o
    D7i{'                     @   s   d Z ddlZddlmZmZmZmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZ G dd	 d	Z	
				ddedededededededeee ef fddZedkr{ddlZeejdkr}eejd ejd \ZZede  dS dS dS )z
Vector-based validation for glossary entries
- Connect to Qdrant (10.10.10.25:6333)
- Generate embeddings for terms
- Search corpus for validation
- Calculate confidence scores
    N)ListDictTupleOptional)tqdm)ThreadPoolExecutoras_completed   )QdrantHelperOpenAIHelperis_valid_glossary_entrydetect_languageloggerc                
   @   s   e Zd ZdZ				d&dededed	efd
dZdefddZ	dedede
e deeef fddZdedefddZ		d'de
e dedede
e fddZde
e de
e fddZdefd d!Zd"efd#d$Zd%S )(GlossaryValidatorz-Validate glossary entries using vector search10.10.10.25  machine_docs333333?qdrant_hostqdrant_port
collectionconfidence_thresholdc                 C   s8   t |||| _t | _|| _g | _dddddd| _d S )Nr           )total_validatedhigh_confidencelow_confidencecorpus_hitsaverage_confidence)r
   qdrantr   openair   validated_entriesstats)selfr   r   r   r    r#   C/var/www/html/PEPCVSON/glossary_cleaner/src/validate_with_vector.py__init__   s   zGlossaryValidator.__init__returnc                 C   s   | j  }| j }|o|S )zConnect to Qdrant and OpenAI)r   connectr   )r"   	qdrant_ok	openai_okr#   r#   r$   r'   -   s   

zGlossaryValidator.connectsourcetargetsearch_resultsc                 C   s:  dddg d}t ||\}}|sd|d< |d d|  t|}|dkr8|d  d	9  < |d d
|  |rytdd |D }||d< | }	|D ] }
|
di }|ddp`|dd}|	| v rmd|d<  nqM|d ry|d d |d rdnd	}|d d |d d  |d  }tdtd|}||fS )zf
        Calculate confidence score for an entry
        Returns: (confidence_score, details)
        Fr         ?
corpus_hitsemantic_scoreformat_scorereasonsr1   r2   zformat_invalid: turkish      ?target_lang: c                 s       | ]}|d  V  qdS )scoreNr#   ).0rr#   r#   r$   	<genexpr>R       z9GlossaryValidator.calculate_confidence.<locals>.<genexpr>r0   payloadtext contentTr/   corpus_match333333?g?)r   appendr   maxlowergetmin)r"   r*   r+   r,   detailsis_validreasontarget_lang
best_scoresource_lowerresultr<   r=   corpus_bonus
confidencer#   r#   r$   calculate_confidence3   sF   

z&GlossaryValidator.calculate_confidenceentryc                 C   s`   |d }|d }| j |}g }|r| jj|dd}| |||\}}i ||||| jkdS )zValidate a single entryr*   r+      limitrO   validation_detailsrH   )r   get_embeddingr   searchrP   r   )r"   rQ   r*   r+   	embeddingr,   rO   rG   r#   r#   r$   validate_entryp   s   z GlossaryValidator.validate_entry2      entries
batch_sizemax_workersc                 C   s  t dt| d g }d}ttdt||ddD ]}||||  }dd |D }| j|}	t|D ]c\}
}|
t|	k rD|	|
 nd	}g }|rR| jj	|d
d}| 
|d |d |\}}i ||||| jkd}|| ||7 }|d r| jd  d7  < n	| jd  d7  < |d r| jd  d7  < q6q|| _t|| jd< |r|t| nd| jd< t d| jd  d| jd  d |S )zValidate entries in batcheszStarting validation of  entries...r   r   
Validatingdescc                 S   s   g | ]}|d  qS )r*   r#   r8   er#   r#   r$   
<listcomp>   s    z4GlossaryValidator.validate_batch.<locals>.<listcomp>NrR   rS   r*   r+   rU   rH   r   r	   r   r/   r   r   r   zValidation complete: z high confidence, z low confidence)r   infolenr   ranger   get_embeddings_batch	enumerater   rX   rP   r   rB   r!   r    )r"   r]   r^   r_   	validatedtotal_confidenceibatchsources
embeddingsjrQ   rY   r,   rO   rG   validated_entryr#   r#   r$   validate_batch   sL   
$z GlossaryValidator.validate_batchc                 C   s<  t dt| d g }t|ddD ]j}|d }|d }t||\}}t|}|r,dnd}	|d	kr6|	d
9 }	dd|r<dnd|sB|gng d}
|d	krT|
d d|  i ||	|
|	| jkd}|| |d rt| jd  d7  < q| jd  d7  < q|| _	t|| jd< |rt
dd |D t| nd| jd< |S )z
        Simple validation without vector search (rule-based only)
        Use this when Qdrant/OpenAI is not available
        zRunning simple validation on r`   zSimple validationrb   r*   r+   g?rA   r3   r4   Fr   r-   r.   r2   r5   rU   rH   r   r	   r   r   c                 s   r6   )rO   Nr#   rd   r#   r#   r$   r:      r;   z4GlossaryValidator.validate_simple.<locals>.<genexpr>r   r   )r   rg   rh   r   r   r   rB   r   r!   r    sum)r"   r]   rl   rQ   r*   r+   rH   rI   rJ   rO   rG   rs   r#   r#   r$   validate_simple   s@   

(z!GlossaryValidator.validate_simplec                 C   s   | j S )zGet validation statistics)r!   )r"   r#   r#   r$   	get_stats   s   zGlossaryValidator.get_statsfilepathc                 C   sb   t |ddd}tj| j|ddd W d   n1 sw   Y  tdt| j d	|  dS )
zSave validation results to JSONwutf-8encodingF   )ensure_asciiindentNzSaved z validated entries to )openjsondumpr    r   rg   rh   )r"   rx   fr#   r#   r$   save_results   s    zGlossaryValidator.save_resultsN)r   r   r   r   )r[   r\   )__name__
__module____qualname____doc__strintfloatr%   boolr'   r   r   r   rP   rZ   rt   rv   rw   r   r#   r#   r#   r$   r      sP    


=
93r   r   r   r   r   T
input_fileoutput_filer   r   r   r   
use_vectorr&   c                 C   s   t | ddd}t|}W d   n1 sw   Y  t||||d}	|r;|	 r0|	|}
ntd |	|}
n|	|}
|		| |
|	
 fS )zQ
    Main function to run validation
    Returns: (validated_entries, stats)
    r9   rz   r{   N)r   r   r   r   z>Vector services unavailable, falling back to simple validation)r   r   loadr   r'   rt   r   warningrv   r   rw   )r   r   r   r   r   r   r   r   r]   	validatorrl   r#   r#   r$   run_validation   s"   


r   __main__   r}   zStats: )r   r   r   r   T)r   r   typingr   r   r   r   r   concurrent.futuresr   r   utilsr
   r   r   r   r   r   r   r   r   r   r   r   sysrh   argvr]   r!   printr#   r#   r#   r$   <module>   sJ    	 k
(