o
    t8i4L                     @   s  d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ g dZedd	e d
 ejZd/dededee fddZdedee fddZdededee fddZdede	eee f fddZdee deeee f fddZdee defdd Zdee defd!d"ZG d#d$ d$Zd%ee d&efd'd(Zd)ed&efd*d+Zd,d- Z e!d.kre   dS dS )0z
Glossary Vector Validation System
- Merge glossaries
- Separate by type (single word, term, sentence)
- Validate against corpus via Qdrant
- Export verified glossaries
    N)ListDictTupleOptional)datetime)defaultdict)tqdm):checkinstallremovereplaceadjustverifyensureinspecttightenloosendrainfillcleanmeasureconnect
disconnectassembledisassembletestrepairmaintainservice	lubricatetorquealign	calibrateoperatestartstoprunturnpresspushpullliftlowerraisemoverotateapplyusesetmakedobehavegetputtakegivekeepreferseenoteconfirmperformcompletefollowz\b(|z)(s|ed|ing|e)?\bTfilepath
has_headerreturnc                 C   s   g }t j| s
|S t| ddd0}t|}|rt|d |D ]}t|dkr8||d 	 |d 	 d q W d   |S 1 sDw   Y  |S )	zLoad CSV filerutf-8encodingN   r      sourcetarget)
ospathexistsopencsvreadernextlenappendstrip)rD   rE   entriesfrU   row r]   @/var/www/html/PEPCVSON/glossary_cleaner/src/vector_validation.pyload_csv#   s&   






r_   c                 C   s   g }t j| s
|S t| ddd+}|D ]}| d}t|dkr3||d  |d  d qW d	   |S 1 s?w   Y  |S )
zLoad TSV filerG   rH   rI   	rK   r   rL   rM   N)rP   rQ   rR   rS   rY   splitrW   rX   )rD   rZ   r[   linepartsr]   r]   r^   load_tsv6   s"   


rd   
deepl_filegoogle_filec           	      C   s  t d t| dd}t|dd}t dt| d t dt| d i }|D ]}|d   }|d |d	 d
d||< q(d}|D ]+}|d   }||v rb|d7 }|d |d	 dd||< qC|d |d	 dd||< qCt| }t dt| d| d |S )zY
    Merge DeepL and Google glossaries
    Google entries take priority on conflicts
    u   📂 Loading glossaries...F)rE   Tz
   DeepL:  entriesz   Google: rN   rO   deepl)rN   rO   originr   rL   googleu   ✅ Merged: z unique entries (z conflicts, Google prioritized))printr_   rW   r,   rY   listvalues)	re   rf   deepl_entriesgoogle_entriesmergedentrykey	conflictsresultr]   r]   r^   merge_glossariesG   s8   ru   rN   c                 C   sT   |   }t|}t| }dd |D }|dkrd|fS |dkr&|s&d|fS d|fS )z_
    Classify entry as: single_word, term, or sentence
    Returns: (type, detected_verbs)
    c                 S   s   g | ]}|d    qS )r   )r,   ).0vr]   r]   r^   
<listcomp>   s    z"classify_entry.<locals>.<listcomp>rK   single_word   termsentence)ra   rW   
VERB_REGEXfindall)rN   words
word_countverbs_found	verb_listr]   r]   r^   classify_entryy   s   
r   rZ   c                 C   s   t d g g g d}t| ddD ]!}t|d \}}i ||t|d  |d}|| | qt d t dt|d	   t d
t|d   t dt|d   |S )zZ
    Separate entries by type
    Returns dict with keys: single_word, term, sentence
    u   
🔀 Separating by type...)ry   r{   r|   ClassifyingdescrN   )typer   detected_verbsu   ✅ Classification complete:z   Single words (1-2): ry   z   Terms (3-4, no verbs): r{   z   Sentences (5+ or verbs): r|   )rk   r   r   rW   ra   rX   )rZ   rt   rq   
entry_typeverbsclassified_entryr]   r]   r^   separate_by_type   s(   r   	sentencesoutput_filec                 C   s   t d|  t|dddd2}t|}|g d | D ]}||d |d |d	 d
|d |ddg qW d   n1 sDw   Y  t dt|  d dS )z(Save sentences to translation memory CSVu!   
💾 Saving translation memory: wrH    rJ   newline)rN   rO   r   r   ri   rN   rO   r   ;r   ri   unknownN
   ✅ Saved z  sentences to translation memory)rk   rS   rT   writerwriterowjoinr7   rW   )r   r   r[   r   rq   r]   r]   r^   save_translation_memory   s   

r   c                 C   sf   t d|  t|ddd}tj| |ddd W d   n1 s"w   Y  t d	t|  d
 dS )z)Save glossary terms to JSON for next stepu   
💾 Saving glossary terms: r   rH   rI   FrK   )ensure_asciiindentNr   z terms for validation)rk   rS   jsondumprW   )rZ   r   r[   r]   r]   r^   save_glossary_terms   s
   r   c                   @   s   e Zd ZdZd"dededefdd	Zd
efddZded
e	e
e  fddZde
e d
e
e	e
e   fddZd#de
e ded
e
e fddZd$de
e ded
e
e fddZd%de
e defdd Zd!S )&QdrantValidatorz$Validate terms against Qdrant corpus10.10.10.25  machine_docshostport
collectionc                 C   s"   || _ || _|| _d | _d | _d S )N)r   r   r   qdrant_clientopenai_client)selfr   r   r   r]   r]   r^   __init__   s
   
zQdrantValidator.__init__rF   c              
   C   s   z#ddl m} || j| jdd| _ | j | j}td|jdd W n ty= } ztd|  W Y d	}~d
S d	}~ww zddl	m
} | | _td W dS  tyi } ztd|  W Y d	}~d
S d	}~ww )zConnect to Qdrant and OpenAIr   )QdrantClient<   )r   r   timeoutu   ✅ Qdrant connected: ,z vectorsu   ❌ Qdrant connection failed: NF)OpenAIu   ✅ OpenAI connectedu   ❌ OpenAI connection failed: T)r   r   r   r   get_collectionr   rk   points_count	Exceptionopenair   r   )r   r   infoer   r]   r]   r^   r      s(   
zQdrantValidator.connecttextc              
   C   sV   z| j jj|dd}|jd jW S  ty* } ztd|  W Y d}~dS d}~ww )zGet embedding for texttext-embedding-3-largeinputmodelr   u   ⚠️ Embedding error: N)r   
embeddingscreatedata	embeddingr   rk   )r   r   responser   r]   r]   r^   get_embedding   s   zQdrantValidator.get_embeddingtextsc              
   C   sf   z| j jj|dd}dd |jD W S  ty2 } ztd|  dgt| W  Y d}~S d}~ww )z!Get embeddings for batch of textsr   r   c                 S   s   g | ]}|j qS r]   )r   )rv   itemr]   r]   r^   rx      s    z8QdrantValidator.get_embeddings_batch.<locals>.<listcomp>u   ⚠️ Batch embedding error: N)r   r   r   r   r   rk   rW   )r   r   r   r   r]   r]   r^   get_embeddings_batch   s   z$QdrantValidator.get_embeddings_batch   vectorlimitc              
   C   s`   z| j j| j||dd}dd |jD W S  ty/ } ztd|  g W  Y d}~S d}~ww )zSearch in QdrantT)collection_namequeryr   with_payloadc                 S   s   g | ]	}|j |jd qS )scorepayloadr   )rv   rG   r]   r]   r^   rx     s    z*QdrantValidator.search.<locals>.<listcomp>u   ⚠️ Search error: N)r   query_pointsr   pointsr   rk   )r   r   r   resultsr   r]   r]   r^   search  s   zQdrantValidator.search2   rZ   
batch_sizec              	   C   sV  t dt| d g }ttdt||ddD ]d}||||  }dd |D }| |}t|D ]I\}}	|t|k r@|| nd}
|
rf| j|
d	d
}|rS|d d nd}|dkr\d}n|dkrcd}nd}nd}d}|i |	||t	 
 d q2qtt}|D ]}||d   d7  < qt d t| D ]\}}t d| d|  q|S )zValidate entries in batchesu   
🔍 Validating z terms against corpus...r   zValidating batchesr   c                 S      g | ]}|d  qS rN   r]   rv   r   r]   r]   r^   rx         z2QdrantValidator.validate_batch.<locals>.<listcomp>Nr   )r   r   gffffff?verifiedg      ?	uncertainrejectederror
confidencestatusvalidated_atr   rL   u   ✅ Validation complete:z   : )rk   rW   r   ranger   	enumerater   rX   r   now	isoformatr   intsorteditems)r   rZ   r   	validatedibatchsourcesr   jrq   r   r   
best_scorer   statsrw   countr]   r]   r^   validate_batch  sB   

zQdrantValidator.validate_batchglossary_termsr   c                 C   s  ddl m}m}m} dd |D }td| dt| d z| j| td | j| W n   Y | jj	||d	|j
d
d g }dd |D }td ttdt|dddD ]C}	||	|	d  }
dd |
D }| |}t|
D ](\}}|| r||t||| |d |d |d |d |dddd quq[tdt| d tdt|dD ]}	||	|	d  }
| jj||
d qtd| dt| d  t|S )!z:Create glossary collection in Qdrant with verified entriesr   )VectorParamsDistancePointStructc                 S      g | ]
}|d  dkr|qS r   r   r]   r   r]   r]   r^   rx   H      z>QdrantValidator.create_glossary_collection.<locals>.<listcomp>u   
📦 Creating collection 'z' with z verified entries...z!   Collection exists, deleting...i   )sizedistance)r   vectors_configc                 S   r   r   r]   r   r]   r]   r^   rx   [  r   z-   Getting embeddings for verified entries...r   	Embeddingr   c                 S   r   r   r]   r   r]   r]   r^   rx   `  r   rN   rO   r   r   ri   r   )rN   rO   r   r   ri   )idr   r   z   Inserting z
 points...d   )r   r   u   ✅ Collection 'z' created with rg   )qdrant_client.modelsr   r   r   rk   rW   r   r   delete_collectioncreate_collectionCOSINEr   r   r   r   rX   r7   upsert)r   rZ   r   r   r   r   r   r   r   r   r   batch_sourcesr   r   rq   r]   r]   r^   create_glossary_collectionD  sP   

z*QdrantValidator.create_glossary_collectionN)r   r   r   )r   )r   )r   )__name__
__module____qualname____doc__strr   r   boolr   r   r   floatr   r   r   r   r   r  r]   r]   r]   r^   r      s    "  1r   r   
output_dirc                 C   s  t d dd | D }dd | D }dd | D }tj|d}t|ddd	d
}t|}|D ]}||d |d g q0W d   n1 sHw   Y  t d| dt| d tj|d}	t|	ddd}|D ]}|	|d  d|d  d qkW d   n1 sw   Y  t d|	 dt| d tj|d}
t|
ddd	d
1}t|}|g d |D ]}||d |d |d d|d |
dd	g qW d   n1 sw   Y  t d|
 dt| d tj|d}t|ddd	d
2}t|}|g d |D ]}||d |d |d d|d |
dd	g qW d   n	1 s3w   Y  t d| dt| d dS )zGenerate export filesu    
📤 Generating export files...c                 S   r   r   r]   r   r]   r]   r^   rx     r   z$generate_exports.<locals>.<listcomp>c                 S   r   )r   r   r]   r   r]   r]   r^   rx     r   c                 S   r   )r   r   r]   r   r]   r]   r^   rx     r   zdeepl_glossary_final.csvr   rH   r   r   rN   rO   Nu      ✅ r   rg   zgoogle_glossary_final.tsvrI   r`   
zglossary_uncertain.csv)rN   rO   r   r   ri   r   z.3fr   ri   zglossary_rejected.csv)rk   rP   rQ   r   rS   rT   r   r   rW   writer7   )r   r
  r   r   r   re   r[   r   rq   rf   uncertain_filerejected_filer]   r]   r^   generate_exports{  sV   
 


r  r   c                 C   s  t j|d}t j|dd t j|d}t|dddf}|dt d	 d
| 	dddd| 	dddd| 	dddd| 	dddd| 	dddd| 	dddd| 	dddd| 	dddd| 	dddd| 	dddd  W d!   n1 sw   Y  t j|d"}t|ddd}t
ji | d#t  i|d$d% W d!   n1 sw   Y  td&|  d!S )'zGenerate validation reportreportsT)exist_okzvalidation_log.mdr   rH   rI   z-# Glossary Validation Report

**Generated:** z%Y-%m-%d %H:%M:%SzR

## Input Summary

| Source | Entries |
|--------|---------|
| DeepL (cleaned) | deepl_countr   r   z |
| Google | google_countz |
| Merged (unique) | merged_countz& |
| Conflicts (Google prioritized) | rs   zp |

## Type Classification

| Type | Count | Destination |
|------|-------|-------------|
| Single word (1-2) | ry   z' | Glossary |
| Term (3-4, no verbs) | r{   z) | Glossary |
| Sentence (5+ or verbs) | r|   zl | Translation Memory |

## Validation Results

| Status | Count |
|--------|-------|
| Verified (>= 0.7) | r   z |
| Uncertain (0.5-0.7) | r   z |
| Rejected (< 0.5) | r   a   |

## Output Files

- `deepl_glossary_final.csv` - DeepL upload format
- `google_glossary_final.tsv` - Google Cloud format  
- `glossary_uncertain.csv` - Manual review needed
- `glossary_rejected.csv` - Not verified in corpus
- `translation_memory.csv` - Sentence translations
Nz
stats.jsongenerated_atrK   )r   u   
📊 Reports generated in )rP   rQ   r   makedirsrS   r  r   r   strftimer7   r   r   r   rk   )r   r
  
report_dirreport_filer[   
stats_filer]   r]   r^   generate_report  sB   

	








(&r  c                  C   s  ddl } | jdd}|jdddd |jd	dd
d | }tjtjtjt}tj	|d}tj	|d}tj	|dd}t
d t
d t
d i }t||}t||d< t|}	t|	d |d< t|	d |d< t|	d |d< tj	|d}
t|	d |
 |	d |	d  }t
dt|  |jrt
d dd |D }nt }| r||}|js|| nt
d dS |D ]}|dd}||dd ||< qt|| t|| t
d t
d t
d dS ) zMain executionr   NzGlossary Vector Validation)descriptionz--skip-validation
store_truezSkip Qdrant validation)actionhelpz--skip-collectionzSkip collection creationr   zdeepl_glossary.csvr   zgoogle_glossary_original.csvz<============================================================zGLOSSARY VECTOR VALIDATIONr  ry   r{   r|   ztranslation_memory.csvu'   
📚 Glossary entries for validation: u/   
⏭️ Skipping validation (--skip-validation)c                 S   s(   g | ]}i |d dt   dqS )g?r   r   )r   r   r   r   r]   r]   r^   rx     s   ( zmain.<locals>.<listcomp>u3   ❌ Cannot proceed without Qdrant/OpenAI connectionr   r   rL   z=
============================================================	COMPLETED)argparseArgumentParseradd_argument
parse_argsrP   rQ   dirnameabspath__file__r   rk   ru   rW   r   r   skip_validationr   r   r   skip_collectionr  r7   r  r  )r!  parserargsbase_dirdata_dirre   rf   r   rp   by_typetm_fileglossary_entriesr   	validatorrw   r   r]   r]   r^   main  sT   




r2  __main__)T)"r  rP   sysrT   r   retypingr   r   r   r   r   collectionsr   r   VERB_PATTERNScompiler   
IGNORECASEr}   r  r  r_   rd   ru   r   r   r   r   r   r  r  r2  r  r]   r]   r]   r^   <module>   s6   2"!
 .06I
