o
    7i                     @   s  d Z ddlZddlZddlZddlZddlmZmZmZm	Z	 ddl
m
Z
 ejejdd eeZddd	d
dddddddddddZi ddddddddddddddd d d!d!d"dd#d$d%d&d'd(d)d)d)d)d*d+d,d+d-d.iZed/ejed0ejed1ejd2Zd3ed4efd5d6Zd3ed4efd7d8Zd3ed4efd9d:Zd3ed4efd;d<Zd=ed>ed4e	eef fd?d@ZG dAdB dBZG dCdD dDZdEed4efdFdGZdHedIefdJdKZ dIed4efdLdMZ!d4efdNdOZ"dS )Pz
Utility functions for glossary cleaning system
- Turkish character handling
- Qdrant client wrapper
- OpenAI embedding helper
- Logging functions
    N)ListDictOptionalTuple)datetimez)%(asctime)s - %(levelname)s - %(message)s)levelformatcCgGiIoOsSuU)   ç   Ç   ğ   Ğ   ır      İr      ö   Ö   ş   Ş   ü   Ü   ýr      þr      Ýr      Þr      ðr      Ðr   r   r      ÿ   ×x   ÷/z: "'",
    '"u   –-u   —u   …z...zz\b(il|della|degli|anelli|preferibilmente|lubrificare|vedere|sopra|regolare|sostituire|lato|nella|sono|essere|fare|avere)\buH   \b(mit|und|oder|die|das|der|ist|sind|werden|haben|nicht|auch|für|auf)\bz5\b(el|la|los|las|del|para|con|por|una|uno|que|como)\bitaliangermanspanishtextreturnc                 C   s"   t  D ]
\}}| ||} q| S )zFix common OCR errors in text)OCR_CORRECTIONSitemsreplace)r3   wrongcorrect r:   4/var/www/html/PEPCVSON/glossary_cleaner/src/utils.pyfix_ocr_errorsA   s   r<   c                 C   s   t | } | S )z,Normalize Turkish text - fix encoding issues)r<   r3   r:   r:   r;   normalize_turkishH   s   r>   c                 C   s   |  dd dd} |  S )z2Convert to lowercase preserving Turkish charactersr   r   r   r   )r7   lowerr=   r:   r:   r;   to_lowercase_turkishN   s   r@   c                    sn   t  D ]\}}|| r|  S qtd t fdd| D }|s5td| r5|  }t|dkr5dS dS )z1Detect if text contains foreign language patterns   çğıöşüÇĞİÖŞÜc                 3   s    | ]}| v V  qd S Nr:   .0r	   turkish_charsr:   r;   	<genexpr>]       z"detect_language.<locals>.<genexpr>z^[A-Za-z0-9\s\-\.,/\(\)]+$   englishturkish)	FOREIGN_PATTERNSr6   searchsetanyrematchsplitlen)r3   langpatternhas_turkishwordsr:   rE   r;   detect_languageU   s   
rX   sourcetargetc                 C   s   | r|sdS t |  dk st | dk rdS td|r dS t|}|dv r1dd|  fS td	d
 | D }t|}|rF|dkrFdS d|v rf|ddkrfdd |dD }t	dd
 |D rfdS t | dkrtt |dkrtdS dS )zC
    Validate a glossary entry
    Returns: (is_valid, reason)
    )FEMPTY_VALUE   )F	TOO_SHORTu   [ÖýþÝÞ]{2,})FOCR_GARBAGEr/   FFOREIGN_LANG_c                 s   s    | ]}|d v V  qdS )rA   Nr:   rC   r:   r:   r;   rG   }   rH   z*is_valid_glossary_entry.<locals>.<genexpr>rJ   )FWRONG_DIRECTION,c                 S   s   g | ]}|  qS r:   )striprD   pr:   r:   r;   
<listcomp>   s    z+is_valid_glossary_entry.<locals>.<listcomp>c                 s   s    | ]	}t |d k V  qdS )   N)rS   rc   r:   r:   r;   rG      s    )FMULTIPLE_ALTERNATIVESrI   2   )FLENGTH_MISMATCH)TOK)
rS   rb   rP   rM   rX   upperrO   countrR   all)rY   rZ   target_langsource_has_turkishpartsr:   r:   r;   is_valid_glossary_entryg   s(    rq   c                   @   sP   e Zd ZdZddededefdd	Zd
d Zddee	 dedee
 fddZdS )QdrantHelperz"Helper class for Qdrant operations10.10.10.25  machine_docshostport
collectionc                 C   s   || _ || _|| _d | _d S rB   )rv   rw   rx   client)selfrv   rw   rx   r:   r:   r;   __init__   s   
zQdrantHelper.__init__c              
   C   s   z%ddl m} || j| jdd| _| j| j}td|j	dd W dS  t
y@ } ztd	|  W Y d
}~dS d
}~ww )zConnect to Qdrantr   )QdrantClient   )rv   rw   timeoutzConnected to Qdrant: ra   z vectorsTzQdrant connection error: NF)qdrant_clientr|   rv   rw   ry   get_collectionrx   loggerinfopoints_count	Exceptionerror)rz   r|   r   er:   r:   r;   connect   s   zQdrantHelper.connect   query_vectorlimitr4   c              
   C   sl   | j sg S z| j j| j||dd}dd |jD W S  ty5 } ztd|  g W  Y d}~S d}~ww )zSearch in QdrantT)collection_namequeryr   with_payloadc                 S   s   g | ]	}|j |jd qS )scorepayloadr   )rD   rr:   r:   r;   re      s    z'QdrantHelper.search.<locals>.<listcomp>zQdrant search error: N)ry   query_pointsrx   pointsr   r   r   )rz   r   r   resultsr   r:   r:   r;   rM      s"   zQdrantHelper.searchN)rs   rt   ru   )r   )__name__
__module____qualname____doc__strintr{   r   r   floatr   rM   r:   r:   r:   r;   rr      s
    $rr   c                
   @   sj   e Zd ZdZddefddZdd Zded	eee	  fd
dZ
ddee ded	eeee	   fddZdS )OpenAIHelperz"Helper class for OpenAI embeddingstext-embedding-3-largemodelc                 C   s   || _ d | _d S rB   )r   ry   )rz   r   r:   r:   r;   r{      s   
zOpenAIHelper.__init__c              
   C   sf   zddl m} | | _td| j d W dS  ty2 } ztd|  W Y d}~dS d}~ww )	zInitialize OpenAI clientr   )OpenAIz"OpenAI client initialized (model: )TzOpenAI initialization error: NF)openair   ry   r   r   r   r   r   )rz   r   r   r:   r:   r;   r      s   zOpenAIHelper.connectr3   r4   c              
   C   sd   | j sdS z| j jj|| jd}|jd jW S  ty1 } ztd|  W Y d}~dS d}~ww )zGet embedding for textNinputr   r   zEmbedding error: )	ry   
embeddingscreater   data	embeddingr   r   r   )rz   r3   responser   r:   r:   r;   get_embedding   s   zOpenAIHelper.get_embeddingd   texts
batch_sizec           	      C   s   | j s
dgt| S g }tdt||D ]F}||||  }z| j jj|| jd}|jD ]}||j q,W q t	yZ } zt
d|  |dgt|  W Y d}~qd}~ww |S )z!Get embeddings for multiple textsNr   r   zBatch embedding error: )ry   rS   ranger   r   r   r   appendr   r   r   r   extend)	rz   r   r   r   r   batchr   itemr   r:   r:   r;   get_embeddings_batch   s&   
 z!OpenAIHelper.get_embeddings_batchN)r   )r   )r   r   r   r   r   r{   r   r   r   r   r   r   r   r:   r:   r:   r;   r      s    ,r   config_pathc                 C   sD   ddl }t| ddd}||W  d   S 1 sw   Y  dS )z!Load configuration from YAML filer   Nr   utf-8encoding)yamlopen	safe_load)r   r   fr:   r:   r;   load_config   s   $r   r   filepathc                 C   sF   t |ddd}tj| |ddd W d   dS 1 sw   Y  dS )zSave data to JSON filewr   r   Fr\   )ensure_asciiindentN)r   jsondump)r   r   r   r:   r:   r;   	save_json   s   "r   c                 C   s<   t | ddd}t|W  d   S 1 sw   Y  dS )zLoad data from JSON filer   r   r   N)r   r   load)r   r   r:   r:   r;   	load_json   s   $r   c                   C   s   t  dS )zGenerate timestamp stringz%Y-%m-%d %H:%M:%S)r   nowstrftimer:   r:   r:   r;   generate_timestamp  s   r   )#r   rP   osr   loggingtypingr   r   r   r   r   basicConfigINFO	getLoggerr   r   TURKISH_CHARSr5   compile
IGNORECASErL   r   r<   r>   r@   rX   boolrq   rr   r   r   rO   r   r   r   r:   r:   r:   r;   <module>   s    
	
(-7