o
    38i!                     @   s2  d Z ddlZddlZddlmZmZmZmZmZ ddl	m
Z
 ejdd zddlZW n ey:   ddlmZ Y nw G dd	 d	Zed
kre Zg dZed eD ]\ZZZede de de d qSeeZedee d eD ]Zeded  ded  ded  ded dd	 qxdS dS )u[   
Term Cleaner Module
===================
Stop-word temizliği, filtreleme ve normalizasyon
    N)ListDictTupleSetOptional)Counterz..   )configc                   @   s  e Zd ZdZdd Zd"dededefdd	Zd
ededefddZd#dededefddZ		d$de
eeeef  dede
e fddZ	d$de
e dede
e fddZde
e de
e fddZ	d%de
e dede
e fddZde
eeeef  de
e fd d!ZdS )&TermCleanerum   
    Terim temizleme ve filtreleme sınıfı.
    Stop-word, frekans ve güven skoru filtreleri uygular.
    c                 C   s   t j| _t j| _g d| _d S )N))z\s+ )z	^\s+|\s+$ )z["\']r   )z	\([^)]*\)r   )z
\[[^\]]*\]r   )[0-9]+r   )r	   ENGLISH_STOP_WORDSen_stopwordsTURKISH_STOP_WORDStr_stopwordsclean_patternsself r   ;/var/www/html/PEPCVSON/terminology-extractor/src/cleaner.py__init__   s   zTermCleaner.__init__Ftermremove_numbersreturnc                 C   sN   |sdS |  }| jD ]\}}|dkr|sqt|||}qd| }|S )u   
        Tek bir terimi temizle.
        
        Args:
            term: Temizlenecek terim
            remove_numbers: Sayıları kaldır
            
        Returns:
            Temizlenmiş terim
        r   r   r   )stripr   resubjoinsplit)r   r   r   cleanedpatternreplacementr   r   r   
clean_term(   s   zTermCleaner.clean_termsourcetargetc                    s   |r|sdS t |tjk rdS t |tjk rdS | }| }t |tjkr)dS t |tjkr2dS t fdd|D r?dS t fdd|D rLdS |dd rVdS |dd r`dS dS )u   
        Terim çiftinin geçerli olup olmadığını kontrol et.
        
        Args:
            source: Kaynak terim
            target: Hedef terim
            
        Returns:
            True if valid
        Fc                 3       | ]
}|   jv V  qd S N)lowerr   .0wr   r   r   	<genexpr>b       z,TermCleaner.is_valid_term.<locals>.<genexpr>c                 3   r&   r'   )r(   r   r)   r   r   r   r,   d   r-   r   r   T)lenr	   MIN_TERM_LENGTHr   MAX_TERM_WORDSallreplaceisdigit)r   r$   r%   	src_words	tgt_wordsr   r   r   is_valid_termC   s*   zTermCleaner.is_valid_termenlangc                    s8   |dkr| j n| j | } fdd|D }d|S )u   
        Terimden stop-word'leri kaldır.
        
        Args:
            term: İşlenecek terim
            lang: Dil ('en' veya 'tr')
            
        Returns:
            Stop-word'suz terim
        r7   c                    s   g | ]
}|   vr|qS r   )r(   r)   	stopwordsr   r   
<listcomp>}   s    z0TermCleaner.remove_stopwords.<locals>.<listcomp>r   )r   r   r   r   )r   r   r8   wordsfilteredr   r9   r   remove_stopwordso   s   
zTermCleaner.remove_stopwordsNtermsmin_freqc              	   C   s   |pt j}t }i }|D ]#\}}}| | f}||  d7  < ||vs+||| kr/|||< qg }	| D ]\\}}}
|
|krN|	|||
|||f d q6|	S )u   
        Terimleri frekansa göre filtrele ve say.
        
        Args:
            terms: List of (source, target, score) tuples
            min_freq: Minimum frekans
            
        Returns:
            Filtered and counted terms
           )r$   r%   	frequency
confidence)r	   MIN_FREQUENCYr   r(   most_commonappend)r   r?   r@   counterscoressrctgtscorekeyresultfreqr   r   r   filter_by_frequency   s(   

zTermCleaner.filter_by_frequencymin_confc                    s    pt j  fdd|D S )u   
        Terimleri güven skoruna göre filtrele.
        
        Args:
            terms: List of term dicts
            min_conf: Minimum güven skoru
            
        Returns:
            Filtered terms
        c                    s    g | ]}| d d kr|qS )rC   r   )get)r*   trP   r   r   r;      s     z4TermCleaner.filter_by_confidence.<locals>.<listcomp>)r	   MIN_CONFIDENCE)r   r?   rP   r   rS   r   filter_by_confidence   s   
z TermCleaner.filter_by_confidencec                 C   sT   i }|D ]}|d   }||vr|||< q|d || d kr#|||< qt| S )u   
        Tekrarlı terimleri kaldır, en yüksek frekanslı olanı tut.
        
        Args:
            terms: List of term dicts
            
        Returns:
            Deduplicated terms
        r$   rB   )r(   listvalues)r   r?   seenr   rL   r   r   r   deduplicate   s   

zTermCleaner.deduplicateT	lowercasec                 C   sH   g }|D ]}|  }|r|d  |d< |d  |d< || q|S )u   
        Terim case'ini normalize et.
        
        Args:
            terms: List of term dicts
            lowercase: Küçük harfe çevir
            
        Returns:
            Normalized terms
        r$   r%   )copyr(   rF   )r   r?   rZ   rM   r   new_termr   r   r   normalize_case   s   zTermCleaner.normalize_casec                 C   sx   g }|D ]\}}}|  |}|  |}| ||r!||||f q| |}| |}	| |	}
|
jdd dd |
S )z
        Batch halinde terimleri temizle ve filtrele.
        
        Args:
            terms: List of (source, target, score) tuples
            
        Returns:
            Cleaned and filtered terms
        c                 S   s   | d S )NrB   r   )xr   r   r   <lambda>  s    z)TermCleaner.clean_batch.<locals>.<lambda>T)rL   reverse)r#   r6   rF   rO   rU   rY   sort)r   r?   r    rI   rJ   rK   	clean_src	clean_tgtcounted	confidentuniquer   r   r   clean_batch   s   




zTermCleaner.clean_batch)F)r7   r'   )T)__name__
__module____qualname____doc__r   strboolr#   r6   r>   r   r   floatintr   rO   rU   rY   r]   rg   r   r   r   r   r
      s4    ,
(

(r
   __main__))zhydraulic pumpzhidrolik pompagffffff?)zHydraulic PumpzHidrolik Pompagq=
ףp?)zthe pumppompag?)safety valveemniyet valfig)\(?)rr   rs   g?)abirGz?)123456rv   )r   r   rv   zTest Terms:z  z -> z	 (score: )z
Cleaned Terms (z):r$   r%   z (freq: rB   z, conf: rC   z.2f)rk   r   systypingr   r   r   r   r   collectionsr   pathinsertr	   ImportErrorr   r
   rh   cleaner
test_termsprintrI   rJ   rK   rg   rM   r.   r   r   r   r   r   <module>   s:     y
