o
    38i!                  	   @   s.  d Z ddlZddlmZmZmZmZ ddlmZ ej	dd zddl
Z
W n ey4   ddlm
Z
 Y nw G dd	 d	Zed
kre ZdZdZede  ede  eeeZed eD ]Zeded  ded   qaeeeZed eD ]\ZZZede de dedd q~dS dS )ua   
Word Alignment Module
=====================
BERT tabanlı kelime hizalama (simalign kullanarak)
    N)ListDictTupleOptional)tqdmz..   )configc                   @   s   e Zd ZdZddedefddZdd Zd	ed
edee fddZ			ddee
eef  dededeee  fddZ	dd	ed
ededee
eeef  fddZ	dd	ed
ededee
eeef  fddZdS )WordAlignerui   
    BERT tabanlı kelime hizalama sınıfı.
    simalign kütüphanesini wrapper olarak kullanır.
    N
model_namedevicec                 C   s0   |pt j| _|pt jrdnd| _d| _d| _dS )u   
        Args:
            model_name: BERT model adı (default: config'den)
            device: 'cpu' veya 'cuda' (default: config'den)
        cpucudaNF)r   ALIGNMENT_MODELr
   CPU_ONLYr   aligner_initialized)selfr
   r    r   =/var/www/html/PEPCVSON/terminology-extractor/src/alignment.py__init__   s   
zWordAligner.__init__c              
   C   s   | j rdS z)ddlm} td| j  td| j  || jddd| _d	| _ td
 W dS  ty:   td   tyM } ztd|   d}~ww )u   Modeli yükle (lazy loading)Nr   )SentenceAligneru   🔄 Loading alignment model: z   Device: bpemai)model
token_typematching_methodsTu   ✅ Alignment model loadedu5   ❌ simalign not installed. Run: pip install simalignu$   ❌ Failed to load alignment model: )	r   simalignr   printr
   r   r   ImportError	Exception)r   r   er   r   r   
initialize$   s*   zWordAligner.initializesourcetargetreturnc              
   C   s   | j s|   |r|sg S zK|  }|  }|r|s!g W S | j||}g }|d|dg }|D ]\}}	|t|k rU|	t|k rU|||	|| ||	 dd q6|W S  t	ys }
 zt
d|
  g W  Y d}
~
S d}
~
ww )u@  
        Tek bir cümle çifti için kelime hizalaması yap.
        
        Args:
            source: İngilizce cümle
            target: Türkçe cümle
            
        Returns:
            List of alignments: [{'src_idx': 0, 'tgt_idx': 1, 'src_word': 'pump', 'tgt_word': 'pompa', 'score': 0.95}, ...]
        itermaxinterg      ?)src_idxtgt_idxsrc_wordtgt_wordscoreu   ⚠️ Alignment error: N)r   r!   stripsplitr   get_word_alignsgetlenappendr   r   )r   r"   r#   
src_tokens
tgt_tokens
alignmentsresultalign_pairsr'   r(   r    r   r   r   align_sentence_pair?   s8   zWordAligner.align_sentence_pairTpairs
batch_sizeshow_progressc           	      C   sX   | j s|   |ptj}g }|rt|ddn|}|D ]\}}| ||}|| q|S )uA  
        Batch halinde cümle çiftlerini hizala.
        
        Args:
            pairs: List of (source, target) tuples
            batch_size: Batch boyutu (default: config'den)
            show_progress: Progress bar göster
            
        Returns:
            List of alignment results for each pair
        Aligning)desc)r   r!   r   
BATCH_SIZEr   r7   r1   )	r   r8   r9   r:   resultsiteratorr"   r#   r4   r   r   r   align_batchr   s   
zWordAligner.align_batch	min_scorec                 C   sP   |pt j}| ||}g }|D ]}|d |kr%||d |d |d f q|S )u'  
        Hizalanmış terimleri çıkar.
        
        Args:
            source: İngilizce cümle
            target: Türkçe cümle
            min_score: Minimum skor (default: config'den)
            
        Returns:
            List of (source_term, target_term, score) tuples
        r+   r)   r*   )r   MIN_CONFIDENCEr7   r1   )r   r"   r#   rA   r4   termsalignr   r   r   extract_aligned_terms   s   
z!WordAligner.extract_aligned_termsmax_phrase_lenc                 C   sB  |pt j}| ||}|sg S |jdd d g }g }g }g }d}	d}
|D ]]}|d |	d krOt|d |
 dkrO||d  ||d	  ||d
  n+|rkt||krk|d|d|t|t| f |d g}|d	 g}|d
 g}|d }	|d }
q%|rt||kr|d|d|t|t| f |S )uf  
        Çok kelimeli ifadeleri (phrase) hizala.
        Ardışık hizalanmış kelimeleri grupla.
        
        Args:
            source: İngilizce cümle
            target: Türkçe cümle
            max_phrase_len: Maksimum kelime sayısı
            
        Returns:
            List of (source_phrase, target_phrase, avg_score) tuples
        c                 S   s   | d S )Nr'   r   )xr   r   r   <lambda>   s    z7WordAligner.extract_phrase_alignments.<locals>.<lambda>)keyr'      r(   r)   r*   r+    )	r   MAX_TERM_WORDSr7   sortabsr1   r0   joinsum)r   r"   r#   rF   r4   phrasescurrent_srccurrent_tgtcurrent_scoreslast_src_idxlast_tgt_idxrD   r   r   r   extract_phrase_alignments   sF   




z%WordAligner.extract_phrase_alignments)NN)NT)N)__name__
__module____qualname____doc__strr   r!   r   r   r7   r   intboolr@   floatrE   rX   r   r   r   r   r	      s4    4


r	   __main__z"Check the hydraulic pump pressure.u)   Hidrolik pompa basıncını kontrol edin.z	
Source: zTarget: z
Word Alignments:z  r)   z -> r*   z
Phrase Alignments:z	 (score: z.2f))r\   systypingr   r   r   r   r   pathinsertr   r    r	   rY   r   r"   r#   r   r7   r4   arX   rR   srctgtr+   r   r   r   r   <module>   s8     ] 