"""
Term Extractor Module
=====================
spaCy ile POS tagging ve Noun Phrase extraction
"""

import sys
from typing import List, Dict, Tuple, Set, Optional
from collections import defaultdict

# Config import
sys.path.insert(0, '..')
try:
    import config
except ImportError:
    from .. import config


class TermExtractor:
    """
    spaCy tabanlı terim çıkarma sınıfı.
    Noun Phrase'leri ve teknik terimleri tespit eder.
    """
    
    def __init__(self, en_model: str = None, tr_model: str = None):
        """
        Args:
            en_model: İngilizce spaCy model adı
            tr_model: Türkçe spaCy model adı
        """
        self.en_model_name = en_model or config.SPACY_EN_MODEL
        self.tr_model_name = tr_model or config.SPACY_TR_MODEL
        self.nlp_en = None
        self.nlp_tr = None
        self._initialized = False
    
    def initialize(self):
        """spaCy modellerini yükle (lazy loading)"""
        if self._initialized:
            return
        
        import spacy
        
        # Load English model
        try:
            print(f"🔄 Loading spaCy English model: {self.en_model_name}")
            self.nlp_en = spacy.load(self.en_model_name)
            print("✅ English model loaded")
        except OSError:
            print(f"⚠️ Model not found. Downloading {self.en_model_name}...")
            spacy.cli.download(self.en_model_name)
            self.nlp_en = spacy.load(self.en_model_name)
        
        # Load Turkish/multilingual model
        try:
            print(f"🔄 Loading spaCy Turkish model: {self.tr_model_name}")
            self.nlp_tr = spacy.load(self.tr_model_name)
            print("✅ Turkish model loaded")
        except OSError:
            print(f"⚠️ Model not found. Trying multilingual model...")
            try:
                # Fallback to multilingual
                self.nlp_tr = spacy.load("xx_ent_wiki_sm")
            except OSError:
                print("⚠️ Using English model for Turkish (limited accuracy)")
                self.nlp_tr = self.nlp_en
        
        self._initialized = True
    
    def extract_noun_phrases(self, text: str, lang: str = 'en') -> List[Dict]:
        """
        Metinden Noun Phrase'leri çıkar.
        
        Args:
            text: İşlenecek metin
            lang: Dil ('en' veya 'tr')
            
        Returns:
            List of noun phrases with metadata
        """
        if not self._initialized:
            self.initialize()
        
        nlp = self.nlp_en if lang == 'en' else self.nlp_tr
        doc = nlp(text)
        
        phrases = []
        
        # Extract noun chunks (noun phrases)
        for chunk in doc.noun_chunks:
            # Filter by word count
            word_count = len(chunk.text.split())
            if word_count > config.MAX_TERM_WORDS:
                continue
            
            phrases.append({
                'text': chunk.text,
                'root': chunk.root.text,
                'root_pos': chunk.root.pos_,
                'start': chunk.start,
                'end': chunk.end,
                'word_count': word_count
            })
        
        return phrases
    
    def extract_technical_terms(self, text: str, lang: str = 'en') -> List[Dict]:
        """
        Metinden teknik terimleri çıkar (NOUN, ADJ).
        
        Args:
            text: İşlenecek metin
            lang: Dil ('en' veya 'tr')
            
        Returns:
            List of technical terms with POS info
        """
        if not self._initialized:
            self.initialize()
        
        nlp = self.nlp_en if lang == 'en' else self.nlp_tr
        doc = nlp(text)
        
        terms = []
        
        for token in doc:
            # Only keep allowed POS tags
            if token.pos_ in config.ALLOWED_POS_TAGS:
                # Skip stop words
                if token.text.lower() in config.ENGLISH_STOP_WORDS:
                    continue
                if token.text.lower() in config.TURKISH_STOP_WORDS:
                    continue
                
                # Skip short tokens
                if len(token.text) < config.MIN_TERM_LENGTH:
                    continue
                
                terms.append({
                    'text': token.text,
                    'lemma': token.lemma_,
                    'pos': token.pos_,
                    'idx': token.i
                })
        
        return terms
    
    def extract_compound_terms(self, text: str, lang: str = 'en', 
                               max_words: int = None) -> List[str]:
        """
        Ardışık isim/sıfat kombinasyonlarını çıkar.
        Örn: "hydraulic pump", "safety valve"
        
        Args:
            text: İşlenecek metin
            lang: Dil
            max_words: Maksimum kelime sayısı
            
        Returns:
            List of compound terms
        """
        if not self._initialized:
            self.initialize()
        
        max_words = max_words or config.MAX_TERM_WORDS
        nlp = self.nlp_en if lang == 'en' else self.nlp_tr
        doc = nlp(text)
        
        compounds = []
        current_compound = []
        
        for token in doc:
            if token.pos_ in config.ALLOWED_POS_TAGS:
                current_compound.append(token.text)
            else:
                # End of compound
                if current_compound:
                    if 1 <= len(current_compound) <= max_words:
                        compound_text = ' '.join(current_compound)
                        # Skip if all stop words
                        words_lower = [w.lower() for w in current_compound]
                        if not all(w in config.ENGLISH_STOP_WORDS or 
                                   w in config.TURKISH_STOP_WORDS 
                                   for w in words_lower):
                            compounds.append(compound_text)
                    current_compound = []
        
        # Don't forget last compound
        if current_compound and 1 <= len(current_compound) <= max_words:
            compound_text = ' '.join(current_compound)
            words_lower = [w.lower() for w in current_compound]
            if not all(w in config.ENGLISH_STOP_WORDS or 
                       w in config.TURKISH_STOP_WORDS 
                       for w in words_lower):
                compounds.append(compound_text)
        
        return compounds
    
    def filter_by_pos(self, alignments: List[Dict], 
                      source_text: str, target_text: str) -> List[Dict]:
        """
        Hizalanmış terimleri POS tag'e göre filtrele.
        Sadece NOUN, ADJ içerenleri tut.
        
        Args:
            alignments: Word alignment sonuçları
            source_text: Kaynak metin
            target_text: Hedef metin
            
        Returns:
            Filtered alignments
        """
        if not self._initialized:
            self.initialize()
        
        # Get POS tags for source
        doc_en = self.nlp_en(source_text)
        src_pos = {token.text.lower(): token.pos_ for token in doc_en}
        
        # Get POS tags for target
        doc_tr = self.nlp_tr(target_text)
        tgt_pos = {token.text.lower(): token.pos_ for token in doc_tr}
        
        filtered = []
        for align in alignments:
            src_word = align.get('src_word', '').lower()
            tgt_word = align.get('tgt_word', '').lower()
            
            # Check if source word is technical term
            src_tag = src_pos.get(src_word, 'X')
            
            if src_tag in config.ALLOWED_POS_TAGS:
                # Skip stop words
                if src_word not in config.ENGLISH_STOP_WORDS:
                    if tgt_word not in config.TURKISH_STOP_WORDS:
                        align['src_pos'] = src_tag
                        align['tgt_pos'] = tgt_pos.get(tgt_word, 'X')
                        filtered.append(align)
        
        return filtered
    
    def get_pos_distribution(self, text: str, lang: str = 'en') -> Dict[str, int]:
        """
        Metindeki POS dağılımını hesapla.
        
        Args:
            text: İşlenecek metin
            lang: Dil
            
        Returns:
            POS tag counts
        """
        if not self._initialized:
            self.initialize()
        
        nlp = self.nlp_en if lang == 'en' else self.nlp_tr
        doc = nlp(text)
        
        distribution = defaultdict(int)
        for token in doc:
            distribution[token.pos_] += 1
        
        return dict(distribution)


# Test
if __name__ == "__main__":
    extractor = TermExtractor()
    
    # Test sentence
    text_en = "Check the hydraulic pump pressure and replace the safety valve."
    text_tr = "Hidrolik pompa basıncını kontrol edin ve emniyet valfini değiştirin."
    
    print(f"\nEnglish: {text_en}")
    
    # Noun phrases
    nps = extractor.extract_noun_phrases(text_en, 'en')
    print(f"\nNoun Phrases:")
    for np in nps:
        print(f"  {np['text']} (root: {np['root']}, POS: {np['root_pos']})")
    
    # Technical terms
    terms = extractor.extract_technical_terms(text_en, 'en')
    print(f"\nTechnical Terms:")
    for t in terms:
        print(f"  {t['text']} ({t['pos']})")
    
    # Compound terms
    compounds = extractor.extract_compound_terms(text_en, 'en')
    print(f"\nCompound Terms:")
    for c in compounds:
        print(f"  {c}")
    
    # POS distribution
    dist = extractor.get_pos_distribution(text_en, 'en')
    print(f"\nPOS Distribution: {dist}")

