import sqlite3
import os
from collections import Counter

DB_PATH = '/mnt/pdfs/dictionary.db'

def analyze_roots():
    if not os.path.exists(DB_PATH):
        print("Veritabanı bulunamadı.")
        return

    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row
    c = conn.cursor()
    
    # Tüm terimleri çek
    print("⏳ Sözlük taranıyor...")
    c.execute("SELECT canonical_tr, canonical_en FROM technical_terms WHERE length(canonical_tr) > 2")
    all_terms = c.fetchall()
    
    print(f"📋 Toplam {len(all_terms)} terim inceleniyor...")
    
    # Kelime eşleşme sıklığını say
    pair_counts = Counter()
    tr_word_counts = Counter()
    
    existing_terms = set()
    
    for row in all_terms:
        tr_full = row['canonical_tr'].lower().strip()
        en_full = row['canonical_en'].lower().strip()
        
        # Tam eşleşmeleri kaydet (zaten var olanlar)
        existing_terms.add(tr_full)
        
        tr_words = tr_full.split()
        en_words = en_full.split()
        
        # Sadece 2-4 kelimelik öbekleri analiz et (daha güvenli)
        if 2 <= len(tr_words) <= 4 and len(tr_words) == len(en_words):
            for i, tr_word in enumerate(tr_words):
                # Temizlik (noktalama vb)
                tr_word = tr_word.strip('.,-()')
                en_word = en_words[i].strip('.,-()')
                
                if len(tr_word) < 3 or len(en_word) < 3: continue
                
                pair_counts[(tr_word, en_word)] += 1
                tr_word_counts[tr_word] += 1

    # Analiz Sonuçları
    missing_roots = []
    
    print("\n🔍 POTANSİYEL EKSİK KÖK KELİMELER:")
    print("-" * 60)
    print(f"{'TR KELİME':<20} | {'EN ADAYI':<20} | {'FREKANS':<10} | {'GÜVEN'}")
    print("-" * 60)
    
    found_count = 0
    
    for (tr, en), count in pair_counts.most_common():
        # Eşik Değerler:
        # En az 5 kez geçmiş olmalı
        if count < 5: continue
        
        # Zaten sözlükte tek başına yoksa
        if tr in existing_terms: continue
        
        # Güven Skoru: Bu TR kelimesi kaç kere geçmiş, kaçında bu EN ile eşleşmiş?
        total_occurrences = tr_word_counts[tr]
        confidence = count / total_occurrences
        
        if confidence > 0.8: # %80 üzeri güven
            print(f"{tr:<20} | {en:<20} | {count:<10} | %{int(confidence*100)}")
            missing_roots.append((tr, en, count))
            found_count += 1
            
            if found_count >= 50: # İlk 50 tanesini göster
                break
    
    print("-" * 60)
    print(f"\n💡 Toplam {len(missing_roots)} adet eksik kök kelime tespit edildi.")
    print("Bunları sözlüğe eklemek için onay verin.")

if __name__ == "__main__":
    analyze_roots()