#!/usr/bin/env python3
"""
Sözlük Kalite Skorlama - TÜM TERİMLER
DeepL ile karşılaştırma, 80+ skorlu olanlar yeni DB'ye
"""

import sqlite3
import requests
import time
import json
import sys
from datetime import datetime

# Konfigürasyon
DEEPL_API_KEY = "b121dc7e-8e98-427f-8984-54c4d4f0851e"
DEEPL_API_URL = "https://api.deepl.com/v2/translate"
OLD_DB = "/mnt/pdfs/dictionary.db"
NEW_DB = "/mnt/pdfs/dictionary_quality.db"

# Kategori skorları
CATEGORY_SCORES = {
    'parts-catalog': 20,
    'equipment': 20,
    'compound': 20,
    'parts': 20,
    'katalog-detailed': 15,
    'deep-scan': 10,
    'general': 10,
    'catalog': 10,
    'maden': 10,
    'maden-vision': 10,
    'hmk-katalog': 15,
    'dynapac': 15,
    'gemini-extract': 10,
    'table': 5,
    'parenthesis': 5,
}

def translate_with_deepl(text: str, source_lang: str, target_lang: str) -> tuple:
    """DeepL ile çevir"""
    try:
        response = requests.post(
            DEEPL_API_URL,
            headers={
                "Authorization": f"DeepL-Auth-Key {DEEPL_API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "text": [text],
                "source_lang": source_lang,
                "target_lang": target_lang
            },
            timeout=15
        )
        
        if response.status_code == 200:
            result = response.json()
            if "translations" in result and len(result["translations"]) > 0:
                return result["translations"][0]["text"], None
        
        return None, f"HTTP {response.status_code}"
    
    except Exception as e:
        return None, str(e)

def normalize_text(text: str) -> str:
    """Karşılaştırma için normalize et"""
    if not text:
        return ""
    return text.lower().strip().replace("-", " ").replace("_", " ")

def calculate_similarity(text1: str, text2: str) -> float:
    """Benzerlik hesapla"""
    if not text1 or not text2:
        return 0.0
    
    t1 = set(normalize_text(text1).split())
    t2 = set(normalize_text(text2).split())
    
    if not t1 or not t2:
        return 0.0
    
    intersection = len(t1 & t2)
    union = len(t1 | t2)
    
    return intersection / union if union > 0 else 0.0

def calculate_quality_score(en_term, dict_tr, deepl_tr, category):
    """Kalite skoru hesapla"""
    
    scores = {
        'deepl_mismatch': 0,
        'category': 0,
        'length_format': 0,
        'roundtrip': 0,
    }
    
    # 1. DeepL Uyumsuzluk Skoru (0-40)
    if deepl_tr:
        similarity = calculate_similarity(dict_tr, deepl_tr)
        if similarity < 0.3:
            scores['deepl_mismatch'] = 40
        elif similarity < 0.5:
            scores['deepl_mismatch'] = 30
        elif similarity < 0.7:
            scores['deepl_mismatch'] = 20
        elif similarity < 0.9:
            scores['deepl_mismatch'] = 10
        else:
            scores['deepl_mismatch'] = 5
    
    # 2. Kategori Skoru (0-20)
    scores['category'] = CATEGORY_SCORES.get(category, 5)
    
    # 3. Uzunluk/Format Skoru (0-10)
    en_len = len(en_term) if en_term else 0
    if 3 <= en_len <= 30:
        scores['length_format'] = 10
    elif 30 < en_len <= 50:
        scores['length_format'] = 5
    else:
        scores['length_format'] = 0
    
    # 4. Round-trip tahmini (0-30)
    if dict_tr and en_term:
        tr_len = len(dict_tr)
        en_len = len(en_term)
        ratio = min(tr_len, en_len) / max(tr_len, en_len) if max(tr_len, en_len) > 0 else 0
        
        if ratio > 0.5:
            scores['roundtrip'] = 30
        elif ratio > 0.3:
            scores['roundtrip'] = 20
        else:
            scores['roundtrip'] = 10
    
    total = sum(scores.values())
    
    return {
        'total': total,
        'breakdown': scores,
        'deepl_translation': deepl_tr
    }

def main():
    print("=" * 70)
    print("📊 SÖZLÜK KALİTE SKORLAMA - TÜM TERİMLER")
    print("=" * 70)
    print(f"⏰ Başlangıç: {datetime.now().strftime('%H:%M:%S')}")
    sys.stdout.flush()
    
    # Eski DB'ye bağlan (SADECE OKUMA)
    old_conn = sqlite3.connect(OLD_DB)
    old_conn.row_factory = sqlite3.Row
    old_cursor = old_conn.cursor()
    
    # Yeni DB oluştur (varsa temizle)
    new_conn = sqlite3.connect(NEW_DB)
    new_cursor = new_conn.cursor()
    
    # Tabloyu yeniden oluştur
    new_cursor.execute('DROP TABLE IF EXISTS quality_terms')
    new_cursor.execute('''
        CREATE TABLE quality_terms (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            original_id INTEGER,
            canonical_en TEXT,
            canonical_tr TEXT,
            deepl_tr TEXT,
            category TEXT,
            quality_score INTEGER,
            score_breakdown TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    new_conn.commit()
    
    # Tüm terimleri al
    print(f"\n📥 Terimler okunuyor...")
    sys.stdout.flush()
    
    old_cursor.execute('''
        SELECT term_id, canonical_en, canonical_tr, category 
        FROM technical_terms 
        WHERE canonical_en IS NOT NULL 
        AND canonical_tr IS NOT NULL
        AND LENGTH(canonical_en) >= 3
    ''')
    
    terms = old_cursor.fetchall()
    total_terms = len(terms)
    print(f"✅ {total_terms:,} terim alındı")
    print("-" * 70)
    sys.stdout.flush()
    
    # İstatistikler
    stats = {
        'total': 0,
        'high_quality': 0,
        'good_quality': 0,
        'medium_quality': 0,
        'low_quality': 0,
        'deepl_errors': 0
    }
    
    batch_size = 100  # Her 100 terimde bir kaydet
    batch_data = []
    start_time = time.time()
    
    for i, term in enumerate(terms):
        term_id = term['term_id']
        en = term['canonical_en']
        dict_tr = term['canonical_tr']
        category = term['category'] or 'general'
        
        # DeepL çevirisi
        deepl_tr, error = translate_with_deepl(en, "EN", "TR")
        
        if error:
            stats['deepl_errors'] += 1
            deepl_tr = None
        
        # Skor hesapla
        score_result = calculate_quality_score(en, dict_tr, deepl_tr, category)
        total_score = score_result['total']
        
        # İstatistik güncelle
        stats['total'] += 1
        if total_score >= 80:
            stats['high_quality'] += 1
            # Yüksek kaliteli terimi batch'e ekle
            batch_data.append((
                term_id, en, dict_tr, deepl_tr, category,
                total_score, json.dumps(score_result['breakdown'])
            ))
        elif total_score >= 60:
            stats['good_quality'] += 1
        elif total_score >= 40:
            stats['medium_quality'] += 1
        else:
            stats['low_quality'] += 1
        
        # Batch kaydet
        if len(batch_data) >= batch_size:
            new_cursor.executemany('''
                INSERT INTO quality_terms 
                (original_id, canonical_en, canonical_tr, deepl_tr, category, quality_score, score_breakdown)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', batch_data)
            new_conn.commit()
            batch_data = []
        
        # İlerleme göster (her 500'de bir)
        if (i + 1) % 500 == 0:
            elapsed = time.time() - start_time
            rate = (i + 1) / elapsed
            remaining = (total_terms - i - 1) / rate / 60
            pct = (i + 1) / total_terms * 100
            print(f"   ✓ {i+1:,}/{total_terms:,} işlendi... (80+: {stats['high_quality']:,}) [{pct:.1f}%] ~{remaining:.0f}dk kaldı")
            sys.stdout.flush()
        
        # Rate limiting
        time.sleep(0.03)
    
    # Kalan batch'i kaydet
    if batch_data:
        new_cursor.executemany('''
            INSERT INTO quality_terms 
            (original_id, canonical_en, canonical_tr, deepl_tr, category, quality_score, score_breakdown)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        ''', batch_data)
        new_conn.commit()
    
    # Sonuçları göster
    print("-" * 70)
    print("\n" + "=" * 70)
    print("📈 SONUÇLAR")
    print("=" * 70)
    
    total = stats['total']
    print(f"📊 Toplam işlenen: {total:,}")
    print(f"⭐ Yüksek kalite (80+): {stats['high_quality']:,} ({stats['high_quality']/total*100:.1f}%)")
    print(f"✅ İyi kalite (60-79): {stats['good_quality']:,} ({stats['good_quality']/total*100:.1f}%)")
    print(f"⚠️ Orta kalite (40-59): {stats['medium_quality']:,} ({stats['medium_quality']/total*100:.1f}%)")
    print(f"❌ Düşük kalite (0-39): {stats['low_quality']:,} ({stats['low_quality']/total*100:.1f}%)")
    print(f"🔴 DeepL hataları: {stats['deepl_errors']:,}")
    
    # Örnek yüksek kaliteli terimler
    print("\n📝 ÖRNEK YÜKSEK KALİTELİ TERİMLER:")
    new_cursor.execute('''
        SELECT quality_score, canonical_en, canonical_tr, deepl_tr 
        FROM quality_terms 
        ORDER BY quality_score DESC 
        LIMIT 20
    ''')
    for row in new_cursor.fetchall():
        score, en, dict_tr, deepl_tr = row
        print(f"   [{score}] {en[:30]:<30} → Sözlük: {dict_tr[:25]:<25} | DeepL: {str(deepl_tr)[:25]}")
    
    elapsed_total = (time.time() - start_time) / 60
    print(f"\n⏱️ Toplam süre: {elapsed_total:.1f} dakika")
    print(f"✅ Yeni DB kaydedildi: {NEW_DB}")
    print(f"⏰ Bitiş: {datetime.now().strftime('%H:%M:%S')}")
    
    old_conn.close()
    new_conn.close()

if __name__ == "__main__":
    main()

