#!/usr/bin/env python3
"""
Sözlük Kalite Skorlama - DeepL ile
1000 rastgele terim test edilir, 80+ skorlu olanlar yeni DB'ye kopyalanır
"""

import sqlite3
import requests
import time
import json
from datetime import datetime

# Konfigürasyon
DEEPL_API_KEY = "b121dc7e-8e98-427f-8984-54c4d4f0851e"
DEEPL_API_URL = "https://api.deepl.com/v2/translate"
OLD_DB = "/mnt/pdfs/dictionary.db"
NEW_DB = "/mnt/pdfs/dictionary_quality.db"
SAMPLE_SIZE = 1000

# Kategori skorları
CATEGORY_SCORES = {
    'parts-catalog': 20,
    'equipment': 20,
    'compound': 20,
    'parts': 20,
    'katalog-detailed': 15,
    'deep-scan': 10,
    'general': 10,
    'catalog': 10,
    'maden': 10,
    'maden-vision': 10,
    'hmk-katalog': 15,
    'dynapac': 15,
    'gemini-extract': 10,
    'table': 5,
    'parenthesis': 5,
}

def translate_with_deepl(text: str, source_lang: str, target_lang: str) -> tuple:
    """DeepL ile çevir, (çeviri, hata) döndür"""
    try:
        response = requests.post(
            DEEPL_API_URL,
            headers={
                "Authorization": f"DeepL-Auth-Key {DEEPL_API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "text": [text],
                "source_lang": source_lang,
                "target_lang": target_lang
            },
            timeout=10
        )
        
        if response.status_code == 200:
            result = response.json()
            if "translations" in result and len(result["translations"]) > 0:
                return result["translations"][0]["text"], None
        
        return None, f"HTTP {response.status_code}"
    
    except Exception as e:
        return None, str(e)

def normalize_text(text: str) -> str:
    """Karşılaştırma için metni normalize et"""
    if not text:
        return ""
    return text.lower().strip().replace("-", " ").replace("_", " ")

def calculate_similarity(text1: str, text2: str) -> float:
    """İki metin arasındaki benzerlik (0-1)"""
    if not text1 or not text2:
        return 0.0
    
    t1 = set(normalize_text(text1).split())
    t2 = set(normalize_text(text2).split())
    
    if not t1 or not t2:
        return 0.0
    
    intersection = len(t1 & t2)
    union = len(t1 | t2)
    
    return intersection / union if union > 0 else 0.0

def calculate_quality_score(
    en_term: str,
    dict_tr: str,
    deepl_tr: str,
    category: str
) -> dict:
    """Kalite skoru hesapla"""
    
    scores = {
        'deepl_mismatch': 0,  # 0-40
        'category': 0,        # 0-20
        'length_format': 0,   # 0-10
        'roundtrip': 0,       # 0-30 (bu versiyonda basitleştirilmiş)
    }
    
    # 1. DeepL Uyumsuzluk Skoru (0-40)
    # Farklıysa = teknik terim = yüksek skor
    if deepl_tr:
        similarity = calculate_similarity(dict_tr, deepl_tr)
        if similarity < 0.3:
            scores['deepl_mismatch'] = 40  # Çok farklı = çok teknik
        elif similarity < 0.5:
            scores['deepl_mismatch'] = 30
        elif similarity < 0.7:
            scores['deepl_mismatch'] = 20
        elif similarity < 0.9:
            scores['deepl_mismatch'] = 10
        else:
            scores['deepl_mismatch'] = 5  # Aynı = genel terim
    
    # 2. Kategori Skoru (0-20)
    scores['category'] = CATEGORY_SCORES.get(category, 5)
    
    # 3. Uzunluk/Format Skoru (0-10)
    en_len = len(en_term) if en_term else 0
    if 3 <= en_len <= 30:
        scores['length_format'] = 10
    elif 30 < en_len <= 50:
        scores['length_format'] = 5
    else:
        scores['length_format'] = 0
    
    # 4. Basit Round-trip tahmini (0-30)
    # Teknik terimler genelde kısa ve öz
    if dict_tr and en_term:
        tr_len = len(dict_tr)
        en_len = len(en_term)
        ratio = min(tr_len, en_len) / max(tr_len, en_len) if max(tr_len, en_len) > 0 else 0
        
        if ratio > 0.5:
            scores['roundtrip'] = 30
        elif ratio > 0.3:
            scores['roundtrip'] = 20
        else:
            scores['roundtrip'] = 10
    
    total = sum(scores.values())
    
    return {
        'total': total,
        'breakdown': scores,
        'deepl_translation': deepl_tr
    }

def main():
    print("=" * 70)
    print("📊 SÖZLÜK KALİTE SKORLAMA")
    print("=" * 70)
    print(f"⏰ Başlangıç: {datetime.now().strftime('%H:%M:%S')}")
    
    # Eski DB'ye bağlan
    old_conn = sqlite3.connect(OLD_DB)
    old_conn.row_factory = sqlite3.Row
    old_cursor = old_conn.cursor()
    
    # Yeni DB oluştur
    new_conn = sqlite3.connect(NEW_DB)
    new_cursor = new_conn.cursor()
    
    # Yeni tablo oluştur
    new_cursor.execute('''
        CREATE TABLE IF NOT EXISTS quality_terms (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            original_id INTEGER,
            canonical_en TEXT,
            canonical_tr TEXT,
            deepl_tr TEXT,
            category TEXT,
            quality_score INTEGER,
            score_breakdown TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    new_conn.commit()
    
    # 1000 rastgele terim al
    print(f"\n📥 {SAMPLE_SIZE} rastgele terim alınıyor...")
    old_cursor.execute(f'''
        SELECT term_id, canonical_en, canonical_tr, category 
        FROM technical_terms 
        WHERE canonical_en IS NOT NULL 
        AND canonical_tr IS NOT NULL
        AND LENGTH(canonical_en) >= 3
        ORDER BY RANDOM() 
        LIMIT {SAMPLE_SIZE}
    ''')
    
    terms = old_cursor.fetchall()
    print(f"✅ {len(terms)} terim alındı")
    
    # İstatistikler
    stats = {
        'total': 0,
        'high_quality': 0,  # 80+
        'good_quality': 0,  # 60-79
        'medium_quality': 0,  # 40-59
        'low_quality': 0,  # 0-39
        'deepl_errors': 0
    }
    
    high_quality_terms = []
    
    print(f"\n🔄 DeepL ile skorlama başlıyor...")
    print("-" * 70)
    
    for i, term in enumerate(terms):
        term_id = term['term_id']
        en = term['canonical_en']
        dict_tr = term['canonical_tr']
        category = term['category'] or 'general'
        
        # DeepL çevirisi
        deepl_tr, error = translate_with_deepl(en, "EN", "TR")
        
        if error:
            stats['deepl_errors'] += 1
            deepl_tr = None
        
        # Skor hesapla
        score_result = calculate_quality_score(en, dict_tr, deepl_tr, category)
        total_score = score_result['total']
        
        # İstatistik güncelle
        stats['total'] += 1
        if total_score >= 80:
            stats['high_quality'] += 1
            high_quality_terms.append({
                'original_id': term_id,
                'en': en,
                'dict_tr': dict_tr,
                'deepl_tr': deepl_tr,
                'category': category,
                'score': total_score,
                'breakdown': score_result['breakdown']
            })
        elif total_score >= 60:
            stats['good_quality'] += 1
        elif total_score >= 40:
            stats['medium_quality'] += 1
        else:
            stats['low_quality'] += 1
        
        # İlerleme göster (her 100'de bir)
        if (i + 1) % 100 == 0:
            print(f"   ✓ {i + 1}/{len(terms)} işlendi... (80+: {stats['high_quality']})")
        
        # Rate limiting
        time.sleep(0.05)  # 50ms bekleme
    
    print("-" * 70)
    
    # Yüksek kaliteli terimleri yeni DB'ye kaydet
    print(f"\n💾 {len(high_quality_terms)} yüksek kaliteli terim kaydediliyor...")
    
    for term in high_quality_terms:
        new_cursor.execute('''
            INSERT INTO quality_terms 
            (original_id, canonical_en, canonical_tr, deepl_tr, category, quality_score, score_breakdown)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        ''', (
            term['original_id'],
            term['en'],
            term['dict_tr'],
            term['deepl_tr'],
            term['category'],
            term['score'],
            json.dumps(term['breakdown'])
        ))
    
    new_conn.commit()
    
    # Sonuçları göster
    print("\n" + "=" * 70)
    print("📈 SONUÇLAR")
    print("=" * 70)
    print(f"📊 Toplam test edilen: {stats['total']}")
    print(f"⭐ Yüksek kalite (80+): {stats['high_quality']} ({stats['high_quality']/stats['total']*100:.1f}%)")
    print(f"✅ İyi kalite (60-79): {stats['good_quality']} ({stats['good_quality']/stats['total']*100:.1f}%)")
    print(f"⚠️ Orta kalite (40-59): {stats['medium_quality']} ({stats['medium_quality']/stats['total']*100:.1f}%)")
    print(f"❌ Düşük kalite (0-39): {stats['low_quality']} ({stats['low_quality']/stats['total']*100:.1f}%)")
    print(f"🔴 DeepL hataları: {stats['deepl_errors']}")
    
    # Örnek yüksek kaliteli terimler
    print("\n📝 ÖRNEK YÜKSEK KALİTELİ TERİMLER (80+):")
    for term in high_quality_terms[:15]:
        print(f"   [{term['score']}] {term['en'][:30]:<30} → Sözlük: {term['dict_tr'][:25]:<25} | DeepL: {str(term['deepl_tr'])[:25]}")
    
    print(f"\n✅ Yeni DB kaydedildi: {NEW_DB}")
    print(f"⏰ Bitiş: {datetime.now().strftime('%H:%M:%S')}")
    
    old_conn.close()
    new_conn.close()

if __name__ == "__main__":
    main()