#!/usr/bin/env python3
"""
Sözlük Kalite Skorlama - PARALEL VERSİYON
10 eşzamanlı thread ile hızlı işlem
"""

import sqlite3
import requests
import time
import json
import sys
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Konfigürasyon
DEEPL_API_KEY = "b121dc7e-8e98-427f-8984-54c4d4f0851e"
DEEPL_API_URL = "https://api.deepl.com/v2/translate"
OLD_DB = "/mnt/pdfs/dictionary.db"
NEW_DB = "/mnt/pdfs/dictionary_quality.db"
MAX_WORKERS = 50  # Ryzen 9 7950X + 128GB RAM için optimize

# Thread-safe sayaçlar
lock = threading.Lock()
stats = {
    'processed': 0,
    'high_quality': 0,
    'good_quality': 0,
    'medium_quality': 0,
    'low_quality': 0,
    'deepl_errors': 0
}

# Kategori skorları
CATEGORY_SCORES = {
    'parts-catalog': 20, 'equipment': 20, 'compound': 20, 'parts': 20,
    'katalog-detailed': 15, 'hmk-katalog': 15, 'dynapac': 15,
    'deep-scan': 10, 'general': 10, 'catalog': 10, 'maden': 10,
    'maden-vision': 10, 'gemini-extract': 10,
    'table': 5, 'parenthesis': 5,
}

def translate_with_deepl(text: str) -> tuple:
    """DeepL ile çevir - İş makinası bağlamıyla"""
    try:
        response = requests.post(
            DEEPL_API_URL,
            headers={
                "Authorization": f"DeepL-Auth-Key {DEEPL_API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "text": [text], 
                "source_lang": "EN", 
                "target_lang": "TR",
                "context": "Heavy machinery, excavator, loader, construction equipment, hydraulic systems, service manual, parts catalog",
                "formality": "prefer_more"  # Teknik/resmi ton
            },
            timeout=15
        )
        
        if response.status_code == 200:
            result = response.json()
            if "translations" in result and result["translations"]:
                return result["translations"][0]["text"], None
        
        if response.status_code == 429:  # Rate limit
            time.sleep(1)
            return translate_with_deepl(text)  # Retry
        
        return None, f"HTTP {response.status_code}"
    except Exception as e:
        return None, str(e)

def calculate_similarity(text1: str, text2: str) -> float:
    """Benzerlik hesapla"""
    if not text1 or not text2:
        return 0.0
    t1 = set(text1.lower().strip().replace("-", " ").split())
    t2 = set(text2.lower().strip().replace("-", " ").split())
    if not t1 or not t2:
        return 0.0
    return len(t1 & t2) / len(t1 | t2)

def calculate_score(en_term, dict_tr, deepl_tr, category):
    """Kalite skoru hesapla"""
    score = 0
    
    # DeepL uyumsuzluk (0-40)
    if deepl_tr:
        sim = calculate_similarity(dict_tr, deepl_tr)
        if sim < 0.3: score += 40
        elif sim < 0.5: score += 30
        elif sim < 0.7: score += 20
        elif sim < 0.9: score += 10
        else: score += 5
    
    # Kategori (0-20)
    score += CATEGORY_SCORES.get(category, 5)
    
    # Uzunluk (0-10)
    if en_term and 3 <= len(en_term) <= 30:
        score += 10
    elif en_term and len(en_term) <= 50:
        score += 5
    
    # Oran (0-30)
    if dict_tr and en_term:
        ratio = min(len(dict_tr), len(en_term)) / max(len(dict_tr), len(en_term))
        if ratio > 0.5: score += 30
        elif ratio > 0.3: score += 20
        else: score += 10
    
    return score

def process_term(term_data):
    """Tek bir terimi işle"""
    term_id, en, dict_tr, category = term_data
    
    deepl_tr, error = translate_with_deepl(en)
    
    with lock:
        stats['processed'] += 1
        if error:
            stats['deepl_errors'] += 1
    
    score = calculate_score(en, dict_tr, deepl_tr, category or 'general')
    
    with lock:
        if score >= 80:
            stats['high_quality'] += 1
        elif score >= 60:
            stats['good_quality'] += 1
        elif score >= 40:
            stats['medium_quality'] += 1
        else:
            stats['low_quality'] += 1
    
    if score >= 80:
        return (term_id, en, dict_tr, deepl_tr, category, score)
    return None

def main():
    print("=" * 70)
    print("📊 SÖZLÜK KALİTE SKORLAMA - PARALEL ({} thread)".format(MAX_WORKERS))
    print("=" * 70)
    start_time = datetime.now()
    print(f"⏰ Başlangıç: {start_time.strftime('%H:%M:%S')}")
    sys.stdout.flush()
    
    # Eski DB'den terimleri al
    old_conn = sqlite3.connect(OLD_DB)
    old_cursor = old_conn.cursor()
    
    print(f"\n📥 Terimler okunuyor...")
    old_cursor.execute('''
        SELECT term_id, canonical_en, canonical_tr, category 
        FROM technical_terms 
        WHERE canonical_en IS NOT NULL 
        AND canonical_tr IS NOT NULL
        AND LENGTH(canonical_en) >= 3
    ''')
    terms = old_cursor.fetchall()
    total = len(terms)
    old_conn.close()
    
    print(f"✅ {total:,} terim alındı")
    print("-" * 70)
    sys.stdout.flush()
    
    # Yeni DB hazırla
    new_conn = sqlite3.connect(NEW_DB)
    new_cursor = new_conn.cursor()
    new_cursor.execute('DROP TABLE IF EXISTS quality_terms')
    new_cursor.execute('''
        CREATE TABLE quality_terms (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            original_id INTEGER,
            canonical_en TEXT,
            canonical_tr TEXT,
            deepl_tr TEXT,
            category TEXT,
            quality_score INTEGER,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    new_conn.commit()
    
    # Paralel işlem
    high_quality_results = []
    last_print_time = time.time()
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(process_term, t): t for t in terms}
        
        for future in as_completed(futures):
            result = future.result()
            if result:
                high_quality_results.append(result)
            
            # Her 2 saniyede bir ilerleme göster
            if time.time() - last_print_time >= 2:
                pct = stats['processed'] / total * 100
                elapsed = (datetime.now() - start_time).total_seconds()
                rate = stats['processed'] / elapsed if elapsed > 0 else 0
                remaining = (total - stats['processed']) / rate / 60 if rate > 0 else 0
                
                print(f"   ✓ {stats['processed']:,}/{total:,} [{pct:.1f}%] | 80+: {stats['high_quality']:,} | {rate:.0f}/sn | ~{remaining:.0f}dk kaldı")
                sys.stdout.flush()
                last_print_time = time.time()
            
            # Batch kaydet (her 500'de)
            if len(high_quality_results) >= 500:
                new_cursor.executemany('''
                    INSERT INTO quality_terms (original_id, canonical_en, canonical_tr, deepl_tr, category, quality_score)
                    VALUES (?, ?, ?, ?, ?, ?)
                ''', high_quality_results)
                new_conn.commit()
                high_quality_results = []
    
    # Kalan kayıtları kaydet
    if high_quality_results:
        new_cursor.executemany('''
            INSERT INTO quality_terms (original_id, canonical_en, canonical_tr, deepl_tr, category, quality_score)
            VALUES (?, ?, ?, ?, ?, ?)
        ''', high_quality_results)
        new_conn.commit()
    
    # Sonuçlar
    print("-" * 70)
    print("\n" + "=" * 70)
    print("📈 SONUÇLAR")
    print("=" * 70)
    
    total_processed = stats['processed']
    print(f"📊 Toplam işlenen: {total_processed:,}")
    print(f"⭐ Yüksek kalite (80+): {stats['high_quality']:,} ({stats['high_quality']/total_processed*100:.1f}%)")
    print(f"✅ İyi kalite (60-79): {stats['good_quality']:,} ({stats['good_quality']/total_processed*100:.1f}%)")
    print(f"⚠️ Orta kalite (40-59): {stats['medium_quality']:,} ({stats['medium_quality']/total_processed*100:.1f}%)")
    print(f"❌ Düşük kalite (0-39): {stats['low_quality']:,} ({stats['low_quality']/total_processed*100:.1f}%)")
    print(f"🔴 DeepL hataları: {stats['deepl_errors']:,}")
    
    # Örnek terimler
    print("\n📝 ÖRNEK YÜKSEK KALİTELİ TERİMLER:")
    new_cursor.execute('SELECT quality_score, canonical_en, canonical_tr, deepl_tr FROM quality_terms ORDER BY quality_score DESC LIMIT 15')
    for row in new_cursor.fetchall():
        print(f"   [{row[0]}] {str(row[1])[:30]:<30} → Sözlük: {str(row[2])[:20]:<20} | DeepL: {str(row[3])[:20]}")
    
    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds() / 60
    print(f"\n⏱️ Toplam süre: {duration:.1f} dakika")
    print(f"✅ Yeni DB: {NEW_DB}")
    print(f"⏰ Bitiş: {end_time.strftime('%H:%M:%S')}")
    
    new_conn.close()

if __name__ == "__main__":
    main()

