#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Master Heavy Glossary - Qdrant Mining Script
~30,000 teknik terimi Qdrant VectorDB üzerinden doğrular ve zenginleştirir.
"""

import sqlite3
import csv
import os
from qdrant_client import QdrantClient
from qdrant_client.http import models
from tqdm import tqdm

# --- AYARLAR ---
QDRANT_HOST = "10.10.10.25"
QDRANT_PORT = 6333
COLLECTION_NAME = "machine_docs"

# Kaynak dosyalar (terminology-extractor'dan)
BASE_DIR = "/var/www/html/PEPCVSON"
SOURCE_DB = f"{BASE_DIR}/terminology-extractor/dictionary_export/dictionary_v2.db"
SOURCE_CSV = f"{BASE_DIR}/terminology-extractor/dictionary_export/dictionary_terms.csv"
TARGET_DB = f"{BASE_DIR}/glossary-mining/output/Master_Heavy_Glossary.db"

# --- FONKSİYONLAR ---

def extract_brand(file_path):
    """Dosya yolundan marka adını çıkarır
    Path: /mnt/storagebox/PDFLER/İŞ MAKİNASI GRUBU/JCB/JS220/dosya.pdf
    veya: /home/poyraz/pdfs/İŞ MAKİNASI GRUBU/JCB/JS220/dosya.pdf
    """
    if not file_path:
        return "UNKNOWN"
    
    try:
        parts = file_path.split('/')
        # /mnt/storagebox/PDFLER/GRUP/MARKA/MODEL/dosya.pdf
        # 0   1           2      3    4     5     6
        
        # Farklı path yapılarını dene
        for i, part in enumerate(parts):
            if "GRUBU" in part.upper() or "GRUP" in part.upper():
                # Bir sonraki klasör marka
                if i + 1 < len(parts):
                    return parts[i + 1]
        
        # Fallback: 5. veya 6. index'i dene
        if len(parts) >= 6:
            return parts[5] if parts[5] else parts[4]
            
    except Exception:
        pass
    
    return "UNKNOWN"


def get_context_snippet(full_text, term, context_chars=150):
    """Metinden terimin geçtiği yeri bulup etrafını keser"""
    if not full_text:
        return ""
    
    text_upper = full_text.upper()
    term_upper = term.upper()
    
    # Terimi bul
    start_index = text_upper.find(term_upper)
    
    if start_index != -1:
        # Terim etrafından kes
        cut_start = max(0, start_index - 50)
        cut_end = min(len(full_text), start_index + len(term) + context_chars)
        
        snippet = full_text[cut_start:cut_end].replace('\n', ' ').strip()
        return f"...{snippet}..."
    
    # Terim bulunamadıysa baştan al
    return full_text[:200].replace('\n', ' ').strip()


def calculate_validation_score(frequency, brand_count):
    """Frekans ve marka çeşitliliğine göre güvenilirlik skoru hesapla"""
    # Frekans skoru: 5+ sonuç = tam puan (1.0)
    freq_score = min(1.0, frequency / 5)
    
    # Marka bonusu: Her farklı marka +0.05 (max +0.2)
    brand_bonus = min(0.2, brand_count * 0.05)
    
    return round(freq_score + brand_bonus, 2)


# --- 1. QDRANT BAĞLANTISI ---
print("🔌 Qdrant'a bağlanılıyor...")
try:
    client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=60)
    collection_info = client.get_collection(COLLECTION_NAME)
    print(f"✅ Qdrant Bağlantısı Başarılı: {QDRANT_HOST}:{QDRANT_PORT}")
    print(f"   Collection: {COLLECTION_NAME} ({collection_info.points_count:,} nokta)")
except Exception as e:
    print(f"❌ Qdrant Bağlantı Hatası: {e}")
    exit(1)

# --- 2. HEDEF VERİTABANI HAZIRLIĞI ---
print("\n📁 Hedef veritabanı hazırlanıyor...")
os.makedirs(os.path.dirname(TARGET_DB), exist_ok=True)

conn_dest = sqlite3.connect(TARGET_DB)
cursor_dest = conn_dest.cursor()

cursor_dest.execute("""
    CREATE TABLE IF NOT EXISTS master_terms (
        term_en TEXT PRIMARY KEY,
        term_tr TEXT,
        context_sample TEXT,
        related_brands TEXT,
        frequency INTEGER DEFAULT 0,
        validation_score REAL,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
""")
conn_dest.commit()
print(f"   Hedef: {TARGET_DB}")

# --- 3. TERİMLERİ RAM'E YÜKLE ---
print("\n📦 Sözlük verileri toplanıyor...")
term_lookup = {}

# A) SQLite DB'den oku
if os.path.exists(SOURCE_DB):
    try:
        conn_src = sqlite3.connect(SOURCE_DB)
        cursor_src = conn_src.cursor()
        
        # Tablo yapısını kontrol et
        cursor_src.execute("PRAGMA table_info(technical_terms)")
        columns = [col[1] for col in cursor_src.fetchall()]
        
        # Sütun adlarına göre sorgu
        if 'canonical_en' in columns:
            query = "SELECT canonical_en, canonical_tr FROM technical_terms WHERE canonical_en IS NOT NULL"
        else:
            query = "SELECT term_en, term_tr FROM technical_terms WHERE term_en IS NOT NULL"
        
        for row in cursor_src.execute(query):
            if row[0] and row[1]:
                term_lookup[row[0].strip().upper()] = row[1].strip()
        
        conn_src.close()
        print(f"   ✓ DB'den {len(term_lookup)} terim yüklendi")
    except Exception as e:
        print(f"   ⚠️ DB okuma hatası: {e}")

# B) CSV'den oku
if os.path.exists(SOURCE_CSV):
    try:
        csv_count = 0
        with open(SOURCE_CSV, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                # Sütun adlarını dene
                en_term = row.get('canonical_en') or row.get('term_en') or row.get('EN', '')
                tr_term = row.get('canonical_tr') or row.get('term_tr') or row.get('TR', '')
                
                if en_term and tr_term:
                    key = en_term.strip().upper()
                    if key not in term_lookup:
                        term_lookup[key] = tr_term.strip()
                        csv_count += 1
        
        print(f"   ✓ CSV'den {csv_count} yeni terim eklendi")
    except Exception as e:
        print(f"   ⚠️ CSV okuma hatası: {e}")

print(f"\n🚀 TOPLAM {len(term_lookup):,} terim Qdrant üzerinde taranacak...")

# --- 4. ANA MADENCİLİK DÖNGÜSÜ ---
hits = 0
misses = 0
errors = 0

print("\n⛏️  Qdrant Mining başlıyor...\n")

for i, (en_term, tr_term) in enumerate(tqdm(term_lookup.items(), desc="Mining", unit="terim")):
    
    try:
        # MatchText ile tam kelime araması
        results, _ = client.scroll(
            collection_name=COLLECTION_NAME,
            scroll_filter=models.Filter(
                must=[
                    models.FieldCondition(
                        key="text",
                        match=models.MatchText(text=en_term)
                    )
                ]
            ),
            limit=10,  # Daha fazla sonuç = daha iyi marka analizi
            with_payload=True
        )
        
        if results:
            hits += 1
            
            # TÜM sonuçlardan marka topla
            brands = set()
            for hit in results:
                brand = extract_brand(hit.payload.get('file_path', ''))
                if brand and brand != "UNKNOWN":
                    brands.add(brand)
            
            brands_str = ", ".join(sorted(brands)) if brands else "UNKNOWN"
            
            # İlk sonuçtan context al
            first_hit = results[0].payload
            context = get_context_snippet(first_hit.get('text', ''), en_term)
            
            # Validation score hesapla
            frequency = len(results)
            validation_score = calculate_validation_score(frequency, len(brands))
            
            # Kaydet
            cursor_dest.execute("""
                INSERT OR REPLACE INTO master_terms 
                (term_en, term_tr, context_sample, related_brands, frequency, validation_score)
                VALUES (?, ?, ?, ?, ?, ?)
            """, (en_term, tr_term, context, brands_str, frequency, validation_score))
            
        else:
            # Qdrant'ta bulunamadı ama sözlükte var - yine de kaydet
            misses += 1
            cursor_dest.execute("""
                INSERT OR IGNORE INTO master_terms 
                (term_en, term_tr, context_sample, related_brands, frequency, validation_score)
                VALUES (?, ?, ?, ?, ?, ?)
            """, (en_term, tr_term, "", "", 0, 0.0))
            
    except Exception as e:
        errors += 1
        # Hata olsa bile devam et
        if errors <= 5:  # İlk 5 hatayı göster
            tqdm.write(f"⚠️ Hata ({en_term[:20]}...): {str(e)[:50]}")
    
    # Batch Commit (Her 100 işlemde bir)
    if i % 100 == 0:
        conn_dest.commit()

# Son commit
conn_dest.commit()
conn_dest.close()

# --- 5. SONUÇ RAPORU ---
print("\n" + "="*60)
print("🏁 MADENCİLİK TAMAMLANDI!")
print("="*60)
print(f"   ✅ Doğrulanan (Qdrant'ta bulunan): {hits:,}")
print(f"   ⚠️ Bulunamayan (frequency=0):      {misses:,}")
print(f"   ❌ Hata:                           {errors:,}")
print(f"\n   📁 Çıktı: {TARGET_DB}")
print("="*60)

# İstatistikler
conn_check = sqlite3.connect(TARGET_DB)
cursor_check = conn_check.cursor()

cursor_check.execute("SELECT COUNT(*) FROM master_terms WHERE frequency > 0")
verified = cursor_check.fetchone()[0]

cursor_check.execute("SELECT COUNT(*) FROM master_terms WHERE frequency >= 5")
high_freq = cursor_check.fetchone()[0]

cursor_check.execute("SELECT AVG(validation_score) FROM master_terms WHERE frequency > 0")
avg_score = cursor_check.fetchone()[0] or 0

print(f"\n📊 VERİTABANI İSTATİSTİKLERİ:")
print(f"   Doğrulanmış terimler: {verified:,}")
print(f"   Yüksek frekanslı (5+): {high_freq:,}")
print(f"   Ortalama güvenilirlik: {avg_score:.2f}")

conn_check.close()

