#!/usr/bin/env python3
"""
Smart Translator - Gemini Powered Technical Translation
=======================================================
Replaces legacy Regex+DeepL approach with Context-Aware AI Translation.
Uses 'dictionary.db' to enforce technical terminology while maintaining natural grammar.
"""

import os
import json
import sqlite3
import re
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass

# Configuration
DICTIONARY_PATH = os.getenv("DICTIONARY_PATH", "/mnt/pdfs/dictionary_v2.db")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

@dataclass
class TranslationResult:
    original_text: str
    translated_text: str
    used_terms: Dict[str, str]
    confidence: float = 1.0

class SmartTranslator:
    def __init__(self, api_key: str = None, dict_path: str = DICTIONARY_PATH):
        # Gemini API Key configuration
        self.api_key = api_key or os.getenv("GEMINI_API_KEY")
        self.dict_path = dict_path
        self.conn = None
        self._term_cache = {} 
        
        self._connect_db()
        self._load_dictionary()

        if self.api_key:
            import google.generativeai as genai
            genai.configure(api_key=self.api_key)
            self.model = genai.GenerativeModel('gemini-2.0-flash')
        else:
            self.model = None
            print("⚠️ Gemini API Key missing for SmartTranslator")

    def _connect_db(self):
        try:
            if os.path.exists(self.dict_path):
                self.conn = sqlite3.connect(self.dict_path)
                self.conn.row_factory = sqlite3.Row
        except Exception as e:
            print(f"⚠️ DB Connection Error: {e}")

    def _load_dictionary(self):
        """Load high-quality terms into memory for quick lookup"""
        if not self.conn: return
        try:
            c = self.conn.cursor()
            c.execute('''
                SELECT canonical_tr, canonical_en, category 
                FROM technical_terms 
                WHERE (category IN ('parts-catalog', 'katalog-detailed', 'equipment', 'parts', 'auto-root', 'root', 'user-added') 
                OR category IS NULL)
                AND length(canonical_en) > 2 AND length(canonical_tr) > 2
            ''')
            count = 0
            for row in c.fetchall():
                tr = row['canonical_tr'].lower().strip()
                en = row['canonical_en'].lower().strip()
                if tr not in self._term_cache: self._term_cache[tr] = en
                if en not in self._term_cache: self._term_cache[en] = tr
                count += 1
            print(f"✓ SmartTranslator loaded {count} terms.")
        except Exception as e:
            print(f"⚠️ Dictionary Load Error: {e}")
    def add_term(self, en: str, tr: str, category: str = 'user-added') -> bool:
        """Add a new term to the dictionary database and cache"""
        if not self.conn: return False
        try:
            en = en.strip().lower()
            tr = tr.strip().lower()
            
            c = self.conn.cursor()
            # Önce var mı diye kontrol et (SQL UNIQUE constraint hatası almamak için)
            c.execute("SELECT id FROM technical_terms WHERE canonical_en = ? OR canonical_tr = ?", (en, tr))
            existing = c.fetchone()
            
            if existing: # Güncelle
                c.execute("""
                    UPDATE technical_terms 
                    SET canonical_tr = ?, canonical_en = ?, category = ?, updated_at = CURRENT_TIMESTAMP
                    WHERE id = ?
                """, (tr, en, category, existing[0]))
            else: # Ekle
                c.execute("""
                    INSERT INTO technical_terms (canonical_en, canonical_tr, category, confidence_score)
                    VALUES (?, ?, ?, 1.0)
                """, (en, tr, category))
                
            self.conn.commit()
            
            # Cache güncelle
            self._term_cache[en] = tr
            self._term_cache[tr] = en
            print(f"✓ Term added/updated: {en} <-> {tr}")
            return True
        except Exception as e:
            print(f"⚠️ Term Add Error: {e}")
            return False

    def delete_term(self, term: str) -> bool:
        """Delete a term by English or Turkish key"""
        if not self.conn: return False
        try:
            term = term.strip().lower()
            c = self.conn.cursor()
            
            # Veritabanından sil
            c.execute("DELETE FROM technical_terms WHERE canonical_en = ? OR canonical_tr = ?", (term, term))
            self.conn.commit()
            
            # Cache'ten sil (Hem TR hem EN keylerini bulup silmek lazım)
            keys_to_delete = []
            
            # Eğer term 'hydraulic pump' ise, dict['hydraulic pump'] = 'hidrolik pompa'
            # Ayrıca dict['hidrolik pompa'] = 'hydraulic pump' da silinmeli
            
            val = self._term_cache.get(term)
            if val:
                keys_to_delete.append(term)
                keys_to_delete.append(val)
                
            for k in keys_to_delete:
                if k in self._term_cache:
                    del self._term_cache[k]
                    
            print(f"✓ Term deleted: {term}")
            return True
        except Exception as e:
            print(f"⚠️ Term Delete Error: {e}")
            return False
    def _find_relevant_terms(self, text: str) -> Dict[str, str]:
        found_terms = {}
        text_lower = text.lower().strip()
        
        # 1. Mevcut metni tokenize et
        # Regex düzeltildi: r'\b\w+\b'
        tokens = re.findall(r'\b\w+\b', text_lower)
        
        # 2. N-Gram Oluştur (1 kelimelikten 5 kelimeliğe kadar)
        # Amacımız "hidrolik pompa" gibi birleşik terimleri bulmak.
        # AYRICA TEK KELİMELERİ DE BULMAK (n=1)
        n_grams = []
        max_n = 5
        length = len(tokens)
        
        for n in range(1, max_n + 1):
             for i in range(length - n + 1):
                 gram = ' '.join(tokens[i:i+n])
                 n_grams.append(gram)
                 
        # 3. Cache kontrolü
        for gram in n_grams:
            # Hem EN hem TR olarak bak (Sözlük key'leri karışık olabilir)
            if gram in self._term_cache:
                found_terms[gram] = self._term_cache[gram] 
            # Ters kontrol: Belki gram bir value'dur? (TR -> EN)
            # Bu biraz maliyetli ama kullanıcı TR arıyorsa gerekli.
            # Şu anlık cache yapımız {term: translation} şeklinde.
            # Eğer term='hydraulic', val='hidrolik' ise;
            # kullanıcı 'hidrolik' yazınca bulması için reverse lookup lazım.
                
        return found_terms

    def get_relevant_terms(self, text: str) -> Dict[str, str]:
        return self._find_relevant_terms(text)

    def translate(self, text: str, source_lang: str = "EN", target_lang: str = "TR", terms_override: Dict[str, str] = None) -> TranslationResult:
        if not self.model:
            return TranslationResult(text, text, {}, 0.0)

        # Eğer dışarıdan terimler verildiyse onları kullan (Düzeltilmiş metinden gelenler)
        # Yoksa kendin bul (Raw text üzerinden - eskisi gibi)
        relevant_terms = terms_override if terms_override is not None else self._find_relevant_terms(text)
        
        prompt = f"""
        You are an expert technical translator specializing in CONSTRUCTION MACHINERY and HEAVY EQUIPMENT (Excavators, Loaders, Backhoes, Dozers, etc.).
        
        DOMAIN CONTEXT:
        - The text belongs to the "World of Heavy Machinery" (İş Makinası Dünyası).
        - Avoid generic, medical, or standard IT translations. 
        - For example: "Travel" means "Seyir/Yürüyüş", not "Seyahat". "Shoe" means "Pabuç", not "Ayakkabı".
        
        TASK:
        Translate the input text to {target_lang} with high technical accuracy.
        
        CRITICAL RULES:
        1. If text is already in {target_lang}, just correct terminology.
        2. Maintain sentence structure and style.
        3. STRICTLY USE these dictionary terms in your translation: 
           {json.dumps(relevant_terms, ensure_ascii=False)}
        4. Output ONLY the translated text. Do not add explanations.
        
        Input Text: "{text}"
        """
        
        try:
            response = self.model.generate_content(prompt)
            translated_text = response.text.strip()
            return TranslationResult(text, translated_text, relevant_terms, 1.0)
        except Exception as e:
            print(f"Translation Error: {e}")
            return TranslationResult(text, text, {}, 0.0)

    def correct_terminology(self, text: str) -> str:
        """Kullanıcı sorgusundaki yazım hatalarını düzeltir (Gemini Powered) - Sadece Düzeltme yapar"""
        if not self.model:
            return text

        # Sadece düzeltme isteği
        prompt = f"""
        Act as a spell checker and grammar corrector for Heavy Equipment/Machinery terminology.
        
        Task: 
        1. Correct spelling mistakes (typos) in the input text.
        2. Fix broken sentence structures.
        3. Normalize technical terms (e.g. "yüüyüş" -> "yürüyüş").
        4. Do NOT translate. Keep the original language (Turkish or English).
        5. Output ONLY the corrected text.
        
        Input: "{text}"
        """
        
        try:
            response = self.model.generate_content(prompt)
            corrected_text = response.text.strip()
            # Gemini bazen açıklama eklerse temizle
            if '"' in corrected_text: corrected_text = corrected_text.replace('"', '')
            return corrected_text
        except Exception as e:
            print(f"Correction Error: {e}")
            return text

# Quick Test
if __name__ == "__main__":
    t = SmartTranslator()
