#!/usr/bin/env python3
"""
Glossary Vector Validation System
- Merge glossaries
- Separate by type (single word, term, sentence)
- Validate against corpus via Qdrant
- Export verified glossaries
"""

import os
import sys
import csv
import json
import re
from typing import List, Dict, Tuple, Optional
from datetime import datetime
from collections import defaultdict
from tqdm import tqdm

# Fiil listesi (cümle tespiti için)
VERB_PATTERNS = [
    'check', 'install', 'remove', 'replace', 'adjust', 'verify', 'ensure',
    'inspect', 'tighten', 'loosen', 'drain', 'fill', 'clean', 'measure',
    'connect', 'disconnect', 'assemble', 'disassemble', 'test', 'repair',
    'maintain', 'service', 'lubricate', 'torque', 'align', 'calibrate',
    'operate', 'start', 'stop', 'run', 'turn', 'press', 'push', 'pull',
    'lift', 'lower', 'raise', 'move', 'rotate', 'apply', 'use', 'set',
    'make', 'do', 'be', 'have', 'get', 'put', 'take', 'give', 'keep',
    'refer', 'see', 'note', 'confirm', 'perform', 'complete', 'follow'
]

VERB_REGEX = re.compile(r'\b(' + '|'.join(VERB_PATTERNS) + r')(s|ed|ing|e)?\b', re.IGNORECASE)


def load_csv(filepath: str, has_header: bool = True) -> List[Dict]:
    """Load CSV file"""
    entries = []
    if not os.path.exists(filepath):
        return entries
    
    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        if has_header:
            next(reader, None)
        for row in reader:
            if len(row) >= 2:
                entries.append({
                    'source': row[0].strip(),
                    'target': row[1].strip()
                })
    return entries


def load_tsv(filepath: str) -> List[Dict]:
    """Load TSV file"""
    entries = []
    if not os.path.exists(filepath):
        return entries
    
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                entries.append({
                    'source': parts[0].strip(),
                    'target': parts[1].strip()
                })
    return entries


def merge_glossaries(deepl_file: str, google_file: str) -> List[Dict]:
    """
    Merge DeepL and Google glossaries
    Google entries take priority on conflicts
    """
    print("📂 Loading glossaries...")
    
    # Load both
    deepl_entries = load_csv(deepl_file, has_header=False)
    google_entries = load_csv(google_file, has_header=True)
    
    print(f"   DeepL: {len(deepl_entries)} entries")
    print(f"   Google: {len(google_entries)} entries")
    
    # Build index (lowercase key for deduplication)
    merged = {}
    
    # Add DeepL first
    for entry in deepl_entries:
        key = entry['source'].lower().strip()
        merged[key] = {
            'source': entry['source'],
            'target': entry['target'],
            'origin': 'deepl'
        }
    
    # Add Google (overwrites DeepL on conflict)
    conflicts = 0
    for entry in google_entries:
        key = entry['source'].lower().strip()
        if key in merged:
            conflicts += 1
            merged[key] = {
                'source': entry['source'],
                'target': entry['target'],
                'origin': 'google'  # Google takes priority
            }
        else:
            merged[key] = {
                'source': entry['source'],
                'target': entry['target'],
                'origin': 'google'
            }
    
    result = list(merged.values())
    print(f"✅ Merged: {len(result)} unique entries ({conflicts} conflicts, Google prioritized)")
    
    return result


def classify_entry(source: str) -> Tuple[str, List[str]]:
    """
    Classify entry as: single_word, term, or sentence
    Returns: (type, detected_verbs)
    """
    words = source.split()
    word_count = len(words)
    
    # Detect verbs
    verbs_found = VERB_REGEX.findall(source)
    verb_list = [v[0].lower() for v in verbs_found]
    
    # Classification rules
    if word_count <= 2:
        return 'single_word', verb_list
    elif word_count <= 4 and not verb_list:
        return 'term', verb_list
    else:
        # 5+ words OR has verbs = sentence
        return 'sentence', verb_list


def separate_by_type(entries: List[Dict]) -> Dict[str, List[Dict]]:
    """
    Separate entries by type
    Returns dict with keys: single_word, term, sentence
    """
    print("\n🔀 Separating by type...")
    
    result = {
        'single_word': [],
        'term': [],
        'sentence': []
    }
    
    for entry in tqdm(entries, desc="Classifying"):
        entry_type, verbs = classify_entry(entry['source'])
        
        classified_entry = {
            **entry,
            'type': entry_type,
            'word_count': len(entry['source'].split()),
            'detected_verbs': verbs
        }
        
        result[entry_type].append(classified_entry)
    
    print(f"✅ Classification complete:")
    print(f"   Single words (1-2): {len(result['single_word'])}")
    print(f"   Terms (3-4, no verbs): {len(result['term'])}")
    print(f"   Sentences (5+ or verbs): {len(result['sentence'])}")
    
    return result


def save_translation_memory(sentences: List[Dict], output_file: str):
    """Save sentences to translation memory CSV"""
    print(f"\n💾 Saving translation memory: {output_file}")
    
    with open(output_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['source', 'target', 'word_count', 'detected_verbs', 'origin'])
        
        for entry in sentences:
            writer.writerow([
                entry['source'],
                entry['target'],
                entry['word_count'],
                ';'.join(entry['detected_verbs']),
                entry.get('origin', 'unknown')
            ])
    
    print(f"✅ Saved {len(sentences)} sentences to translation memory")


def save_glossary_terms(entries: List[Dict], output_file: str):
    """Save glossary terms to JSON for next step"""
    print(f"\n💾 Saving glossary terms: {output_file}")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(entries, f, ensure_ascii=False, indent=2)
    
    print(f"✅ Saved {len(entries)} terms for validation")


class QdrantValidator:
    """Validate terms against Qdrant corpus"""
    
    def __init__(self, host: str = "10.10.10.25", port: int = 6333, collection: str = "machine_docs"):
        self.host = host
        self.port = port
        self.collection = collection
        self.qdrant_client = None
        self.openai_client = None
    
    def connect(self) -> bool:
        """Connect to Qdrant and OpenAI"""
        try:
            from qdrant_client import QdrantClient
            self.qdrant_client = QdrantClient(host=self.host, port=self.port, timeout=60)
            info = self.qdrant_client.get_collection(self.collection)
            print(f"✅ Qdrant connected: {info.points_count:,} vectors")
        except Exception as e:
            print(f"❌ Qdrant connection failed: {e}")
            return False
        
        try:
            from openai import OpenAI
            self.openai_client = OpenAI()
            print("✅ OpenAI connected")
        except Exception as e:
            print(f"❌ OpenAI connection failed: {e}")
            return False
        
        return True
    
    def get_embedding(self, text: str) -> Optional[List[float]]:
        """Get embedding for text"""
        try:
            response = self.openai_client.embeddings.create(
                input=text,
                model="text-embedding-3-large"
            )
            return response.data[0].embedding
        except Exception as e:
            print(f"⚠️ Embedding error: {e}")
            return None
    
    def get_embeddings_batch(self, texts: List[str]) -> List[Optional[List[float]]]:
        """Get embeddings for batch of texts"""
        try:
            response = self.openai_client.embeddings.create(
                input=texts,
                model="text-embedding-3-large"
            )
            return [item.embedding for item in response.data]
        except Exception as e:
            print(f"⚠️ Batch embedding error: {e}")
            return [None] * len(texts)
    
    def search(self, vector: List[float], limit: int = 3) -> List[Dict]:
        """Search in Qdrant"""
        try:
            results = self.qdrant_client.query_points(
                collection_name=self.collection,
                query=vector,
                limit=limit,
                with_payload=True
            )
            return [{'score': r.score, 'payload': r.payload} for r in results.points]
        except Exception as e:
            print(f"⚠️ Search error: {e}")
            return []
    
    def validate_batch(self, entries: List[Dict], batch_size: int = 50) -> List[Dict]:
        """Validate entries in batches"""
        print(f"\n🔍 Validating {len(entries)} terms against corpus...")
        
        validated = []
        
        for i in tqdm(range(0, len(entries), batch_size), desc="Validating batches"):
            batch = entries[i:i+batch_size]
            sources = [e['source'] for e in batch]
            
            # Get embeddings
            embeddings = self.get_embeddings_batch(sources)
            
            for j, entry in enumerate(batch):
                embedding = embeddings[j] if j < len(embeddings) else None
                
                if embedding:
                    results = self.search(embedding, limit=3)
                    best_score = results[0]['score'] if results else 0
                    
                    # Determine status
                    if best_score >= 0.7:
                        status = 'verified'
                    elif best_score >= 0.5:
                        status = 'uncertain'
            else:
                        status = 'rejected'
                else:
                    best_score = 0
                    status = 'error'
                
                validated.append({
                    **entry,
                    'confidence': best_score,
                    'status': status,
                    'validated_at': datetime.now().isoformat()
                })
        
        # Stats
        stats = defaultdict(int)
        for v in validated:
            stats[v['status']] += 1
        
        print(f"✅ Validation complete:")
        for status, count in sorted(stats.items()):
            print(f"   {status}: {count}")
        
        return validated
    
    def create_glossary_collection(self, entries: List[Dict], collection_name: str = "glossary_terms"):
        """Create glossary collection in Qdrant with verified entries"""
        from qdrant_client.models import VectorParams, Distance, PointStruct
        
        verified = [e for e in entries if e['status'] == 'verified']
        print(f"\n📦 Creating collection '{collection_name}' with {len(verified)} verified entries...")
        
        # Check if collection exists
        try:
            self.qdrant_client.get_collection(collection_name)
            print(f"   Collection exists, deleting...")
            self.qdrant_client.delete_collection(collection_name)
        except:
            pass
        
        # Create collection
        self.qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=3072, distance=Distance.COSINE)
        )
        
        # Insert points
        points = []
        sources = [e['source'] for e in verified]
        
        print("   Getting embeddings for verified entries...")
        for i in tqdm(range(0, len(verified), 50), desc="Embedding"):
            batch = verified[i:i+50]
            batch_sources = [e['source'] for e in batch]
            embeddings = self.get_embeddings_batch(batch_sources)
            
            for j, entry in enumerate(batch):
                if embeddings[j]:
                    points.append(PointStruct(
                        id=len(points),
                        vector=embeddings[j],
                        payload={
                            'source': entry['source'],
                            'target': entry['target'],
                            'type': entry['type'],
                            'confidence': entry['confidence'],
                            'origin': entry.get('origin', 'unknown')
                        }
                    ))
        
        # Upsert in batches
        print(f"   Inserting {len(points)} points...")
        for i in range(0, len(points), 100):
            batch = points[i:i+100]
            self.qdrant_client.upsert(collection_name=collection_name, points=batch)
        
        print(f"✅ Collection '{collection_name}' created with {len(points)} entries")
        return len(points)


def generate_exports(validated: List[Dict], output_dir: str):
    """Generate export files"""
    print(f"\n📤 Generating export files...")
    
    verified = [e for e in validated if e['status'] == 'verified']
    uncertain = [e for e in validated if e['status'] == 'uncertain']
    rejected = [e for e in validated if e['status'] == 'rejected']
    
    # DeepL format (CSV)
    deepl_file = os.path.join(output_dir, 'deepl_glossary_final.csv')
    with open(deepl_file, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
        for entry in verified:
            writer.writerow([entry['source'], entry['target']])
    print(f"   ✅ {deepl_file}: {len(verified)} entries")
    
    # Google format (TSV)
    google_file = os.path.join(output_dir, 'google_glossary_final.tsv')
    with open(google_file, 'w', encoding='utf-8') as f:
        for entry in verified:
            f.write(f"{entry['source']}\t{entry['target']}\n")
    print(f"   ✅ {google_file}: {len(verified)} entries")
    
    # Uncertain (for review)
    uncertain_file = os.path.join(output_dir, 'glossary_uncertain.csv')
    with open(uncertain_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['source', 'target', 'confidence', 'type', 'origin'])
        for entry in uncertain:
                writer.writerow([
                entry['source'], entry['target'], 
                f"{entry['confidence']:.3f}", entry['type'], entry.get('origin', '')
                ])
    print(f"   ✅ {uncertain_file}: {len(uncertain)} entries")
    
    # Rejected
    rejected_file = os.path.join(output_dir, 'glossary_rejected.csv')
    with open(rejected_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['source', 'target', 'confidence', 'type', 'origin'])
        for entry in rejected:
            writer.writerow([
                entry['source'], entry['target'],
                f"{entry['confidence']:.3f}", entry['type'], entry.get('origin', '')
            ])
    print(f"   ✅ {rejected_file}: {len(rejected)} entries")


def generate_report(stats: Dict, output_dir: str):
    """Generate validation report"""
    report_dir = os.path.join(output_dir, 'reports')
    os.makedirs(report_dir, exist_ok=True)
    
    # Markdown report
    report_file = os.path.join(report_dir, 'validation_log.md')
    with open(report_file, 'w', encoding='utf-8') as f:
        f.write(f"""# Glossary Validation Report

**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Input Summary

| Source | Entries |
|--------|---------|
| DeepL (cleaned) | {stats.get('deepl_count', 0):,} |
| Google | {stats.get('google_count', 0):,} |
| Merged (unique) | {stats.get('merged_count', 0):,} |
| Conflicts (Google prioritized) | {stats.get('conflicts', 0):,} |

## Type Classification

| Type | Count | Destination |
|------|-------|-------------|
| Single word (1-2) | {stats.get('single_word', 0):,} | Glossary |
| Term (3-4, no verbs) | {stats.get('term', 0):,} | Glossary |
| Sentence (5+ or verbs) | {stats.get('sentence', 0):,} | Translation Memory |

## Validation Results

| Status | Count |
|--------|-------|
| Verified (>= 0.7) | {stats.get('verified', 0):,} |
| Uncertain (0.5-0.7) | {stats.get('uncertain', 0):,} |
| Rejected (< 0.5) | {stats.get('rejected', 0):,} |

## Output Files

- `deepl_glossary_final.csv` - DeepL upload format
- `google_glossary_final.tsv` - Google Cloud format  
- `glossary_uncertain.csv` - Manual review needed
- `glossary_rejected.csv` - Not verified in corpus
- `translation_memory.csv` - Sentence translations
""")
    
    # JSON stats
    stats_file = os.path.join(report_dir, 'stats.json')
    with open(stats_file, 'w', encoding='utf-8') as f:
        json.dump({**stats, 'generated_at': datetime.now().isoformat()}, f, indent=2)
    
    print(f"\n📊 Reports generated in {report_dir}")


def main():
    """Main execution"""
    import argparse
    
    parser = argparse.ArgumentParser(description='Glossary Vector Validation')
    parser.add_argument('--skip-validation', action='store_true', help='Skip Qdrant validation')
    parser.add_argument('--skip-collection', action='store_true', help='Skip collection creation')
    args = parser.parse_args()
    
    # Paths
    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    data_dir = os.path.join(base_dir, 'data')
    
    deepl_file = os.path.join(data_dir, 'deepl_glossary.csv')
    google_file = os.path.join(data_dir, 'input', 'google_glossary_original.csv')
    
    print("=" * 60)
    print("GLOSSARY VECTOR VALIDATION")
    print("=" * 60)
    
    stats = {}
    
    # Step 1: Merge glossaries
    merged = merge_glossaries(deepl_file, google_file)
    stats['merged_count'] = len(merged)
    
    # Step 2: Separate by type
    by_type = separate_by_type(merged)
    stats['single_word'] = len(by_type['single_word'])
    stats['term'] = len(by_type['term'])
    stats['sentence'] = len(by_type['sentence'])
    
    # Step 3: Save translation memory (sentences)
    tm_file = os.path.join(data_dir, 'translation_memory.csv')
    save_translation_memory(by_type['sentence'], tm_file)
    
    # Combine glossary entries (single_word + term)
    glossary_entries = by_type['single_word'] + by_type['term']
    print(f"\n📚 Glossary entries for validation: {len(glossary_entries)}")
    
    if args.skip_validation:
        print("\n⏭️ Skipping validation (--skip-validation)")
        # Mark all as verified for testing
        validated = [{**e, 'confidence': 0.8, 'status': 'verified', 'validated_at': datetime.now().isoformat()} for e in glossary_entries]
    else:
        # Step 4 & 5: Validate against corpus
        validator = QdrantValidator()
        if validator.connect():
            validated = validator.validate_batch(glossary_entries)
            
            # Step 6: Create collection
            if not args.skip_collection:
                validator.create_glossary_collection(validated)
        else:
            print("❌ Cannot proceed without Qdrant/OpenAI connection")
            return
    
    # Count stats
    for v in validated:
        status = v.get('status', 'unknown')
        stats[status] = stats.get(status, 0) + 1
    
    # Step 7: Generate exports
    generate_exports(validated, data_dir)
    
    # Generate report
    generate_report(stats, data_dir)
    
    print("\n" + "=" * 60)
    print("COMPLETED")
    print("=" * 60)


if __name__ == "__main__":
    main()