#!/usr/bin/env python3
"""
Glossary Cleaner - Main Runner
Cleans, normalizes, validates and exports glossary files
"""

import os
import sys
import argparse
import yaml
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path(__file__).parent))

from src.normalize import run_normalization
from src.validate_with_vector import run_validation
from src.generate_output import run_output_generation
from src.utils import logger


def load_config(config_path: str) -> dict:
    """Load configuration from YAML file"""
    with open(config_path, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def ensure_dirs(base_dir: str, config: dict):
    """Ensure output directories exist"""
    data_dir = os.path.join(base_dir, config['output']['data_dir'])
    reports_dir = os.path.join(base_dir, config['output']['reports_dir'])
    
    os.makedirs(data_dir, exist_ok=True)
    os.makedirs(reports_dir, exist_ok=True)
    os.makedirs(os.path.join(data_dir, 'input'), exist_ok=True)
    
    return data_dir, reports_dir


def main():
    parser = argparse.ArgumentParser(description='Glossary Cleaner')
    parser.add_argument('--config', '-c', default='config.yaml', help='Config file path')
    parser.add_argument('--input', '-i', help='Input TSV file (overrides config)')
    parser.add_argument('--mode', '-m', choices=['vector', 'simple'], help='Validation mode')
    parser.add_argument('--skip-validation', action='store_true', help='Skip validation step')
    args = parser.parse_args()
    
    # Get base directory
    base_dir = Path(__file__).parent
    
    # Load config
    config_path = os.path.join(base_dir, args.config)
    config = load_config(config_path)
    
    # Override config with args
    if args.input:
        config['input']['file'] = args.input
    if args.mode:
        config['validation_mode'] = args.mode
    
    # Ensure directories
    data_dir, reports_dir = ensure_dirs(base_dir, config)
    
    # Input file
    input_file = config['input']['file']
    if not os.path.isabs(input_file):
        input_file = os.path.join(base_dir, input_file)
    
    if not os.path.exists(input_file):
        logger.error(f"Input file not found: {input_file}")
        sys.exit(1)
    
    logger.info("=" * 60)
    logger.info("GLOSSARY CLEANER - Starting")
    logger.info("=" * 60)
    logger.info(f"Input: {input_file}")
    logger.info(f"Validation mode: {config['validation_mode']}")
    logger.info(f"Confidence threshold: {config['thresholds']['confidence_min']}")
    
    # Combined stats
    all_stats = {}
    
    # Step 1: Normalization
    logger.info("\n" + "=" * 40)
    logger.info("STEP 1: NORMALIZATION")
    logger.info("=" * 40)
    
    normalized_file = os.path.join(data_dir, 'normalized_terms.json')
    normalized_entries, norm_stats = run_normalization(
        input_file,
        normalized_file,
        fuzzy_threshold=config['thresholds']['fuzzy_match']
    )
    all_stats.update(norm_stats)
    
    # Step 2: Validation
    if not args.skip_validation:
        logger.info("\n" + "=" * 40)
        logger.info("STEP 2: VALIDATION")
        logger.info("=" * 40)
        
        validated_file = os.path.join(data_dir, 'validated_terms.json')
        use_vector = config['validation_mode'] == 'vector'
        
        validated_entries, val_stats = run_validation(
            normalized_file,
            validated_file,
            qdrant_host=config['qdrant']['host'],
            qdrant_port=config['qdrant']['port'],
            collection=config['qdrant']['collection'],
            confidence_threshold=config['thresholds']['confidence_min'],
            use_vector=use_vector
        )
        all_stats.update(val_stats)
    else:
        logger.info("\n[Validation skipped]")
        validated_file = normalized_file
        # Add fake validation stats
        all_stats['total_validated'] = len(normalized_entries)
        all_stats['high_confidence'] = len(normalized_entries)
        all_stats['low_confidence'] = 0
        all_stats['corpus_hits'] = 0
        all_stats['average_confidence'] = 0.8
    
    # Step 3: Generate outputs
    logger.info("\n" + "=" * 40)
    logger.info("STEP 3: GENERATE OUTPUTS")
    logger.info("=" * 40)
    
    output_stats = run_output_generation(
        validated_file,
        data_dir,
        reports_dir,
        all_stats,
        confidence_threshold=config['thresholds']['confidence_min']
    )
    
    # Final summary
    logger.info("\n" + "=" * 60)
    logger.info("COMPLETED - SUMMARY")
    logger.info("=" * 60)
    logger.info(f"Total input entries:     {all_stats.get('total_input', 0):,}")
    logger.info(f"After normalization:     {all_stats.get('after_normalization', 0):,}")
    logger.info(f"Duplicates removed:      {all_stats.get('duplicates_removed', 0):,}")
    logger.info(f"Invalid removed:         {all_stats.get('invalid_removed', 0):,}")
    logger.info(f"Clean entries:           {output_stats.get('cleaned', 0):,}")
    logger.info(f"Flagged for review:      {output_stats.get('flagged', 0):,}")
    logger.info(f"Average confidence:      {all_stats.get('average_confidence', 0):.3f}")
    logger.info("")
    logger.info("Output files:")
    logger.info(f"  - {data_dir}/cleaned_glossary.csv")
    logger.info(f"  - {data_dir}/deepl_glossary.csv")
    logger.info(f"  - {data_dir}/google_glossary.tsv")
    logger.info(f"  - {data_dir}/flagged.csv")
    logger.info(f"  - {reports_dir}/validation_log.md")
    logger.info(f"  - {reports_dir}/stats.json")
    logger.info("=" * 60)


if __name__ == "__main__":
    main()

