"""
Generate output files for different glossary formats
- cleaned_glossary.csv (internal format with comments)
- deepl_glossary.csv (DeepL format)
- google_glossary.tsv (Google format)
- flagged.csv (manual review list)
"""

import csv
import json
from typing import List, Dict
from datetime import datetime

from .utils import logger


class OutputGenerator:
    """Generate various glossary output formats"""
    
    def __init__(self, validated_entries: List[Dict], confidence_threshold: float = 0.6):
        self.entries = validated_entries
        self.confidence_threshold = confidence_threshold
        self.clean_entries = []
        self.flagged_entries = []
        self._separate_entries()
    
    def _separate_entries(self):
        """Separate entries into clean and flagged"""
        for entry in self.entries:
            if entry.get('is_valid', False) and entry.get('confidence', 0) >= self.confidence_threshold:
                self.clean_entries.append(entry)
            else:
                self.flagged_entries.append(entry)
        
        logger.info(f"Separated: {len(self.clean_entries)} clean, {len(self.flagged_entries)} flagged")
    
    def _generate_comment(self, entry: Dict) -> str:
        """Generate a comment/description for the entry"""
        details = entry.get('validation_details', {})
        reasons = details.get('reasons', [])
        confidence = entry.get('confidence', 0)
        
        parts = []
        if details.get('corpus_hit'):
            parts.append("corpus_verified")
        
        parts.append(f"conf:{confidence:.2f}")
        
        if reasons:
            parts.extend(reasons[:2])  # Max 2 reasons
        
        return "; ".join(parts)
    
    def generate_cleaned_csv(self, filepath: str):
        """
        Generate internal format CSV with comments
        Format: source,target,comment,confidence
        """
        with open(filepath, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['source', 'target', 'comment', 'confidence'])
            
            for entry in self.clean_entries:
                comment = self._generate_comment(entry)
                writer.writerow([
                    entry['source'],
                    entry['target'],
                    comment,
                    f"{entry.get('confidence', 0):.3f}"
                ])
        
        logger.info(f"Generated cleaned_glossary.csv: {len(self.clean_entries)} entries")
    
    def generate_deepl_csv(self, filepath: str):
        """
        Generate DeepL format CSV
        Format: source,target (no header, comma separated)
        """
        with open(filepath, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            # DeepL doesn't need header
            for entry in self.clean_entries:
                writer.writerow([entry['source'], entry['target']])
        
        logger.info(f"Generated deepl_glossary.csv: {len(self.clean_entries)} entries")
    
    def generate_google_tsv(self, filepath: str):
        """
        Generate Google format TSV
        Format: source<TAB>target (no header)
        """
        with open(filepath, 'w', encoding='utf-8') as f:
            for entry in self.clean_entries:
                f.write(f"{entry['source']}\t{entry['target']}\n")
        
        logger.info(f"Generated google_glossary.tsv: {len(self.clean_entries)} entries")
    
    def generate_flagged_csv(self, filepath: str):
        """
        Generate flagged entries for manual review
        Format: source,target,reason,confidence
        """
        with open(filepath, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['source', 'target', 'reason', 'confidence'])
            
            for entry in self.flagged_entries:
                details = entry.get('validation_details', {})
                reasons = details.get('reasons', ['unknown'])
                reason_str = "; ".join(reasons) if reasons else "low_confidence"
                
                writer.writerow([
                    entry['source'],
                    entry['target'],
                    reason_str,
                    f"{entry.get('confidence', 0):.3f}"
                ])
        
        logger.info(f"Generated flagged.csv: {len(self.flagged_entries)} entries")
    
    def generate_validation_log(self, filepath: str, stats: Dict):
        """Generate markdown validation log"""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        content = f"""# Glossary Validation Log

**Generated:** {timestamp}

## Summary

| Metric | Value |
|--------|-------|
| Total Input | {stats.get('total_input', 0):,} |
| After Normalization | {stats.get('after_normalization', 0):,} |
| Duplicates Removed | {stats.get('duplicates_removed', 0):,} |
| Invalid Removed | {stats.get('invalid_removed', 0):,} |
| Variants Merged | {stats.get('variants_merged', 0):,} |
| Total Validated | {stats.get('total_validated', 0):,} |
| High Confidence | {stats.get('high_confidence', 0):,} |
| Low Confidence (Flagged) | {stats.get('low_confidence', 0):,} |
| Corpus Hits | {stats.get('corpus_hits', 0):,} |
| Average Confidence | {stats.get('average_confidence', 0):.3f} |

## Output Files

| File | Entries | Description |
|------|---------|-------------|
| `cleaned_glossary.csv` | {len(self.clean_entries):,} | Internal format with comments |
| `deepl_glossary.csv` | {len(self.clean_entries):,} | DeepL upload format |
| `google_glossary.tsv` | {len(self.clean_entries):,} | Google Cloud format |
| `flagged.csv` | {len(self.flagged_entries):,} | Manual review needed |

## Confidence Distribution

```
High (>= {self.confidence_threshold}): {len(self.clean_entries):,} entries
Low (< {self.confidence_threshold}):  {len(self.flagged_entries):,} entries
```

## Top Flagged Reasons

"""
        # Count reasons
        reason_counts = {}
        for entry in self.flagged_entries:
            details = entry.get('validation_details', {})
            for reason in details.get('reasons', ['unknown']):
                reason_counts[reason] = reason_counts.get(reason, 0) + 1
        
        sorted_reasons = sorted(reason_counts.items(), key=lambda x: -x[1])[:10]
        
        for reason, count in sorted_reasons:
            content += f"- `{reason}`: {count:,}\n"
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        
        logger.info(f"Generated validation_log.md")
    
    def generate_stats_json(self, filepath: str, stats: Dict):
        """Generate stats JSON file"""
        output_stats = {
            **stats,
            'clean_entries': len(self.clean_entries),
            'flagged_entries': len(self.flagged_entries),
            'confidence_threshold': self.confidence_threshold,
            'generated_at': datetime.now().isoformat()
        }
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(output_stats, f, ensure_ascii=False, indent=2)
        
        logger.info(f"Generated stats.json")
    
    def generate_all(self, data_dir: str, reports_dir: str, stats: Dict):
        """Generate all output files"""
        # Data files
        self.generate_cleaned_csv(f"{data_dir}/cleaned_glossary.csv")
        self.generate_deepl_csv(f"{data_dir}/deepl_glossary.csv")
        self.generate_google_tsv(f"{data_dir}/google_glossary.tsv")
        self.generate_flagged_csv(f"{data_dir}/flagged.csv")
        
        # Reports
        self.generate_validation_log(f"{reports_dir}/validation_log.md", stats)
        self.generate_stats_json(f"{reports_dir}/stats.json", stats)
        
        return {
            'cleaned': len(self.clean_entries),
            'flagged': len(self.flagged_entries)
        }


def run_output_generation(
    validated_file: str,
    data_dir: str,
    reports_dir: str,
    stats: Dict,
    confidence_threshold: float = 0.6
) -> Dict:
    """
    Main function to generate all outputs
    Returns: output stats
    """
    # Load validated entries
    with open(validated_file, 'r', encoding='utf-8') as f:
        entries = json.load(f)
    
    generator = OutputGenerator(entries, confidence_threshold)
    return generator.generate_all(data_dir, reports_dir, stats)


if __name__ == "__main__":
    import sys
    if len(sys.argv) >= 4:
        stats = run_output_generation(sys.argv[1], sys.argv[2], sys.argv[3], {})
        print(f"Output stats: {stats}")

