#!/usr/bin/env python3
"""
Ingest Script - PDF İndeksleme ve Payload Güncelleme
====================================================
PDF dosyalarını vektörize ederken payload'a marka/model bilgisi ekler.
Mevcut vektörlerin payload'larını da güncelleyebilir.

Kullanım:
    # Mevcut payload'ları güncelle (marka/model ekle)
    python3 ingest.py --update-payloads --batch-size 1000
    
    # Belirli bir PDF'i indeksle
    python3 ingest.py --index-pdf /path/to/file.pdf
    
    # Tüm PDF'leri tarayıp eksik marka/model olanları bul
    python3 ingest.py --scan-missing
"""

import os
import sys
import json
import time
import argparse
from typing import Dict, List, Optional, Generator
from datetime import datetime

try:
    from qdrant_client import QdrantClient
    from qdrant_client.models import PointStruct, UpdateStatus
except ImportError:
    print("❌ qdrant-client yüklü değil: pip install qdrant-client")
    sys.exit(1)

from query_analyzer import QueryAnalyzer


# ==================== CONFIGURATION ====================

QDRANT_HOST = os.getenv("QDRANT_HOST", "10.10.10.25")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
COLLECTION_NAME = os.getenv("QDRANT_COLLECTION", "machine_docs")

BATCH_SIZE = 500  # Toplu güncelleme boyutu


class PayloadUpdater:
    """Mevcut vektörlerin payload'larını günceller"""
    
    def __init__(self, host: str = QDRANT_HOST, port: int = QDRANT_PORT):
        self.qdrant = QdrantClient(host=host, port=port, timeout=60)
        self.collection = COLLECTION_NAME
        self.analyzer = QueryAnalyzer()
        
        # İstatistikler
        self.stats = {
            "total_scanned": 0,
            "updated": 0,
            "skipped": 0,
            "errors": 0,
            "brands_found": {},
            "models_found": {}
        }
    
    def _scroll_all_points(self, batch_size: int = BATCH_SIZE) -> Generator[List, None, None]:
        """Tüm noktaları scroll ile tara"""
        offset = None
        
        while True:
            results, offset = self.qdrant.scroll(
                collection_name=self.collection,
                limit=batch_size,
                offset=offset,
                with_payload=True,
                with_vectors=False
            )
            
            if not results:
                break
            
            yield results
            
            if offset is None:
                break
    
    def _extract_metadata(self, payload: Dict) -> Dict[str, Optional[str]]:
        """Payload'dan marka/model çıkar"""
        # Önce pdf_path'ten dene
        pdf_path = payload.get("pdf_path", "")
        if pdf_path:
            extracted = self.analyzer.extract_from_path(pdf_path)
            if extracted.get("brand"):
                return extracted
        
        # Sonra pdf_filename'den dene
        pdf_filename = payload.get("pdf_filename", payload.get("pdf_name", ""))
        if pdf_filename:
            # Dosya adını sorgu gibi analiz et
            analysis = self.analyzer.analyze(pdf_filename)
            return {
                "brand": analysis.brand,
                "model": analysis.model,
                "category": None
            }
        
        return {"brand": None, "model": None, "category": None}
    
    def update_payloads(
        self,
        batch_size: int = BATCH_SIZE,
        dry_run: bool = False,
        force: bool = False
    ) -> Dict:
        """
        Tüm vektörlerin payload'larını güncelle.
        
        Args:
            batch_size: Toplu işlem boyutu
            dry_run: Sadece simülasyon (değişiklik yapma)
            force: Mevcut brand/model değerlerini de üzerine yaz
        """
        print(f"\n{'='*60}")
        print(f"   PAYLOAD GÜNCELLEME {'(DRY RUN)' if dry_run else ''}")
        print(f"{'='*60}")
        print(f"Collection: {self.collection}")
        print(f"Batch Size: {batch_size}")
        print(f"Force: {force}\n")
        
        start_time = time.time()
        updates_batch = []
        
        for batch in self._scroll_all_points(batch_size):
            for point in batch:
                self.stats["total_scanned"] += 1
                
                payload = point.payload or {}
                
                # Zaten brand/model varsa ve force değilse atla
                if not force:
                    if payload.get("brand") and payload.get("model"):
                        self.stats["skipped"] += 1
                        continue
                
                # Metadata çıkar
                metadata = self._extract_metadata(payload)
                
                if not metadata.get("brand") and not metadata.get("model"):
                    self.stats["skipped"] += 1
                    continue
                
                # Güncelleme hazırla
                update_payload = {}
                if metadata.get("brand"):
                    update_payload["brand"] = metadata["brand"]
                    self.stats["brands_found"][metadata["brand"]] = \
                        self.stats["brands_found"].get(metadata["brand"], 0) + 1
                
                if metadata.get("model"):
                    update_payload["model"] = metadata["model"]
                    self.stats["models_found"][metadata["model"]] = \
                        self.stats["models_found"].get(metadata["model"], 0) + 1
                
                if metadata.get("category"):
                    update_payload["category"] = metadata["category"]
                
                updates_batch.append({
                    "id": point.id,
                    "payload": update_payload
                })
                
                self.stats["updated"] += 1
            
            # Batch güncelleme
            if updates_batch and not dry_run:
                try:
                    for update in updates_batch:
                        self.qdrant.set_payload(
                            collection_name=self.collection,
                            payload=update["payload"],
                            points=[update["id"]]
                        )
                except Exception as e:
                    print(f"⚠️ Güncelleme hatası: {e}")
                    self.stats["errors"] += len(updates_batch)
                
                updates_batch = []
            
            # Progress göster
            if self.stats["total_scanned"] % 10000 == 0:
                elapsed = time.time() - start_time
                rate = self.stats["total_scanned"] / elapsed if elapsed > 0 else 0
                print(f"   Taranan: {self.stats['total_scanned']:,}, "
                      f"Güncellenen: {self.stats['updated']:,}, "
                      f"Hız: {rate:.0f}/s")
        
        # Son batch
        if updates_batch and not dry_run:
            try:
                for update in updates_batch:
                    self.qdrant.set_payload(
                        collection_name=self.collection,
                        payload=update["payload"],
                        points=[update["id"]]
                    )
            except Exception as e:
                print(f"⚠️ Son batch hatası: {e}")
                self.stats["errors"] += len(updates_batch)
        
        elapsed = time.time() - start_time
        
        # Sonuç raporu
        print(f"\n{'='*60}")
        print(f"   SONUÇ RAPORU")
        print(f"{'='*60}")
        print(f"Toplam Taranan  : {self.stats['total_scanned']:,}")
        print(f"Güncellenen     : {self.stats['updated']:,}")
        print(f"Atlanan         : {self.stats['skipped']:,}")
        print(f"Hata            : {self.stats['errors']:,}")
        print(f"Süre            : {elapsed:.1f}s")
        
        if self.stats["brands_found"]:
            print(f"\nBulunan Markalar ({len(self.stats['brands_found'])}):")
            for brand, count in sorted(self.stats["brands_found"].items(), 
                                       key=lambda x: x[1], reverse=True)[:10]:
                print(f"   {brand}: {count:,}")
        
        return self.stats
    
    def scan_missing(self, sample_size: int = 1000) -> Dict:
        """Eksik marka/model olan kayıtları tara"""
        print(f"\n{'='*60}")
        print(f"   EKSİK MARKA/MODEL TARAMASI")
        print(f"{'='*60}\n")
        
        missing = {"no_brand": 0, "no_model": 0, "no_both": 0, "complete": 0}
        samples = {"no_brand": [], "no_model": [], "no_both": []}
        
        count = 0
        for batch in self._scroll_all_points(500):
            for point in batch:
                if count >= sample_size:
                    break
                
                payload = point.payload or {}
                has_brand = bool(payload.get("brand"))
                has_model = bool(payload.get("model"))
                
                pdf_path = payload.get("pdf_path", "")[:60]
                
                if has_brand and has_model:
                    missing["complete"] += 1
                elif not has_brand and not has_model:
                    missing["no_both"] += 1
                    if len(samples["no_both"]) < 3:
                        samples["no_both"].append(pdf_path)
                elif not has_brand:
                    missing["no_brand"] += 1
                    if len(samples["no_brand"]) < 3:
                        samples["no_brand"].append(pdf_path)
                else:
                    missing["no_model"] += 1
                    if len(samples["no_model"]) < 3:
                        samples["no_model"].append(pdf_path)
                
                count += 1
            
            if count >= sample_size:
                break
        
        print(f"Taranan: {count:,} kayıt")
        print(f"\n📊 Dağılım:")
        print(f"   ✓ Tam (brand+model): {missing['complete']} ({missing['complete']*100//count}%)")
        print(f"   ⚠ Sadece model: {missing['no_brand']} ({missing['no_brand']*100//count}%)")
        print(f"   ⚠ Sadece brand: {missing['no_model']} ({missing['no_model']*100//count}%)")
        print(f"   ✗ Hiçbiri yok: {missing['no_both']} ({missing['no_both']*100//count}%)")
        
        if samples["no_both"]:
            print(f"\n📁 Brand+Model eksik örnekler:")
            for s in samples["no_both"]:
                print(f"   - {s}")
        
        return missing


class PDFIndexer:
    """Yeni PDF'leri indeksler"""
    
    def __init__(self, host: str = QDRANT_HOST, port: int = QDRANT_PORT):
        self.qdrant = QdrantClient(host=host, port=port, timeout=60)
        self.collection = COLLECTION_NAME
        self.analyzer = QueryAnalyzer()
    
    def create_payload(self, pdf_path: str, page_number: int, text: str) -> Dict:
        """
        PDF chunk'ı için payload oluştur.
        
        Marka ve model bilgisini otomatik ekler.
        """
        # Path'ten metadata çıkar
        metadata = self.analyzer.extract_from_path(pdf_path)
        
        # Dosya adını al
        pdf_filename = os.path.basename(pdf_path)
        
        payload = {
            "pdf_path": pdf_path,
            "pdf_filename": pdf_filename,
            "page_number": page_number,
            "text": text,
            "brand": metadata.get("brand"),
            "model": metadata.get("model"),
            "category": metadata.get("category"),
            "created_at": datetime.utcnow().isoformat(),
            "source": "ingest_v2"
        }
        
        return payload
    
    def index_chunks(
        self,
        chunks: List[Dict],
        vectors: List[List[float]]
    ) -> bool:
        """
        Chunk'ları Qdrant'a ekle.
        
        Args:
            chunks: [{"pdf_path": ..., "page_number": ..., "text": ...}, ...]
            vectors: Embedding vektörleri
        """
        if len(chunks) != len(vectors):
            raise ValueError("Chunk ve vektör sayısı eşleşmiyor!")
        
        points = []
        for i, (chunk, vector) in enumerate(zip(chunks, vectors)):
            payload = self.create_payload(
                pdf_path=chunk["pdf_path"],
                page_number=chunk["page_number"],
                text=chunk["text"]
            )
            
            points.append(PointStruct(
                id=chunk.get("id", i),
                vector=vector,
                payload=payload
            ))
        
        try:
            self.qdrant.upsert(
                collection_name=self.collection,
                points=points
            )
            return True
        except Exception as e:
            print(f"❌ İndeksleme hatası: {e}")
            return False


# ==================== CLI ====================

def main():
    parser = argparse.ArgumentParser(description="PDF İndeksleme ve Payload Güncelleme")
    
    parser.add_argument("--update-payloads", action="store_true",
                        help="Mevcut payload'lara marka/model ekle")
    parser.add_argument("--scan-missing", action="store_true",
                        help="Eksik marka/model olan kayıtları tara")
    parser.add_argument("--batch-size", type=int, default=500,
                        help="Toplu işlem boyutu")
    parser.add_argument("--dry-run", action="store_true",
                        help="Değişiklik yapma, sadece simüle et")
    parser.add_argument("--force", action="store_true",
                        help="Mevcut değerleri üzerine yaz")
    parser.add_argument("--sample-size", type=int, default=1000,
                        help="Tarama örneklem boyutu")
    
    args = parser.parse_args()
    
    if args.scan_missing:
        updater = PayloadUpdater()
        updater.scan_missing(args.sample_size)
    
    elif args.update_payloads:
        updater = PayloadUpdater()
        updater.update_payloads(
            batch_size=args.batch_size,
            dry_run=args.dry_run,
            force=args.force
        )
    
    else:
        parser.print_help()


if __name__ == "__main__":
    main()

