breakpilot-lehrer/klausur-service/backend/nibis_ingestion.py

"""
NiBiS Ingestion Pipeline
Automatisierte Verarbeitung von Abitur-Erwartungshorizonten aus Niedersachsen.

Unterstützt:
- Mehrere Jahre (2016, 2017, 2024, 2025, ...)
- Verschiedene Namenskonventionen (alt: *Lehrer/*L.pdf, neu: *_EWH.pdf)
- Automatisches Entpacken von ZIP-Dateien
- Flexible Erweiterung für andere Bundesländer
"""

import os
import re
import zipfile
import hashlib
import json
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
import asyncio

# Local imports
from eh_pipeline import chunk_text, generate_embeddings, extract_text_from_pdf, get_vector_size, EMBEDDING_BACKEND
from qdrant_service import QdrantService

# Configuration
DOCS_BASE_PATH = Path("/Users/benjaminadmin/projekte/breakpilot-pwa/docs")
ZA_DOWNLOAD_DIRS = ["za-download", "za-download-2", "za-download-3"]

# Qdrant collection for NiBiS data (separate from user EH)
NIBIS_COLLECTION = "bp_nibis_eh"


@dataclass
class NiBiSDocument:
    """Strukturierte Repräsentation eines NiBiS-Dokuments."""
    id: str
    file_path: str
    year: int
    subject: str
    niveau: str  # eA, gA, EA, GA
    task_number: Optional[int]
    doc_type: str  # EWH, Aufgabe, Material, GBU, etc.
    bundesland: str
    source_dir: str
    file_hash: str
    extracted_at: datetime

    # Metadaten aus Dateinamen
    raw_filename: str
    variant: Optional[str] = None  # BG, Tech, Wirt, etc.

    def to_dict(self) -> dict:
        d = asdict(self)
        d['extracted_at'] = d['extracted_at'].isoformat()
        return d


# Fach-Mapping (Kurzform -> Langform)
SUBJECT_MAPPING = {
    "deutsch": "Deutsch",
    "englisch": "Englisch",
    "englischbg": "Englisch (Berufliches Gymnasium)",
    "mathe": "Mathematik",
    "mathebg": "Mathematik (Berufliches Gymnasium)",
    "mathezwb": "Mathematik (Zweiter Bildungsweg)",
    "informatik": "Informatik",
    "biologie": "Biologie",
    "chemie": "Chemie",
    "physik": "Physik",
    "geschichte": "Geschichte",
    "erdkunde": "Erdkunde/Geografie",
    "kunst": "Kunst",
    "musik": "Musik",
    "sport": "Sport",
    "latein": "Latein",
    "griechisch": "Griechisch",
    "französisch": "Französisch",
    "franzîsisch": "Französisch",  # Encoding-Problem in 2017
    "spanisch": "Spanisch",
    "kathreligion": "Katholische Religion",
    "evreligion": "Evangelische Religion",
    "wertenormen": "Werte und Normen",
    "brc": "Betriebswirtschaft mit Rechnungswesen/Controlling",
    "bvw": "Betriebswirtschaft mit Rechnungswesen",
    "gespfl": "Gesundheit-Pflege",
}

# Niveau-Mapping
NIVEAU_MAPPING = {
    "ea": "eA",  # erhöhtes Anforderungsniveau
    "ga": "gA",  # grundlegendes Anforderungsniveau
    "neuga": "gA (neu einsetzend)",
    "neuea": "eA (neu einsetzend)",
}


def compute_file_hash(file_path: Path) -> str:
    """Berechnet SHA-256 Hash einer Datei."""
    sha256 = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            sha256.update(chunk)
    return sha256.hexdigest()[:16]


def extract_zip_files(base_path: Path) -> List[Path]:
    """Entpackt alle ZIP-Dateien in den za-download Verzeichnissen."""
    extracted = []

    for za_dir in ZA_DOWNLOAD_DIRS:
        za_path = base_path / za_dir
        if not za_path.exists():
            continue

        for zip_file in za_path.glob("*.zip"):
            # Zielverzeichnis = Name ohne .zip
            target_dir = za_path / zip_file.stem

            if target_dir.exists():
                print(f"  Bereits entpackt: {zip_file.name} -> {target_dir.name}/")
                extracted.append(target_dir)
                continue

            print(f"  Entpacke: {zip_file.name}...")
            try:
                with zipfile.ZipFile(zip_file, 'r') as zf:
                    zf.extractall(target_dir)
                print(f"    -> {len(list(target_dir.rglob('*')))} Dateien extrahiert")
                extracted.append(target_dir)
            except Exception as e:
                print(f"    FEHLER: {e}")

    return extracted


def parse_filename_old_format(filename: str, file_path: Path) -> Optional[Dict]:
    """
    Parst alte Namenskonvention (2016, 2017):
    - {Jahr}{Fach}{Niveau}Lehrer/{Jahr}{Fach}{Niveau}A{Nr}L.pdf
    - Beispiel: 2016DeutschEALehrer/2016DeutschEAA1L.pdf
    """
    # Pattern für Lehrer-Dateien
    pattern = r"(\d{4})([A-Za-zäöüÄÖÜ]+)(EA|GA|NeuGA|NeuEA)(?:Lehrer)?.*?(?:A(\d+)|Aufg(\d+))?L?\.pdf$"

    match = re.search(pattern, filename, re.IGNORECASE)
    if not match:
        return None

    year = int(match.group(1))
    subject_raw = match.group(2).lower()
    niveau = match.group(3).upper()
    task_num = match.group(4) or match.group(5)

    # Prüfe ob es ein Lehrer-Dokument ist (EWH)
    is_ewh = "lehrer" in str(file_path).lower() or filename.endswith("L.pdf")

    # Extrahiere Variante (Tech, Wirt, CAS, GTR, etc.)
    variant = None
    variant_patterns = ["Tech", "Wirt", "CAS", "GTR", "Pflicht", "BG", "mitExp", "ohneExp"]
    for v in variant_patterns:
        if v.lower() in str(file_path).lower():
            variant = v
            break

    return {
        "year": year,
        "subject": subject_raw,
        "niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
        "task_number": int(task_num) if task_num else None,
        "doc_type": "EWH" if is_ewh else "Aufgabe",
        "variant": variant,
    }


def parse_filename_new_format(filename: str, file_path: Path) -> Optional[Dict]:
    """
    Parst neue Namenskonvention (2024, 2025):
    - {Jahr}_{Fach}_{niveau}_{Nr}_EWH.pdf
    - Beispiel: 2025_Deutsch_eA_I_EWH.pdf
    """
    # Pattern für neue Dateien
    pattern = r"(\d{4})_([A-Za-zäöüÄÖÜ]+)(?:BG)?_(eA|gA)(?:_([IVX\d]+))?(?:_(.+))?\.pdf$"

    match = re.search(pattern, filename, re.IGNORECASE)
    if not match:
        return None

    year = int(match.group(1))
    subject_raw = match.group(2).lower()
    niveau = match.group(3)
    task_id = match.group(4)
    suffix = match.group(5) or ""

    # Task-Nummer aus römischen Zahlen
    task_num = None
    if task_id:
        roman_map = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
        task_num = roman_map.get(task_id) or (int(task_id) if task_id.isdigit() else None)

    # Dokumenttyp
    is_ewh = "EWH" in filename or "ewh" in filename.lower()

    # Spezielle Dokumenttypen
    doc_type = "EWH" if is_ewh else "Aufgabe"
    if "Material" in suffix:
        doc_type = "Material"
    elif "GBU" in suffix:
        doc_type = "GBU"
    elif "Ergebnis" in suffix:
        doc_type = "Ergebnis"
    elif "Bewertungsbogen" in suffix:
        doc_type = "Bewertungsbogen"
    elif "HV" in suffix:
        doc_type = "Hörverstehen"
    elif "ME" in suffix:
        doc_type = "Mediation"

    # BG Variante
    variant = "BG" if "BG" in filename else None
    if "mitExp" in str(file_path):
        variant = "mitExp"

    return {
        "year": year,
        "subject": subject_raw,
        "niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
        "task_number": task_num,
        "doc_type": doc_type,
        "variant": variant,
    }


def discover_documents(base_path: Path, ewh_only: bool = True) -> List[NiBiSDocument]:
    """
    Findet alle relevanten Dokumente in den za-download Verzeichnissen.

    Args:
        base_path: Basis-Pfad zu docs/
        ewh_only: Nur Erwartungshorizonte (keine Aufgaben)
    """
    documents = []

    for za_dir in ZA_DOWNLOAD_DIRS:
        za_path = base_path / za_dir
        if not za_path.exists():
            continue

        print(f"\nSuche in {za_dir}...")

        for pdf_file in za_path.rglob("*.pdf"):
            filename = pdf_file.name

            # Versuche beide Formate
            parsed = parse_filename_new_format(filename, pdf_file)
            if not parsed:
                parsed = parse_filename_old_format(filename, pdf_file)

            if not parsed:
                # Unbekanntes Format
                continue

            # Filter: Nur EWH?
            if ewh_only and parsed["doc_type"] != "EWH":
                continue

            # Erstelle Dokument
            doc_id = f"nibis_{parsed['year']}_{parsed['subject']}_{parsed['niveau']}_{parsed.get('task_number', 0)}_{compute_file_hash(pdf_file)}"

            doc = NiBiSDocument(
                id=doc_id,
                file_path=str(pdf_file),
                year=parsed["year"],
                subject=SUBJECT_MAPPING.get(parsed["subject"], parsed["subject"].capitalize()),
                niveau=parsed["niveau"],
                task_number=parsed.get("task_number"),
                doc_type=parsed["doc_type"],
                bundesland="NI",  # Niedersachsen
                source_dir=za_dir,
                file_hash=compute_file_hash(pdf_file),
                extracted_at=datetime.now(),
                raw_filename=filename,
                variant=parsed.get("variant"),
            )

            documents.append(doc)

    return documents


async def index_document_to_qdrant(
    doc: NiBiSDocument,
    qdrant: QdrantService,
    collection: str = NIBIS_COLLECTION
) -> int:
    """
    Indexiert ein einzelnes Dokument in Qdrant.

    Returns:
        Anzahl der indexierten Chunks
    """
    # 1. PDF lesen
    try:
        with open(doc.file_path, "rb") as f:
            pdf_content = f.read()
    except Exception as e:
        print(f"  FEHLER beim Lesen: {e}")
        return 0

    # 2. Text extrahieren
    try:
        text = extract_text_from_pdf(pdf_content)
        if not text or len(text.strip()) < 50:
            print(f"  Warnung: Wenig Text extrahiert ({len(text)} Zeichen)")
            return 0
    except Exception as e:
        print(f"  FEHLER bei PDF-Extraktion: {e}")
        return 0

    # 3. Chunking
    chunks = chunk_text(text)
    if not chunks:
        return 0

    # 4. Embeddings generieren
    try:
        embeddings = await generate_embeddings(chunks)
    except Exception as e:
        print(f"  FEHLER bei Embedding-Generierung: {e}")
        return 0

    # 5. In Qdrant indexieren
    points = []
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        point_id = f"{doc.id}_chunk_{i}"

        payload = {
            "doc_id": doc.id,
            "chunk_index": i,
            "text": chunk,
            "year": doc.year,
            "subject": doc.subject,
            "niveau": doc.niveau,
            "task_number": doc.task_number,
            "doc_type": doc.doc_type,
            "bundesland": doc.bundesland,
            "variant": doc.variant,
            "source": "nibis",
            "training_allowed": True,  # NiBiS-Daten dürfen für Training genutzt werden
        }

        points.append({
            "id": point_id,
            "vector": embedding,
            "payload": payload,
        })

    # Batch-Upload
    try:
        await qdrant.upsert_points(collection, points)
        return len(points)
    except Exception as e:
        print(f"  FEHLER beim Qdrant-Upload: {e}")
        return 0


async def run_ingestion(
    ewh_only: bool = True,
    dry_run: bool = False,
    year_filter: Optional[int] = None,
    subject_filter: Optional[str] = None,
) -> Dict:
    """
    Hauptfunktion für die Ingestion-Pipeline.

    Args:
        ewh_only: Nur Erwartungshorizonte indexieren
        dry_run: Nur analysieren, nicht indexieren
        year_filter: Optional - nur bestimmtes Jahr
        subject_filter: Optional - nur bestimmtes Fach

    Returns:
        Statistiken über die Ingestion
    """
    stats = {
        "started_at": datetime.now().isoformat(),
        "zip_extracted": 0,
        "documents_found": 0,
        "documents_indexed": 0,
        "chunks_created": 0,
        "errors": [],
        "by_year": {},
        "by_subject": {},
    }

    print("=" * 60)
    print("NiBiS Ingestion Pipeline")
    print("=" * 60)

    # 1. ZIP-Dateien entpacken
    print("\n1. Entpacke ZIP-Dateien...")
    extracted = extract_zip_files(DOCS_BASE_PATH)
    stats["zip_extracted"] = len(extracted)

    # 2. Dokumente finden
    print("\n2. Suche Dokumente...")
    documents = discover_documents(DOCS_BASE_PATH, ewh_only=ewh_only)

    # Filter anwenden
    if year_filter:
        documents = [d for d in documents if d.year == year_filter]
    if subject_filter:
        documents = [d for d in documents if subject_filter.lower() in d.subject.lower()]

    stats["documents_found"] = len(documents)

    print(f"\n  Gefunden: {len(documents)} Dokumente")

    # Statistiken nach Jahr/Fach
    for doc in documents:
        year_key = str(doc.year)
        stats["by_year"][year_key] = stats["by_year"].get(year_key, 0) + 1
        stats["by_subject"][doc.subject] = stats["by_subject"].get(doc.subject, 0) + 1

    print("\n  Nach Jahr:")
    for year, count in sorted(stats["by_year"].items()):
        print(f"    {year}: {count}")

    print("\n  Nach Fach (Top 10):")
    sorted_subjects = sorted(stats["by_subject"].items(), key=lambda x: -x[1])[:10]
    for subject, count in sorted_subjects:
        print(f"    {subject}: {count}")

    if dry_run:
        print("\n[DRY RUN] Keine Indexierung durchgeführt.")
        return stats

    # 3. Qdrant initialisieren
    vector_size = get_vector_size()
    print(f"\n3. Initialisiere Qdrant...")
    print(f"   Embedding Backend: {EMBEDDING_BACKEND}")
    print(f"   Vektorgröße: {vector_size} Dimensionen")
    qdrant = QdrantService()
    await qdrant.ensure_collection(NIBIS_COLLECTION, vector_size=vector_size)

    # 4. Dokumente indexieren
    print("\n4. Indexiere Dokumente...")
    for i, doc in enumerate(documents, 1):
        print(f"  [{i}/{len(documents)}] {doc.raw_filename}...")

        try:
            chunk_count = await index_document_to_qdrant(doc, qdrant)
            if chunk_count > 0:
                stats["documents_indexed"] += 1
                stats["chunks_created"] += chunk_count
                print(f"    -> {chunk_count} Chunks indexiert")
        except Exception as e:
            error_msg = f"{doc.raw_filename}: {str(e)}"
            stats["errors"].append(error_msg)
            print(f"    FEHLER: {e}")

    stats["completed_at"] = datetime.now().isoformat()

    # 5. Zusammenfassung
    print("\n" + "=" * 60)
    print("ZUSAMMENFASSUNG")
    print("=" * 60)
    print(f"  ZIP-Dateien entpackt: {stats['zip_extracted']}")
    print(f"  Dokumente gefunden: {stats['documents_found']}")
    print(f"  Dokumente indexiert: {stats['documents_indexed']}")
    print(f"  Chunks erstellt: {stats['chunks_created']}")
    print(f"  Fehler: {len(stats['errors'])}")

    return stats


def generate_manifest(documents: List[NiBiSDocument], output_path: Path) -> None:
    """Erstellt ein Manifest aller gefundenen Dokumente."""
    manifest = {
        "generated_at": datetime.now().isoformat(),
        "total_documents": len(documents),
        "documents": [doc.to_dict() for doc in documents],
    }

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2, ensure_ascii=False)

    print(f"Manifest geschrieben: {output_path}")


# CLI Entry Point
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="NiBiS Ingestion Pipeline")
    parser.add_argument("--dry-run", action="store_true", help="Nur analysieren")
    parser.add_argument("--year", type=int, help="Filter nach Jahr")
    parser.add_argument("--subject", type=str, help="Filter nach Fach")
    parser.add_argument("--all-docs", action="store_true", help="Alle Dokumente (nicht nur EWH)")
    parser.add_argument("--manifest", type=str, help="Manifest-Datei erstellen")

    args = parser.parse_args()

    # Manifest erstellen?
    if args.manifest:
        docs = discover_documents(DOCS_BASE_PATH, ewh_only=not args.all_docs)
        generate_manifest(docs, Path(args.manifest))
    else:
        # Ingestion ausführen
        asyncio.run(run_ingestion(
            ewh_only=not args.all_docs,
            dry_run=args.dry_run,
            year_filter=args.year,
            subject_filter=args.subject,
        ))