""" NiBiS Ingestion Pipeline Automatisierte Verarbeitung von Abitur-Erwartungshorizonten aus Niedersachsen. Unterstützt: - Mehrere Jahre (2016, 2017, 2024, 2025, ...) - Verschiedene Namenskonventionen (alt: *Lehrer/*L.pdf, neu: *_EWH.pdf) - Automatisches Entpacken von ZIP-Dateien - Flexible Erweiterung für andere Bundesländer """ import os import re import zipfile import hashlib import json from pathlib import Path from typing import List, Dict, Optional, Tuple from dataclasses import dataclass, asdict from datetime import datetime import asyncio # Local imports from eh_pipeline import chunk_text, generate_embeddings, extract_text_from_pdf, get_vector_size, EMBEDDING_BACKEND from qdrant_service import QdrantService # Configuration DOCS_BASE_PATH = Path("/Users/benjaminadmin/projekte/breakpilot-pwa/docs") ZA_DOWNLOAD_DIRS = ["za-download", "za-download-2", "za-download-3"] # Qdrant collection for NiBiS data (separate from user EH) NIBIS_COLLECTION = "bp_nibis_eh" @dataclass class NiBiSDocument: """Strukturierte Repräsentation eines NiBiS-Dokuments.""" id: str file_path: str year: int subject: str niveau: str # eA, gA, EA, GA task_number: Optional[int] doc_type: str # EWH, Aufgabe, Material, GBU, etc. bundesland: str source_dir: str file_hash: str extracted_at: datetime # Metadaten aus Dateinamen raw_filename: str variant: Optional[str] = None # BG, Tech, Wirt, etc. def to_dict(self) -> dict: d = asdict(self) d['extracted_at'] = d['extracted_at'].isoformat() return d # Fach-Mapping (Kurzform -> Langform) SUBJECT_MAPPING = { "deutsch": "Deutsch", "englisch": "Englisch", "englischbg": "Englisch (Berufliches Gymnasium)", "mathe": "Mathematik", "mathebg": "Mathematik (Berufliches Gymnasium)", "mathezwb": "Mathematik (Zweiter Bildungsweg)", "informatik": "Informatik", "biologie": "Biologie", "chemie": "Chemie", "physik": "Physik", "geschichte": "Geschichte", "erdkunde": "Erdkunde/Geografie", "kunst": "Kunst", "musik": "Musik", "sport": "Sport", "latein": "Latein", "griechisch": "Griechisch", "französisch": "Französisch", "franzîsisch": "Französisch", # Encoding-Problem in 2017 "spanisch": "Spanisch", "kathreligion": "Katholische Religion", "evreligion": "Evangelische Religion", "wertenormen": "Werte und Normen", "brc": "Betriebswirtschaft mit Rechnungswesen/Controlling", "bvw": "Betriebswirtschaft mit Rechnungswesen", "gespfl": "Gesundheit-Pflege", } # Niveau-Mapping NIVEAU_MAPPING = { "ea": "eA", # erhöhtes Anforderungsniveau "ga": "gA", # grundlegendes Anforderungsniveau "neuga": "gA (neu einsetzend)", "neuea": "eA (neu einsetzend)", } def compute_file_hash(file_path: Path) -> str: """Berechnet SHA-256 Hash einer Datei.""" sha256 = hashlib.sha256() with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(8192), b""): sha256.update(chunk) return sha256.hexdigest()[:16] def extract_zip_files(base_path: Path) -> List[Path]: """Entpackt alle ZIP-Dateien in den za-download Verzeichnissen.""" extracted = [] for za_dir in ZA_DOWNLOAD_DIRS: za_path = base_path / za_dir if not za_path.exists(): continue for zip_file in za_path.glob("*.zip"): # Zielverzeichnis = Name ohne .zip target_dir = za_path / zip_file.stem if target_dir.exists(): print(f" Bereits entpackt: {zip_file.name} -> {target_dir.name}/") extracted.append(target_dir) continue print(f" Entpacke: {zip_file.name}...") try: with zipfile.ZipFile(zip_file, 'r') as zf: zf.extractall(target_dir) print(f" -> {len(list(target_dir.rglob('*')))} Dateien extrahiert") extracted.append(target_dir) except Exception as e: print(f" FEHLER: {e}") return extracted def parse_filename_old_format(filename: str, file_path: Path) -> Optional[Dict]: """ Parst alte Namenskonvention (2016, 2017): - {Jahr}{Fach}{Niveau}Lehrer/{Jahr}{Fach}{Niveau}A{Nr}L.pdf - Beispiel: 2016DeutschEALehrer/2016DeutschEAA1L.pdf """ # Pattern für Lehrer-Dateien pattern = r"(\d{4})([A-Za-zäöüÄÖÜ]+)(EA|GA|NeuGA|NeuEA)(?:Lehrer)?.*?(?:A(\d+)|Aufg(\d+))?L?\.pdf$" match = re.search(pattern, filename, re.IGNORECASE) if not match: return None year = int(match.group(1)) subject_raw = match.group(2).lower() niveau = match.group(3).upper() task_num = match.group(4) or match.group(5) # Prüfe ob es ein Lehrer-Dokument ist (EWH) is_ewh = "lehrer" in str(file_path).lower() or filename.endswith("L.pdf") # Extrahiere Variante (Tech, Wirt, CAS, GTR, etc.) variant = None variant_patterns = ["Tech", "Wirt", "CAS", "GTR", "Pflicht", "BG", "mitExp", "ohneExp"] for v in variant_patterns: if v.lower() in str(file_path).lower(): variant = v break return { "year": year, "subject": subject_raw, "niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau), "task_number": int(task_num) if task_num else None, "doc_type": "EWH" if is_ewh else "Aufgabe", "variant": variant, } def parse_filename_new_format(filename: str, file_path: Path) -> Optional[Dict]: """ Parst neue Namenskonvention (2024, 2025): - {Jahr}_{Fach}_{niveau}_{Nr}_EWH.pdf - Beispiel: 2025_Deutsch_eA_I_EWH.pdf """ # Pattern für neue Dateien pattern = r"(\d{4})_([A-Za-zäöüÄÖÜ]+)(?:BG)?_(eA|gA)(?:_([IVX\d]+))?(?:_(.+))?\.pdf$" match = re.search(pattern, filename, re.IGNORECASE) if not match: return None year = int(match.group(1)) subject_raw = match.group(2).lower() niveau = match.group(3) task_id = match.group(4) suffix = match.group(5) or "" # Task-Nummer aus römischen Zahlen task_num = None if task_id: roman_map = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5} task_num = roman_map.get(task_id) or (int(task_id) if task_id.isdigit() else None) # Dokumenttyp is_ewh = "EWH" in filename or "ewh" in filename.lower() # Spezielle Dokumenttypen doc_type = "EWH" if is_ewh else "Aufgabe" if "Material" in suffix: doc_type = "Material" elif "GBU" in suffix: doc_type = "GBU" elif "Ergebnis" in suffix: doc_type = "Ergebnis" elif "Bewertungsbogen" in suffix: doc_type = "Bewertungsbogen" elif "HV" in suffix: doc_type = "Hörverstehen" elif "ME" in suffix: doc_type = "Mediation" # BG Variante variant = "BG" if "BG" in filename else None if "mitExp" in str(file_path): variant = "mitExp" return { "year": year, "subject": subject_raw, "niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau), "task_number": task_num, "doc_type": doc_type, "variant": variant, } def discover_documents(base_path: Path, ewh_only: bool = True) -> List[NiBiSDocument]: """ Findet alle relevanten Dokumente in den za-download Verzeichnissen. Args: base_path: Basis-Pfad zu docs/ ewh_only: Nur Erwartungshorizonte (keine Aufgaben) """ documents = [] for za_dir in ZA_DOWNLOAD_DIRS: za_path = base_path / za_dir if not za_path.exists(): continue print(f"\nSuche in {za_dir}...") for pdf_file in za_path.rglob("*.pdf"): filename = pdf_file.name # Versuche beide Formate parsed = parse_filename_new_format(filename, pdf_file) if not parsed: parsed = parse_filename_old_format(filename, pdf_file) if not parsed: # Unbekanntes Format continue # Filter: Nur EWH? if ewh_only and parsed["doc_type"] != "EWH": continue # Erstelle Dokument doc_id = f"nibis_{parsed['year']}_{parsed['subject']}_{parsed['niveau']}_{parsed.get('task_number', 0)}_{compute_file_hash(pdf_file)}" doc = NiBiSDocument( id=doc_id, file_path=str(pdf_file), year=parsed["year"], subject=SUBJECT_MAPPING.get(parsed["subject"], parsed["subject"].capitalize()), niveau=parsed["niveau"], task_number=parsed.get("task_number"), doc_type=parsed["doc_type"], bundesland="NI", # Niedersachsen source_dir=za_dir, file_hash=compute_file_hash(pdf_file), extracted_at=datetime.now(), raw_filename=filename, variant=parsed.get("variant"), ) documents.append(doc) return documents async def index_document_to_qdrant( doc: NiBiSDocument, qdrant: QdrantService, collection: str = NIBIS_COLLECTION ) -> int: """ Indexiert ein einzelnes Dokument in Qdrant. Returns: Anzahl der indexierten Chunks """ # 1. PDF lesen try: with open(doc.file_path, "rb") as f: pdf_content = f.read() except Exception as e: print(f" FEHLER beim Lesen: {e}") return 0 # 2. Text extrahieren try: text = extract_text_from_pdf(pdf_content) if not text or len(text.strip()) < 50: print(f" Warnung: Wenig Text extrahiert ({len(text)} Zeichen)") return 0 except Exception as e: print(f" FEHLER bei PDF-Extraktion: {e}") return 0 # 3. Chunking chunks = chunk_text(text) if not chunks: return 0 # 4. Embeddings generieren try: embeddings = await generate_embeddings(chunks) except Exception as e: print(f" FEHLER bei Embedding-Generierung: {e}") return 0 # 5. In Qdrant indexieren points = [] for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): point_id = f"{doc.id}_chunk_{i}" payload = { "doc_id": doc.id, "chunk_index": i, "text": chunk, "year": doc.year, "subject": doc.subject, "niveau": doc.niveau, "task_number": doc.task_number, "doc_type": doc.doc_type, "bundesland": doc.bundesland, "variant": doc.variant, "source": "nibis", "training_allowed": True, # NiBiS-Daten dürfen für Training genutzt werden } points.append({ "id": point_id, "vector": embedding, "payload": payload, }) # Batch-Upload try: await qdrant.upsert_points(collection, points) return len(points) except Exception as e: print(f" FEHLER beim Qdrant-Upload: {e}") return 0 async def run_ingestion( ewh_only: bool = True, dry_run: bool = False, year_filter: Optional[int] = None, subject_filter: Optional[str] = None, ) -> Dict: """ Hauptfunktion für die Ingestion-Pipeline. Args: ewh_only: Nur Erwartungshorizonte indexieren dry_run: Nur analysieren, nicht indexieren year_filter: Optional - nur bestimmtes Jahr subject_filter: Optional - nur bestimmtes Fach Returns: Statistiken über die Ingestion """ stats = { "started_at": datetime.now().isoformat(), "zip_extracted": 0, "documents_found": 0, "documents_indexed": 0, "chunks_created": 0, "errors": [], "by_year": {}, "by_subject": {}, } print("=" * 60) print("NiBiS Ingestion Pipeline") print("=" * 60) # 1. ZIP-Dateien entpacken print("\n1. Entpacke ZIP-Dateien...") extracted = extract_zip_files(DOCS_BASE_PATH) stats["zip_extracted"] = len(extracted) # 2. Dokumente finden print("\n2. Suche Dokumente...") documents = discover_documents(DOCS_BASE_PATH, ewh_only=ewh_only) # Filter anwenden if year_filter: documents = [d for d in documents if d.year == year_filter] if subject_filter: documents = [d for d in documents if subject_filter.lower() in d.subject.lower()] stats["documents_found"] = len(documents) print(f"\n Gefunden: {len(documents)} Dokumente") # Statistiken nach Jahr/Fach for doc in documents: year_key = str(doc.year) stats["by_year"][year_key] = stats["by_year"].get(year_key, 0) + 1 stats["by_subject"][doc.subject] = stats["by_subject"].get(doc.subject, 0) + 1 print("\n Nach Jahr:") for year, count in sorted(stats["by_year"].items()): print(f" {year}: {count}") print("\n Nach Fach (Top 10):") sorted_subjects = sorted(stats["by_subject"].items(), key=lambda x: -x[1])[:10] for subject, count in sorted_subjects: print(f" {subject}: {count}") if dry_run: print("\n[DRY RUN] Keine Indexierung durchgeführt.") return stats # 3. Qdrant initialisieren vector_size = get_vector_size() print(f"\n3. Initialisiere Qdrant...") print(f" Embedding Backend: {EMBEDDING_BACKEND}") print(f" Vektorgröße: {vector_size} Dimensionen") qdrant = QdrantService() await qdrant.ensure_collection(NIBIS_COLLECTION, vector_size=vector_size) # 4. Dokumente indexieren print("\n4. Indexiere Dokumente...") for i, doc in enumerate(documents, 1): print(f" [{i}/{len(documents)}] {doc.raw_filename}...") try: chunk_count = await index_document_to_qdrant(doc, qdrant) if chunk_count > 0: stats["documents_indexed"] += 1 stats["chunks_created"] += chunk_count print(f" -> {chunk_count} Chunks indexiert") except Exception as e: error_msg = f"{doc.raw_filename}: {str(e)}" stats["errors"].append(error_msg) print(f" FEHLER: {e}") stats["completed_at"] = datetime.now().isoformat() # 5. Zusammenfassung print("\n" + "=" * 60) print("ZUSAMMENFASSUNG") print("=" * 60) print(f" ZIP-Dateien entpackt: {stats['zip_extracted']}") print(f" Dokumente gefunden: {stats['documents_found']}") print(f" Dokumente indexiert: {stats['documents_indexed']}") print(f" Chunks erstellt: {stats['chunks_created']}") print(f" Fehler: {len(stats['errors'])}") return stats def generate_manifest(documents: List[NiBiSDocument], output_path: Path) -> None: """Erstellt ein Manifest aller gefundenen Dokumente.""" manifest = { "generated_at": datetime.now().isoformat(), "total_documents": len(documents), "documents": [doc.to_dict() for doc in documents], } with open(output_path, "w", encoding="utf-8") as f: json.dump(manifest, f, indent=2, ensure_ascii=False) print(f"Manifest geschrieben: {output_path}") # CLI Entry Point if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="NiBiS Ingestion Pipeline") parser.add_argument("--dry-run", action="store_true", help="Nur analysieren") parser.add_argument("--year", type=int, help="Filter nach Jahr") parser.add_argument("--subject", type=str, help="Filter nach Fach") parser.add_argument("--all-docs", action="store_true", help="Alle Dokumente (nicht nur EWH)") parser.add_argument("--manifest", type=str, help="Manifest-Datei erstellen") args = parser.parse_args() # Manifest erstellen? if args.manifest: docs = discover_documents(DOCS_BASE_PATH, ewh_only=not args.all_docs) generate_manifest(docs, Path(args.manifest)) else: # Ingestion ausführen asyncio.run(run_ingestion( ewh_only=not args.all_docs, dry_run=args.dry_run, year_filter=args.year, subject_filter=args.subject, ))