""" NiBiS Ingestion Pipeline Automatisierte Verarbeitung von Abitur-Erwartungshorizonten aus Niedersachsen. Unterstützt: - Mehrere Jahre (2016, 2017, 2024, 2025, ...) - Verschiedene Namenskonventionen (alt: *Lehrer/*L.pdf, neu: *_EWH.pdf) - Automatisches Entpacken von ZIP-Dateien - Flexible Erweiterung für andere Bundesländer """ import os import zipfile import hashlib import json from pathlib import Path from typing import List, Dict, Optional from dataclasses import dataclass, asdict from datetime import datetime import asyncio # Local imports from eh_pipeline import chunk_text, generate_embeddings, extract_text_from_pdf, get_vector_size, EMBEDDING_BACKEND from qdrant_service import QdrantService from nibis_parsers import parse_filename_old_format, parse_filename_new_format # Configuration DOCS_BASE_PATH = Path("/Users/benjaminadmin/projekte/breakpilot-pwa/docs") ZA_DOWNLOAD_DIRS = ["za-download", "za-download-2", "za-download-3"] # Qdrant collection for NiBiS data (separate from user EH) NIBIS_COLLECTION = "bp_nibis_eh" @dataclass class NiBiSDocument: """Strukturierte Repräsentation eines NiBiS-Dokuments.""" id: str file_path: str year: int subject: str niveau: str # eA, gA, EA, GA task_number: Optional[int] doc_type: str # EWH, Aufgabe, Material, GBU, etc. bundesland: str source_dir: str file_hash: str extracted_at: datetime # Metadaten aus Dateinamen raw_filename: str variant: Optional[str] = None # BG, Tech, Wirt, etc. def to_dict(self) -> dict: d = asdict(self) d['extracted_at'] = d['extracted_at'].isoformat() return d # Fach-Mapping (Kurzform -> Langform) SUBJECT_MAPPING = { "deutsch": "Deutsch", "englisch": "Englisch", "englischbg": "Englisch (Berufliches Gymnasium)", "mathe": "Mathematik", "mathebg": "Mathematik (Berufliches Gymnasium)", "mathezwb": "Mathematik (Zweiter Bildungsweg)", "informatik": "Informatik", "biologie": "Biologie", "chemie": "Chemie", "physik": "Physik", "geschichte": "Geschichte", "erdkunde": "Erdkunde/Geografie", "kunst": "Kunst", "musik": "Musik", "sport": "Sport", "latein": "Latein", "griechisch": "Griechisch", "französisch": "Französisch", "franzîsisch": "Französisch", # Encoding-Problem in 2017 "spanisch": "Spanisch", "kathreligion": "Katholische Religion", "evreligion": "Evangelische Religion", "wertenormen": "Werte und Normen", "brc": "Betriebswirtschaft mit Rechnungswesen/Controlling", "bvw": "Betriebswirtschaft mit Rechnungswesen", "gespfl": "Gesundheit-Pflege", } def compute_file_hash(file_path: Path) -> str: """Berechnet SHA-256 Hash einer Datei.""" sha256 = hashlib.sha256() with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(8192), b""): sha256.update(chunk) return sha256.hexdigest()[:16] def extract_zip_files(base_path: Path) -> List[Path]: """Entpackt alle ZIP-Dateien in den za-download Verzeichnissen.""" extracted = [] for za_dir in ZA_DOWNLOAD_DIRS: za_path = base_path / za_dir if not za_path.exists(): continue for zip_file in za_path.glob("*.zip"): # Zielverzeichnis = Name ohne .zip target_dir = za_path / zip_file.stem if target_dir.exists(): print(f" Bereits entpackt: {zip_file.name} -> {target_dir.name}/") extracted.append(target_dir) continue print(f" Entpacke: {zip_file.name}...") try: with zipfile.ZipFile(zip_file, 'r') as zf: zf.extractall(target_dir) print(f" -> {len(list(target_dir.rglob('*')))} Dateien extrahiert") extracted.append(target_dir) except Exception as e: print(f" FEHLER: {e}") return extracted def discover_documents(base_path: Path, ewh_only: bool = True) -> List[NiBiSDocument]: """ Findet alle relevanten Dokumente in den za-download Verzeichnissen. Args: base_path: Basis-Pfad zu docs/ ewh_only: Nur Erwartungshorizonte (keine Aufgaben) """ documents = [] for za_dir in ZA_DOWNLOAD_DIRS: za_path = base_path / za_dir if not za_path.exists(): continue print(f"\nSuche in {za_dir}...") for pdf_file in za_path.rglob("*.pdf"): filename = pdf_file.name # Versuche beide Formate parsed = parse_filename_new_format(filename, pdf_file) if not parsed: parsed = parse_filename_old_format(filename, pdf_file) if not parsed: # Unbekanntes Format continue # Filter: Nur EWH? if ewh_only and parsed["doc_type"] != "EWH": continue # Erstelle Dokument doc_id = f"nibis_{parsed['year']}_{parsed['subject']}_{parsed['niveau']}_{parsed.get('task_number', 0)}_{compute_file_hash(pdf_file)}" doc = NiBiSDocument( id=doc_id, file_path=str(pdf_file), year=parsed["year"], subject=SUBJECT_MAPPING.get(parsed["subject"], parsed["subject"].capitalize()), niveau=parsed["niveau"], task_number=parsed.get("task_number"), doc_type=parsed["doc_type"], bundesland="NI", # Niedersachsen source_dir=za_dir, file_hash=compute_file_hash(pdf_file), extracted_at=datetime.now(), raw_filename=filename, variant=parsed.get("variant"), ) documents.append(doc) return documents async def index_document_to_qdrant( doc: NiBiSDocument, qdrant: QdrantService, collection: str = NIBIS_COLLECTION ) -> int: """ Indexiert ein einzelnes Dokument in Qdrant. Returns: Anzahl der indexierten Chunks """ # 1. PDF lesen try: with open(doc.file_path, "rb") as f: pdf_content = f.read() except Exception as e: print(f" FEHLER beim Lesen: {e}") return 0 # 2. Text extrahieren try: text = extract_text_from_pdf(pdf_content) if not text or len(text.strip()) < 50: print(f" Warnung: Wenig Text extrahiert ({len(text)} Zeichen)") return 0 except Exception as e: print(f" FEHLER bei PDF-Extraktion: {e}") return 0 # 3. Chunking chunks = chunk_text(text) if not chunks: return 0 # 4. Embeddings generieren try: embeddings = await generate_embeddings(chunks) except Exception as e: print(f" FEHLER bei Embedding-Generierung: {e}") return 0 # 5. In Qdrant indexieren points = [] for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): point_id = f"{doc.id}_chunk_{i}" payload = { "doc_id": doc.id, "chunk_index": i, "text": chunk, "year": doc.year, "subject": doc.subject, "niveau": doc.niveau, "task_number": doc.task_number, "doc_type": doc.doc_type, "bundesland": doc.bundesland, "variant": doc.variant, "source": "nibis", "training_allowed": True, # NiBiS-Daten dürfen für Training genutzt werden } points.append({ "id": point_id, "vector": embedding, "payload": payload, }) # Batch-Upload try: await qdrant.upsert_points(collection, points) return len(points) except Exception as e: print(f" FEHLER beim Qdrant-Upload: {e}") return 0 async def run_ingestion( ewh_only: bool = True, dry_run: bool = False, year_filter: Optional[int] = None, subject_filter: Optional[str] = None, ) -> Dict: """ Hauptfunktion für die Ingestion-Pipeline. Args: ewh_only: Nur Erwartungshorizonte indexieren dry_run: Nur analysieren, nicht indexieren year_filter: Optional - nur bestimmtes Jahr subject_filter: Optional - nur bestimmtes Fach Returns: Statistiken über die Ingestion """ stats = { "started_at": datetime.now().isoformat(), "zip_extracted": 0, "documents_found": 0, "documents_indexed": 0, "chunks_created": 0, "errors": [], "by_year": {}, "by_subject": {}, } print("=" * 60) print("NiBiS Ingestion Pipeline") print("=" * 60) # 1. ZIP-Dateien entpacken print("\n1. Entpacke ZIP-Dateien...") extracted = extract_zip_files(DOCS_BASE_PATH) stats["zip_extracted"] = len(extracted) # 2. Dokumente finden print("\n2. Suche Dokumente...") documents = discover_documents(DOCS_BASE_PATH, ewh_only=ewh_only) # Filter anwenden if year_filter: documents = [d for d in documents if d.year == year_filter] if subject_filter: documents = [d for d in documents if subject_filter.lower() in d.subject.lower()] stats["documents_found"] = len(documents) print(f"\n Gefunden: {len(documents)} Dokumente") # Statistiken nach Jahr/Fach for doc in documents: year_key = str(doc.year) stats["by_year"][year_key] = stats["by_year"].get(year_key, 0) + 1 stats["by_subject"][doc.subject] = stats["by_subject"].get(doc.subject, 0) + 1 print("\n Nach Jahr:") for year, count in sorted(stats["by_year"].items()): print(f" {year}: {count}") print("\n Nach Fach (Top 10):") sorted_subjects = sorted(stats["by_subject"].items(), key=lambda x: -x[1])[:10] for subject, count in sorted_subjects: print(f" {subject}: {count}") if dry_run: print("\n[DRY RUN] Keine Indexierung durchgeführt.") return stats # 3. Qdrant initialisieren vector_size = get_vector_size() print(f"\n3. Initialisiere Qdrant...") print(f" Embedding Backend: {EMBEDDING_BACKEND}") print(f" Vektorgröße: {vector_size} Dimensionen") qdrant = QdrantService() await qdrant.ensure_collection(NIBIS_COLLECTION, vector_size=vector_size) # 4. Dokumente indexieren print("\n4. Indexiere Dokumente...") for i, doc in enumerate(documents, 1): print(f" [{i}/{len(documents)}] {doc.raw_filename}...") try: chunk_count = await index_document_to_qdrant(doc, qdrant) if chunk_count > 0: stats["documents_indexed"] += 1 stats["chunks_created"] += chunk_count print(f" -> {chunk_count} Chunks indexiert") except Exception as e: error_msg = f"{doc.raw_filename}: {str(e)}" stats["errors"].append(error_msg) print(f" FEHLER: {e}") stats["completed_at"] = datetime.now().isoformat() # 5. Zusammenfassung print("\n" + "=" * 60) print("ZUSAMMENFASSUNG") print("=" * 60) print(f" ZIP-Dateien entpackt: {stats['zip_extracted']}") print(f" Dokumente gefunden: {stats['documents_found']}") print(f" Dokumente indexiert: {stats['documents_indexed']}") print(f" Chunks erstellt: {stats['chunks_created']}") print(f" Fehler: {len(stats['errors'])}") return stats def generate_manifest(documents: List[NiBiSDocument], output_path: Path) -> None: """Erstellt ein Manifest aller gefundenen Dokumente.""" manifest = { "generated_at": datetime.now().isoformat(), "total_documents": len(documents), "documents": [doc.to_dict() for doc in documents], } with open(output_path, "w", encoding="utf-8") as f: json.dump(manifest, f, indent=2, ensure_ascii=False) print(f"Manifest geschrieben: {output_path}") # CLI Entry Point if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="NiBiS Ingestion Pipeline") parser.add_argument("--dry-run", action="store_true", help="Nur analysieren") parser.add_argument("--year", type=int, help="Filter nach Jahr") parser.add_argument("--subject", type=str, help="Filter nach Fach") parser.add_argument("--all-docs", action="store_true", help="Alle Dokumente (nicht nur EWH)") parser.add_argument("--manifest", type=str, help="Manifest-Datei erstellen") args = parser.parse_args() # Manifest erstellen? if args.manifest: docs = discover_documents(DOCS_BASE_PATH, ewh_only=not args.all_docs) generate_manifest(docs, Path(args.manifest)) else: # Ingestion ausführen asyncio.run(run_ingestion( ewh_only=not args.all_docs, dry_run=args.dry_run, year_filter=args.year, subject_filter=args.subject, ))