Files
breakpilot-lehrer/klausur-service/backend/nibis_ingestion.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

517 lines
16 KiB
Python

"""
NiBiS Ingestion Pipeline
Automatisierte Verarbeitung von Abitur-Erwartungshorizonten aus Niedersachsen.
Unterstützt:
- Mehrere Jahre (2016, 2017, 2024, 2025, ...)
- Verschiedene Namenskonventionen (alt: *Lehrer/*L.pdf, neu: *_EWH.pdf)
- Automatisches Entpacken von ZIP-Dateien
- Flexible Erweiterung für andere Bundesländer
"""
import os
import re
import zipfile
import hashlib
import json
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
import asyncio
# Local imports
from eh_pipeline import chunk_text, generate_embeddings, extract_text_from_pdf, get_vector_size, EMBEDDING_BACKEND
from qdrant_service import QdrantService
# Configuration
DOCS_BASE_PATH = Path("/Users/benjaminadmin/projekte/breakpilot-pwa/docs")
ZA_DOWNLOAD_DIRS = ["za-download", "za-download-2", "za-download-3"]
# Qdrant collection for NiBiS data (separate from user EH)
NIBIS_COLLECTION = "bp_nibis_eh"
@dataclass
class NiBiSDocument:
"""Strukturierte Repräsentation eines NiBiS-Dokuments."""
id: str
file_path: str
year: int
subject: str
niveau: str # eA, gA, EA, GA
task_number: Optional[int]
doc_type: str # EWH, Aufgabe, Material, GBU, etc.
bundesland: str
source_dir: str
file_hash: str
extracted_at: datetime
# Metadaten aus Dateinamen
raw_filename: str
variant: Optional[str] = None # BG, Tech, Wirt, etc.
def to_dict(self) -> dict:
d = asdict(self)
d['extracted_at'] = d['extracted_at'].isoformat()
return d
# Fach-Mapping (Kurzform -> Langform)
SUBJECT_MAPPING = {
"deutsch": "Deutsch",
"englisch": "Englisch",
"englischbg": "Englisch (Berufliches Gymnasium)",
"mathe": "Mathematik",
"mathebg": "Mathematik (Berufliches Gymnasium)",
"mathezwb": "Mathematik (Zweiter Bildungsweg)",
"informatik": "Informatik",
"biologie": "Biologie",
"chemie": "Chemie",
"physik": "Physik",
"geschichte": "Geschichte",
"erdkunde": "Erdkunde/Geografie",
"kunst": "Kunst",
"musik": "Musik",
"sport": "Sport",
"latein": "Latein",
"griechisch": "Griechisch",
"französisch": "Französisch",
"franzîsisch": "Französisch", # Encoding-Problem in 2017
"spanisch": "Spanisch",
"kathreligion": "Katholische Religion",
"evreligion": "Evangelische Religion",
"wertenormen": "Werte und Normen",
"brc": "Betriebswirtschaft mit Rechnungswesen/Controlling",
"bvw": "Betriebswirtschaft mit Rechnungswesen",
"gespfl": "Gesundheit-Pflege",
}
# Niveau-Mapping
NIVEAU_MAPPING = {
"ea": "eA", # erhöhtes Anforderungsniveau
"ga": "gA", # grundlegendes Anforderungsniveau
"neuga": "gA (neu einsetzend)",
"neuea": "eA (neu einsetzend)",
}
def compute_file_hash(file_path: Path) -> str:
"""Berechnet SHA-256 Hash einer Datei."""
sha256 = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
sha256.update(chunk)
return sha256.hexdigest()[:16]
def extract_zip_files(base_path: Path) -> List[Path]:
"""Entpackt alle ZIP-Dateien in den za-download Verzeichnissen."""
extracted = []
for za_dir in ZA_DOWNLOAD_DIRS:
za_path = base_path / za_dir
if not za_path.exists():
continue
for zip_file in za_path.glob("*.zip"):
# Zielverzeichnis = Name ohne .zip
target_dir = za_path / zip_file.stem
if target_dir.exists():
print(f" Bereits entpackt: {zip_file.name} -> {target_dir.name}/")
extracted.append(target_dir)
continue
print(f" Entpacke: {zip_file.name}...")
try:
with zipfile.ZipFile(zip_file, 'r') as zf:
zf.extractall(target_dir)
print(f" -> {len(list(target_dir.rglob('*')))} Dateien extrahiert")
extracted.append(target_dir)
except Exception as e:
print(f" FEHLER: {e}")
return extracted
def parse_filename_old_format(filename: str, file_path: Path) -> Optional[Dict]:
"""
Parst alte Namenskonvention (2016, 2017):
- {Jahr}{Fach}{Niveau}Lehrer/{Jahr}{Fach}{Niveau}A{Nr}L.pdf
- Beispiel: 2016DeutschEALehrer/2016DeutschEAA1L.pdf
"""
# Pattern für Lehrer-Dateien
pattern = r"(\d{4})([A-Za-zäöüÄÖÜ]+)(EA|GA|NeuGA|NeuEA)(?:Lehrer)?.*?(?:A(\d+)|Aufg(\d+))?L?\.pdf$"
match = re.search(pattern, filename, re.IGNORECASE)
if not match:
return None
year = int(match.group(1))
subject_raw = match.group(2).lower()
niveau = match.group(3).upper()
task_num = match.group(4) or match.group(5)
# Prüfe ob es ein Lehrer-Dokument ist (EWH)
is_ewh = "lehrer" in str(file_path).lower() or filename.endswith("L.pdf")
# Extrahiere Variante (Tech, Wirt, CAS, GTR, etc.)
variant = None
variant_patterns = ["Tech", "Wirt", "CAS", "GTR", "Pflicht", "BG", "mitExp", "ohneExp"]
for v in variant_patterns:
if v.lower() in str(file_path).lower():
variant = v
break
return {
"year": year,
"subject": subject_raw,
"niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
"task_number": int(task_num) if task_num else None,
"doc_type": "EWH" if is_ewh else "Aufgabe",
"variant": variant,
}
def parse_filename_new_format(filename: str, file_path: Path) -> Optional[Dict]:
"""
Parst neue Namenskonvention (2024, 2025):
- {Jahr}_{Fach}_{niveau}_{Nr}_EWH.pdf
- Beispiel: 2025_Deutsch_eA_I_EWH.pdf
"""
# Pattern für neue Dateien
pattern = r"(\d{4})_([A-Za-zäöüÄÖÜ]+)(?:BG)?_(eA|gA)(?:_([IVX\d]+))?(?:_(.+))?\.pdf$"
match = re.search(pattern, filename, re.IGNORECASE)
if not match:
return None
year = int(match.group(1))
subject_raw = match.group(2).lower()
niveau = match.group(3)
task_id = match.group(4)
suffix = match.group(5) or ""
# Task-Nummer aus römischen Zahlen
task_num = None
if task_id:
roman_map = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
task_num = roman_map.get(task_id) or (int(task_id) if task_id.isdigit() else None)
# Dokumenttyp
is_ewh = "EWH" in filename or "ewh" in filename.lower()
# Spezielle Dokumenttypen
doc_type = "EWH" if is_ewh else "Aufgabe"
if "Material" in suffix:
doc_type = "Material"
elif "GBU" in suffix:
doc_type = "GBU"
elif "Ergebnis" in suffix:
doc_type = "Ergebnis"
elif "Bewertungsbogen" in suffix:
doc_type = "Bewertungsbogen"
elif "HV" in suffix:
doc_type = "Hörverstehen"
elif "ME" in suffix:
doc_type = "Mediation"
# BG Variante
variant = "BG" if "BG" in filename else None
if "mitExp" in str(file_path):
variant = "mitExp"
return {
"year": year,
"subject": subject_raw,
"niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
"task_number": task_num,
"doc_type": doc_type,
"variant": variant,
}
def discover_documents(base_path: Path, ewh_only: bool = True) -> List[NiBiSDocument]:
"""
Findet alle relevanten Dokumente in den za-download Verzeichnissen.
Args:
base_path: Basis-Pfad zu docs/
ewh_only: Nur Erwartungshorizonte (keine Aufgaben)
"""
documents = []
for za_dir in ZA_DOWNLOAD_DIRS:
za_path = base_path / za_dir
if not za_path.exists():
continue
print(f"\nSuche in {za_dir}...")
for pdf_file in za_path.rglob("*.pdf"):
filename = pdf_file.name
# Versuche beide Formate
parsed = parse_filename_new_format(filename, pdf_file)
if not parsed:
parsed = parse_filename_old_format(filename, pdf_file)
if not parsed:
# Unbekanntes Format
continue
# Filter: Nur EWH?
if ewh_only and parsed["doc_type"] != "EWH":
continue
# Erstelle Dokument
doc_id = f"nibis_{parsed['year']}_{parsed['subject']}_{parsed['niveau']}_{parsed.get('task_number', 0)}_{compute_file_hash(pdf_file)}"
doc = NiBiSDocument(
id=doc_id,
file_path=str(pdf_file),
year=parsed["year"],
subject=SUBJECT_MAPPING.get(parsed["subject"], parsed["subject"].capitalize()),
niveau=parsed["niveau"],
task_number=parsed.get("task_number"),
doc_type=parsed["doc_type"],
bundesland="NI", # Niedersachsen
source_dir=za_dir,
file_hash=compute_file_hash(pdf_file),
extracted_at=datetime.now(),
raw_filename=filename,
variant=parsed.get("variant"),
)
documents.append(doc)
return documents
async def index_document_to_qdrant(
doc: NiBiSDocument,
qdrant: QdrantService,
collection: str = NIBIS_COLLECTION
) -> int:
"""
Indexiert ein einzelnes Dokument in Qdrant.
Returns:
Anzahl der indexierten Chunks
"""
# 1. PDF lesen
try:
with open(doc.file_path, "rb") as f:
pdf_content = f.read()
except Exception as e:
print(f" FEHLER beim Lesen: {e}")
return 0
# 2. Text extrahieren
try:
text = extract_text_from_pdf(pdf_content)
if not text or len(text.strip()) < 50:
print(f" Warnung: Wenig Text extrahiert ({len(text)} Zeichen)")
return 0
except Exception as e:
print(f" FEHLER bei PDF-Extraktion: {e}")
return 0
# 3. Chunking
chunks = chunk_text(text)
if not chunks:
return 0
# 4. Embeddings generieren
try:
embeddings = await generate_embeddings(chunks)
except Exception as e:
print(f" FEHLER bei Embedding-Generierung: {e}")
return 0
# 5. In Qdrant indexieren
points = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
point_id = f"{doc.id}_chunk_{i}"
payload = {
"doc_id": doc.id,
"chunk_index": i,
"text": chunk,
"year": doc.year,
"subject": doc.subject,
"niveau": doc.niveau,
"task_number": doc.task_number,
"doc_type": doc.doc_type,
"bundesland": doc.bundesland,
"variant": doc.variant,
"source": "nibis",
"training_allowed": True, # NiBiS-Daten dürfen für Training genutzt werden
}
points.append({
"id": point_id,
"vector": embedding,
"payload": payload,
})
# Batch-Upload
try:
await qdrant.upsert_points(collection, points)
return len(points)
except Exception as e:
print(f" FEHLER beim Qdrant-Upload: {e}")
return 0
async def run_ingestion(
ewh_only: bool = True,
dry_run: bool = False,
year_filter: Optional[int] = None,
subject_filter: Optional[str] = None,
) -> Dict:
"""
Hauptfunktion für die Ingestion-Pipeline.
Args:
ewh_only: Nur Erwartungshorizonte indexieren
dry_run: Nur analysieren, nicht indexieren
year_filter: Optional - nur bestimmtes Jahr
subject_filter: Optional - nur bestimmtes Fach
Returns:
Statistiken über die Ingestion
"""
stats = {
"started_at": datetime.now().isoformat(),
"zip_extracted": 0,
"documents_found": 0,
"documents_indexed": 0,
"chunks_created": 0,
"errors": [],
"by_year": {},
"by_subject": {},
}
print("=" * 60)
print("NiBiS Ingestion Pipeline")
print("=" * 60)
# 1. ZIP-Dateien entpacken
print("\n1. Entpacke ZIP-Dateien...")
extracted = extract_zip_files(DOCS_BASE_PATH)
stats["zip_extracted"] = len(extracted)
# 2. Dokumente finden
print("\n2. Suche Dokumente...")
documents = discover_documents(DOCS_BASE_PATH, ewh_only=ewh_only)
# Filter anwenden
if year_filter:
documents = [d for d in documents if d.year == year_filter]
if subject_filter:
documents = [d for d in documents if subject_filter.lower() in d.subject.lower()]
stats["documents_found"] = len(documents)
print(f"\n Gefunden: {len(documents)} Dokumente")
# Statistiken nach Jahr/Fach
for doc in documents:
year_key = str(doc.year)
stats["by_year"][year_key] = stats["by_year"].get(year_key, 0) + 1
stats["by_subject"][doc.subject] = stats["by_subject"].get(doc.subject, 0) + 1
print("\n Nach Jahr:")
for year, count in sorted(stats["by_year"].items()):
print(f" {year}: {count}")
print("\n Nach Fach (Top 10):")
sorted_subjects = sorted(stats["by_subject"].items(), key=lambda x: -x[1])[:10]
for subject, count in sorted_subjects:
print(f" {subject}: {count}")
if dry_run:
print("\n[DRY RUN] Keine Indexierung durchgeführt.")
return stats
# 3. Qdrant initialisieren
vector_size = get_vector_size()
print(f"\n3. Initialisiere Qdrant...")
print(f" Embedding Backend: {EMBEDDING_BACKEND}")
print(f" Vektorgröße: {vector_size} Dimensionen")
qdrant = QdrantService()
await qdrant.ensure_collection(NIBIS_COLLECTION, vector_size=vector_size)
# 4. Dokumente indexieren
print("\n4. Indexiere Dokumente...")
for i, doc in enumerate(documents, 1):
print(f" [{i}/{len(documents)}] {doc.raw_filename}...")
try:
chunk_count = await index_document_to_qdrant(doc, qdrant)
if chunk_count > 0:
stats["documents_indexed"] += 1
stats["chunks_created"] += chunk_count
print(f" -> {chunk_count} Chunks indexiert")
except Exception as e:
error_msg = f"{doc.raw_filename}: {str(e)}"
stats["errors"].append(error_msg)
print(f" FEHLER: {e}")
stats["completed_at"] = datetime.now().isoformat()
# 5. Zusammenfassung
print("\n" + "=" * 60)
print("ZUSAMMENFASSUNG")
print("=" * 60)
print(f" ZIP-Dateien entpackt: {stats['zip_extracted']}")
print(f" Dokumente gefunden: {stats['documents_found']}")
print(f" Dokumente indexiert: {stats['documents_indexed']}")
print(f" Chunks erstellt: {stats['chunks_created']}")
print(f" Fehler: {len(stats['errors'])}")
return stats
def generate_manifest(documents: List[NiBiSDocument], output_path: Path) -> None:
"""Erstellt ein Manifest aller gefundenen Dokumente."""
manifest = {
"generated_at": datetime.now().isoformat(),
"total_documents": len(documents),
"documents": [doc.to_dict() for doc in documents],
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
print(f"Manifest geschrieben: {output_path}")
# CLI Entry Point
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="NiBiS Ingestion Pipeline")
parser.add_argument("--dry-run", action="store_true", help="Nur analysieren")
parser.add_argument("--year", type=int, help="Filter nach Jahr")
parser.add_argument("--subject", type=str, help="Filter nach Fach")
parser.add_argument("--all-docs", action="store_true", help="Alle Dokumente (nicht nur EWH)")
parser.add_argument("--manifest", type=str, help="Manifest-Datei erstellen")
args = parser.parse_args()
# Manifest erstellen?
if args.manifest:
docs = discover_documents(DOCS_BASE_PATH, ewh_only=not args.all_docs)
generate_manifest(docs, Path(args.manifest))
else:
# Ingestion ausführen
asyncio.run(run_ingestion(
ewh_only=not args.all_docs,
dry_run=args.dry_run,
year_filter=args.year,
subject_filter=args.subject,
))