Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
517 lines
16 KiB
Python
517 lines
16 KiB
Python
"""
|
|
NiBiS Ingestion Pipeline
|
|
Automatisierte Verarbeitung von Abitur-Erwartungshorizonten aus Niedersachsen.
|
|
|
|
Unterstützt:
|
|
- Mehrere Jahre (2016, 2017, 2024, 2025, ...)
|
|
- Verschiedene Namenskonventionen (alt: *Lehrer/*L.pdf, neu: *_EWH.pdf)
|
|
- Automatisches Entpacken von ZIP-Dateien
|
|
- Flexible Erweiterung für andere Bundesländer
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import zipfile
|
|
import hashlib
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional, Tuple
|
|
from dataclasses import dataclass, asdict
|
|
from datetime import datetime
|
|
import asyncio
|
|
|
|
# Local imports
|
|
from eh_pipeline import chunk_text, generate_embeddings, extract_text_from_pdf, get_vector_size, EMBEDDING_BACKEND
|
|
from qdrant_service import QdrantService
|
|
|
|
# Configuration
|
|
DOCS_BASE_PATH = Path("/Users/benjaminadmin/projekte/breakpilot-pwa/docs")
|
|
ZA_DOWNLOAD_DIRS = ["za-download", "za-download-2", "za-download-3"]
|
|
|
|
# Qdrant collection for NiBiS data (separate from user EH)
|
|
NIBIS_COLLECTION = "bp_nibis_eh"
|
|
|
|
|
|
@dataclass
|
|
class NiBiSDocument:
|
|
"""Strukturierte Repräsentation eines NiBiS-Dokuments."""
|
|
id: str
|
|
file_path: str
|
|
year: int
|
|
subject: str
|
|
niveau: str # eA, gA, EA, GA
|
|
task_number: Optional[int]
|
|
doc_type: str # EWH, Aufgabe, Material, GBU, etc.
|
|
bundesland: str
|
|
source_dir: str
|
|
file_hash: str
|
|
extracted_at: datetime
|
|
|
|
# Metadaten aus Dateinamen
|
|
raw_filename: str
|
|
variant: Optional[str] = None # BG, Tech, Wirt, etc.
|
|
|
|
def to_dict(self) -> dict:
|
|
d = asdict(self)
|
|
d['extracted_at'] = d['extracted_at'].isoformat()
|
|
return d
|
|
|
|
|
|
# Fach-Mapping (Kurzform -> Langform)
|
|
SUBJECT_MAPPING = {
|
|
"deutsch": "Deutsch",
|
|
"englisch": "Englisch",
|
|
"englischbg": "Englisch (Berufliches Gymnasium)",
|
|
"mathe": "Mathematik",
|
|
"mathebg": "Mathematik (Berufliches Gymnasium)",
|
|
"mathezwb": "Mathematik (Zweiter Bildungsweg)",
|
|
"informatik": "Informatik",
|
|
"biologie": "Biologie",
|
|
"chemie": "Chemie",
|
|
"physik": "Physik",
|
|
"geschichte": "Geschichte",
|
|
"erdkunde": "Erdkunde/Geografie",
|
|
"kunst": "Kunst",
|
|
"musik": "Musik",
|
|
"sport": "Sport",
|
|
"latein": "Latein",
|
|
"griechisch": "Griechisch",
|
|
"französisch": "Französisch",
|
|
"franzîsisch": "Französisch", # Encoding-Problem in 2017
|
|
"spanisch": "Spanisch",
|
|
"kathreligion": "Katholische Religion",
|
|
"evreligion": "Evangelische Religion",
|
|
"wertenormen": "Werte und Normen",
|
|
"brc": "Betriebswirtschaft mit Rechnungswesen/Controlling",
|
|
"bvw": "Betriebswirtschaft mit Rechnungswesen",
|
|
"gespfl": "Gesundheit-Pflege",
|
|
}
|
|
|
|
# Niveau-Mapping
|
|
NIVEAU_MAPPING = {
|
|
"ea": "eA", # erhöhtes Anforderungsniveau
|
|
"ga": "gA", # grundlegendes Anforderungsniveau
|
|
"neuga": "gA (neu einsetzend)",
|
|
"neuea": "eA (neu einsetzend)",
|
|
}
|
|
|
|
|
|
def compute_file_hash(file_path: Path) -> str:
|
|
"""Berechnet SHA-256 Hash einer Datei."""
|
|
sha256 = hashlib.sha256()
|
|
with open(file_path, "rb") as f:
|
|
for chunk in iter(lambda: f.read(8192), b""):
|
|
sha256.update(chunk)
|
|
return sha256.hexdigest()[:16]
|
|
|
|
|
|
def extract_zip_files(base_path: Path) -> List[Path]:
|
|
"""Entpackt alle ZIP-Dateien in den za-download Verzeichnissen."""
|
|
extracted = []
|
|
|
|
for za_dir in ZA_DOWNLOAD_DIRS:
|
|
za_path = base_path / za_dir
|
|
if not za_path.exists():
|
|
continue
|
|
|
|
for zip_file in za_path.glob("*.zip"):
|
|
# Zielverzeichnis = Name ohne .zip
|
|
target_dir = za_path / zip_file.stem
|
|
|
|
if target_dir.exists():
|
|
print(f" Bereits entpackt: {zip_file.name} -> {target_dir.name}/")
|
|
extracted.append(target_dir)
|
|
continue
|
|
|
|
print(f" Entpacke: {zip_file.name}...")
|
|
try:
|
|
with zipfile.ZipFile(zip_file, 'r') as zf:
|
|
zf.extractall(target_dir)
|
|
print(f" -> {len(list(target_dir.rglob('*')))} Dateien extrahiert")
|
|
extracted.append(target_dir)
|
|
except Exception as e:
|
|
print(f" FEHLER: {e}")
|
|
|
|
return extracted
|
|
|
|
|
|
def parse_filename_old_format(filename: str, file_path: Path) -> Optional[Dict]:
|
|
"""
|
|
Parst alte Namenskonvention (2016, 2017):
|
|
- {Jahr}{Fach}{Niveau}Lehrer/{Jahr}{Fach}{Niveau}A{Nr}L.pdf
|
|
- Beispiel: 2016DeutschEALehrer/2016DeutschEAA1L.pdf
|
|
"""
|
|
# Pattern für Lehrer-Dateien
|
|
pattern = r"(\d{4})([A-Za-zäöüÄÖÜ]+)(EA|GA|NeuGA|NeuEA)(?:Lehrer)?.*?(?:A(\d+)|Aufg(\d+))?L?\.pdf$"
|
|
|
|
match = re.search(pattern, filename, re.IGNORECASE)
|
|
if not match:
|
|
return None
|
|
|
|
year = int(match.group(1))
|
|
subject_raw = match.group(2).lower()
|
|
niveau = match.group(3).upper()
|
|
task_num = match.group(4) or match.group(5)
|
|
|
|
# Prüfe ob es ein Lehrer-Dokument ist (EWH)
|
|
is_ewh = "lehrer" in str(file_path).lower() or filename.endswith("L.pdf")
|
|
|
|
# Extrahiere Variante (Tech, Wirt, CAS, GTR, etc.)
|
|
variant = None
|
|
variant_patterns = ["Tech", "Wirt", "CAS", "GTR", "Pflicht", "BG", "mitExp", "ohneExp"]
|
|
for v in variant_patterns:
|
|
if v.lower() in str(file_path).lower():
|
|
variant = v
|
|
break
|
|
|
|
return {
|
|
"year": year,
|
|
"subject": subject_raw,
|
|
"niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
|
|
"task_number": int(task_num) if task_num else None,
|
|
"doc_type": "EWH" if is_ewh else "Aufgabe",
|
|
"variant": variant,
|
|
}
|
|
|
|
|
|
def parse_filename_new_format(filename: str, file_path: Path) -> Optional[Dict]:
|
|
"""
|
|
Parst neue Namenskonvention (2024, 2025):
|
|
- {Jahr}_{Fach}_{niveau}_{Nr}_EWH.pdf
|
|
- Beispiel: 2025_Deutsch_eA_I_EWH.pdf
|
|
"""
|
|
# Pattern für neue Dateien
|
|
pattern = r"(\d{4})_([A-Za-zäöüÄÖÜ]+)(?:BG)?_(eA|gA)(?:_([IVX\d]+))?(?:_(.+))?\.pdf$"
|
|
|
|
match = re.search(pattern, filename, re.IGNORECASE)
|
|
if not match:
|
|
return None
|
|
|
|
year = int(match.group(1))
|
|
subject_raw = match.group(2).lower()
|
|
niveau = match.group(3)
|
|
task_id = match.group(4)
|
|
suffix = match.group(5) or ""
|
|
|
|
# Task-Nummer aus römischen Zahlen
|
|
task_num = None
|
|
if task_id:
|
|
roman_map = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
|
|
task_num = roman_map.get(task_id) or (int(task_id) if task_id.isdigit() else None)
|
|
|
|
# Dokumenttyp
|
|
is_ewh = "EWH" in filename or "ewh" in filename.lower()
|
|
|
|
# Spezielle Dokumenttypen
|
|
doc_type = "EWH" if is_ewh else "Aufgabe"
|
|
if "Material" in suffix:
|
|
doc_type = "Material"
|
|
elif "GBU" in suffix:
|
|
doc_type = "GBU"
|
|
elif "Ergebnis" in suffix:
|
|
doc_type = "Ergebnis"
|
|
elif "Bewertungsbogen" in suffix:
|
|
doc_type = "Bewertungsbogen"
|
|
elif "HV" in suffix:
|
|
doc_type = "Hörverstehen"
|
|
elif "ME" in suffix:
|
|
doc_type = "Mediation"
|
|
|
|
# BG Variante
|
|
variant = "BG" if "BG" in filename else None
|
|
if "mitExp" in str(file_path):
|
|
variant = "mitExp"
|
|
|
|
return {
|
|
"year": year,
|
|
"subject": subject_raw,
|
|
"niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
|
|
"task_number": task_num,
|
|
"doc_type": doc_type,
|
|
"variant": variant,
|
|
}
|
|
|
|
|
|
def discover_documents(base_path: Path, ewh_only: bool = True) -> List[NiBiSDocument]:
|
|
"""
|
|
Findet alle relevanten Dokumente in den za-download Verzeichnissen.
|
|
|
|
Args:
|
|
base_path: Basis-Pfad zu docs/
|
|
ewh_only: Nur Erwartungshorizonte (keine Aufgaben)
|
|
"""
|
|
documents = []
|
|
|
|
for za_dir in ZA_DOWNLOAD_DIRS:
|
|
za_path = base_path / za_dir
|
|
if not za_path.exists():
|
|
continue
|
|
|
|
print(f"\nSuche in {za_dir}...")
|
|
|
|
for pdf_file in za_path.rglob("*.pdf"):
|
|
filename = pdf_file.name
|
|
|
|
# Versuche beide Formate
|
|
parsed = parse_filename_new_format(filename, pdf_file)
|
|
if not parsed:
|
|
parsed = parse_filename_old_format(filename, pdf_file)
|
|
|
|
if not parsed:
|
|
# Unbekanntes Format
|
|
continue
|
|
|
|
# Filter: Nur EWH?
|
|
if ewh_only and parsed["doc_type"] != "EWH":
|
|
continue
|
|
|
|
# Erstelle Dokument
|
|
doc_id = f"nibis_{parsed['year']}_{parsed['subject']}_{parsed['niveau']}_{parsed.get('task_number', 0)}_{compute_file_hash(pdf_file)}"
|
|
|
|
doc = NiBiSDocument(
|
|
id=doc_id,
|
|
file_path=str(pdf_file),
|
|
year=parsed["year"],
|
|
subject=SUBJECT_MAPPING.get(parsed["subject"], parsed["subject"].capitalize()),
|
|
niveau=parsed["niveau"],
|
|
task_number=parsed.get("task_number"),
|
|
doc_type=parsed["doc_type"],
|
|
bundesland="NI", # Niedersachsen
|
|
source_dir=za_dir,
|
|
file_hash=compute_file_hash(pdf_file),
|
|
extracted_at=datetime.now(),
|
|
raw_filename=filename,
|
|
variant=parsed.get("variant"),
|
|
)
|
|
|
|
documents.append(doc)
|
|
|
|
return documents
|
|
|
|
|
|
async def index_document_to_qdrant(
|
|
doc: NiBiSDocument,
|
|
qdrant: QdrantService,
|
|
collection: str = NIBIS_COLLECTION
|
|
) -> int:
|
|
"""
|
|
Indexiert ein einzelnes Dokument in Qdrant.
|
|
|
|
Returns:
|
|
Anzahl der indexierten Chunks
|
|
"""
|
|
# 1. PDF lesen
|
|
try:
|
|
with open(doc.file_path, "rb") as f:
|
|
pdf_content = f.read()
|
|
except Exception as e:
|
|
print(f" FEHLER beim Lesen: {e}")
|
|
return 0
|
|
|
|
# 2. Text extrahieren
|
|
try:
|
|
text = extract_text_from_pdf(pdf_content)
|
|
if not text or len(text.strip()) < 50:
|
|
print(f" Warnung: Wenig Text extrahiert ({len(text)} Zeichen)")
|
|
return 0
|
|
except Exception as e:
|
|
print(f" FEHLER bei PDF-Extraktion: {e}")
|
|
return 0
|
|
|
|
# 3. Chunking
|
|
chunks = chunk_text(text)
|
|
if not chunks:
|
|
return 0
|
|
|
|
# 4. Embeddings generieren
|
|
try:
|
|
embeddings = await generate_embeddings(chunks)
|
|
except Exception as e:
|
|
print(f" FEHLER bei Embedding-Generierung: {e}")
|
|
return 0
|
|
|
|
# 5. In Qdrant indexieren
|
|
points = []
|
|
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
|
point_id = f"{doc.id}_chunk_{i}"
|
|
|
|
payload = {
|
|
"doc_id": doc.id,
|
|
"chunk_index": i,
|
|
"text": chunk,
|
|
"year": doc.year,
|
|
"subject": doc.subject,
|
|
"niveau": doc.niveau,
|
|
"task_number": doc.task_number,
|
|
"doc_type": doc.doc_type,
|
|
"bundesland": doc.bundesland,
|
|
"variant": doc.variant,
|
|
"source": "nibis",
|
|
"training_allowed": True, # NiBiS-Daten dürfen für Training genutzt werden
|
|
}
|
|
|
|
points.append({
|
|
"id": point_id,
|
|
"vector": embedding,
|
|
"payload": payload,
|
|
})
|
|
|
|
# Batch-Upload
|
|
try:
|
|
await qdrant.upsert_points(collection, points)
|
|
return len(points)
|
|
except Exception as e:
|
|
print(f" FEHLER beim Qdrant-Upload: {e}")
|
|
return 0
|
|
|
|
|
|
async def run_ingestion(
|
|
ewh_only: bool = True,
|
|
dry_run: bool = False,
|
|
year_filter: Optional[int] = None,
|
|
subject_filter: Optional[str] = None,
|
|
) -> Dict:
|
|
"""
|
|
Hauptfunktion für die Ingestion-Pipeline.
|
|
|
|
Args:
|
|
ewh_only: Nur Erwartungshorizonte indexieren
|
|
dry_run: Nur analysieren, nicht indexieren
|
|
year_filter: Optional - nur bestimmtes Jahr
|
|
subject_filter: Optional - nur bestimmtes Fach
|
|
|
|
Returns:
|
|
Statistiken über die Ingestion
|
|
"""
|
|
stats = {
|
|
"started_at": datetime.now().isoformat(),
|
|
"zip_extracted": 0,
|
|
"documents_found": 0,
|
|
"documents_indexed": 0,
|
|
"chunks_created": 0,
|
|
"errors": [],
|
|
"by_year": {},
|
|
"by_subject": {},
|
|
}
|
|
|
|
print("=" * 60)
|
|
print("NiBiS Ingestion Pipeline")
|
|
print("=" * 60)
|
|
|
|
# 1. ZIP-Dateien entpacken
|
|
print("\n1. Entpacke ZIP-Dateien...")
|
|
extracted = extract_zip_files(DOCS_BASE_PATH)
|
|
stats["zip_extracted"] = len(extracted)
|
|
|
|
# 2. Dokumente finden
|
|
print("\n2. Suche Dokumente...")
|
|
documents = discover_documents(DOCS_BASE_PATH, ewh_only=ewh_only)
|
|
|
|
# Filter anwenden
|
|
if year_filter:
|
|
documents = [d for d in documents if d.year == year_filter]
|
|
if subject_filter:
|
|
documents = [d for d in documents if subject_filter.lower() in d.subject.lower()]
|
|
|
|
stats["documents_found"] = len(documents)
|
|
|
|
print(f"\n Gefunden: {len(documents)} Dokumente")
|
|
|
|
# Statistiken nach Jahr/Fach
|
|
for doc in documents:
|
|
year_key = str(doc.year)
|
|
stats["by_year"][year_key] = stats["by_year"].get(year_key, 0) + 1
|
|
stats["by_subject"][doc.subject] = stats["by_subject"].get(doc.subject, 0) + 1
|
|
|
|
print("\n Nach Jahr:")
|
|
for year, count in sorted(stats["by_year"].items()):
|
|
print(f" {year}: {count}")
|
|
|
|
print("\n Nach Fach (Top 10):")
|
|
sorted_subjects = sorted(stats["by_subject"].items(), key=lambda x: -x[1])[:10]
|
|
for subject, count in sorted_subjects:
|
|
print(f" {subject}: {count}")
|
|
|
|
if dry_run:
|
|
print("\n[DRY RUN] Keine Indexierung durchgeführt.")
|
|
return stats
|
|
|
|
# 3. Qdrant initialisieren
|
|
vector_size = get_vector_size()
|
|
print(f"\n3. Initialisiere Qdrant...")
|
|
print(f" Embedding Backend: {EMBEDDING_BACKEND}")
|
|
print(f" Vektorgröße: {vector_size} Dimensionen")
|
|
qdrant = QdrantService()
|
|
await qdrant.ensure_collection(NIBIS_COLLECTION, vector_size=vector_size)
|
|
|
|
# 4. Dokumente indexieren
|
|
print("\n4. Indexiere Dokumente...")
|
|
for i, doc in enumerate(documents, 1):
|
|
print(f" [{i}/{len(documents)}] {doc.raw_filename}...")
|
|
|
|
try:
|
|
chunk_count = await index_document_to_qdrant(doc, qdrant)
|
|
if chunk_count > 0:
|
|
stats["documents_indexed"] += 1
|
|
stats["chunks_created"] += chunk_count
|
|
print(f" -> {chunk_count} Chunks indexiert")
|
|
except Exception as e:
|
|
error_msg = f"{doc.raw_filename}: {str(e)}"
|
|
stats["errors"].append(error_msg)
|
|
print(f" FEHLER: {e}")
|
|
|
|
stats["completed_at"] = datetime.now().isoformat()
|
|
|
|
# 5. Zusammenfassung
|
|
print("\n" + "=" * 60)
|
|
print("ZUSAMMENFASSUNG")
|
|
print("=" * 60)
|
|
print(f" ZIP-Dateien entpackt: {stats['zip_extracted']}")
|
|
print(f" Dokumente gefunden: {stats['documents_found']}")
|
|
print(f" Dokumente indexiert: {stats['documents_indexed']}")
|
|
print(f" Chunks erstellt: {stats['chunks_created']}")
|
|
print(f" Fehler: {len(stats['errors'])}")
|
|
|
|
return stats
|
|
|
|
|
|
def generate_manifest(documents: List[NiBiSDocument], output_path: Path) -> None:
|
|
"""Erstellt ein Manifest aller gefundenen Dokumente."""
|
|
manifest = {
|
|
"generated_at": datetime.now().isoformat(),
|
|
"total_documents": len(documents),
|
|
"documents": [doc.to_dict() for doc in documents],
|
|
}
|
|
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Manifest geschrieben: {output_path}")
|
|
|
|
|
|
# CLI Entry Point
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="NiBiS Ingestion Pipeline")
|
|
parser.add_argument("--dry-run", action="store_true", help="Nur analysieren")
|
|
parser.add_argument("--year", type=int, help="Filter nach Jahr")
|
|
parser.add_argument("--subject", type=str, help="Filter nach Fach")
|
|
parser.add_argument("--all-docs", action="store_true", help="Alle Dokumente (nicht nur EWH)")
|
|
parser.add_argument("--manifest", type=str, help="Manifest-Datei erstellen")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Manifest erstellen?
|
|
if args.manifest:
|
|
docs = discover_documents(DOCS_BASE_PATH, ewh_only=not args.all_docs)
|
|
generate_manifest(docs, Path(args.manifest))
|
|
else:
|
|
# Ingestion ausführen
|
|
asyncio.run(run_ingestion(
|
|
ewh_only=not args.all_docs,
|
|
dry_run=args.dry_run,
|
|
year_filter=args.year,
|
|
subject_filter=args.subject,
|
|
))
|