Files
breakpilot-lehrer/klausur-service/backend/dsfa_corpus_ingestion.py
Benjamin Admin 9ba420fa91
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Fix: Remove broken getKlausurApiUrl and clean up empty lines
sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00

198 lines
6.7 KiB
Python

"""
DSFA Corpus Ingestion Pipeline.
Indexes DSFA guidance documents into Qdrant with full source attribution.
Collections:
- bp_dsfa_corpus: All DSFA-related documents (WP248, DSK, Muss-Listen)
Usage:
python dsfa_corpus_ingestion.py --init-sources # Register all sources
python dsfa_corpus_ingestion.py --ingest WP248 # Ingest specific source
python dsfa_corpus_ingestion.py --ingest-all # Ingest all sources
python dsfa_corpus_ingestion.py --status # Show ingestion status
"""
import os
import asyncio
import argparse
from typing import List
from collections import defaultdict
import asyncpg
# ── Re-exports for backward compatibility ──────────────────────────────────
# All existing `from dsfa_corpus_ingestion import X` continue to work.
from dsfa_sources_registry import ( # noqa: F401
LICENSE_REGISTRY,
DSFA_SOURCES,
DSFA_CHUNK_CONFIG,
)
from dsfa_corpus_store import ( # noqa: F401
DSFACorpusStore,
DSFAChunkPayload,
DSFASearchResult,
)
from dsfa_qdrant_service import ( # noqa: F401
DSFAQdrantService,
DSFA_COLLECTION,
)
from dsfa_chunking import ( # noqa: F401
chunk_text_recursive,
chunk_by_sections,
chunk_by_list_items,
chunk_document,
)
# ── Configuration ──────────────────────────────────────────────────────────
DATABASE_URL = os.getenv(
"DATABASE_URL",
"postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db",
)
# ── Attribution Functions ──────────────────────────────────────────────────
def generate_attribution_notice(results: List[DSFASearchResult]) -> str:
"""Generate combined attribution notice for all used sources."""
by_license = defaultdict(list)
for r in results:
by_license[r.license_code].append(r)
notices = []
for license_code, items in by_license.items():
license_info = LICENSE_REGISTRY.get(license_code, {})
if license_info.get("attribution_required", True):
sources = ", ".join(set(r.source_name for r in items))
license_name = license_info.get("name", license_code)
notices.append(f"{sources} - {license_name}")
if notices:
return "Quellennachweis:\n" + "\n".join(notices)
return ""
def get_license_label(license_code: str) -> str:
"""Get human-readable license label."""
license_info = LICENSE_REGISTRY.get(license_code, {})
return license_info.get("name", license_code)
# ── Main Functions ─────────────────────────────────────────────────────────
async def init_dsfa_tables(pool: asyncpg.Pool):
"""Initialize DSFA tables by running migration."""
migration_path = os.path.join(
os.path.dirname(__file__),
"migrations",
"003_dsfa_rag_tables.sql"
)
async with pool.acquire() as conn:
with open(migration_path, "r") as f:
await conn.execute(f.read())
print("DSFA tables initialized successfully")
async def register_all_sources(pool: asyncpg.Pool):
"""Register all DSFA sources in the database (skips migrated sources)."""
store = DSFACorpusStore(pool)
registered = 0
skipped = 0
for source in DSFA_SOURCES:
if source.get("migrated_to"):
print(f"Skipping migrated source: {source['source_code']} -> {source['migrated_to']}")
skipped += 1
continue
source_id = await store.register_source(source)
print(f"Registered source: {source['source_code']} -> {source_id}")
registered += 1
print(f"\nTotal sources registered: {registered} (skipped {skipped} migrated)")
async def get_ingestion_status(pool: asyncpg.Pool):
"""Get current ingestion status."""
store = DSFACorpusStore(pool)
qdrant = DSFAQdrantService()
print("\n=== DSFA Corpus Status ===\n")
# PostgreSQL stats
stats = await store.get_source_stats()
print("PostgreSQL Sources:")
print("-" * 80)
print(f"{'Source Code':<25} {'Documents':>10} {'Chunks':>10} {'Last Indexed':<20}")
print("-" * 80)
total_docs = 0
total_chunks = 0
for s in stats:
total_docs += s.get("document_count", 0)
total_chunks += s.get("chunk_count", 0)
last_indexed = s.get("last_indexed_at")
last_indexed_str = last_indexed.strftime("%Y-%m-%d %H:%M") if last_indexed else "Never"
print(f"{s['source_code']:<25} {s.get('document_count', 0):>10} {s.get('chunk_count', 0):>10} {last_indexed_str:<20}")
print("-" * 80)
print(f"{'TOTAL':<25} {total_docs:>10} {total_chunks:>10}")
# Qdrant stats
print("\nQdrant Collection:")
qdrant_stats = await qdrant.get_stats()
if "error" in qdrant_stats:
print(f" Error: {qdrant_stats['error']}")
else:
print(f" Collection: {qdrant_stats['collection']}")
print(f" Points: {qdrant_stats['points_count']}")
print(f" Status: {qdrant_stats['status']}")
async def main():
"""Main entry point."""
parser = argparse.ArgumentParser(description="DSFA Corpus Ingestion Pipeline")
parser.add_argument("--init-sources", action="store_true", help="Register all sources")
parser.add_argument("--init-tables", action="store_true", help="Initialize database tables")
parser.add_argument("--ingest", type=str, help="Ingest specific source by code")
parser.add_argument("--ingest-all", action="store_true", help="Ingest all sources")
parser.add_argument("--status", action="store_true", help="Show ingestion status")
parser.add_argument("--init-qdrant", action="store_true", help="Initialize Qdrant collection")
args = parser.parse_args()
pool = await asyncpg.create_pool(DATABASE_URL)
try:
if args.init_tables:
await init_dsfa_tables(pool)
if args.init_sources:
await register_all_sources(pool)
if args.init_qdrant:
qdrant = DSFAQdrantService()
await qdrant.ensure_collection()
print(f"Qdrant collection {DSFA_COLLECTION} initialized")
if args.status:
await get_ingestion_status(pool)
if args.ingest:
print(f"Ingesting source: {args.ingest}")
# TODO: Implement document ingestion
if args.ingest_all:
print("Ingesting all sources...")
# TODO: Implement bulk ingestion
finally:
await pool.close()
if __name__ == "__main__":
asyncio.run(main())