""" DSFA Corpus Ingestion Pipeline. Indexes DSFA guidance documents into Qdrant with full source attribution. Collections: - bp_dsfa_corpus: All DSFA-related documents (WP248, DSK, Muss-Listen) Usage: python dsfa_corpus_ingestion.py --init-sources # Register all sources python dsfa_corpus_ingestion.py --ingest WP248 # Ingest specific source python dsfa_corpus_ingestion.py --ingest-all # Ingest all sources python dsfa_corpus_ingestion.py --status # Show ingestion status """ import os import asyncio import argparse from typing import List from collections import defaultdict import asyncpg # ── Re-exports for backward compatibility ────────────────────────────────── # All existing `from dsfa_corpus_ingestion import X` continue to work. from dsfa_sources_registry import ( # noqa: F401 LICENSE_REGISTRY, DSFA_SOURCES, DSFA_CHUNK_CONFIG, ) from dsfa_corpus_store import ( # noqa: F401 DSFACorpusStore, DSFAChunkPayload, DSFASearchResult, ) from dsfa_qdrant_service import ( # noqa: F401 DSFAQdrantService, DSFA_COLLECTION, ) from dsfa_chunking import ( # noqa: F401 chunk_text_recursive, chunk_by_sections, chunk_by_list_items, chunk_document, ) # ── Configuration ────────────────────────────────────────────────────────── DATABASE_URL = os.getenv( "DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db", ) # ── Attribution Functions ────────────────────────────────────────────────── def generate_attribution_notice(results: List[DSFASearchResult]) -> str: """Generate combined attribution notice for all used sources.""" by_license = defaultdict(list) for r in results: by_license[r.license_code].append(r) notices = [] for license_code, items in by_license.items(): license_info = LICENSE_REGISTRY.get(license_code, {}) if license_info.get("attribution_required", True): sources = ", ".join(set(r.source_name for r in items)) license_name = license_info.get("name", license_code) notices.append(f"• {sources} - {license_name}") if notices: return "Quellennachweis:\n" + "\n".join(notices) return "" def get_license_label(license_code: str) -> str: """Get human-readable license label.""" license_info = LICENSE_REGISTRY.get(license_code, {}) return license_info.get("name", license_code) # ── Main Functions ───────────────────────────────────────────────────────── async def init_dsfa_tables(pool: asyncpg.Pool): """Initialize DSFA tables by running migration.""" migration_path = os.path.join( os.path.dirname(__file__), "migrations", "003_dsfa_rag_tables.sql" ) async with pool.acquire() as conn: with open(migration_path, "r") as f: await conn.execute(f.read()) print("DSFA tables initialized successfully") async def register_all_sources(pool: asyncpg.Pool): """Register all DSFA sources in the database (skips migrated sources).""" store = DSFACorpusStore(pool) registered = 0 skipped = 0 for source in DSFA_SOURCES: if source.get("migrated_to"): print(f"Skipping migrated source: {source['source_code']} -> {source['migrated_to']}") skipped += 1 continue source_id = await store.register_source(source) print(f"Registered source: {source['source_code']} -> {source_id}") registered += 1 print(f"\nTotal sources registered: {registered} (skipped {skipped} migrated)") async def get_ingestion_status(pool: asyncpg.Pool): """Get current ingestion status.""" store = DSFACorpusStore(pool) qdrant = DSFAQdrantService() print("\n=== DSFA Corpus Status ===\n") # PostgreSQL stats stats = await store.get_source_stats() print("PostgreSQL Sources:") print("-" * 80) print(f"{'Source Code':<25} {'Documents':>10} {'Chunks':>10} {'Last Indexed':<20}") print("-" * 80) total_docs = 0 total_chunks = 0 for s in stats: total_docs += s.get("document_count", 0) total_chunks += s.get("chunk_count", 0) last_indexed = s.get("last_indexed_at") last_indexed_str = last_indexed.strftime("%Y-%m-%d %H:%M") if last_indexed else "Never" print(f"{s['source_code']:<25} {s.get('document_count', 0):>10} {s.get('chunk_count', 0):>10} {last_indexed_str:<20}") print("-" * 80) print(f"{'TOTAL':<25} {total_docs:>10} {total_chunks:>10}") # Qdrant stats print("\nQdrant Collection:") qdrant_stats = await qdrant.get_stats() if "error" in qdrant_stats: print(f" Error: {qdrant_stats['error']}") else: print(f" Collection: {qdrant_stats['collection']}") print(f" Points: {qdrant_stats['points_count']}") print(f" Status: {qdrant_stats['status']}") async def main(): """Main entry point.""" parser = argparse.ArgumentParser(description="DSFA Corpus Ingestion Pipeline") parser.add_argument("--init-sources", action="store_true", help="Register all sources") parser.add_argument("--init-tables", action="store_true", help="Initialize database tables") parser.add_argument("--ingest", type=str, help="Ingest specific source by code") parser.add_argument("--ingest-all", action="store_true", help="Ingest all sources") parser.add_argument("--status", action="store_true", help="Show ingestion status") parser.add_argument("--init-qdrant", action="store_true", help="Initialize Qdrant collection") args = parser.parse_args() pool = await asyncpg.create_pool(DATABASE_URL) try: if args.init_tables: await init_dsfa_tables(pool) if args.init_sources: await register_all_sources(pool) if args.init_qdrant: qdrant = DSFAQdrantService() await qdrant.ensure_collection() print(f"Qdrant collection {DSFA_COLLECTION} initialized") if args.status: await get_ingestion_status(pool) if args.ingest: print(f"Ingesting source: {args.ingest}") # TODO: Implement document ingestion if args.ingest_all: print("Ingesting all sources...") # TODO: Implement bulk ingestion finally: await pool.close() if __name__ == "__main__": asyncio.run(main())