breakpilot-lehrer/klausur-service/backend/dsfa_corpus_ingestion.py

"""
DSFA Corpus Ingestion Pipeline.

Indexes DSFA guidance documents into Qdrant with full source attribution.

Collections:
- bp_dsfa_corpus: All DSFA-related documents (WP248, DSK, Muss-Listen)

Usage:
    python dsfa_corpus_ingestion.py --init-sources    # Register all sources
    python dsfa_corpus_ingestion.py --ingest WP248    # Ingest specific source
    python dsfa_corpus_ingestion.py --ingest-all      # Ingest all sources
    python dsfa_corpus_ingestion.py --status          # Show ingestion status
"""

import os
import asyncio
import argparse
from typing import List
from collections import defaultdict

import asyncpg

# ── Re-exports for backward compatibility ──────────────────────────────────
# All existing `from dsfa_corpus_ingestion import X` continue to work.

from dsfa_sources_registry import (  # noqa: F401
    LICENSE_REGISTRY,
    DSFA_SOURCES,
    DSFA_CHUNK_CONFIG,
)
from dsfa_corpus_store import (  # noqa: F401
    DSFACorpusStore,
    DSFAChunkPayload,
    DSFASearchResult,
)
from dsfa_qdrant_service import (  # noqa: F401
    DSFAQdrantService,
    DSFA_COLLECTION,
)
from dsfa_chunking import (  # noqa: F401
    chunk_text_recursive,
    chunk_by_sections,
    chunk_by_list_items,
    chunk_document,
)

# ── Configuration ──────────────────────────────────────────────────────────

DATABASE_URL = os.getenv(
    "DATABASE_URL",
    "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db",
)


# ── Attribution Functions ──────────────────────────────────────────────────

def generate_attribution_notice(results: List[DSFASearchResult]) -> str:
    """Generate combined attribution notice for all used sources."""
    by_license = defaultdict(list)
    for r in results:
        by_license[r.license_code].append(r)

    notices = []
    for license_code, items in by_license.items():
        license_info = LICENSE_REGISTRY.get(license_code, {})
        if license_info.get("attribution_required", True):
            sources = ", ".join(set(r.source_name for r in items))
            license_name = license_info.get("name", license_code)
            notices.append(f"• {sources} - {license_name}")

    if notices:
        return "Quellennachweis:\n" + "\n".join(notices)
    return ""


def get_license_label(license_code: str) -> str:
    """Get human-readable license label."""
    license_info = LICENSE_REGISTRY.get(license_code, {})
    return license_info.get("name", license_code)


# ── Main Functions ─────────────────────────────────────────────────────────

async def init_dsfa_tables(pool: asyncpg.Pool):
    """Initialize DSFA tables by running migration."""
    migration_path = os.path.join(
        os.path.dirname(__file__),
        "migrations",
        "003_dsfa_rag_tables.sql"
    )

    async with pool.acquire() as conn:
        with open(migration_path, "r") as f:
            await conn.execute(f.read())

    print("DSFA tables initialized successfully")


async def register_all_sources(pool: asyncpg.Pool):
    """Register all DSFA sources in the database (skips migrated sources)."""
    store = DSFACorpusStore(pool)

    registered = 0
    skipped = 0
    for source in DSFA_SOURCES:
        if source.get("migrated_to"):
            print(f"Skipping migrated source: {source['source_code']} -> {source['migrated_to']}")
            skipped += 1
            continue
        source_id = await store.register_source(source)
        print(f"Registered source: {source['source_code']} -> {source_id}")
        registered += 1

    print(f"\nTotal sources registered: {registered} (skipped {skipped} migrated)")


async def get_ingestion_status(pool: asyncpg.Pool):
    """Get current ingestion status."""
    store = DSFACorpusStore(pool)
    qdrant = DSFAQdrantService()

    print("\n=== DSFA Corpus Status ===\n")

    # PostgreSQL stats
    stats = await store.get_source_stats()
    print("PostgreSQL Sources:")
    print("-" * 80)
    print(f"{'Source Code':<25} {'Documents':>10} {'Chunks':>10} {'Last Indexed':<20}")
    print("-" * 80)

    total_docs = 0
    total_chunks = 0
    for s in stats:
        total_docs += s.get("document_count", 0)
        total_chunks += s.get("chunk_count", 0)
        last_indexed = s.get("last_indexed_at")
        last_indexed_str = last_indexed.strftime("%Y-%m-%d %H:%M") if last_indexed else "Never"
        print(f"{s['source_code']:<25} {s.get('document_count', 0):>10} {s.get('chunk_count', 0):>10} {last_indexed_str:<20}")

    print("-" * 80)
    print(f"{'TOTAL':<25} {total_docs:>10} {total_chunks:>10}")

    # Qdrant stats
    print("\nQdrant Collection:")
    qdrant_stats = await qdrant.get_stats()
    if "error" in qdrant_stats:
        print(f"  Error: {qdrant_stats['error']}")
    else:
        print(f"  Collection: {qdrant_stats['collection']}")
        print(f"  Points: {qdrant_stats['points_count']}")
        print(f"  Status: {qdrant_stats['status']}")


async def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(description="DSFA Corpus Ingestion Pipeline")
    parser.add_argument("--init-sources", action="store_true", help="Register all sources")
    parser.add_argument("--init-tables", action="store_true", help="Initialize database tables")
    parser.add_argument("--ingest", type=str, help="Ingest specific source by code")
    parser.add_argument("--ingest-all", action="store_true", help="Ingest all sources")
    parser.add_argument("--status", action="store_true", help="Show ingestion status")
    parser.add_argument("--init-qdrant", action="store_true", help="Initialize Qdrant collection")

    args = parser.parse_args()

    pool = await asyncpg.create_pool(DATABASE_URL)

    try:
        if args.init_tables:
            await init_dsfa_tables(pool)

        if args.init_sources:
            await register_all_sources(pool)

        if args.init_qdrant:
            qdrant = DSFAQdrantService()
            await qdrant.ensure_collection()
            print(f"Qdrant collection {DSFA_COLLECTION} initialized")

        if args.status:
            await get_ingestion_status(pool)

        if args.ingest:
            print(f"Ingesting source: {args.ingest}")
            # TODO: Implement document ingestion

        if args.ingest_all:
            print("Ingesting all sources...")
            # TODO: Implement bulk ingestion

    finally:
        await pool.close()


if __name__ == "__main__":
    asyncio.run(main())