Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
198 lines
6.7 KiB
Python
198 lines
6.7 KiB
Python
"""
|
|
DSFA Corpus Ingestion Pipeline.
|
|
|
|
Indexes DSFA guidance documents into Qdrant with full source attribution.
|
|
|
|
Collections:
|
|
- bp_dsfa_corpus: All DSFA-related documents (WP248, DSK, Muss-Listen)
|
|
|
|
Usage:
|
|
python dsfa_corpus_ingestion.py --init-sources # Register all sources
|
|
python dsfa_corpus_ingestion.py --ingest WP248 # Ingest specific source
|
|
python dsfa_corpus_ingestion.py --ingest-all # Ingest all sources
|
|
python dsfa_corpus_ingestion.py --status # Show ingestion status
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
import argparse
|
|
from typing import List
|
|
from collections import defaultdict
|
|
|
|
import asyncpg
|
|
|
|
# ── Re-exports for backward compatibility ──────────────────────────────────
|
|
# All existing `from dsfa_corpus_ingestion import X` continue to work.
|
|
|
|
from dsfa_sources_registry import ( # noqa: F401
|
|
LICENSE_REGISTRY,
|
|
DSFA_SOURCES,
|
|
DSFA_CHUNK_CONFIG,
|
|
)
|
|
from dsfa_corpus_store import ( # noqa: F401
|
|
DSFACorpusStore,
|
|
DSFAChunkPayload,
|
|
DSFASearchResult,
|
|
)
|
|
from dsfa_qdrant_service import ( # noqa: F401
|
|
DSFAQdrantService,
|
|
DSFA_COLLECTION,
|
|
)
|
|
from dsfa_chunking import ( # noqa: F401
|
|
chunk_text_recursive,
|
|
chunk_by_sections,
|
|
chunk_by_list_items,
|
|
chunk_document,
|
|
)
|
|
|
|
# ── Configuration ──────────────────────────────────────────────────────────
|
|
|
|
DATABASE_URL = os.getenv(
|
|
"DATABASE_URL",
|
|
"postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db",
|
|
)
|
|
|
|
|
|
# ── Attribution Functions ──────────────────────────────────────────────────
|
|
|
|
def generate_attribution_notice(results: List[DSFASearchResult]) -> str:
|
|
"""Generate combined attribution notice for all used sources."""
|
|
by_license = defaultdict(list)
|
|
for r in results:
|
|
by_license[r.license_code].append(r)
|
|
|
|
notices = []
|
|
for license_code, items in by_license.items():
|
|
license_info = LICENSE_REGISTRY.get(license_code, {})
|
|
if license_info.get("attribution_required", True):
|
|
sources = ", ".join(set(r.source_name for r in items))
|
|
license_name = license_info.get("name", license_code)
|
|
notices.append(f"• {sources} - {license_name}")
|
|
|
|
if notices:
|
|
return "Quellennachweis:\n" + "\n".join(notices)
|
|
return ""
|
|
|
|
|
|
def get_license_label(license_code: str) -> str:
|
|
"""Get human-readable license label."""
|
|
license_info = LICENSE_REGISTRY.get(license_code, {})
|
|
return license_info.get("name", license_code)
|
|
|
|
|
|
# ── Main Functions ─────────────────────────────────────────────────────────
|
|
|
|
async def init_dsfa_tables(pool: asyncpg.Pool):
|
|
"""Initialize DSFA tables by running migration."""
|
|
migration_path = os.path.join(
|
|
os.path.dirname(__file__),
|
|
"migrations",
|
|
"003_dsfa_rag_tables.sql"
|
|
)
|
|
|
|
async with pool.acquire() as conn:
|
|
with open(migration_path, "r") as f:
|
|
await conn.execute(f.read())
|
|
|
|
print("DSFA tables initialized successfully")
|
|
|
|
|
|
async def register_all_sources(pool: asyncpg.Pool):
|
|
"""Register all DSFA sources in the database (skips migrated sources)."""
|
|
store = DSFACorpusStore(pool)
|
|
|
|
registered = 0
|
|
skipped = 0
|
|
for source in DSFA_SOURCES:
|
|
if source.get("migrated_to"):
|
|
print(f"Skipping migrated source: {source['source_code']} -> {source['migrated_to']}")
|
|
skipped += 1
|
|
continue
|
|
source_id = await store.register_source(source)
|
|
print(f"Registered source: {source['source_code']} -> {source_id}")
|
|
registered += 1
|
|
|
|
print(f"\nTotal sources registered: {registered} (skipped {skipped} migrated)")
|
|
|
|
|
|
async def get_ingestion_status(pool: asyncpg.Pool):
|
|
"""Get current ingestion status."""
|
|
store = DSFACorpusStore(pool)
|
|
qdrant = DSFAQdrantService()
|
|
|
|
print("\n=== DSFA Corpus Status ===\n")
|
|
|
|
# PostgreSQL stats
|
|
stats = await store.get_source_stats()
|
|
print("PostgreSQL Sources:")
|
|
print("-" * 80)
|
|
print(f"{'Source Code':<25} {'Documents':>10} {'Chunks':>10} {'Last Indexed':<20}")
|
|
print("-" * 80)
|
|
|
|
total_docs = 0
|
|
total_chunks = 0
|
|
for s in stats:
|
|
total_docs += s.get("document_count", 0)
|
|
total_chunks += s.get("chunk_count", 0)
|
|
last_indexed = s.get("last_indexed_at")
|
|
last_indexed_str = last_indexed.strftime("%Y-%m-%d %H:%M") if last_indexed else "Never"
|
|
print(f"{s['source_code']:<25} {s.get('document_count', 0):>10} {s.get('chunk_count', 0):>10} {last_indexed_str:<20}")
|
|
|
|
print("-" * 80)
|
|
print(f"{'TOTAL':<25} {total_docs:>10} {total_chunks:>10}")
|
|
|
|
# Qdrant stats
|
|
print("\nQdrant Collection:")
|
|
qdrant_stats = await qdrant.get_stats()
|
|
if "error" in qdrant_stats:
|
|
print(f" Error: {qdrant_stats['error']}")
|
|
else:
|
|
print(f" Collection: {qdrant_stats['collection']}")
|
|
print(f" Points: {qdrant_stats['points_count']}")
|
|
print(f" Status: {qdrant_stats['status']}")
|
|
|
|
|
|
async def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(description="DSFA Corpus Ingestion Pipeline")
|
|
parser.add_argument("--init-sources", action="store_true", help="Register all sources")
|
|
parser.add_argument("--init-tables", action="store_true", help="Initialize database tables")
|
|
parser.add_argument("--ingest", type=str, help="Ingest specific source by code")
|
|
parser.add_argument("--ingest-all", action="store_true", help="Ingest all sources")
|
|
parser.add_argument("--status", action="store_true", help="Show ingestion status")
|
|
parser.add_argument("--init-qdrant", action="store_true", help="Initialize Qdrant collection")
|
|
|
|
args = parser.parse_args()
|
|
|
|
pool = await asyncpg.create_pool(DATABASE_URL)
|
|
|
|
try:
|
|
if args.init_tables:
|
|
await init_dsfa_tables(pool)
|
|
|
|
if args.init_sources:
|
|
await register_all_sources(pool)
|
|
|
|
if args.init_qdrant:
|
|
qdrant = DSFAQdrantService()
|
|
await qdrant.ensure_collection()
|
|
print(f"Qdrant collection {DSFA_COLLECTION} initialized")
|
|
|
|
if args.status:
|
|
await get_ingestion_status(pool)
|
|
|
|
if args.ingest:
|
|
print(f"Ingesting source: {args.ingest}")
|
|
# TODO: Implement document ingestion
|
|
|
|
if args.ingest_all:
|
|
print("Ingesting all sources...")
|
|
# TODO: Implement bulk ingestion
|
|
|
|
finally:
|
|
await pool.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|