Files
breakpilot-lehrer/klausur-service/backend/full_reingestion.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

117 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""
Full Re-Ingestion Script for Legal Corpus and UCCA.
This script:
1. Deletes all existing chunks from bp_legal_corpus
2. Re-ingests all 19 regulations with improved semantic chunking
3. Logs progress to a file for monitoring
Run in background on Mac Mini:
nohup python full_reingestion.py > /tmp/reingestion.log 2>&1 &
"""
import asyncio
import logging
import os
import sys
from datetime import datetime
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('/tmp/legal_corpus_reingestion.log')
]
)
logger = logging.getLogger(__name__)
# Set environment variables for Docker network
os.environ.setdefault("QDRANT_HOST", "qdrant")
os.environ.setdefault("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
from legal_corpus_ingestion import LegalCorpusIngestion, REGULATIONS, LEGAL_CORPUS_COLLECTION
from qdrant_client import QdrantClient
async def main():
logger.info("=" * 60)
logger.info("FULL LEGAL CORPUS RE-INGESTION")
logger.info(f"Started at: {datetime.now().isoformat()}")
logger.info("=" * 60)
# Initialize
ingestion = LegalCorpusIngestion()
try:
# Step 1: Delete all existing points
logger.info("\n[Step 1] Deleting all existing chunks...")
qdrant = QdrantClient(host=os.getenv("QDRANT_HOST", "qdrant"), port=6333)
# Get current count
try:
collection_info = qdrant.get_collection(LEGAL_CORPUS_COLLECTION)
old_count = collection_info.points_count
logger.info(f" Current chunk count: {old_count}")
except Exception as e:
logger.warning(f" Could not get collection info: {e}")
old_count = 0
# Delete all points by recreating collection
logger.info(" Deleting collection and recreating...")
try:
qdrant.delete_collection(LEGAL_CORPUS_COLLECTION)
logger.info(" Collection deleted.")
except Exception as e:
logger.warning(f" Could not delete collection: {e}")
# The ingestion class will recreate the collection
ingestion = LegalCorpusIngestion()
logger.info(" Collection recreated.")
# Step 2: Re-ingest all regulations
logger.info("\n[Step 2] Re-ingesting all 19 regulations...")
logger.info(f" Regulations: {[r.code for r in REGULATIONS]}")
results = {}
total_chunks = 0
for i, regulation in enumerate(REGULATIONS, 1):
logger.info(f"\n [{i}/19] Processing {regulation.code}: {regulation.name}")
try:
count = await ingestion.ingest_regulation(regulation)
results[regulation.code] = count
total_chunks += count
logger.info(f" -> {count} chunks indexed")
except Exception as e:
logger.error(f" -> FAILED: {e}")
results[regulation.code] = 0
# Step 3: Summary
logger.info("\n" + "=" * 60)
logger.info("SUMMARY")
logger.info("=" * 60)
logger.info(f" Previous chunk count: {old_count}")
logger.info(f" New chunk count: {total_chunks}")
logger.info(f" Difference: {total_chunks - old_count:+d}")
logger.info("\n Per regulation:")
for code, count in sorted(results.items()):
logger.info(f" {code}: {count} chunks")
# BSI specific
bsi_total = sum(results.get(f"BSI-TR-03161-{i}", 0) for i in [1, 2, 3])
logger.info(f"\n BSI-TR-03161 total: {bsi_total} chunks (was 18)")
logger.info("\n" + "=" * 60)
logger.info(f"Completed at: {datetime.now().isoformat()}")
logger.info("=" * 60)
finally:
await ingestion.close()
if __name__ == "__main__":
asyncio.run(main())