Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
117 lines
3.9 KiB
Python
117 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Full Re-Ingestion Script for Legal Corpus and UCCA.
|
|
|
|
This script:
|
|
1. Deletes all existing chunks from bp_legal_corpus
|
|
2. Re-ingests all 19 regulations with improved semantic chunking
|
|
3. Logs progress to a file for monitoring
|
|
|
|
Run in background on Mac Mini:
|
|
nohup python full_reingestion.py > /tmp/reingestion.log 2>&1 &
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout),
|
|
logging.FileHandler('/tmp/legal_corpus_reingestion.log')
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Set environment variables for Docker network
|
|
os.environ.setdefault("QDRANT_HOST", "qdrant")
|
|
os.environ.setdefault("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
|
|
|
|
from legal_corpus_ingestion import LegalCorpusIngestion, REGULATIONS, LEGAL_CORPUS_COLLECTION
|
|
from qdrant_client import QdrantClient
|
|
|
|
|
|
async def main():
|
|
logger.info("=" * 60)
|
|
logger.info("FULL LEGAL CORPUS RE-INGESTION")
|
|
logger.info(f"Started at: {datetime.now().isoformat()}")
|
|
logger.info("=" * 60)
|
|
|
|
# Initialize
|
|
ingestion = LegalCorpusIngestion()
|
|
|
|
try:
|
|
# Step 1: Delete all existing points
|
|
logger.info("\n[Step 1] Deleting all existing chunks...")
|
|
qdrant = QdrantClient(host=os.getenv("QDRANT_HOST", "qdrant"), port=6333)
|
|
|
|
# Get current count
|
|
try:
|
|
collection_info = qdrant.get_collection(LEGAL_CORPUS_COLLECTION)
|
|
old_count = collection_info.points_count
|
|
logger.info(f" Current chunk count: {old_count}")
|
|
except Exception as e:
|
|
logger.warning(f" Could not get collection info: {e}")
|
|
old_count = 0
|
|
|
|
# Delete all points by recreating collection
|
|
logger.info(" Deleting collection and recreating...")
|
|
try:
|
|
qdrant.delete_collection(LEGAL_CORPUS_COLLECTION)
|
|
logger.info(" Collection deleted.")
|
|
except Exception as e:
|
|
logger.warning(f" Could not delete collection: {e}")
|
|
|
|
# The ingestion class will recreate the collection
|
|
ingestion = LegalCorpusIngestion()
|
|
logger.info(" Collection recreated.")
|
|
|
|
# Step 2: Re-ingest all regulations
|
|
logger.info("\n[Step 2] Re-ingesting all 19 regulations...")
|
|
logger.info(f" Regulations: {[r.code for r in REGULATIONS]}")
|
|
|
|
results = {}
|
|
total_chunks = 0
|
|
|
|
for i, regulation in enumerate(REGULATIONS, 1):
|
|
logger.info(f"\n [{i}/19] Processing {regulation.code}: {regulation.name}")
|
|
try:
|
|
count = await ingestion.ingest_regulation(regulation)
|
|
results[regulation.code] = count
|
|
total_chunks += count
|
|
logger.info(f" -> {count} chunks indexed")
|
|
except Exception as e:
|
|
logger.error(f" -> FAILED: {e}")
|
|
results[regulation.code] = 0
|
|
|
|
# Step 3: Summary
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("SUMMARY")
|
|
logger.info("=" * 60)
|
|
logger.info(f" Previous chunk count: {old_count}")
|
|
logger.info(f" New chunk count: {total_chunks}")
|
|
logger.info(f" Difference: {total_chunks - old_count:+d}")
|
|
logger.info("\n Per regulation:")
|
|
for code, count in sorted(results.items()):
|
|
logger.info(f" {code}: {count} chunks")
|
|
|
|
# BSI specific
|
|
bsi_total = sum(results.get(f"BSI-TR-03161-{i}", 0) for i in [1, 2, 3])
|
|
logger.info(f"\n BSI-TR-03161 total: {bsi_total} chunks (was 18)")
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info(f"Completed at: {datetime.now().isoformat()}")
|
|
logger.info("=" * 60)
|
|
|
|
finally:
|
|
await ingestion.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|