#!/usr/bin/env python3 """ Full Re-Ingestion Script for Legal Corpus and UCCA. This script: 1. Deletes all existing chunks from bp_legal_corpus 2. Re-ingests all 19 regulations with improved semantic chunking 3. Logs progress to a file for monitoring Run in background on Mac Mini: nohup python full_reingestion.py > /tmp/reingestion.log 2>&1 & """ import asyncio import logging import os import sys from datetime import datetime # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler('/tmp/legal_corpus_reingestion.log') ] ) logger = logging.getLogger(__name__) # Set environment variables for Docker network os.environ.setdefault("QDRANT_HOST", "qdrant") os.environ.setdefault("EMBEDDING_SERVICE_URL", "http://embedding-service:8087") from legal_corpus_ingestion import LegalCorpusIngestion, REGULATIONS, LEGAL_CORPUS_COLLECTION from qdrant_client import QdrantClient async def main(): logger.info("=" * 60) logger.info("FULL LEGAL CORPUS RE-INGESTION") logger.info(f"Started at: {datetime.now().isoformat()}") logger.info("=" * 60) # Initialize ingestion = LegalCorpusIngestion() try: # Step 1: Delete all existing points logger.info("\n[Step 1] Deleting all existing chunks...") qdrant = QdrantClient(host=os.getenv("QDRANT_HOST", "qdrant"), port=6333) # Get current count try: collection_info = qdrant.get_collection(LEGAL_CORPUS_COLLECTION) old_count = collection_info.points_count logger.info(f" Current chunk count: {old_count}") except Exception as e: logger.warning(f" Could not get collection info: {e}") old_count = 0 # Delete all points by recreating collection logger.info(" Deleting collection and recreating...") try: qdrant.delete_collection(LEGAL_CORPUS_COLLECTION) logger.info(" Collection deleted.") except Exception as e: logger.warning(f" Could not delete collection: {e}") # The ingestion class will recreate the collection ingestion = LegalCorpusIngestion() logger.info(" Collection recreated.") # Step 2: Re-ingest all regulations logger.info("\n[Step 2] Re-ingesting all 19 regulations...") logger.info(f" Regulations: {[r.code for r in REGULATIONS]}") results = {} total_chunks = 0 for i, regulation in enumerate(REGULATIONS, 1): logger.info(f"\n [{i}/19] Processing {regulation.code}: {regulation.name}") try: count = await ingestion.ingest_regulation(regulation) results[regulation.code] = count total_chunks += count logger.info(f" -> {count} chunks indexed") except Exception as e: logger.error(f" -> FAILED: {e}") results[regulation.code] = 0 # Step 3: Summary logger.info("\n" + "=" * 60) logger.info("SUMMARY") logger.info("=" * 60) logger.info(f" Previous chunk count: {old_count}") logger.info(f" New chunk count: {total_chunks}") logger.info(f" Difference: {total_chunks - old_count:+d}") logger.info("\n Per regulation:") for code, count in sorted(results.items()): logger.info(f" {code}: {count} chunks") # BSI specific bsi_total = sum(results.get(f"BSI-TR-03161-{i}", 0) for i in [1, 2, 3]) logger.info(f"\n BSI-TR-03161 total: {bsi_total} chunks (was 18)") logger.info("\n" + "=" * 60) logger.info(f"Completed at: {datetime.now().isoformat()}") logger.info("=" * 60) finally: await ingestion.close() if __name__ == "__main__": asyncio.run(main())