This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/klausur-service/backend/full_reingestion.py
BreakPilot Dev 19855efacc
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
feat: BreakPilot PWA - Full codebase (clean push without large binaries)
All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00

117 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""
Full Re-Ingestion Script for Legal Corpus and UCCA.
This script:
1. Deletes all existing chunks from bp_legal_corpus
2. Re-ingests all 19 regulations with improved semantic chunking
3. Logs progress to a file for monitoring
Run in background on Mac Mini:
nohup python full_reingestion.py > /tmp/reingestion.log 2>&1 &
"""
import asyncio
import logging
import os
import sys
from datetime import datetime
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('/tmp/legal_corpus_reingestion.log')
]
)
logger = logging.getLogger(__name__)
# Set environment variables for Docker network
os.environ.setdefault("QDRANT_HOST", "qdrant")
os.environ.setdefault("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
from legal_corpus_ingestion import LegalCorpusIngestion, REGULATIONS, LEGAL_CORPUS_COLLECTION
from qdrant_client import QdrantClient
async def main():
logger.info("=" * 60)
logger.info("FULL LEGAL CORPUS RE-INGESTION")
logger.info(f"Started at: {datetime.now().isoformat()}")
logger.info("=" * 60)
# Initialize
ingestion = LegalCorpusIngestion()
try:
# Step 1: Delete all existing points
logger.info("\n[Step 1] Deleting all existing chunks...")
qdrant = QdrantClient(host=os.getenv("QDRANT_HOST", "qdrant"), port=6333)
# Get current count
try:
collection_info = qdrant.get_collection(LEGAL_CORPUS_COLLECTION)
old_count = collection_info.points_count
logger.info(f" Current chunk count: {old_count}")
except Exception as e:
logger.warning(f" Could not get collection info: {e}")
old_count = 0
# Delete all points by recreating collection
logger.info(" Deleting collection and recreating...")
try:
qdrant.delete_collection(LEGAL_CORPUS_COLLECTION)
logger.info(" Collection deleted.")
except Exception as e:
logger.warning(f" Could not delete collection: {e}")
# The ingestion class will recreate the collection
ingestion = LegalCorpusIngestion()
logger.info(" Collection recreated.")
# Step 2: Re-ingest all regulations
logger.info("\n[Step 2] Re-ingesting all 19 regulations...")
logger.info(f" Regulations: {[r.code for r in REGULATIONS]}")
results = {}
total_chunks = 0
for i, regulation in enumerate(REGULATIONS, 1):
logger.info(f"\n [{i}/19] Processing {regulation.code}: {regulation.name}")
try:
count = await ingestion.ingest_regulation(regulation)
results[regulation.code] = count
total_chunks += count
logger.info(f" -> {count} chunks indexed")
except Exception as e:
logger.error(f" -> FAILED: {e}")
results[regulation.code] = 0
# Step 3: Summary
logger.info("\n" + "=" * 60)
logger.info("SUMMARY")
logger.info("=" * 60)
logger.info(f" Previous chunk count: {old_count}")
logger.info(f" New chunk count: {total_chunks}")
logger.info(f" Difference: {total_chunks - old_count:+d}")
logger.info("\n Per regulation:")
for code, count in sorted(results.items()):
logger.info(f" {code}: {count} chunks")
# BSI specific
bsi_total = sum(results.get(f"BSI-TR-03161-{i}", 0) for i in [1, 2, 3])
logger.info(f"\n BSI-TR-03161 total: {bsi_total} chunks (was 18)")
logger.info("\n" + "=" * 60)
logger.info(f"Completed at: {datetime.now().isoformat()}")
logger.info("=" * 60)
finally:
await ingestion.close()
if __name__ == "__main__":
asyncio.run(main())