Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
456 lines
16 KiB
Python
456 lines
16 KiB
Python
"""
|
|
Robust Legal Corpus Ingestion for UCCA RAG Integration.
|
|
|
|
This version handles large documents and unstable embedding services by:
|
|
- Processing one text at a time
|
|
- Health checks before each embedding
|
|
- Automatic retry with exponential backoff
|
|
- Progress tracking for resume capability
|
|
- Longer delays to prevent service overload
|
|
|
|
Usage:
|
|
python legal_corpus_robust.py --ingest DPF
|
|
python legal_corpus_robust.py --ingest-all-missing
|
|
python legal_corpus_robust.py --status
|
|
"""
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
import httpx
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.models import (
|
|
Distance,
|
|
FieldCondition,
|
|
Filter,
|
|
MatchValue,
|
|
PointStruct,
|
|
VectorParams,
|
|
)
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s"
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
|
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
|
|
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://localhost:8087")
|
|
LEGAL_CORPUS_COLLECTION = "bp_legal_corpus"
|
|
VECTOR_SIZE = 1024
|
|
CHUNK_SIZE = 800
|
|
CHUNK_OVERLAP = 150
|
|
|
|
# Robust settings
|
|
MAX_RETRIES = 5
|
|
INITIAL_DELAY = 2.0
|
|
DELAY_BETWEEN_EMBEDDINGS = 2.0
|
|
HEALTH_CHECK_INTERVAL = 10 # Check health every N embeddings
|
|
|
|
|
|
@dataclass
|
|
class Regulation:
|
|
"""Regulation metadata."""
|
|
code: str
|
|
name: str
|
|
full_name: str
|
|
regulation_type: str
|
|
source_url: str
|
|
description: str
|
|
language: str = "de"
|
|
|
|
|
|
# Regulations that need robust loading
|
|
ROBUST_REGULATIONS: List[Regulation] = [
|
|
Regulation(
|
|
code="DPF",
|
|
name="EU-US Data Privacy Framework",
|
|
full_name="Durchführungsbeschluss (EU) 2023/1795",
|
|
regulation_type="eu_regulation",
|
|
source_url="https://eur-lex.europa.eu/eli/dec_impl/2023/1795/oj",
|
|
description="Angemessenheitsbeschluss für USA-Transfers.",
|
|
),
|
|
Regulation(
|
|
code="BSI-TR-03161-1",
|
|
name="BSI-TR-03161 Teil 1",
|
|
full_name="BSI Technische Richtlinie - Allgemeine Anforderungen",
|
|
regulation_type="bsi_standard",
|
|
source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-1.pdf",
|
|
description="Allgemeine Sicherheitsanforderungen (45 Prüfaspekte).",
|
|
),
|
|
Regulation(
|
|
code="BSI-TR-03161-2",
|
|
name="BSI-TR-03161 Teil 2",
|
|
full_name="BSI Technische Richtlinie - Web-Anwendungen",
|
|
regulation_type="bsi_standard",
|
|
source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-2.pdf",
|
|
description="Web-Sicherheit (40 Prüfaspekte).",
|
|
),
|
|
Regulation(
|
|
code="BSI-TR-03161-3",
|
|
name="BSI-TR-03161 Teil 3",
|
|
full_name="BSI Technische Richtlinie - Hintergrundsysteme",
|
|
regulation_type="bsi_standard",
|
|
source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-3.pdf",
|
|
description="Backend-Sicherheit (35 Prüfaspekte).",
|
|
),
|
|
]
|
|
|
|
|
|
class RobustLegalCorpusIngestion:
|
|
"""Handles robust ingestion of large legal documents."""
|
|
|
|
def __init__(self):
|
|
self.qdrant = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
|
|
self.http_client = None
|
|
self.embeddings_since_health_check = 0
|
|
self._ensure_collection()
|
|
|
|
def _ensure_collection(self):
|
|
"""Create the legal corpus collection if it doesn't exist."""
|
|
collections = self.qdrant.get_collections().collections
|
|
collection_names = [c.name for c in collections]
|
|
|
|
if LEGAL_CORPUS_COLLECTION not in collection_names:
|
|
logger.info(f"Creating collection: {LEGAL_CORPUS_COLLECTION}")
|
|
self.qdrant.create_collection(
|
|
collection_name=LEGAL_CORPUS_COLLECTION,
|
|
vectors_config=VectorParams(
|
|
size=VECTOR_SIZE,
|
|
distance=Distance.COSINE,
|
|
),
|
|
)
|
|
|
|
async def _get_client(self) -> httpx.AsyncClient:
|
|
"""Get or create HTTP client."""
|
|
if self.http_client is None:
|
|
self.http_client = httpx.AsyncClient(timeout=120.0)
|
|
return self.http_client
|
|
|
|
async def _check_embedding_service_health(self) -> bool:
|
|
"""Check if embedding service is healthy."""
|
|
try:
|
|
client = await self._get_client()
|
|
response = await client.get(f"{EMBEDDING_SERVICE_URL}/health", timeout=10.0)
|
|
return response.status_code == 200
|
|
except Exception as e:
|
|
logger.warning(f"Health check failed: {e}")
|
|
return False
|
|
|
|
async def _wait_for_healthy_service(self, max_wait: int = 60) -> bool:
|
|
"""Wait for embedding service to become healthy."""
|
|
logger.info("Waiting for embedding service to become healthy...")
|
|
start = datetime.now()
|
|
while (datetime.now() - start).seconds < max_wait:
|
|
if await self._check_embedding_service_health():
|
|
logger.info("Embedding service is healthy")
|
|
return True
|
|
await asyncio.sleep(5)
|
|
logger.error("Embedding service did not become healthy")
|
|
return False
|
|
|
|
async def _generate_single_embedding(self, text: str) -> Optional[List[float]]:
|
|
"""Generate embedding for a single text with robust retry."""
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
# Health check periodically
|
|
self.embeddings_since_health_check += 1
|
|
if self.embeddings_since_health_check >= HEALTH_CHECK_INTERVAL:
|
|
if not await self._check_embedding_service_health():
|
|
await self._wait_for_healthy_service()
|
|
self.embeddings_since_health_check = 0
|
|
|
|
client = await self._get_client()
|
|
response = await client.post(
|
|
f"{EMBEDDING_SERVICE_URL}/embed",
|
|
json={"texts": [text]},
|
|
timeout=60.0,
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data["embeddings"][0]
|
|
|
|
except Exception as e:
|
|
delay = INITIAL_DELAY * (2 ** attempt)
|
|
logger.warning(f"Embedding attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
|
|
logger.info(f"Waiting {delay}s before retry...")
|
|
|
|
# Close and recreate client on connection errors
|
|
if "disconnect" in str(e).lower() or "connection" in str(e).lower():
|
|
if self.http_client:
|
|
await self.http_client.aclose()
|
|
self.http_client = None
|
|
# Wait for service to recover
|
|
await asyncio.sleep(delay)
|
|
if not await self._wait_for_healthy_service():
|
|
continue
|
|
else:
|
|
await asyncio.sleep(delay)
|
|
|
|
logger.error(f"Failed to generate embedding after {MAX_RETRIES} attempts")
|
|
return None
|
|
|
|
def _chunk_text_semantic(self, text: str) -> List[Tuple[str, int]]:
|
|
"""Chunk text semantically, respecting German sentence boundaries."""
|
|
sentence_endings = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ])')
|
|
sentences = sentence_endings.split(text)
|
|
|
|
chunks = []
|
|
current_chunk = []
|
|
current_length = 0
|
|
chunk_start = 0
|
|
position = 0
|
|
|
|
for sentence in sentences:
|
|
sentence = sentence.strip()
|
|
if not sentence:
|
|
continue
|
|
|
|
sentence_length = len(sentence)
|
|
|
|
if current_length + sentence_length > CHUNK_SIZE and current_chunk:
|
|
chunk_text = " ".join(current_chunk)
|
|
chunks.append((chunk_text, chunk_start))
|
|
|
|
# Keep some sentences for overlap
|
|
overlap_sentences = []
|
|
overlap_length = 0
|
|
for s in reversed(current_chunk):
|
|
if overlap_length + len(s) > CHUNK_OVERLAP:
|
|
break
|
|
overlap_sentences.insert(0, s)
|
|
overlap_length += len(s)
|
|
|
|
current_chunk = overlap_sentences
|
|
current_length = overlap_length
|
|
chunk_start = position - overlap_length
|
|
|
|
current_chunk.append(sentence)
|
|
current_length += sentence_length
|
|
position += sentence_length + 1
|
|
|
|
if current_chunk:
|
|
chunk_text = " ".join(current_chunk)
|
|
chunks.append((chunk_text, chunk_start))
|
|
|
|
return chunks
|
|
|
|
def _extract_article_info(self, text: str) -> Optional[Dict]:
|
|
"""Extract article number and paragraph from text."""
|
|
article_match = re.search(r'(?:Artikel|Art\.?)\s+(\d+)', text)
|
|
paragraph_match = re.search(r'(?:Absatz|Abs\.?)\s+(\d+)', text)
|
|
|
|
if article_match:
|
|
return {
|
|
"article": article_match.group(1),
|
|
"paragraph": paragraph_match.group(1) if paragraph_match else None,
|
|
}
|
|
return None
|
|
|
|
async def _fetch_document_text(self, regulation: Regulation) -> Optional[str]:
|
|
"""Fetch document text from URL."""
|
|
logger.info(f"Fetching {regulation.code} from: {regulation.source_url}")
|
|
try:
|
|
client = await self._get_client()
|
|
response = await client.get(
|
|
regulation.source_url,
|
|
follow_redirects=True,
|
|
headers={"Accept": "text/html,application/xhtml+xml"},
|
|
timeout=60.0,
|
|
)
|
|
response.raise_for_status()
|
|
|
|
html_content = response.text
|
|
html_content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL)
|
|
html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL)
|
|
text = re.sub(r'<[^>]+>', ' ', html_content)
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
return text
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch {regulation.code}: {e}")
|
|
return None
|
|
|
|
def get_existing_chunk_count(self, regulation_code: str) -> int:
|
|
"""Get count of existing chunks for a regulation."""
|
|
try:
|
|
result = self.qdrant.count(
|
|
collection_name=LEGAL_CORPUS_COLLECTION,
|
|
count_filter=Filter(
|
|
must=[
|
|
FieldCondition(
|
|
key="regulation_code",
|
|
match=MatchValue(value=regulation_code),
|
|
)
|
|
]
|
|
),
|
|
)
|
|
return result.count
|
|
except:
|
|
return 0
|
|
|
|
async def ingest_regulation_robust(self, regulation: Regulation, resume: bool = True) -> int:
|
|
"""
|
|
Ingest a regulation with robust error handling.
|
|
|
|
Args:
|
|
regulation: The regulation to ingest
|
|
resume: If True, skip already indexed chunks
|
|
|
|
Returns:
|
|
Number of chunks indexed
|
|
"""
|
|
logger.info(f"=== Starting robust ingestion for {regulation.code} ===")
|
|
|
|
# Check existing chunks
|
|
existing_count = self.get_existing_chunk_count(regulation.code)
|
|
logger.info(f"Existing chunks for {regulation.code}: {existing_count}")
|
|
|
|
# Fetch document
|
|
text = await self._fetch_document_text(regulation)
|
|
if not text or len(text) < 100:
|
|
logger.warning(f"No text found for {regulation.code}")
|
|
return 0
|
|
|
|
# Chunk the text
|
|
chunks = self._chunk_text_semantic(text)
|
|
total_chunks = len(chunks)
|
|
logger.info(f"Total chunks to process: {total_chunks}")
|
|
|
|
if resume and existing_count >= total_chunks:
|
|
logger.info(f"{regulation.code} already fully indexed")
|
|
return existing_count
|
|
|
|
# Determine starting point
|
|
start_idx = existing_count if resume else 0
|
|
logger.info(f"Starting from chunk {start_idx}")
|
|
|
|
indexed = 0
|
|
for idx, (chunk_text, position) in enumerate(chunks[start_idx:], start=start_idx):
|
|
# Progress logging
|
|
if idx % 10 == 0:
|
|
logger.info(f"Progress: {idx}/{total_chunks} chunks ({idx*100//total_chunks}%)")
|
|
|
|
# Generate embedding
|
|
embedding = await self._generate_single_embedding(chunk_text)
|
|
if embedding is None:
|
|
logger.error(f"Failed to embed chunk {idx}, stopping")
|
|
break
|
|
|
|
# Create point
|
|
point_id = hashlib.md5(f"{regulation.code}-{idx}".encode()).hexdigest()
|
|
article_info = self._extract_article_info(chunk_text)
|
|
|
|
point = PointStruct(
|
|
id=point_id,
|
|
vector=embedding,
|
|
payload={
|
|
"text": chunk_text,
|
|
"regulation_code": regulation.code,
|
|
"regulation_name": regulation.name,
|
|
"regulation_full_name": regulation.full_name,
|
|
"regulation_type": regulation.regulation_type,
|
|
"source_url": regulation.source_url,
|
|
"chunk_index": idx,
|
|
"chunk_position": position,
|
|
"article": article_info.get("article") if article_info else None,
|
|
"paragraph": article_info.get("paragraph") if article_info else None,
|
|
"language": regulation.language,
|
|
"indexed_at": datetime.utcnow().isoformat(),
|
|
"training_allowed": False,
|
|
},
|
|
)
|
|
|
|
# Upsert single point
|
|
self.qdrant.upsert(
|
|
collection_name=LEGAL_CORPUS_COLLECTION,
|
|
points=[point],
|
|
)
|
|
indexed += 1
|
|
|
|
# Delay between embeddings
|
|
await asyncio.sleep(DELAY_BETWEEN_EMBEDDINGS)
|
|
|
|
logger.info(f"=== Completed {regulation.code}: {indexed} new chunks indexed ===")
|
|
return existing_count + indexed
|
|
|
|
def get_status(self) -> Dict:
|
|
"""Get ingestion status for all robust regulations."""
|
|
status = {
|
|
"collection": LEGAL_CORPUS_COLLECTION,
|
|
"regulations": {},
|
|
}
|
|
|
|
for reg in ROBUST_REGULATIONS:
|
|
count = self.get_existing_chunk_count(reg.code)
|
|
status["regulations"][reg.code] = {
|
|
"name": reg.name,
|
|
"chunks": count,
|
|
"status": "complete" if count > 0 else "missing",
|
|
}
|
|
|
|
return status
|
|
|
|
async def close(self):
|
|
"""Close HTTP client."""
|
|
if self.http_client:
|
|
await self.http_client.aclose()
|
|
|
|
|
|
async def main():
|
|
"""CLI entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Robust Legal Corpus Ingestion")
|
|
parser.add_argument("--ingest", nargs="+", metavar="CODE", help="Ingest specific regulations")
|
|
parser.add_argument("--ingest-all-missing", action="store_true", help="Ingest all missing regulations")
|
|
parser.add_argument("--status", action="store_true", help="Show status")
|
|
parser.add_argument("--no-resume", action="store_true", help="Don't resume from existing chunks")
|
|
|
|
args = parser.parse_args()
|
|
|
|
ingestion = RobustLegalCorpusIngestion()
|
|
|
|
try:
|
|
if args.status:
|
|
status = ingestion.get_status()
|
|
print(json.dumps(status, indent=2))
|
|
|
|
elif args.ingest_all_missing:
|
|
print("Ingesting all missing regulations...")
|
|
for reg in ROBUST_REGULATIONS:
|
|
if ingestion.get_existing_chunk_count(reg.code) == 0:
|
|
count = await ingestion.ingest_regulation_robust(reg, resume=not args.no_resume)
|
|
print(f"{reg.code}: {count} chunks")
|
|
|
|
elif args.ingest:
|
|
for code in args.ingest:
|
|
reg = next((r for r in ROBUST_REGULATIONS if r.code == code), None)
|
|
if not reg:
|
|
print(f"Unknown regulation: {code}")
|
|
continue
|
|
count = await ingestion.ingest_regulation_robust(reg, resume=not args.no_resume)
|
|
print(f"{code}: {count} chunks")
|
|
|
|
else:
|
|
parser.print_help()
|
|
|
|
finally:
|
|
await ingestion.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|