klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
501 lines
19 KiB
Python
501 lines
19 KiB
Python
"""
|
|
Legal Corpus Ingestion for UCCA RAG Integration.
|
|
|
|
Indexes all regulations from the Compliance Hub into Qdrant for
|
|
semantic search during UCCA assessments and explanations.
|
|
Includes EU regulations, DACH national laws, and EDPB guidelines.
|
|
|
|
Collections:
|
|
- bp_legal_corpus: All regulation texts (GDPR, AI Act, CRA, BSI, etc.)
|
|
|
|
Split modules:
|
|
- legal_corpus_registry: Regulation dataclass + REGULATIONS list (pure data)
|
|
- legal_corpus_chunking: Sentence/paragraph splitting, semantic chunking, HTML-to-text
|
|
|
|
Usage:
|
|
python legal_corpus_ingestion.py --ingest-all
|
|
python legal_corpus_ingestion.py --ingest GDPR AIACT
|
|
python legal_corpus_ingestion.py --status
|
|
"""
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
from urllib.parse import urlparse
|
|
|
|
import httpx
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.models import (
|
|
Distance,
|
|
FieldCondition,
|
|
Filter,
|
|
MatchValue,
|
|
PointStruct,
|
|
VectorParams,
|
|
)
|
|
|
|
# Re-export for backward compatibility
|
|
from legal_corpus_registry import Regulation, REGULATIONS # noqa: F401
|
|
from legal_corpus_chunking import ( # noqa: F401
|
|
chunk_text_semantic,
|
|
extract_article_info,
|
|
html_to_text,
|
|
split_into_sentences,
|
|
split_into_paragraphs,
|
|
GERMAN_ABBREVIATIONS,
|
|
)
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration - Support both QDRANT_URL and QDRANT_HOST/PORT
|
|
_qdrant_url = os.getenv("QDRANT_URL", "")
|
|
if _qdrant_url:
|
|
_parsed = urlparse(_qdrant_url)
|
|
QDRANT_HOST = _parsed.hostname or "localhost"
|
|
QDRANT_PORT = _parsed.port or 6333
|
|
else:
|
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
|
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
|
|
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://localhost:8087")
|
|
LEGAL_CORPUS_COLLECTION = "bp_legal_corpus"
|
|
VECTOR_SIZE = 1024 # BGE-M3 dimension
|
|
|
|
# Chunking configuration - matched to NIBIS settings for semantic chunking
|
|
CHUNK_SIZE = int(os.getenv("LEGAL_CHUNK_SIZE", "1000"))
|
|
CHUNK_OVERLAP = int(os.getenv("LEGAL_CHUNK_OVERLAP", "200"))
|
|
|
|
# Base path for local PDF/HTML files
|
|
_default_docs_path = Path(__file__).parent.parent / "docs" / "legal_corpus"
|
|
LEGAL_DOCS_PATH = Path(os.getenv("LEGAL_DOCS_PATH", str(_default_docs_path)))
|
|
if Path("/app/docs/legal_corpus").exists():
|
|
LEGAL_DOCS_PATH = Path("/app/docs/legal_corpus")
|
|
|
|
|
|
class LegalCorpusIngestion:
|
|
"""Handles ingestion of legal documents into Qdrant."""
|
|
|
|
def __init__(self):
|
|
self.qdrant = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
|
|
self.http_client = httpx.AsyncClient(timeout=60.0)
|
|
self._ensure_collection()
|
|
|
|
def _ensure_collection(self):
|
|
"""Create the legal corpus collection if it doesn't exist."""
|
|
collections = self.qdrant.get_collections().collections
|
|
collection_names = [c.name for c in collections]
|
|
|
|
if LEGAL_CORPUS_COLLECTION not in collection_names:
|
|
logger.info(f"Creating collection: {LEGAL_CORPUS_COLLECTION}")
|
|
self.qdrant.create_collection(
|
|
collection_name=LEGAL_CORPUS_COLLECTION,
|
|
vectors_config=VectorParams(
|
|
size=VECTOR_SIZE,
|
|
distance=Distance.COSINE,
|
|
),
|
|
)
|
|
logger.info(f"Collection {LEGAL_CORPUS_COLLECTION} created")
|
|
else:
|
|
logger.info(f"Collection {LEGAL_CORPUS_COLLECTION} already exists")
|
|
|
|
async def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
|
"""Generate embeddings via the embedding service."""
|
|
try:
|
|
response = await self.http_client.post(
|
|
f"{EMBEDDING_SERVICE_URL}/embed",
|
|
json={"texts": texts},
|
|
timeout=120.0,
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data["embeddings"]
|
|
except Exception as e:
|
|
logger.error(f"Embedding generation failed: {e}")
|
|
raise
|
|
|
|
# Delegate chunking/text methods to legal_corpus_chunking module
|
|
# Keep as instance methods for backward compatibility
|
|
GERMAN_ABBREVIATIONS = GERMAN_ABBREVIATIONS
|
|
|
|
def _split_into_sentences(self, text: str) -> List[str]:
|
|
return split_into_sentences(text)
|
|
|
|
def _split_into_paragraphs(self, text: str) -> List[str]:
|
|
return split_into_paragraphs(text)
|
|
|
|
def _chunk_text_semantic(self, text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP):
|
|
return chunk_text_semantic(text, chunk_size, overlap)
|
|
|
|
def _extract_article_info(self, text: str):
|
|
return extract_article_info(text)
|
|
|
|
def _html_to_text(self, html_content: str) -> str:
|
|
return html_to_text(html_content)
|
|
|
|
async def _fetch_document_text(self, regulation: Regulation) -> Optional[str]:
|
|
"""
|
|
Fetch document text from local file or URL.
|
|
|
|
Priority:
|
|
1. Local file in docs/legal_corpus/ (.txt or .pdf)
|
|
2. EUR-Lex via CELEX URL (for EU regulations/directives)
|
|
3. Fallback to original source URL
|
|
"""
|
|
# Check for local file first
|
|
local_file = LEGAL_DOCS_PATH / f"{regulation.code}.txt"
|
|
if local_file.exists():
|
|
logger.info(f"Loading {regulation.code} from local file: {local_file}")
|
|
return local_file.read_text(encoding="utf-8")
|
|
|
|
local_pdf = LEGAL_DOCS_PATH / f"{regulation.code}.pdf"
|
|
if local_pdf.exists():
|
|
logger.info(f"Extracting text from PDF: {local_pdf}")
|
|
try:
|
|
response = await self.http_client.post(
|
|
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
|
|
files={"file": open(local_pdf, "rb")},
|
|
timeout=120.0,
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data.get("text", "")
|
|
except Exception as e:
|
|
logger.error(f"PDF extraction failed for {regulation.code}: {e}")
|
|
|
|
# Try EUR-Lex CELEX URL if available
|
|
if regulation.celex:
|
|
celex_url = f"https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{regulation.celex}"
|
|
logger.info(f"Fetching {regulation.code} from EUR-Lex CELEX: {celex_url}")
|
|
try:
|
|
response = await self.http_client.get(
|
|
celex_url,
|
|
follow_redirects=True,
|
|
headers={
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
"Accept-Language": "de-DE,de;q=0.9",
|
|
"User-Agent": "Mozilla/5.0 (compatible; LegalCorpusIndexer/1.0)",
|
|
},
|
|
timeout=120.0,
|
|
)
|
|
response.raise_for_status()
|
|
|
|
html_content = response.text
|
|
|
|
if "verify that you're not a robot" not in html_content and len(html_content) > 10000:
|
|
text = self._html_to_text(html_content)
|
|
if text and len(text) > 1000:
|
|
logger.info(f"Successfully fetched {regulation.code} via CELEX ({len(text)} chars)")
|
|
return text
|
|
else:
|
|
logger.warning(f"CELEX response too short for {regulation.code}, trying fallback")
|
|
else:
|
|
logger.warning(f"CELEX returned CAPTCHA for {regulation.code}, trying fallback")
|
|
except Exception as e:
|
|
logger.warning(f"CELEX fetch failed for {regulation.code}: {e}, trying fallback")
|
|
|
|
# Fallback to original source URL
|
|
logger.info(f"Fetching {regulation.code} from: {regulation.source_url}")
|
|
try:
|
|
parsed_url = urlparse(regulation.source_url)
|
|
is_pdf_url = parsed_url.path.lower().endswith('.pdf')
|
|
if is_pdf_url:
|
|
logger.info(f"Downloading PDF from URL for {regulation.code}")
|
|
response = await self.http_client.get(
|
|
regulation.source_url,
|
|
follow_redirects=True,
|
|
headers={
|
|
"Accept": "application/pdf",
|
|
"User-Agent": "Mozilla/5.0 (compatible; LegalCorpusIndexer/1.0)",
|
|
},
|
|
timeout=180.0,
|
|
)
|
|
response.raise_for_status()
|
|
|
|
pdf_content = response.content
|
|
extract_response = await self.http_client.post(
|
|
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
|
|
files={"file": ("document.pdf", pdf_content, "application/pdf")},
|
|
timeout=180.0,
|
|
)
|
|
extract_response.raise_for_status()
|
|
data = extract_response.json()
|
|
text = data.get("text", "")
|
|
if text:
|
|
logger.info(f"Successfully extracted PDF text for {regulation.code} ({len(text)} chars)")
|
|
return text
|
|
else:
|
|
logger.warning(f"PDF extraction returned empty text for {regulation.code}")
|
|
return None
|
|
else:
|
|
response = await self.http_client.get(
|
|
regulation.source_url,
|
|
follow_redirects=True,
|
|
headers={
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
"Accept-Language": "de-DE,de;q=0.9",
|
|
"User-Agent": "Mozilla/5.0 (compatible; LegalCorpusIndexer/1.0)",
|
|
},
|
|
timeout=120.0,
|
|
)
|
|
response.raise_for_status()
|
|
|
|
text = self._html_to_text(response.text)
|
|
return text
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch {regulation.code}: {e}")
|
|
return None
|
|
|
|
async def ingest_regulation(self, regulation: Regulation) -> int:
|
|
"""Ingest a single regulation into Qdrant. Returns number of chunks indexed."""
|
|
logger.info(f"Ingesting {regulation.code}: {regulation.name}")
|
|
|
|
text = await self._fetch_document_text(regulation)
|
|
if not text or len(text) < 100:
|
|
logger.warning(f"No text found for {regulation.code}, skipping")
|
|
return 0
|
|
|
|
chunks = self._chunk_text_semantic(text)
|
|
logger.info(f"Created {len(chunks)} chunks for {regulation.code}")
|
|
|
|
if not chunks:
|
|
return 0
|
|
|
|
batch_size = 4
|
|
all_points = []
|
|
max_retries = 3
|
|
|
|
for i in range(0, len(chunks), batch_size):
|
|
batch_chunks = chunks[i:i + batch_size]
|
|
chunk_texts = [c[0] for c in batch_chunks]
|
|
|
|
embeddings = None
|
|
for retry in range(max_retries):
|
|
try:
|
|
embeddings = await self._generate_embeddings(chunk_texts)
|
|
break
|
|
except Exception as e:
|
|
logger.warning(f"Embedding attempt {retry+1}/{max_retries} failed for batch {i//batch_size}: {e}")
|
|
if retry < max_retries - 1:
|
|
await asyncio.sleep(3 * (retry + 1))
|
|
else:
|
|
logger.error(f"Embedding failed permanently for batch {i//batch_size}")
|
|
|
|
if embeddings is None:
|
|
continue
|
|
|
|
await asyncio.sleep(1.5)
|
|
|
|
for j, ((chunk_text, position), embedding) in enumerate(zip(batch_chunks, embeddings)):
|
|
chunk_idx = i + j
|
|
point_id = hashlib.md5(f"{regulation.code}-{chunk_idx}".encode()).hexdigest()
|
|
|
|
article_info = self._extract_article_info(chunk_text)
|
|
|
|
point = PointStruct(
|
|
id=point_id,
|
|
vector=embedding,
|
|
payload={
|
|
"text": chunk_text,
|
|
"regulation_code": regulation.code,
|
|
"regulation_name": regulation.name,
|
|
"regulation_full_name": regulation.full_name,
|
|
"regulation_type": regulation.regulation_type,
|
|
"source_url": regulation.source_url,
|
|
"chunk_index": chunk_idx,
|
|
"chunk_position": position,
|
|
"article": article_info.get("article") if article_info else None,
|
|
"paragraph": article_info.get("paragraph") if article_info else None,
|
|
"language": regulation.language,
|
|
"indexed_at": datetime.utcnow().isoformat(),
|
|
"training_allowed": False,
|
|
},
|
|
)
|
|
all_points.append(point)
|
|
|
|
if all_points:
|
|
self.qdrant.upsert(
|
|
collection_name=LEGAL_CORPUS_COLLECTION,
|
|
points=all_points,
|
|
)
|
|
logger.info(f"Indexed {len(all_points)} chunks for {regulation.code}")
|
|
|
|
return len(all_points)
|
|
|
|
async def ingest_all(self) -> Dict[str, int]:
|
|
"""Ingest all regulations."""
|
|
results = {}
|
|
total = 0
|
|
|
|
for regulation in REGULATIONS:
|
|
try:
|
|
count = await self.ingest_regulation(regulation)
|
|
results[regulation.code] = count
|
|
total += count
|
|
except Exception as e:
|
|
logger.error(f"Failed to ingest {regulation.code}: {e}")
|
|
results[regulation.code] = 0
|
|
|
|
logger.info(f"Ingestion complete: {total} total chunks indexed")
|
|
return results
|
|
|
|
async def ingest_selected(self, codes: List[str]) -> Dict[str, int]:
|
|
"""Ingest selected regulations by code."""
|
|
results = {}
|
|
|
|
for code in codes:
|
|
regulation = next((r for r in REGULATIONS if r.code == code), None)
|
|
if not regulation:
|
|
logger.warning(f"Unknown regulation code: {code}")
|
|
results[code] = 0
|
|
continue
|
|
|
|
try:
|
|
count = await self.ingest_regulation(regulation)
|
|
results[code] = count
|
|
except Exception as e:
|
|
logger.error(f"Failed to ingest {code}: {e}")
|
|
results[code] = 0
|
|
|
|
return results
|
|
|
|
def get_status(self) -> Dict:
|
|
"""Get collection status and indexed regulations."""
|
|
try:
|
|
collection_info = self.qdrant.get_collection(LEGAL_CORPUS_COLLECTION)
|
|
|
|
regulation_counts = {}
|
|
for reg in REGULATIONS:
|
|
result = self.qdrant.count(
|
|
collection_name=LEGAL_CORPUS_COLLECTION,
|
|
count_filter=Filter(
|
|
must=[
|
|
FieldCondition(
|
|
key="regulation_code",
|
|
match=MatchValue(value=reg.code),
|
|
)
|
|
]
|
|
),
|
|
)
|
|
regulation_counts[reg.code] = result.count
|
|
|
|
return {
|
|
"collection": LEGAL_CORPUS_COLLECTION,
|
|
"total_points": collection_info.points_count,
|
|
"vector_size": VECTOR_SIZE,
|
|
"regulations": regulation_counts,
|
|
"status": "ready" if collection_info.points_count > 0 else "empty",
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"collection": LEGAL_CORPUS_COLLECTION,
|
|
"error": str(e),
|
|
"status": "error",
|
|
}
|
|
|
|
async def search(
|
|
self,
|
|
query: str,
|
|
regulation_codes: Optional[List[str]] = None,
|
|
top_k: int = 5,
|
|
) -> List[Dict]:
|
|
"""Search the legal corpus for relevant passages."""
|
|
embeddings = await self._generate_embeddings([query])
|
|
query_vector = embeddings[0]
|
|
|
|
search_filter = None
|
|
if regulation_codes:
|
|
search_filter = Filter(
|
|
should=[
|
|
FieldCondition(
|
|
key="regulation_code",
|
|
match=MatchValue(value=code),
|
|
)
|
|
for code in regulation_codes
|
|
]
|
|
)
|
|
|
|
results = self.qdrant.search(
|
|
collection_name=LEGAL_CORPUS_COLLECTION,
|
|
query_vector=query_vector,
|
|
query_filter=search_filter,
|
|
limit=top_k,
|
|
)
|
|
|
|
return [
|
|
{
|
|
"text": hit.payload.get("text"),
|
|
"regulation_code": hit.payload.get("regulation_code"),
|
|
"regulation_name": hit.payload.get("regulation_name"),
|
|
"article": hit.payload.get("article"),
|
|
"paragraph": hit.payload.get("paragraph"),
|
|
"source_url": hit.payload.get("source_url"),
|
|
"score": hit.score,
|
|
}
|
|
for hit in results
|
|
]
|
|
|
|
async def close(self):
|
|
"""Close HTTP client."""
|
|
await self.http_client.aclose()
|
|
|
|
|
|
async def main():
|
|
"""CLI entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Legal Corpus Ingestion for UCCA")
|
|
parser.add_argument("--ingest-all", action="store_true", help="Ingest all regulations")
|
|
parser.add_argument("--ingest", nargs="+", metavar="CODE", help="Ingest specific regulations by code")
|
|
parser.add_argument("--status", action="store_true", help="Show collection status")
|
|
parser.add_argument("--search", type=str, help="Test search query")
|
|
|
|
args = parser.parse_args()
|
|
|
|
ingestion = LegalCorpusIngestion()
|
|
|
|
try:
|
|
if args.status:
|
|
status = ingestion.get_status()
|
|
print(json.dumps(status, indent=2))
|
|
|
|
elif args.ingest_all:
|
|
print(f"Ingesting all {len(REGULATIONS)} regulations...")
|
|
results = await ingestion.ingest_all()
|
|
print("\nResults:")
|
|
for code, count in results.items():
|
|
print(f" {code}: {count} chunks")
|
|
print(f"\nTotal: {sum(results.values())} chunks")
|
|
|
|
elif args.ingest:
|
|
print(f"Ingesting: {', '.join(args.ingest)}")
|
|
results = await ingestion.ingest_selected(args.ingest)
|
|
print("\nResults:")
|
|
for code, count in results.items():
|
|
print(f" {code}: {count} chunks")
|
|
|
|
elif args.search:
|
|
print(f"Searching: {args.search}")
|
|
results = await ingestion.search(args.search)
|
|
print(f"\nFound {len(results)} results:")
|
|
for i, result in enumerate(results, 1):
|
|
print(f"\n{i}. [{result['regulation_code']}] Score: {result['score']:.3f}")
|
|
if result.get('article'):
|
|
print(f" Art. {result['article']}" + (f" Abs. {result['paragraph']}" if result.get('paragraph') else ""))
|
|
print(f" {result['text'][:200]}...")
|
|
|
|
else:
|
|
parser.print_help()
|
|
|
|
finally:
|
|
await ingestion.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|