Files
breakpilot-lehrer/klausur-service/backend/legal_templates_ingestion.py
Benjamin Admin b6983ab1dc [split-required] Split 500-1000 LOC files across all services
backend-lehrer (5 files):
- alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3)
- teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3)
- mail/mail_db.py (987 → 6)

klausur-service (5 files):
- legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4)
- ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2)
- KorrekturPage.tsx (956 → 6)

website (5 pages):
- mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7)
- ocr-labeling (946 → 7), audit-workspace (871 → 4)

studio-v2 (5 files + 1 deleted):
- page.tsx (946 → 5), MessagesContext.tsx (925 → 4)
- korrektur (914 → 6), worksheet-cleanup (899 → 6)
- useVocabWorksheet.ts (888 → 3)
- Deleted dead page-original.tsx (934 LOC)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 23:35:37 +02:00

501 lines
18 KiB
Python

"""
Legal Templates Ingestion Pipeline for RAG.
Indexes legal template documents from various open-source repositories
into Qdrant for semantic search. Supports multiple license types with
proper attribution tracking.
Collection: bp_legal_templates
Usage:
python legal_templates_cli.py --ingest-all
python legal_templates_cli.py --ingest-source github-site-policy
python legal_templates_cli.py --status
python legal_templates_cli.py --search "Datenschutzerklaerung"
"""
import asyncio
import hashlib
import logging
import os
from datetime import datetime
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse
import httpx
from qdrant_client import QdrantClient
from qdrant_client.models import (
Distance,
FieldCondition,
Filter,
MatchValue,
PointStruct,
VectorParams,
)
from template_sources import (
LICENSES,
TEMPLATE_SOURCES,
TEMPLATE_TYPES,
LicenseType,
SourceConfig,
get_enabled_sources,
get_sources_by_priority,
)
from github_crawler import (
ExtractedDocument,
GitHubCrawler,
RepositoryDownloader,
)
# Re-export from chunking module for backward compatibility
from legal_templates_chunking import ( # noqa: F401
IngestionStatus,
TemplateChunk,
chunk_text,
create_chunks,
infer_clause_category,
infer_template_type,
split_sentences,
)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Configuration - Support both QDRANT_URL and QDRANT_HOST/PORT
_qdrant_url = os.getenv("QDRANT_URL", "")
if _qdrant_url:
_parsed = urlparse(_qdrant_url)
QDRANT_HOST = _parsed.hostname or "localhost"
QDRANT_PORT = _parsed.port or 6333
else:
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://localhost:8087")
LEGAL_TEMPLATES_COLLECTION = "bp_legal_templates"
VECTOR_SIZE = 1024 # BGE-M3 dimension
# Chunking configuration
CHUNK_SIZE = int(os.getenv("TEMPLATE_CHUNK_SIZE", "1000"))
CHUNK_OVERLAP = int(os.getenv("TEMPLATE_CHUNK_OVERLAP", "200"))
# Batch processing
EMBEDDING_BATCH_SIZE = 4
MAX_RETRIES = 3
RETRY_DELAY = 3.0
class LegalTemplatesIngestion:
"""Handles ingestion of legal templates into Qdrant."""
def __init__(self):
self.qdrant = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
self.http_client = httpx.AsyncClient(timeout=120.0)
self._ensure_collection()
self._ingestion_status: Dict[str, IngestionStatus] = {}
def _ensure_collection(self):
"""Create the legal templates collection if it doesn't exist."""
collections = self.qdrant.get_collections().collections
collection_names = [c.name for c in collections]
if LEGAL_TEMPLATES_COLLECTION not in collection_names:
logger.info(f"Creating collection: {LEGAL_TEMPLATES_COLLECTION}")
self.qdrant.create_collection(
collection_name=LEGAL_TEMPLATES_COLLECTION,
vectors_config=VectorParams(
size=VECTOR_SIZE,
distance=Distance.COSINE,
),
)
logger.info(f"Collection {LEGAL_TEMPLATES_COLLECTION} created")
else:
logger.info(f"Collection {LEGAL_TEMPLATES_COLLECTION} already exists")
async def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
"""Generate embeddings via the embedding service."""
try:
response = await self.http_client.post(
f"{EMBEDDING_SERVICE_URL}/embed",
json={"texts": texts},
timeout=120.0,
)
response.raise_for_status()
data = response.json()
return data["embeddings"]
except Exception as e:
logger.error(f"Embedding generation failed: {e}")
raise
async def ingest_source(self, source: SourceConfig) -> IngestionStatus:
"""Ingest a single source into Qdrant."""
status = IngestionStatus(
source_name=source.name,
status="running",
started_at=datetime.utcnow(),
)
self._ingestion_status[source.name] = status
logger.info(f"Ingesting source: {source.name}")
try:
# Crawl the source
documents: List[ExtractedDocument] = []
if source.repo_url:
async with GitHubCrawler() as crawler:
async for doc in crawler.crawl_repository(source):
documents.append(doc)
status.documents_found += 1
logger.info(f"Found {len(documents)} documents in {source.name}")
if not documents:
status.status = "completed"
status.completed_at = datetime.utcnow()
return status
# Create chunks from all documents
all_chunks: List[TemplateChunk] = []
for doc in documents:
chunks = create_chunks(doc, source, CHUNK_SIZE, CHUNK_OVERLAP)
all_chunks.extend(chunks)
status.chunks_created += len(chunks)
logger.info(f"Created {len(all_chunks)} chunks from {source.name}")
# Generate embeddings and index in batches
for i in range(0, len(all_chunks), EMBEDDING_BATCH_SIZE):
batch_chunks = all_chunks[i:i + EMBEDDING_BATCH_SIZE]
chunk_texts = [c.text for c in batch_chunks]
# Retry logic for embeddings
embeddings = None
for retry in range(MAX_RETRIES):
try:
embeddings = await self._generate_embeddings(chunk_texts)
break
except Exception as e:
logger.warning(
f"Embedding attempt {retry+1}/{MAX_RETRIES} failed: {e}"
)
if retry < MAX_RETRIES - 1:
await asyncio.sleep(RETRY_DELAY * (retry + 1))
else:
status.errors.append(f"Embedding failed for batch {i}: {e}")
if embeddings is None:
continue
# Create points for Qdrant
points = []
for j, (chunk, embedding) in enumerate(zip(batch_chunks, embeddings)):
point_id = hashlib.md5(
f"{source.name}-{chunk.source_file}-{chunk.chunk_index}".encode()
).hexdigest()
payload = {
"text": chunk.text,
"chunk_index": chunk.chunk_index,
"document_title": chunk.document_title,
"template_type": chunk.template_type,
"clause_category": chunk.clause_category,
"language": chunk.language,
"jurisdiction": chunk.jurisdiction,
"license_id": chunk.license_id,
"license_name": chunk.license_name,
"license_url": chunk.license_url,
"attribution_required": chunk.attribution_required,
"share_alike": chunk.share_alike,
"no_derivatives": chunk.no_derivatives,
"commercial_use": chunk.commercial_use,
"source_name": chunk.source_name,
"source_url": chunk.source_url,
"source_repo": chunk.source_repo,
"source_commit": chunk.source_commit,
"source_file": chunk.source_file,
"source_hash": chunk.source_hash,
"attribution_text": chunk.attribution_text,
"copyright_notice": chunk.copyright_notice,
"is_complete_document": chunk.is_complete_document,
"is_modular": chunk.is_modular,
"requires_customization": chunk.requires_customization,
"placeholders": chunk.placeholders,
"training_allowed": chunk.training_allowed,
"output_allowed": chunk.output_allowed,
"modification_allowed": chunk.modification_allowed,
"distortion_prohibited": chunk.distortion_prohibited,
"indexed_at": datetime.utcnow().isoformat(),
}
points.append(PointStruct(
id=point_id,
vector=embedding,
payload=payload,
))
# Upsert to Qdrant
if points:
self.qdrant.upsert(
collection_name=LEGAL_TEMPLATES_COLLECTION,
points=points,
)
status.chunks_indexed += len(points)
# Rate limiting
await asyncio.sleep(1.0)
status.status = "completed"
status.completed_at = datetime.utcnow()
logger.info(
f"Completed {source.name}: {status.chunks_indexed} chunks indexed"
)
except Exception as e:
status.status = "failed"
status.errors.append(str(e))
status.completed_at = datetime.utcnow()
logger.error(f"Failed to ingest {source.name}: {e}")
return status
async def ingest_all(self, max_priority: int = 5) -> Dict[str, IngestionStatus]:
"""Ingest all enabled sources up to a priority level."""
sources = get_sources_by_priority(max_priority)
results = {}
for source in sources:
result = await self.ingest_source(source)
results[source.name] = result
return results
async def ingest_by_license(self, license_type: LicenseType) -> Dict[str, IngestionStatus]:
"""Ingest all sources of a specific license type."""
from template_sources import get_sources_by_license
sources = get_sources_by_license(license_type)
results = {}
for source in sources:
result = await self.ingest_source(source)
results[source.name] = result
return results
def get_status(self) -> Dict[str, Any]:
"""Get collection status and ingestion results."""
try:
collection_info = self.qdrant.get_collection(LEGAL_TEMPLATES_COLLECTION)
# Count points per source
source_counts = {}
for source in TEMPLATE_SOURCES:
result = self.qdrant.count(
collection_name=LEGAL_TEMPLATES_COLLECTION,
count_filter=Filter(
must=[
FieldCondition(
key="source_name",
match=MatchValue(value=source.name),
)
]
),
)
source_counts[source.name] = result.count
# Count by license type
license_counts = {}
for license_type in LicenseType:
result = self.qdrant.count(
collection_name=LEGAL_TEMPLATES_COLLECTION,
count_filter=Filter(
must=[
FieldCondition(
key="license_id",
match=MatchValue(value=license_type.value),
)
]
),
)
license_counts[license_type.value] = result.count
# Count by template type
template_type_counts = {}
for template_type in TEMPLATE_TYPES.keys():
result = self.qdrant.count(
collection_name=LEGAL_TEMPLATES_COLLECTION,
count_filter=Filter(
must=[
FieldCondition(
key="template_type",
match=MatchValue(value=template_type),
)
]
),
)
if result.count > 0:
template_type_counts[template_type] = result.count
# Count by language
language_counts = {}
for lang in ["de", "en"]:
result = self.qdrant.count(
collection_name=LEGAL_TEMPLATES_COLLECTION,
count_filter=Filter(
must=[
FieldCondition(
key="language",
match=MatchValue(value=lang),
)
]
),
)
language_counts[lang] = result.count
return {
"collection": LEGAL_TEMPLATES_COLLECTION,
"total_points": collection_info.points_count,
"vector_size": VECTOR_SIZE,
"sources": source_counts,
"licenses": license_counts,
"template_types": template_type_counts,
"languages": language_counts,
"status": "ready" if collection_info.points_count > 0 else "empty",
"ingestion_status": {
name: {
"status": s.status,
"documents_found": s.documents_found,
"chunks_indexed": s.chunks_indexed,
"errors": s.errors,
}
for name, s in self._ingestion_status.items()
},
}
except Exception as e:
return {
"collection": LEGAL_TEMPLATES_COLLECTION,
"error": str(e),
"status": "error",
}
async def search(
self,
query: str,
template_type: Optional[str] = None,
license_types: Optional[List[str]] = None,
language: Optional[str] = None,
jurisdiction: Optional[str] = None,
attribution_required: Optional[bool] = None,
top_k: int = 10,
) -> List[Dict[str, Any]]:
"""Search the legal templates collection."""
# Generate query embedding
embeddings = await self._generate_embeddings([query])
query_vector = embeddings[0]
# Build filter conditions
must_conditions = []
if template_type:
must_conditions.append(
FieldCondition(key="template_type", match=MatchValue(value=template_type))
)
if language:
must_conditions.append(
FieldCondition(key="language", match=MatchValue(value=language))
)
if jurisdiction:
must_conditions.append(
FieldCondition(key="jurisdiction", match=MatchValue(value=jurisdiction))
)
if attribution_required is not None:
must_conditions.append(
FieldCondition(key="attribution_required", match=MatchValue(value=attribution_required))
)
# License type filter (OR condition)
should_conditions = []
if license_types:
for lt in license_types:
should_conditions.append(
FieldCondition(key="license_id", match=MatchValue(value=lt))
)
# Construct filter
search_filter = None
if must_conditions or should_conditions:
filter_dict = {}
if must_conditions:
filter_dict["must"] = must_conditions
if should_conditions:
filter_dict["should"] = should_conditions
search_filter = Filter(**filter_dict)
# Execute search
results = self.qdrant.search(
collection_name=LEGAL_TEMPLATES_COLLECTION,
query_vector=query_vector,
query_filter=search_filter,
limit=top_k,
)
return [
{
"id": hit.id,
"score": hit.score,
"text": hit.payload.get("text"),
"document_title": hit.payload.get("document_title"),
"template_type": hit.payload.get("template_type"),
"clause_category": hit.payload.get("clause_category"),
"language": hit.payload.get("language"),
"jurisdiction": hit.payload.get("jurisdiction"),
"license_id": hit.payload.get("license_id"),
"license_name": hit.payload.get("license_name"),
"attribution_required": hit.payload.get("attribution_required"),
"attribution_text": hit.payload.get("attribution_text"),
"source_name": hit.payload.get("source_name"),
"source_url": hit.payload.get("source_url"),
"placeholders": hit.payload.get("placeholders"),
"is_complete_document": hit.payload.get("is_complete_document"),
"requires_customization": hit.payload.get("requires_customization"),
"output_allowed": hit.payload.get("output_allowed"),
"modification_allowed": hit.payload.get("modification_allowed"),
}
for hit in results
]
def delete_source(self, source_name: str) -> int:
"""Delete all chunks from a specific source."""
count_result = self.qdrant.count(
collection_name=LEGAL_TEMPLATES_COLLECTION,
count_filter=Filter(
must=[FieldCondition(key="source_name", match=MatchValue(value=source_name))]
),
)
self.qdrant.delete(
collection_name=LEGAL_TEMPLATES_COLLECTION,
points_selector=Filter(
must=[FieldCondition(key="source_name", match=MatchValue(value=source_name))]
),
)
return count_result.count
def reset_collection(self):
"""Delete and recreate the collection."""
logger.warning(f"Resetting collection: {LEGAL_TEMPLATES_COLLECTION}")
try:
self.qdrant.delete_collection(LEGAL_TEMPLATES_COLLECTION)
except Exception:
pass
self._ensure_collection()
self._ingestion_status.clear()
logger.info(f"Collection {LEGAL_TEMPLATES_COLLECTION} reset")
async def close(self):
"""Close HTTP client."""
await self.http_client.aclose()