[split-required] Split 500-1000 LOC files across all services

backend-lehrer (5 files):
- alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3)
- teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3)
- mail/mail_db.py (987 → 6)

klausur-service (5 files):
- legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4)
- ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2)
- KorrekturPage.tsx (956 → 6)

website (5 pages):
- mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7)
- ocr-labeling (946 → 7), audit-workspace (871 → 4)

studio-v2 (5 files + 1 deleted):
- page.tsx (946 → 5), MessagesContext.tsx (925 → 4)
- korrektur (914 → 6), worksheet-cleanup (899 → 6)
- useVocabWorksheet.ts (888 → 3)
- Deleted dead page-original.tsx (934 LOC)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 23:35:37 +02:00
parent 6811264756
commit b6983ab1dc
99 changed files with 13484 additions and 16106 deletions

View File

@@ -8,18 +8,16 @@ proper attribution tracking.
Collection: bp_legal_templates
Usage:
python legal_templates_ingestion.py --ingest-all
python legal_templates_ingestion.py --ingest-source github-site-policy
python legal_templates_ingestion.py --status
python legal_templates_ingestion.py --search "Datenschutzerklaerung"
python legal_templates_cli.py --ingest-all
python legal_templates_cli.py --ingest-source github-site-policy
python legal_templates_cli.py --status
python legal_templates_cli.py --search "Datenschutzerklaerung"
"""
import asyncio
import hashlib
import json
import logging
import os
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse
@@ -50,6 +48,17 @@ from github_crawler import (
RepositoryDownloader,
)
# Re-export from chunking module for backward compatibility
from legal_templates_chunking import ( # noqa: F401
IngestionStatus,
TemplateChunk,
chunk_text,
create_chunks,
infer_clause_category,
infer_template_type,
split_sentences,
)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -78,54 +87,6 @@ MAX_RETRIES = 3
RETRY_DELAY = 3.0
@dataclass
class IngestionStatus:
"""Status of a source ingestion."""
source_name: str
status: str # "pending", "running", "completed", "failed"
documents_found: int = 0
chunks_created: int = 0
chunks_indexed: int = 0
errors: List[str] = field(default_factory=list)
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
@dataclass
class TemplateChunk:
"""A chunk of template text ready for indexing."""
text: str
chunk_index: int
document_title: str
template_type: str
clause_category: Optional[str]
language: str
jurisdiction: str
license_id: str
license_name: str
license_url: str
attribution_required: bool
share_alike: bool
no_derivatives: bool
commercial_use: bool
source_name: str
source_url: str
source_repo: Optional[str]
source_commit: Optional[str]
source_file: str
source_hash: str
attribution_text: Optional[str]
copyright_notice: Optional[str]
is_complete_document: bool
is_modular: bool
requires_customization: bool
placeholders: List[str]
training_allowed: bool
output_allowed: bool
modification_allowed: bool
distortion_prohibited: bool
class LegalTemplatesIngestion:
"""Handles ingestion of legal templates into Qdrant."""
@@ -168,212 +129,6 @@ class LegalTemplatesIngestion:
logger.error(f"Embedding generation failed: {e}")
raise
def _chunk_text(self, text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
"""
Split text into overlapping chunks.
Respects paragraph and sentence boundaries where possible.
"""
if not text:
return []
if len(text) <= chunk_size:
return [text.strip()]
# Split into paragraphs first
paragraphs = text.split('\n\n')
chunks = []
current_chunk = []
current_length = 0
for para in paragraphs:
para = para.strip()
if not para:
continue
para_length = len(para)
if para_length > chunk_size:
# Large paragraph: split by sentences
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = []
current_length = 0
# Split long paragraph by sentences
sentences = self._split_sentences(para)
for sentence in sentences:
if current_length + len(sentence) + 1 > chunk_size:
if current_chunk:
chunks.append(' '.join(current_chunk))
# Keep overlap
overlap_count = max(1, len(current_chunk) // 3)
current_chunk = current_chunk[-overlap_count:]
current_length = sum(len(s) + 1 for s in current_chunk)
current_chunk.append(sentence)
current_length += len(sentence) + 1
elif current_length + para_length + 2 > chunk_size:
# Paragraph would exceed chunk size
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = []
current_length = 0
current_chunk.append(para)
current_length = para_length
else:
current_chunk.append(para)
current_length += para_length + 2
# Add final chunk
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
return [c.strip() for c in chunks if c.strip()]
def _split_sentences(self, text: str) -> List[str]:
"""Split text into sentences with basic abbreviation handling."""
import re
# Protect common abbreviations
abbreviations = ['bzw', 'ca', 'd.h', 'etc', 'ggf', 'inkl', 'u.a', 'usw', 'z.B', 'z.b', 'e.g', 'i.e', 'vs', 'no']
protected = text
for abbr in abbreviations:
pattern = re.compile(r'\b' + re.escape(abbr) + r'\.', re.IGNORECASE)
protected = pattern.sub(abbr.replace('.', '<DOT>') + '<ABBR>', protected)
# Protect decimal numbers
protected = re.sub(r'(\d)\.(\d)', r'\1<DECIMAL>\2', protected)
# Split on sentence endings
sentences = re.split(r'(?<=[.!?])\s+', protected)
# Restore protected characters
result = []
for s in sentences:
s = s.replace('<DOT>', '.').replace('<ABBR>', '.').replace('<DECIMAL>', '.')
s = s.strip()
if s:
result.append(s)
return result
def _infer_template_type(self, doc: ExtractedDocument, source: SourceConfig) -> str:
"""Infer the template type from document content and metadata."""
text_lower = doc.text.lower()
title_lower = doc.title.lower()
# Check known indicators
type_indicators = {
"privacy_policy": ["datenschutz", "privacy", "personal data", "personenbezogen"],
"terms_of_service": ["nutzungsbedingungen", "terms of service", "terms of use", "agb"],
"cookie_banner": ["cookie", "cookies", "tracking"],
"impressum": ["impressum", "legal notice", "imprint"],
"widerruf": ["widerruf", "cancellation", "withdrawal", "right to cancel"],
"dpa": ["auftragsverarbeitung", "data processing agreement", "dpa"],
"sla": ["service level", "availability", "uptime"],
"nda": ["confidential", "non-disclosure", "geheimhaltung", "vertraulich"],
"community_guidelines": ["community", "guidelines", "conduct", "verhaltens"],
"acceptable_use": ["acceptable use", "acceptable usage", "nutzungsrichtlinien"],
}
for template_type, indicators in type_indicators.items():
for indicator in indicators:
if indicator in text_lower or indicator in title_lower:
return template_type
# Fall back to source's first template type
if source.template_types:
return source.template_types[0]
return "clause" # Generic fallback
def _infer_clause_category(self, text: str) -> Optional[str]:
"""Infer the clause category from text content."""
text_lower = text.lower()
categories = {
"haftung": ["haftung", "liability", "haftungsausschluss", "limitation"],
"datenschutz": ["datenschutz", "privacy", "personal data", "personenbezogen"],
"widerruf": ["widerruf", "cancellation", "withdrawal"],
"gewaehrleistung": ["gewaehrleistung", "warranty", "garantie"],
"kuendigung": ["kuendigung", "termination", "beendigung"],
"zahlung": ["zahlung", "payment", "preis", "price"],
"gerichtsstand": ["gerichtsstand", "jurisdiction", "governing law"],
"aenderungen": ["aenderung", "modification", "amendment"],
"schlussbestimmungen": ["schlussbestimmung", "miscellaneous", "final provisions"],
}
for category, indicators in categories.items():
for indicator in indicators:
if indicator in text_lower:
return category
return None
def _create_chunks(
self,
doc: ExtractedDocument,
source: SourceConfig,
) -> List[TemplateChunk]:
"""Create template chunks from an extracted document."""
license_info = source.license_info
template_type = self._infer_template_type(doc, source)
# Chunk the text
text_chunks = self._chunk_text(doc.text)
chunks = []
for i, chunk_text in enumerate(text_chunks):
# Determine if this is a complete document or a clause
is_complete = len(text_chunks) == 1 and len(chunk_text) > 500
is_modular = len(doc.sections) > 0 or '##' in doc.text
requires_customization = len(doc.placeholders) > 0
# Generate attribution text
attribution_text = None
if license_info.attribution_required:
attribution_text = license_info.get_attribution_text(
source.name,
doc.source_url or source.get_source_url()
)
chunk = TemplateChunk(
text=chunk_text,
chunk_index=i,
document_title=doc.title,
template_type=template_type,
clause_category=self._infer_clause_category(chunk_text),
language=doc.language,
jurisdiction=source.jurisdiction,
license_id=license_info.id.value,
license_name=license_info.name,
license_url=license_info.url,
attribution_required=license_info.attribution_required,
share_alike=license_info.share_alike,
no_derivatives=license_info.no_derivatives,
commercial_use=license_info.commercial_use,
source_name=source.name,
source_url=doc.source_url or source.get_source_url(),
source_repo=source.repo_url,
source_commit=doc.source_commit,
source_file=doc.file_path,
source_hash=doc.source_hash,
attribution_text=attribution_text,
copyright_notice=None, # Could be extracted from doc if present
is_complete_document=is_complete,
is_modular=is_modular,
requires_customization=requires_customization,
placeholders=doc.placeholders,
training_allowed=license_info.training_allowed,
output_allowed=license_info.output_allowed,
modification_allowed=license_info.modification_allowed,
distortion_prohibited=license_info.distortion_prohibited,
)
chunks.append(chunk)
return chunks
async def ingest_source(self, source: SourceConfig) -> IngestionStatus:
"""Ingest a single source into Qdrant."""
status = IngestionStatus(
@@ -405,7 +160,7 @@ class LegalTemplatesIngestion:
# Create chunks from all documents
all_chunks: List[TemplateChunk] = []
for doc in documents:
chunks = self._create_chunks(doc, source)
chunks = create_chunks(doc, source, CHUNK_SIZE, CHUNK_OVERLAP)
all_chunks.extend(chunks)
status.chunks_created += len(chunks)
@@ -637,21 +392,7 @@ class LegalTemplatesIngestion:
attribution_required: Optional[bool] = None,
top_k: int = 10,
) -> List[Dict[str, Any]]:
"""
Search the legal templates collection.
Args:
query: Search query text
template_type: Filter by template type (e.g., "privacy_policy")
license_types: Filter by license types (e.g., ["cc0", "mit"])
language: Filter by language (e.g., "de")
jurisdiction: Filter by jurisdiction (e.g., "DE")
attribution_required: Filter by attribution requirement
top_k: Number of results to return
Returns:
List of search results with full metadata
"""
"""Search the legal templates collection."""
# Generate query embedding
embeddings = await self._generate_embeddings([query])
query_vector = embeddings[0]
@@ -661,45 +402,27 @@ class LegalTemplatesIngestion:
if template_type:
must_conditions.append(
FieldCondition(
key="template_type",
match=MatchValue(value=template_type),
)
FieldCondition(key="template_type", match=MatchValue(value=template_type))
)
if language:
must_conditions.append(
FieldCondition(
key="language",
match=MatchValue(value=language),
)
FieldCondition(key="language", match=MatchValue(value=language))
)
if jurisdiction:
must_conditions.append(
FieldCondition(
key="jurisdiction",
match=MatchValue(value=jurisdiction),
)
FieldCondition(key="jurisdiction", match=MatchValue(value=jurisdiction))
)
if attribution_required is not None:
must_conditions.append(
FieldCondition(
key="attribution_required",
match=MatchValue(value=attribution_required),
)
FieldCondition(key="attribution_required", match=MatchValue(value=attribution_required))
)
# License type filter (OR condition)
should_conditions = []
if license_types:
for license_type in license_types:
for lt in license_types:
should_conditions.append(
FieldCondition(
key="license_id",
match=MatchValue(value=license_type),
)
FieldCondition(key="license_id", match=MatchValue(value=lt))
)
# Construct filter
@@ -747,196 +470,31 @@ class LegalTemplatesIngestion:
def delete_source(self, source_name: str) -> int:
"""Delete all chunks from a specific source."""
# First count how many we're deleting
count_result = self.qdrant.count(
collection_name=LEGAL_TEMPLATES_COLLECTION,
count_filter=Filter(
must=[
FieldCondition(
key="source_name",
match=MatchValue(value=source_name),
)
]
must=[FieldCondition(key="source_name", match=MatchValue(value=source_name))]
),
)
# Delete by filter
self.qdrant.delete(
collection_name=LEGAL_TEMPLATES_COLLECTION,
points_selector=Filter(
must=[
FieldCondition(
key="source_name",
match=MatchValue(value=source_name),
)
]
must=[FieldCondition(key="source_name", match=MatchValue(value=source_name))]
),
)
return count_result.count
def reset_collection(self):
"""Delete and recreate the collection."""
logger.warning(f"Resetting collection: {LEGAL_TEMPLATES_COLLECTION}")
# Delete collection
try:
self.qdrant.delete_collection(LEGAL_TEMPLATES_COLLECTION)
except Exception:
pass # Collection might not exist
# Recreate
pass
self._ensure_collection()
self._ingestion_status.clear()
logger.info(f"Collection {LEGAL_TEMPLATES_COLLECTION} reset")
async def close(self):
"""Close HTTP client."""
await self.http_client.aclose()
async def main():
"""CLI entry point."""
import argparse
parser = argparse.ArgumentParser(description="Legal Templates Ingestion")
parser.add_argument(
"--ingest-all",
action="store_true",
help="Ingest all enabled sources"
)
parser.add_argument(
"--ingest-source",
type=str,
metavar="NAME",
help="Ingest a specific source by name"
)
parser.add_argument(
"--ingest-license",
type=str,
choices=["cc0", "mit", "cc_by_4", "public_domain"],
help="Ingest all sources of a specific license type"
)
parser.add_argument(
"--max-priority",
type=int,
default=3,
help="Maximum priority level to ingest (1=highest, 5=lowest)"
)
parser.add_argument(
"--status",
action="store_true",
help="Show collection status"
)
parser.add_argument(
"--search",
type=str,
metavar="QUERY",
help="Test search query"
)
parser.add_argument(
"--template-type",
type=str,
help="Filter search by template type"
)
parser.add_argument(
"--language",
type=str,
help="Filter search by language"
)
parser.add_argument(
"--reset",
action="store_true",
help="Reset (delete and recreate) the collection"
)
parser.add_argument(
"--delete-source",
type=str,
metavar="NAME",
help="Delete all chunks from a source"
)
args = parser.parse_args()
ingestion = LegalTemplatesIngestion()
try:
if args.reset:
ingestion.reset_collection()
print("Collection reset successfully")
elif args.delete_source:
count = ingestion.delete_source(args.delete_source)
print(f"Deleted {count} chunks from {args.delete_source}")
elif args.status:
status = ingestion.get_status()
print(json.dumps(status, indent=2, default=str))
elif args.ingest_all:
print(f"Ingesting all sources (max priority: {args.max_priority})...")
results = await ingestion.ingest_all(max_priority=args.max_priority)
print("\nResults:")
for name, status in results.items():
print(f" {name}: {status.chunks_indexed} chunks ({status.status})")
if status.errors:
for error in status.errors:
print(f" ERROR: {error}")
total = sum(s.chunks_indexed for s in results.values())
print(f"\nTotal: {total} chunks indexed")
elif args.ingest_source:
source = next(
(s for s in TEMPLATE_SOURCES if s.name == args.ingest_source),
None
)
if not source:
print(f"Unknown source: {args.ingest_source}")
print("Available sources:")
for s in TEMPLATE_SOURCES:
print(f" - {s.name}")
return
print(f"Ingesting: {source.name}")
status = await ingestion.ingest_source(source)
print(f"\nResult: {status.chunks_indexed} chunks ({status.status})")
if status.errors:
for error in status.errors:
print(f" ERROR: {error}")
elif args.ingest_license:
license_type = LicenseType(args.ingest_license)
print(f"Ingesting all {license_type.value} sources...")
results = await ingestion.ingest_by_license(license_type)
print("\nResults:")
for name, status in results.items():
print(f" {name}: {status.chunks_indexed} chunks ({status.status})")
elif args.search:
print(f"Searching: {args.search}")
results = await ingestion.search(
args.search,
template_type=args.template_type,
language=args.language,
)
print(f"\nFound {len(results)} results:")
for i, result in enumerate(results, 1):
print(f"\n{i}. [{result['template_type']}] {result['document_title']}")
print(f" Score: {result['score']:.3f}")
print(f" License: {result['license_name']}")
print(f" Source: {result['source_name']}")
print(f" Language: {result['language']}")
if result['attribution_required']:
print(f" Attribution: {result['attribution_text']}")
print(f" Text: {result['text'][:200]}...")
else:
parser.print_help()
finally:
await ingestion.close()
if __name__ == "__main__":
asyncio.run(main())