[split-required] Split 500-1000 LOC files across all services

backend-lehrer (5 files):
- alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3)
- teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3)
- mail/mail_db.py (987 → 6)

klausur-service (5 files):
- legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4)
- ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2)
- KorrekturPage.tsx (956 → 6)

website (5 pages):
- mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7)
- ocr-labeling (946 → 7), audit-workspace (871 → 4)

studio-v2 (5 files + 1 deleted):
- page.tsx (946 → 5), MessagesContext.tsx (925 → 4)
- korrektur (914 → 6), worksheet-cleanup (899 → 6)
- useVocabWorksheet.ts (888 → 3)
- Deleted dead page-original.tsx (934 LOC)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 23:35:37 +02:00
parent 6811264756
commit b6983ab1dc
99 changed files with 13484 additions and 16106 deletions

View File

@@ -0,0 +1,282 @@
"""
Legal Templates Chunking — text splitting, type inference, and chunk creation.
Extracted from legal_templates_ingestion.py to keep files under 500 LOC.
Lizenz: Apache 2.0
"""
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import List, Optional
from template_sources import SourceConfig
from github_crawler import ExtractedDocument
# Chunking configuration defaults (can be overridden by env vars in ingestion module)
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_CHUNK_OVERLAP = 200
@dataclass
class TemplateChunk:
"""A chunk of template text ready for indexing."""
text: str
chunk_index: int
document_title: str
template_type: str
clause_category: Optional[str]
language: str
jurisdiction: str
license_id: str
license_name: str
license_url: str
attribution_required: bool
share_alike: bool
no_derivatives: bool
commercial_use: bool
source_name: str
source_url: str
source_repo: Optional[str]
source_commit: Optional[str]
source_file: str
source_hash: str
attribution_text: Optional[str]
copyright_notice: Optional[str]
is_complete_document: bool
is_modular: bool
requires_customization: bool
placeholders: List[str]
training_allowed: bool
output_allowed: bool
modification_allowed: bool
distortion_prohibited: bool
@dataclass
class IngestionStatus:
"""Status of a source ingestion."""
source_name: str
status: str # "pending", "running", "completed", "failed"
documents_found: int = 0
chunks_created: int = 0
chunks_indexed: int = 0
errors: List[str] = field(default_factory=list)
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
def split_sentences(text: str) -> List[str]:
"""Split text into sentences with basic abbreviation handling."""
# Protect common abbreviations
abbreviations = ['bzw', 'ca', 'd.h', 'etc', 'ggf', 'inkl', 'u.a', 'usw', 'z.B', 'z.b', 'e.g', 'i.e', 'vs', 'no']
protected = text
for abbr in abbreviations:
pattern = re.compile(r'\b' + re.escape(abbr) + r'\.', re.IGNORECASE)
protected = pattern.sub(abbr.replace('.', '<DOT>') + '<ABBR>', protected)
# Protect decimal numbers
protected = re.sub(r'(\d)\.(\d)', r'\1<DECIMAL>\2', protected)
# Split on sentence endings
sentences = re.split(r'(?<=[.!?])\s+', protected)
# Restore protected characters
result = []
for s in sentences:
s = s.replace('<DOT>', '.').replace('<ABBR>', '.').replace('<DECIMAL>', '.')
s = s.strip()
if s:
result.append(s)
return result
def chunk_text(
text: str,
chunk_size: int = DEFAULT_CHUNK_SIZE,
overlap: int = DEFAULT_CHUNK_OVERLAP,
) -> List[str]:
"""
Split text into overlapping chunks.
Respects paragraph and sentence boundaries where possible.
"""
if not text:
return []
if len(text) <= chunk_size:
return [text.strip()]
# Split into paragraphs first
paragraphs = text.split('\n\n')
chunks = []
current_chunk: List[str] = []
current_length = 0
for para in paragraphs:
para = para.strip()
if not para:
continue
para_length = len(para)
if para_length > chunk_size:
# Large paragraph: split by sentences
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = []
current_length = 0
# Split long paragraph by sentences
sentences = split_sentences(para)
for sentence in sentences:
if current_length + len(sentence) + 1 > chunk_size:
if current_chunk:
chunks.append(' '.join(current_chunk))
# Keep overlap
overlap_count = max(1, len(current_chunk) // 3)
current_chunk = current_chunk[-overlap_count:]
current_length = sum(len(s) + 1 for s in current_chunk)
current_chunk.append(sentence)
current_length += len(sentence) + 1
elif current_length + para_length + 2 > chunk_size:
# Paragraph would exceed chunk size
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = []
current_length = 0
current_chunk.append(para)
current_length = para_length
else:
current_chunk.append(para)
current_length += para_length + 2
# Add final chunk
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
return [c.strip() for c in chunks if c.strip()]
def infer_template_type(doc: ExtractedDocument, source: SourceConfig) -> str:
"""Infer the template type from document content and metadata."""
text_lower = doc.text.lower()
title_lower = doc.title.lower()
# Check known indicators
type_indicators = {
"privacy_policy": ["datenschutz", "privacy", "personal data", "personenbezogen"],
"terms_of_service": ["nutzungsbedingungen", "terms of service", "terms of use", "agb"],
"cookie_banner": ["cookie", "cookies", "tracking"],
"impressum": ["impressum", "legal notice", "imprint"],
"widerruf": ["widerruf", "cancellation", "withdrawal", "right to cancel"],
"dpa": ["auftragsverarbeitung", "data processing agreement", "dpa"],
"sla": ["service level", "availability", "uptime"],
"nda": ["confidential", "non-disclosure", "geheimhaltung", "vertraulich"],
"community_guidelines": ["community", "guidelines", "conduct", "verhaltens"],
"acceptable_use": ["acceptable use", "acceptable usage", "nutzungsrichtlinien"],
}
for template_type, indicators in type_indicators.items():
for indicator in indicators:
if indicator in text_lower or indicator in title_lower:
return template_type
# Fall back to source's first template type
if source.template_types:
return source.template_types[0]
return "clause" # Generic fallback
def infer_clause_category(text: str) -> Optional[str]:
"""Infer the clause category from text content."""
text_lower = text.lower()
categories = {
"haftung": ["haftung", "liability", "haftungsausschluss", "limitation"],
"datenschutz": ["datenschutz", "privacy", "personal data", "personenbezogen"],
"widerruf": ["widerruf", "cancellation", "withdrawal"],
"gewaehrleistung": ["gewaehrleistung", "warranty", "garantie"],
"kuendigung": ["kuendigung", "termination", "beendigung"],
"zahlung": ["zahlung", "payment", "preis", "price"],
"gerichtsstand": ["gerichtsstand", "jurisdiction", "governing law"],
"aenderungen": ["aenderung", "modification", "amendment"],
"schlussbestimmungen": ["schlussbestimmung", "miscellaneous", "final provisions"],
}
for category, indicators in categories.items():
for indicator in indicators:
if indicator in text_lower:
return category
return None
def create_chunks(
doc: ExtractedDocument,
source: SourceConfig,
chunk_size: int = DEFAULT_CHUNK_SIZE,
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
) -> List[TemplateChunk]:
"""Create template chunks from an extracted document."""
license_info = source.license_info
template_type = infer_template_type(doc, source)
# Chunk the text
text_chunks = chunk_text(doc.text, chunk_size, chunk_overlap)
chunks = []
for i, chunk_text_str in enumerate(text_chunks):
# Determine if this is a complete document or a clause
is_complete = len(text_chunks) == 1 and len(chunk_text_str) > 500
is_modular = len(doc.sections) > 0 or '##' in doc.text
requires_customization = len(doc.placeholders) > 0
# Generate attribution text
attribution_text = None
if license_info.attribution_required:
attribution_text = license_info.get_attribution_text(
source.name,
doc.source_url or source.get_source_url()
)
chunk = TemplateChunk(
text=chunk_text_str,
chunk_index=i,
document_title=doc.title,
template_type=template_type,
clause_category=infer_clause_category(chunk_text_str),
language=doc.language,
jurisdiction=source.jurisdiction,
license_id=license_info.id.value,
license_name=license_info.name,
license_url=license_info.url,
attribution_required=license_info.attribution_required,
share_alike=license_info.share_alike,
no_derivatives=license_info.no_derivatives,
commercial_use=license_info.commercial_use,
source_name=source.name,
source_url=doc.source_url or source.get_source_url(),
source_repo=source.repo_url,
source_commit=doc.source_commit,
source_file=doc.file_path,
source_hash=doc.source_hash,
attribution_text=attribution_text,
copyright_notice=None,
is_complete_document=is_complete,
is_modular=is_modular,
requires_customization=requires_customization,
placeholders=doc.placeholders,
training_allowed=license_info.training_allowed,
output_allowed=license_info.output_allowed,
modification_allowed=license_info.modification_allowed,
distortion_prohibited=license_info.distortion_prohibited,
)
chunks.append(chunk)
return chunks