[split-required] Split 500-1000 LOC files across all services
backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
282
klausur-service/backend/legal_templates_chunking.py
Normal file
282
klausur-service/backend/legal_templates_chunking.py
Normal file
@@ -0,0 +1,282 @@
|
||||
"""
|
||||
Legal Templates Chunking — text splitting, type inference, and chunk creation.
|
||||
|
||||
Extracted from legal_templates_ingestion.py to keep files under 500 LOC.
|
||||
|
||||
Lizenz: Apache 2.0
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
|
||||
from template_sources import SourceConfig
|
||||
from github_crawler import ExtractedDocument
|
||||
|
||||
|
||||
# Chunking configuration defaults (can be overridden by env vars in ingestion module)
|
||||
DEFAULT_CHUNK_SIZE = 1000
|
||||
DEFAULT_CHUNK_OVERLAP = 200
|
||||
|
||||
|
||||
@dataclass
|
||||
class TemplateChunk:
|
||||
"""A chunk of template text ready for indexing."""
|
||||
text: str
|
||||
chunk_index: int
|
||||
document_title: str
|
||||
template_type: str
|
||||
clause_category: Optional[str]
|
||||
language: str
|
||||
jurisdiction: str
|
||||
license_id: str
|
||||
license_name: str
|
||||
license_url: str
|
||||
attribution_required: bool
|
||||
share_alike: bool
|
||||
no_derivatives: bool
|
||||
commercial_use: bool
|
||||
source_name: str
|
||||
source_url: str
|
||||
source_repo: Optional[str]
|
||||
source_commit: Optional[str]
|
||||
source_file: str
|
||||
source_hash: str
|
||||
attribution_text: Optional[str]
|
||||
copyright_notice: Optional[str]
|
||||
is_complete_document: bool
|
||||
is_modular: bool
|
||||
requires_customization: bool
|
||||
placeholders: List[str]
|
||||
training_allowed: bool
|
||||
output_allowed: bool
|
||||
modification_allowed: bool
|
||||
distortion_prohibited: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class IngestionStatus:
|
||||
"""Status of a source ingestion."""
|
||||
source_name: str
|
||||
status: str # "pending", "running", "completed", "failed"
|
||||
documents_found: int = 0
|
||||
chunks_created: int = 0
|
||||
chunks_indexed: int = 0
|
||||
errors: List[str] = field(default_factory=list)
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
|
||||
|
||||
def split_sentences(text: str) -> List[str]:
|
||||
"""Split text into sentences with basic abbreviation handling."""
|
||||
# Protect common abbreviations
|
||||
abbreviations = ['bzw', 'ca', 'd.h', 'etc', 'ggf', 'inkl', 'u.a', 'usw', 'z.B', 'z.b', 'e.g', 'i.e', 'vs', 'no']
|
||||
protected = text
|
||||
for abbr in abbreviations:
|
||||
pattern = re.compile(r'\b' + re.escape(abbr) + r'\.', re.IGNORECASE)
|
||||
protected = pattern.sub(abbr.replace('.', '<DOT>') + '<ABBR>', protected)
|
||||
|
||||
# Protect decimal numbers
|
||||
protected = re.sub(r'(\d)\.(\d)', r'\1<DECIMAL>\2', protected)
|
||||
|
||||
# Split on sentence endings
|
||||
sentences = re.split(r'(?<=[.!?])\s+', protected)
|
||||
|
||||
# Restore protected characters
|
||||
result = []
|
||||
for s in sentences:
|
||||
s = s.replace('<DOT>', '.').replace('<ABBR>', '.').replace('<DECIMAL>', '.')
|
||||
s = s.strip()
|
||||
if s:
|
||||
result.append(s)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def chunk_text(
|
||||
text: str,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||
overlap: int = DEFAULT_CHUNK_OVERLAP,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Split text into overlapping chunks.
|
||||
Respects paragraph and sentence boundaries where possible.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
if len(text) <= chunk_size:
|
||||
return [text.strip()]
|
||||
|
||||
# Split into paragraphs first
|
||||
paragraphs = text.split('\n\n')
|
||||
chunks = []
|
||||
current_chunk: List[str] = []
|
||||
current_length = 0
|
||||
|
||||
for para in paragraphs:
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
|
||||
para_length = len(para)
|
||||
|
||||
if para_length > chunk_size:
|
||||
# Large paragraph: split by sentences
|
||||
if current_chunk:
|
||||
chunks.append('\n\n'.join(current_chunk))
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
# Split long paragraph by sentences
|
||||
sentences = split_sentences(para)
|
||||
for sentence in sentences:
|
||||
if current_length + len(sentence) + 1 > chunk_size:
|
||||
if current_chunk:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
# Keep overlap
|
||||
overlap_count = max(1, len(current_chunk) // 3)
|
||||
current_chunk = current_chunk[-overlap_count:]
|
||||
current_length = sum(len(s) + 1 for s in current_chunk)
|
||||
current_chunk.append(sentence)
|
||||
current_length += len(sentence) + 1
|
||||
|
||||
elif current_length + para_length + 2 > chunk_size:
|
||||
# Paragraph would exceed chunk size
|
||||
if current_chunk:
|
||||
chunks.append('\n\n'.join(current_chunk))
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
current_chunk.append(para)
|
||||
current_length = para_length
|
||||
|
||||
else:
|
||||
current_chunk.append(para)
|
||||
current_length += para_length + 2
|
||||
|
||||
# Add final chunk
|
||||
if current_chunk:
|
||||
chunks.append('\n\n'.join(current_chunk))
|
||||
|
||||
return [c.strip() for c in chunks if c.strip()]
|
||||
|
||||
|
||||
def infer_template_type(doc: ExtractedDocument, source: SourceConfig) -> str:
|
||||
"""Infer the template type from document content and metadata."""
|
||||
text_lower = doc.text.lower()
|
||||
title_lower = doc.title.lower()
|
||||
|
||||
# Check known indicators
|
||||
type_indicators = {
|
||||
"privacy_policy": ["datenschutz", "privacy", "personal data", "personenbezogen"],
|
||||
"terms_of_service": ["nutzungsbedingungen", "terms of service", "terms of use", "agb"],
|
||||
"cookie_banner": ["cookie", "cookies", "tracking"],
|
||||
"impressum": ["impressum", "legal notice", "imprint"],
|
||||
"widerruf": ["widerruf", "cancellation", "withdrawal", "right to cancel"],
|
||||
"dpa": ["auftragsverarbeitung", "data processing agreement", "dpa"],
|
||||
"sla": ["service level", "availability", "uptime"],
|
||||
"nda": ["confidential", "non-disclosure", "geheimhaltung", "vertraulich"],
|
||||
"community_guidelines": ["community", "guidelines", "conduct", "verhaltens"],
|
||||
"acceptable_use": ["acceptable use", "acceptable usage", "nutzungsrichtlinien"],
|
||||
}
|
||||
|
||||
for template_type, indicators in type_indicators.items():
|
||||
for indicator in indicators:
|
||||
if indicator in text_lower or indicator in title_lower:
|
||||
return template_type
|
||||
|
||||
# Fall back to source's first template type
|
||||
if source.template_types:
|
||||
return source.template_types[0]
|
||||
|
||||
return "clause" # Generic fallback
|
||||
|
||||
|
||||
def infer_clause_category(text: str) -> Optional[str]:
|
||||
"""Infer the clause category from text content."""
|
||||
text_lower = text.lower()
|
||||
|
||||
categories = {
|
||||
"haftung": ["haftung", "liability", "haftungsausschluss", "limitation"],
|
||||
"datenschutz": ["datenschutz", "privacy", "personal data", "personenbezogen"],
|
||||
"widerruf": ["widerruf", "cancellation", "withdrawal"],
|
||||
"gewaehrleistung": ["gewaehrleistung", "warranty", "garantie"],
|
||||
"kuendigung": ["kuendigung", "termination", "beendigung"],
|
||||
"zahlung": ["zahlung", "payment", "preis", "price"],
|
||||
"gerichtsstand": ["gerichtsstand", "jurisdiction", "governing law"],
|
||||
"aenderungen": ["aenderung", "modification", "amendment"],
|
||||
"schlussbestimmungen": ["schlussbestimmung", "miscellaneous", "final provisions"],
|
||||
}
|
||||
|
||||
for category, indicators in categories.items():
|
||||
for indicator in indicators:
|
||||
if indicator in text_lower:
|
||||
return category
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def create_chunks(
|
||||
doc: ExtractedDocument,
|
||||
source: SourceConfig,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
||||
) -> List[TemplateChunk]:
|
||||
"""Create template chunks from an extracted document."""
|
||||
license_info = source.license_info
|
||||
template_type = infer_template_type(doc, source)
|
||||
|
||||
# Chunk the text
|
||||
text_chunks = chunk_text(doc.text, chunk_size, chunk_overlap)
|
||||
|
||||
chunks = []
|
||||
for i, chunk_text_str in enumerate(text_chunks):
|
||||
# Determine if this is a complete document or a clause
|
||||
is_complete = len(text_chunks) == 1 and len(chunk_text_str) > 500
|
||||
is_modular = len(doc.sections) > 0 or '##' in doc.text
|
||||
requires_customization = len(doc.placeholders) > 0
|
||||
|
||||
# Generate attribution text
|
||||
attribution_text = None
|
||||
if license_info.attribution_required:
|
||||
attribution_text = license_info.get_attribution_text(
|
||||
source.name,
|
||||
doc.source_url or source.get_source_url()
|
||||
)
|
||||
|
||||
chunk = TemplateChunk(
|
||||
text=chunk_text_str,
|
||||
chunk_index=i,
|
||||
document_title=doc.title,
|
||||
template_type=template_type,
|
||||
clause_category=infer_clause_category(chunk_text_str),
|
||||
language=doc.language,
|
||||
jurisdiction=source.jurisdiction,
|
||||
license_id=license_info.id.value,
|
||||
license_name=license_info.name,
|
||||
license_url=license_info.url,
|
||||
attribution_required=license_info.attribution_required,
|
||||
share_alike=license_info.share_alike,
|
||||
no_derivatives=license_info.no_derivatives,
|
||||
commercial_use=license_info.commercial_use,
|
||||
source_name=source.name,
|
||||
source_url=doc.source_url or source.get_source_url(),
|
||||
source_repo=source.repo_url,
|
||||
source_commit=doc.source_commit,
|
||||
source_file=doc.file_path,
|
||||
source_hash=doc.source_hash,
|
||||
attribution_text=attribution_text,
|
||||
copyright_notice=None,
|
||||
is_complete_document=is_complete,
|
||||
is_modular=is_modular,
|
||||
requires_customization=requires_customization,
|
||||
placeholders=doc.placeholders,
|
||||
training_allowed=license_info.training_allowed,
|
||||
output_allowed=license_info.output_allowed,
|
||||
modification_allowed=license_info.modification_allowed,
|
||||
distortion_prohibited=license_info.distortion_prohibited,
|
||||
)
|
||||
chunks.append(chunk)
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user