[split-required] Split 500-1000 LOC files across all services

backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 23:35:37 +02:00
parent 6811264756
commit b6983ab1dc
99 changed files with 13484 additions and 16106 deletions
--- a/klausur-service/backend/legal_templates_ingestion.py
+++ b/klausur-service/backend/legal_templates_ingestion.py
@@ -8,18 +8,16 @@ proper attribution tracking.
 Collection: bp_legal_templates

 Usage:
-    python legal_templates_ingestion.py --ingest-all
-    python legal_templates_ingestion.py --ingest-source github-site-policy
-    python legal_templates_ingestion.py --status
-    python legal_templates_ingestion.py --search "Datenschutzerklaerung"
+    python legal_templates_cli.py --ingest-all
+    python legal_templates_cli.py --ingest-source github-site-policy
+    python legal_templates_cli.py --status
+    python legal_templates_cli.py --search "Datenschutzerklaerung"
 """

 import asyncio
 import hashlib
-import json
 import logging
 import os
-from dataclasses import dataclass, field
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 from urllib.parse import urlparse
@@ -50,6 +48,17 @@ from github_crawler import (
    RepositoryDownloader,
 )

+# Re-export from chunking module for backward compatibility
+from legal_templates_chunking import (  # noqa: F401
+    IngestionStatus,
+    TemplateChunk,
+    chunk_text,
+    create_chunks,
+    infer_clause_category,
+    infer_template_type,
+    split_sentences,
+)
+
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -78,54 +87,6 @@ MAX_RETRIES = 3
 RETRY_DELAY = 3.0


-@dataclass
-class IngestionStatus:
-    """Status of a source ingestion."""
-    source_name: str
-    status: str  # "pending", "running", "completed", "failed"
-    documents_found: int = 0
-    chunks_created: int = 0
-    chunks_indexed: int = 0
-    errors: List[str] = field(default_factory=list)
-    started_at: Optional[datetime] = None
-    completed_at: Optional[datetime] = None
-
-
-@dataclass
-class TemplateChunk:
-    """A chunk of template text ready for indexing."""
-    text: str
-    chunk_index: int
-    document_title: str
-    template_type: str
-    clause_category: Optional[str]
-    language: str
-    jurisdiction: str
-    license_id: str
-    license_name: str
-    license_url: str
-    attribution_required: bool
-    share_alike: bool
-    no_derivatives: bool
-    commercial_use: bool
-    source_name: str
-    source_url: str
-    source_repo: Optional[str]
-    source_commit: Optional[str]
-    source_file: str
-    source_hash: str
-    attribution_text: Optional[str]
-    copyright_notice: Optional[str]
-    is_complete_document: bool
-    is_modular: bool
-    requires_customization: bool
-    placeholders: List[str]
-    training_allowed: bool
-    output_allowed: bool
-    modification_allowed: bool
-    distortion_prohibited: bool
-
-
 class LegalTemplatesIngestion:
    """Handles ingestion of legal templates into Qdrant."""

@@ -168,212 +129,6 @@ class LegalTemplatesIngestion:
            logger.error(f"Embedding generation failed: {e}")
            raise

-    def _chunk_text(self, text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
-        """
-        Split text into overlapping chunks.
-        Respects paragraph and sentence boundaries where possible.
-        """
-        if not text:
-            return []
-
-        if len(text) <= chunk_size:
-            return [text.strip()]
-
-        # Split into paragraphs first
-        paragraphs = text.split('\n\n')
-        chunks = []
-        current_chunk = []
-        current_length = 0
-
-        for para in paragraphs:
-            para = para.strip()
-            if not para:
-                continue
-
-            para_length = len(para)
-
-            if para_length > chunk_size:
-                # Large paragraph: split by sentences
-                if current_chunk:
-                    chunks.append('\n\n'.join(current_chunk))
-                    current_chunk = []
-                    current_length = 0
-
-                # Split long paragraph by sentences
-                sentences = self._split_sentences(para)
-                for sentence in sentences:
-                    if current_length + len(sentence) + 1 > chunk_size:
-                        if current_chunk:
-                            chunks.append(' '.join(current_chunk))
-                            # Keep overlap
-                            overlap_count = max(1, len(current_chunk) // 3)
-                            current_chunk = current_chunk[-overlap_count:]
-                            current_length = sum(len(s) + 1 for s in current_chunk)
-                    current_chunk.append(sentence)
-                    current_length += len(sentence) + 1
-
-            elif current_length + para_length + 2 > chunk_size:
-                # Paragraph would exceed chunk size
-                if current_chunk:
-                    chunks.append('\n\n'.join(current_chunk))
-                    current_chunk = []
-                    current_length = 0
-                current_chunk.append(para)
-                current_length = para_length
-
-            else:
-                current_chunk.append(para)
-                current_length += para_length + 2
-
-        # Add final chunk
-        if current_chunk:
-            chunks.append('\n\n'.join(current_chunk))
-
-        return [c.strip() for c in chunks if c.strip()]
-
-    def _split_sentences(self, text: str) -> List[str]:
-        """Split text into sentences with basic abbreviation handling."""
-        import re
-
-        # Protect common abbreviations
-        abbreviations = ['bzw', 'ca', 'd.h', 'etc', 'ggf', 'inkl', 'u.a', 'usw', 'z.B', 'z.b', 'e.g', 'i.e', 'vs', 'no']
-        protected = text
-        for abbr in abbreviations:
-            pattern = re.compile(r'\b' + re.escape(abbr) + r'\.', re.IGNORECASE)
-            protected = pattern.sub(abbr.replace('.', '<DOT>') + '<ABBR>', protected)
-
-        # Protect decimal numbers
-        protected = re.sub(r'(\d)\.(\d)', r'\1<DECIMAL>\2', protected)
-
-        # Split on sentence endings
-        sentences = re.split(r'(?<=[.!?])\s+', protected)
-
-        # Restore protected characters
-        result = []
-        for s in sentences:
-            s = s.replace('<DOT>', '.').replace('<ABBR>', '.').replace('<DECIMAL>', '.')
-            s = s.strip()
-            if s:
-                result.append(s)
-
-        return result
-
-    def _infer_template_type(self, doc: ExtractedDocument, source: SourceConfig) -> str:
-        """Infer the template type from document content and metadata."""
-        text_lower = doc.text.lower()
-        title_lower = doc.title.lower()
-
-        # Check known indicators
-        type_indicators = {
-            "privacy_policy": ["datenschutz", "privacy", "personal data", "personenbezogen"],
-            "terms_of_service": ["nutzungsbedingungen", "terms of service", "terms of use", "agb"],
-            "cookie_banner": ["cookie", "cookies", "tracking"],
-            "impressum": ["impressum", "legal notice", "imprint"],
-            "widerruf": ["widerruf", "cancellation", "withdrawal", "right to cancel"],
-            "dpa": ["auftragsverarbeitung", "data processing agreement", "dpa"],
-            "sla": ["service level", "availability", "uptime"],
-            "nda": ["confidential", "non-disclosure", "geheimhaltung", "vertraulich"],
-            "community_guidelines": ["community", "guidelines", "conduct", "verhaltens"],
-            "acceptable_use": ["acceptable use", "acceptable usage", "nutzungsrichtlinien"],
-        }
-
-        for template_type, indicators in type_indicators.items():
-            for indicator in indicators:
-                if indicator in text_lower or indicator in title_lower:
-                    return template_type
-
-        # Fall back to source's first template type
-        if source.template_types:
-            return source.template_types[0]
-
-        return "clause"  # Generic fallback
-
-    def _infer_clause_category(self, text: str) -> Optional[str]:
-        """Infer the clause category from text content."""
-        text_lower = text.lower()
-
-        categories = {
-            "haftung": ["haftung", "liability", "haftungsausschluss", "limitation"],
-            "datenschutz": ["datenschutz", "privacy", "personal data", "personenbezogen"],
-            "widerruf": ["widerruf", "cancellation", "withdrawal"],
-            "gewaehrleistung": ["gewaehrleistung", "warranty", "garantie"],
-            "kuendigung": ["kuendigung", "termination", "beendigung"],
-            "zahlung": ["zahlung", "payment", "preis", "price"],
-            "gerichtsstand": ["gerichtsstand", "jurisdiction", "governing law"],
-            "aenderungen": ["aenderung", "modification", "amendment"],
-            "schlussbestimmungen": ["schlussbestimmung", "miscellaneous", "final provisions"],
-        }
-
-        for category, indicators in categories.items():
-            for indicator in indicators:
-                if indicator in text_lower:
-                    return category
-
-        return None
-
-    def _create_chunks(
-        self,
-        doc: ExtractedDocument,
-        source: SourceConfig,
-    ) -> List[TemplateChunk]:
-        """Create template chunks from an extracted document."""
-        license_info = source.license_info
-        template_type = self._infer_template_type(doc, source)
-
-        # Chunk the text
-        text_chunks = self._chunk_text(doc.text)
-
-        chunks = []
-        for i, chunk_text in enumerate(text_chunks):
-            # Determine if this is a complete document or a clause
-            is_complete = len(text_chunks) == 1 and len(chunk_text) > 500
-            is_modular = len(doc.sections) > 0 or '##' in doc.text
-            requires_customization = len(doc.placeholders) > 0
-
-            # Generate attribution text
-            attribution_text = None
-            if license_info.attribution_required:
-                attribution_text = license_info.get_attribution_text(
-                    source.name,
-                    doc.source_url or source.get_source_url()
-                )
-
-            chunk = TemplateChunk(
-                text=chunk_text,
-                chunk_index=i,
-                document_title=doc.title,
-                template_type=template_type,
-                clause_category=self._infer_clause_category(chunk_text),
-                language=doc.language,
-                jurisdiction=source.jurisdiction,
-                license_id=license_info.id.value,
-                license_name=license_info.name,
-                license_url=license_info.url,
-                attribution_required=license_info.attribution_required,
-                share_alike=license_info.share_alike,
-                no_derivatives=license_info.no_derivatives,
-                commercial_use=license_info.commercial_use,
-                source_name=source.name,
-                source_url=doc.source_url or source.get_source_url(),
-                source_repo=source.repo_url,
-                source_commit=doc.source_commit,
-                source_file=doc.file_path,
-                source_hash=doc.source_hash,
-                attribution_text=attribution_text,
-                copyright_notice=None,  # Could be extracted from doc if present
-                is_complete_document=is_complete,
-                is_modular=is_modular,
-                requires_customization=requires_customization,
-                placeholders=doc.placeholders,
-                training_allowed=license_info.training_allowed,
-                output_allowed=license_info.output_allowed,
-                modification_allowed=license_info.modification_allowed,
-                distortion_prohibited=license_info.distortion_prohibited,
-            )
-            chunks.append(chunk)
-
-        return chunks
-
    async def ingest_source(self, source: SourceConfig) -> IngestionStatus:
        """Ingest a single source into Qdrant."""
        status = IngestionStatus(
@@ -405,7 +160,7 @@ class LegalTemplatesIngestion:
            # Create chunks from all documents
            all_chunks: List[TemplateChunk] = []
            for doc in documents:
-                chunks = self._create_chunks(doc, source)
+                chunks = create_chunks(doc, source, CHUNK_SIZE, CHUNK_OVERLAP)
                all_chunks.extend(chunks)
                status.chunks_created += len(chunks)

@@ -637,21 +392,7 @@ class LegalTemplatesIngestion:
        attribution_required: Optional[bool] = None,
        top_k: int = 10,
    ) -> List[Dict[str, Any]]:
-        """
-        Search the legal templates collection.
-
-        Args:
-            query: Search query text
-            template_type: Filter by template type (e.g., "privacy_policy")
-            license_types: Filter by license types (e.g., ["cc0", "mit"])
-            language: Filter by language (e.g., "de")
-            jurisdiction: Filter by jurisdiction (e.g., "DE")
-            attribution_required: Filter by attribution requirement
-            top_k: Number of results to return
-
-        Returns:
-            List of search results with full metadata
-        """
+        """Search the legal templates collection."""
        # Generate query embedding
        embeddings = await self._generate_embeddings([query])
        query_vector = embeddings[0]
@@ -661,45 +402,27 @@ class LegalTemplatesIngestion:

        if template_type:
            must_conditions.append(
-                FieldCondition(
-                    key="template_type",
-                    match=MatchValue(value=template_type),
-                )
+                FieldCondition(key="template_type", match=MatchValue(value=template_type))
            )
-
        if language:
            must_conditions.append(
-                FieldCondition(
-                    key="language",
-                    match=MatchValue(value=language),
-                )
+                FieldCondition(key="language", match=MatchValue(value=language))
            )
-
        if jurisdiction:
            must_conditions.append(
-                FieldCondition(
-                    key="jurisdiction",
-                    match=MatchValue(value=jurisdiction),
-                )
+                FieldCondition(key="jurisdiction", match=MatchValue(value=jurisdiction))
            )
-
        if attribution_required is not None:
            must_conditions.append(
-                FieldCondition(
-                    key="attribution_required",
-                    match=MatchValue(value=attribution_required),
-                )
+                FieldCondition(key="attribution_required", match=MatchValue(value=attribution_required))
            )

        # License type filter (OR condition)
        should_conditions = []
        if license_types:
-            for license_type in license_types:
+            for lt in license_types:
                should_conditions.append(
-                    FieldCondition(
-                        key="license_id",
-                        match=MatchValue(value=license_type),
-                    )
+                    FieldCondition(key="license_id", match=MatchValue(value=lt))
                )

        # Construct filter
@@ -747,196 +470,31 @@ class LegalTemplatesIngestion:

    def delete_source(self, source_name: str) -> int:
        """Delete all chunks from a specific source."""
-        # First count how many we're deleting
        count_result = self.qdrant.count(
            collection_name=LEGAL_TEMPLATES_COLLECTION,
            count_filter=Filter(
-                must=[
-                    FieldCondition(
-                        key="source_name",
-                        match=MatchValue(value=source_name),
-                    )
-                ]
+                must=[FieldCondition(key="source_name", match=MatchValue(value=source_name))]
            ),
        )
-
-        # Delete by filter
        self.qdrant.delete(
            collection_name=LEGAL_TEMPLATES_COLLECTION,
            points_selector=Filter(
-                must=[
-                    FieldCondition(
-                        key="source_name",
-                        match=MatchValue(value=source_name),
-                    )
-                ]
+                must=[FieldCondition(key="source_name", match=MatchValue(value=source_name))]
            ),
        )
-
        return count_result.count

    def reset_collection(self):
        """Delete and recreate the collection."""
        logger.warning(f"Resetting collection: {LEGAL_TEMPLATES_COLLECTION}")
-
-        # Delete collection
        try:
            self.qdrant.delete_collection(LEGAL_TEMPLATES_COLLECTION)
        except Exception:
-            pass  # Collection might not exist
-
-        # Recreate
+            pass
        self._ensure_collection()
        self._ingestion_status.clear()
-
        logger.info(f"Collection {LEGAL_TEMPLATES_COLLECTION} reset")

    async def close(self):
        """Close HTTP client."""
        await self.http_client.aclose()
-
-
-async def main():
-    """CLI entry point."""
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Legal Templates Ingestion")
-    parser.add_argument(
-        "--ingest-all",
-        action="store_true",
-        help="Ingest all enabled sources"
-    )
-    parser.add_argument(
-        "--ingest-source",
-        type=str,
-        metavar="NAME",
-        help="Ingest a specific source by name"
-    )
-    parser.add_argument(
-        "--ingest-license",
-        type=str,
-        choices=["cc0", "mit", "cc_by_4", "public_domain"],
-        help="Ingest all sources of a specific license type"
-    )
-    parser.add_argument(
-        "--max-priority",
-        type=int,
-        default=3,
-        help="Maximum priority level to ingest (1=highest, 5=lowest)"
-    )
-    parser.add_argument(
-        "--status",
-        action="store_true",
-        help="Show collection status"
-    )
-    parser.add_argument(
-        "--search",
-        type=str,
-        metavar="QUERY",
-        help="Test search query"
-    )
-    parser.add_argument(
-        "--template-type",
-        type=str,
-        help="Filter search by template type"
-    )
-    parser.add_argument(
-        "--language",
-        type=str,
-        help="Filter search by language"
-    )
-    parser.add_argument(
-        "--reset",
-        action="store_true",
-        help="Reset (delete and recreate) the collection"
-    )
-    parser.add_argument(
-        "--delete-source",
-        type=str,
-        metavar="NAME",
-        help="Delete all chunks from a source"
-    )
-
-    args = parser.parse_args()
-
-    ingestion = LegalTemplatesIngestion()
-
-    try:
-        if args.reset:
-            ingestion.reset_collection()
-            print("Collection reset successfully")
-
-        elif args.delete_source:
-            count = ingestion.delete_source(args.delete_source)
-            print(f"Deleted {count} chunks from {args.delete_source}")
-
-        elif args.status:
-            status = ingestion.get_status()
-            print(json.dumps(status, indent=2, default=str))
-
-        elif args.ingest_all:
-            print(f"Ingesting all sources (max priority: {args.max_priority})...")
-            results = await ingestion.ingest_all(max_priority=args.max_priority)
-            print("\nResults:")
-            for name, status in results.items():
-                print(f"  {name}: {status.chunks_indexed} chunks ({status.status})")
-                if status.errors:
-                    for error in status.errors:
-                        print(f"    ERROR: {error}")
-            total = sum(s.chunks_indexed for s in results.values())
-            print(f"\nTotal: {total} chunks indexed")
-
-        elif args.ingest_source:
-            source = next(
-                (s for s in TEMPLATE_SOURCES if s.name == args.ingest_source),
-                None
-            )
-            if not source:
-                print(f"Unknown source: {args.ingest_source}")
-                print("Available sources:")
-                for s in TEMPLATE_SOURCES:
-                    print(f"  - {s.name}")
-                return
-
-            print(f"Ingesting: {source.name}")
-            status = await ingestion.ingest_source(source)
-            print(f"\nResult: {status.chunks_indexed} chunks ({status.status})")
-            if status.errors:
-                for error in status.errors:
-                    print(f"  ERROR: {error}")
-
-        elif args.ingest_license:
-            license_type = LicenseType(args.ingest_license)
-            print(f"Ingesting all {license_type.value} sources...")
-            results = await ingestion.ingest_by_license(license_type)
-            print("\nResults:")
-            for name, status in results.items():
-                print(f"  {name}: {status.chunks_indexed} chunks ({status.status})")
-
-        elif args.search:
-            print(f"Searching: {args.search}")
-            results = await ingestion.search(
-                args.search,
-                template_type=args.template_type,
-                language=args.language,
-            )
-            print(f"\nFound {len(results)} results:")
-            for i, result in enumerate(results, 1):
-                print(f"\n{i}. [{result['template_type']}] {result['document_title']}")
-                print(f"   Score: {result['score']:.3f}")
-                print(f"   License: {result['license_name']}")
-                print(f"   Source: {result['source_name']}")
-                print(f"   Language: {result['language']}")
-                if result['attribution_required']:
-                    print(f"   Attribution: {result['attribution_text']}")
-                print(f"   Text: {result['text'][:200]}...")
-
-        else:
-            parser.print_help()
-
-    finally:
-        await ingestion.close()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())