fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/klausur-service/backend/legal_corpus_robust.py
+++ b/klausur-service/backend/legal_corpus_robust.py
@@ -0,0 +1,455 @@
+"""
+Robust Legal Corpus Ingestion for UCCA RAG Integration.
+
+This version handles large documents and unstable embedding services by:
+- Processing one text at a time
+- Health checks before each embedding
+- Automatic retry with exponential backoff
+- Progress tracking for resume capability
+- Longer delays to prevent service overload
+
+Usage:
+    python legal_corpus_robust.py --ingest DPF
+    python legal_corpus_robust.py --ingest-all-missing
+    python legal_corpus_robust.py --status
+"""
+
+import asyncio
+import hashlib
+import json
+import logging
+import os
+import re
+import sys
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import httpx
+from qdrant_client import QdrantClient
+from qdrant_client.models import (
+    Distance,
+    FieldCondition,
+    Filter,
+    MatchValue,
+    PointStruct,
+    VectorParams,
+)
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+# Configuration
+QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
+QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
+EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://localhost:8087")
+LEGAL_CORPUS_COLLECTION = "bp_legal_corpus"
+VECTOR_SIZE = 1024
+CHUNK_SIZE = 800
+CHUNK_OVERLAP = 150
+
+# Robust settings
+MAX_RETRIES = 5
+INITIAL_DELAY = 2.0
+DELAY_BETWEEN_EMBEDDINGS = 2.0
+HEALTH_CHECK_INTERVAL = 10  # Check health every N embeddings
+
+
+@dataclass
+class Regulation:
+    """Regulation metadata."""
+    code: str
+    name: str
+    full_name: str
+    regulation_type: str
+    source_url: str
+    description: str
+    language: str = "de"
+
+
+# Regulations that need robust loading
+ROBUST_REGULATIONS: List[Regulation] = [
+    Regulation(
+        code="DPF",
+        name="EU-US Data Privacy Framework",
+        full_name="Durchführungsbeschluss (EU) 2023/1795",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/dec_impl/2023/1795/oj",
+        description="Angemessenheitsbeschluss für USA-Transfers.",
+    ),
+    Regulation(
+        code="BSI-TR-03161-1",
+        name="BSI-TR-03161 Teil 1",
+        full_name="BSI Technische Richtlinie - Allgemeine Anforderungen",
+        regulation_type="bsi_standard",
+        source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-1.pdf",
+        description="Allgemeine Sicherheitsanforderungen (45 Prüfaspekte).",
+    ),
+    Regulation(
+        code="BSI-TR-03161-2",
+        name="BSI-TR-03161 Teil 2",
+        full_name="BSI Technische Richtlinie - Web-Anwendungen",
+        regulation_type="bsi_standard",
+        source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-2.pdf",
+        description="Web-Sicherheit (40 Prüfaspekte).",
+    ),
+    Regulation(
+        code="BSI-TR-03161-3",
+        name="BSI-TR-03161 Teil 3",
+        full_name="BSI Technische Richtlinie - Hintergrundsysteme",
+        regulation_type="bsi_standard",
+        source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-3.pdf",
+        description="Backend-Sicherheit (35 Prüfaspekte).",
+    ),
+]
+
+
+class RobustLegalCorpusIngestion:
+    """Handles robust ingestion of large legal documents."""
+
+    def __init__(self):
+        self.qdrant = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
+        self.http_client = None
+        self.embeddings_since_health_check = 0
+        self._ensure_collection()
+
+    def _ensure_collection(self):
+        """Create the legal corpus collection if it doesn't exist."""
+        collections = self.qdrant.get_collections().collections
+        collection_names = [c.name for c in collections]
+
+        if LEGAL_CORPUS_COLLECTION not in collection_names:
+            logger.info(f"Creating collection: {LEGAL_CORPUS_COLLECTION}")
+            self.qdrant.create_collection(
+                collection_name=LEGAL_CORPUS_COLLECTION,
+                vectors_config=VectorParams(
+                    size=VECTOR_SIZE,
+                    distance=Distance.COSINE,
+                ),
+            )
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        """Get or create HTTP client."""
+        if self.http_client is None:
+            self.http_client = httpx.AsyncClient(timeout=120.0)
+        return self.http_client
+
+    async def _check_embedding_service_health(self) -> bool:
+        """Check if embedding service is healthy."""
+        try:
+            client = await self._get_client()
+            response = await client.get(f"{EMBEDDING_SERVICE_URL}/health", timeout=10.0)
+            return response.status_code == 200
+        except Exception as e:
+            logger.warning(f"Health check failed: {e}")
+            return False
+
+    async def _wait_for_healthy_service(self, max_wait: int = 60) -> bool:
+        """Wait for embedding service to become healthy."""
+        logger.info("Waiting for embedding service to become healthy...")
+        start = datetime.now()
+        while (datetime.now() - start).seconds < max_wait:
+            if await self._check_embedding_service_health():
+                logger.info("Embedding service is healthy")
+                return True
+            await asyncio.sleep(5)
+        logger.error("Embedding service did not become healthy")
+        return False
+
+    async def _generate_single_embedding(self, text: str) -> Optional[List[float]]:
+        """Generate embedding for a single text with robust retry."""
+        for attempt in range(MAX_RETRIES):
+            try:
+                # Health check periodically
+                self.embeddings_since_health_check += 1
+                if self.embeddings_since_health_check >= HEALTH_CHECK_INTERVAL:
+                    if not await self._check_embedding_service_health():
+                        await self._wait_for_healthy_service()
+                    self.embeddings_since_health_check = 0
+
+                client = await self._get_client()
+                response = await client.post(
+                    f"{EMBEDDING_SERVICE_URL}/embed",
+                    json={"texts": [text]},
+                    timeout=60.0,
+                )
+                response.raise_for_status()
+                data = response.json()
+                return data["embeddings"][0]
+
+            except Exception as e:
+                delay = INITIAL_DELAY * (2 ** attempt)
+                logger.warning(f"Embedding attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
+                logger.info(f"Waiting {delay}s before retry...")
+
+                # Close and recreate client on connection errors
+                if "disconnect" in str(e).lower() or "connection" in str(e).lower():
+                    if self.http_client:
+                        await self.http_client.aclose()
+                        self.http_client = None
+                    # Wait for service to recover
+                    await asyncio.sleep(delay)
+                    if not await self._wait_for_healthy_service():
+                        continue
+                else:
+                    await asyncio.sleep(delay)
+
+        logger.error(f"Failed to generate embedding after {MAX_RETRIES} attempts")
+        return None
+
+    def _chunk_text_semantic(self, text: str) -> List[Tuple[str, int]]:
+        """Chunk text semantically, respecting German sentence boundaries."""
+        sentence_endings = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ])')
+        sentences = sentence_endings.split(text)
+
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        chunk_start = 0
+        position = 0
+
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+
+            sentence_length = len(sentence)
+
+            if current_length + sentence_length > CHUNK_SIZE and current_chunk:
+                chunk_text = " ".join(current_chunk)
+                chunks.append((chunk_text, chunk_start))
+
+                # Keep some sentences for overlap
+                overlap_sentences = []
+                overlap_length = 0
+                for s in reversed(current_chunk):
+                    if overlap_length + len(s) > CHUNK_OVERLAP:
+                        break
+                    overlap_sentences.insert(0, s)
+                    overlap_length += len(s)
+
+                current_chunk = overlap_sentences
+                current_length = overlap_length
+                chunk_start = position - overlap_length
+
+            current_chunk.append(sentence)
+            current_length += sentence_length
+            position += sentence_length + 1
+
+        if current_chunk:
+            chunk_text = " ".join(current_chunk)
+            chunks.append((chunk_text, chunk_start))
+
+        return chunks
+
+    def _extract_article_info(self, text: str) -> Optional[Dict]:
+        """Extract article number and paragraph from text."""
+        article_match = re.search(r'(?:Artikel|Art\.?)\s+(\d+)', text)
+        paragraph_match = re.search(r'(?:Absatz|Abs\.?)\s+(\d+)', text)
+
+        if article_match:
+            return {
+                "article": article_match.group(1),
+                "paragraph": paragraph_match.group(1) if paragraph_match else None,
+            }
+        return None
+
+    async def _fetch_document_text(self, regulation: Regulation) -> Optional[str]:
+        """Fetch document text from URL."""
+        logger.info(f"Fetching {regulation.code} from: {regulation.source_url}")
+        try:
+            client = await self._get_client()
+            response = await client.get(
+                regulation.source_url,
+                follow_redirects=True,
+                headers={"Accept": "text/html,application/xhtml+xml"},
+                timeout=60.0,
+            )
+            response.raise_for_status()
+
+            html_content = response.text
+            html_content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL)
+            html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL)
+            text = re.sub(r'<[^>]+>', ' ', html_content)
+            text = re.sub(r'\s+', ' ', text).strip()
+
+            return text
+        except Exception as e:
+            logger.error(f"Failed to fetch {regulation.code}: {e}")
+            return None
+
+    def get_existing_chunk_count(self, regulation_code: str) -> int:
+        """Get count of existing chunks for a regulation."""
+        try:
+            result = self.qdrant.count(
+                collection_name=LEGAL_CORPUS_COLLECTION,
+                count_filter=Filter(
+                    must=[
+                        FieldCondition(
+                            key="regulation_code",
+                            match=MatchValue(value=regulation_code),
+                        )
+                    ]
+                ),
+            )
+            return result.count
+        except:
+            return 0
+
+    async def ingest_regulation_robust(self, regulation: Regulation, resume: bool = True) -> int:
+        """
+        Ingest a regulation with robust error handling.
+
+        Args:
+            regulation: The regulation to ingest
+            resume: If True, skip already indexed chunks
+
+        Returns:
+            Number of chunks indexed
+        """
+        logger.info(f"=== Starting robust ingestion for {regulation.code} ===")
+
+        # Check existing chunks
+        existing_count = self.get_existing_chunk_count(regulation.code)
+        logger.info(f"Existing chunks for {regulation.code}: {existing_count}")
+
+        # Fetch document
+        text = await self._fetch_document_text(regulation)
+        if not text or len(text) < 100:
+            logger.warning(f"No text found for {regulation.code}")
+            return 0
+
+        # Chunk the text
+        chunks = self._chunk_text_semantic(text)
+        total_chunks = len(chunks)
+        logger.info(f"Total chunks to process: {total_chunks}")
+
+        if resume and existing_count >= total_chunks:
+            logger.info(f"{regulation.code} already fully indexed")
+            return existing_count
+
+        # Determine starting point
+        start_idx = existing_count if resume else 0
+        logger.info(f"Starting from chunk {start_idx}")
+
+        indexed = 0
+        for idx, (chunk_text, position) in enumerate(chunks[start_idx:], start=start_idx):
+            # Progress logging
+            if idx % 10 == 0:
+                logger.info(f"Progress: {idx}/{total_chunks} chunks ({idx*100//total_chunks}%)")
+
+            # Generate embedding
+            embedding = await self._generate_single_embedding(chunk_text)
+            if embedding is None:
+                logger.error(f"Failed to embed chunk {idx}, stopping")
+                break
+
+            # Create point
+            point_id = hashlib.md5(f"{regulation.code}-{idx}".encode()).hexdigest()
+            article_info = self._extract_article_info(chunk_text)
+
+            point = PointStruct(
+                id=point_id,
+                vector=embedding,
+                payload={
+                    "text": chunk_text,
+                    "regulation_code": regulation.code,
+                    "regulation_name": regulation.name,
+                    "regulation_full_name": regulation.full_name,
+                    "regulation_type": regulation.regulation_type,
+                    "source_url": regulation.source_url,
+                    "chunk_index": idx,
+                    "chunk_position": position,
+                    "article": article_info.get("article") if article_info else None,
+                    "paragraph": article_info.get("paragraph") if article_info else None,
+                    "language": regulation.language,
+                    "indexed_at": datetime.utcnow().isoformat(),
+                    "training_allowed": False,
+                },
+            )
+
+            # Upsert single point
+            self.qdrant.upsert(
+                collection_name=LEGAL_CORPUS_COLLECTION,
+                points=[point],
+            )
+            indexed += 1
+
+            # Delay between embeddings
+            await asyncio.sleep(DELAY_BETWEEN_EMBEDDINGS)
+
+        logger.info(f"=== Completed {regulation.code}: {indexed} new chunks indexed ===")
+        return existing_count + indexed
+
+    def get_status(self) -> Dict:
+        """Get ingestion status for all robust regulations."""
+        status = {
+            "collection": LEGAL_CORPUS_COLLECTION,
+            "regulations": {},
+        }
+
+        for reg in ROBUST_REGULATIONS:
+            count = self.get_existing_chunk_count(reg.code)
+            status["regulations"][reg.code] = {
+                "name": reg.name,
+                "chunks": count,
+                "status": "complete" if count > 0 else "missing",
+            }
+
+        return status
+
+    async def close(self):
+        """Close HTTP client."""
+        if self.http_client:
+            await self.http_client.aclose()
+
+
+async def main():
+    """CLI entry point."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Robust Legal Corpus Ingestion")
+    parser.add_argument("--ingest", nargs="+", metavar="CODE", help="Ingest specific regulations")
+    parser.add_argument("--ingest-all-missing", action="store_true", help="Ingest all missing regulations")
+    parser.add_argument("--status", action="store_true", help="Show status")
+    parser.add_argument("--no-resume", action="store_true", help="Don't resume from existing chunks")
+
+    args = parser.parse_args()
+
+    ingestion = RobustLegalCorpusIngestion()
+
+    try:
+        if args.status:
+            status = ingestion.get_status()
+            print(json.dumps(status, indent=2))
+
+        elif args.ingest_all_missing:
+            print("Ingesting all missing regulations...")
+            for reg in ROBUST_REGULATIONS:
+                if ingestion.get_existing_chunk_count(reg.code) == 0:
+                    count = await ingestion.ingest_regulation_robust(reg, resume=not args.no_resume)
+                    print(f"{reg.code}: {count} chunks")
+
+        elif args.ingest:
+            for code in args.ingest:
+                reg = next((r for r in ROBUST_REGULATIONS if r.code == code), None)
+                if not reg:
+                    print(f"Unknown regulation: {code}")
+                    continue
+                count = await ingestion.ingest_regulation_robust(reg, resume=not args.no_resume)
+                print(f"{code}: {count} chunks")
+
+        else:
+            parser.print_help()
+
+    finally:
+        await ingestion.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())