Restructure: Move 52 files into 7 domain packages

korrektur/ zeugnis/ admin/ compliance/ worksheet/ training/ metrics/ 52 shims, relative imports, RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 22:10:48 +02:00
parent 0504d22b8e
commit 165c493d1e
111 changed files with 11859 additions and 11609 deletions
--- a/klausur-service/backend/zeugnis/text.py
+++ b/klausur-service/backend/zeugnis/text.py
@@ -0,0 +1,110 @@
+"""
+Zeugnis Crawler - Text extraction, chunking, and hashing utilities.
+"""
+
+import hashlib
+from typing import List
+
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 200
+
+
+def extract_text_from_pdf(content: bytes) -> str:
+    """Extract text from PDF bytes."""
+    try:
+        from PyPDF2 import PdfReader
+        import io
+
+        reader = PdfReader(io.BytesIO(content))
+        text_parts = []
+        for page in reader.pages:
+            text = page.extract_text()
+            if text:
+                text_parts.append(text)
+        return "\n\n".join(text_parts)
+    except Exception as e:
+        print(f"PDF extraction failed: {e}")
+        return ""
+
+
+def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str:
+    """Extract text from HTML bytes."""
+    try:
+        from bs4 import BeautifulSoup
+
+        html = content.decode(encoding, errors="replace")
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Remove script and style elements
+        for element in soup(["script", "style", "nav", "header", "footer"]):
+            element.decompose()
+
+        # Get text
+        text = soup.get_text(separator="\n", strip=True)
+
+        # Clean up whitespace
+        lines = [line.strip() for line in text.splitlines() if line.strip()]
+        return "\n".join(lines)
+    except Exception as e:
+        print(f"HTML extraction failed: {e}")
+        return ""
+
+
+def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
+    """Split text into overlapping chunks."""
+    if not text:
+        return []
+
+    chunks = []
+    separators = ["\n\n", "\n", ". ", " "]
+
+    def split_recursive(text: str, sep_index: int = 0) -> List[str]:
+        if len(text) <= chunk_size:
+            return [text] if text.strip() else []
+
+        if sep_index >= len(separators):
+            # Force split at chunk_size
+            result = []
+            for i in range(0, len(text), chunk_size - overlap):
+                chunk = text[i:i + chunk_size]
+                if chunk.strip():
+                    result.append(chunk)
+            return result
+
+        sep = separators[sep_index]
+        parts = text.split(sep)
+        result = []
+        current = ""
+
+        for part in parts:
+            if len(current) + len(sep) + len(part) <= chunk_size:
+                current = current + sep + part if current else part
+            else:
+                if current.strip():
+                    result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
+                current = part
+
+        if current.strip():
+            result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
+
+        return result
+
+    chunks = split_recursive(text)
+
+    # Add overlap
+    if overlap > 0 and len(chunks) > 1:
+        overlapped = []
+        for i, chunk in enumerate(chunks):
+            if i > 0:
+                # Add end of previous chunk
+                prev_end = chunks[i - 1][-overlap:]
+                chunk = prev_end + chunk
+            overlapped.append(chunk)
+        chunks = overlapped
+
+    return chunks
+
+
+def compute_hash(content: bytes) -> str:
+    """Compute SHA-256 hash of content."""
+    return hashlib.sha256(content).hexdigest()