breakpilot-lehrer/klausur-service/backend/zeugnis/text.py

"""
Zeugnis Crawler - Text extraction, chunking, and hashing utilities.
"""

import hashlib
from typing import List

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200


def extract_text_from_pdf(content: bytes) -> str:
    """Extract text from PDF bytes."""
    try:
        from PyPDF2 import PdfReader
        import io

        reader = PdfReader(io.BytesIO(content))
        text_parts = []
        for page in reader.pages:
            text = page.extract_text()
            if text:
                text_parts.append(text)
        return "\n\n".join(text_parts)
    except Exception as e:
        print(f"PDF extraction failed: {e}")
        return ""


def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str:
    """Extract text from HTML bytes."""
    try:
        from bs4 import BeautifulSoup

        html = content.decode(encoding, errors="replace")
        soup = BeautifulSoup(html, "html.parser")

        # Remove script and style elements
        for element in soup(["script", "style", "nav", "header", "footer"]):
            element.decompose()

        # Get text
        text = soup.get_text(separator="\n", strip=True)

        # Clean up whitespace
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        return "\n".join(lines)
    except Exception as e:
        print(f"HTML extraction failed: {e}")
        return ""


def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    """Split text into overlapping chunks."""
    if not text:
        return []

    chunks = []
    separators = ["\n\n", "\n", ". ", " "]

    def split_recursive(text: str, sep_index: int = 0) -> List[str]:
        if len(text) <= chunk_size:
            return [text] if text.strip() else []

        if sep_index >= len(separators):
            # Force split at chunk_size
            result = []
            for i in range(0, len(text), chunk_size - overlap):
                chunk = text[i:i + chunk_size]
                if chunk.strip():
                    result.append(chunk)
            return result

        sep = separators[sep_index]
        parts = text.split(sep)
        result = []
        current = ""

        for part in parts:
            if len(current) + len(sep) + len(part) <= chunk_size:
                current = current + sep + part if current else part
            else:
                if current.strip():
                    result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
                current = part

        if current.strip():
            result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])

        return result

    chunks = split_recursive(text)

    # Add overlap
    if overlap > 0 and len(chunks) > 1:
        overlapped = []
        for i, chunk in enumerate(chunks):
            if i > 0:
                # Add end of previous chunk
                prev_end = chunks[i - 1][-overlap:]
                chunk = prev_end + chunk
            overlapped.append(chunk)
        chunks = overlapped

    return chunks


def compute_hash(content: bytes) -> str:
    """Compute SHA-256 hash of content."""
    return hashlib.sha256(content).hexdigest()