Files
breakpilot-lehrer/klausur-service/backend/zeugnis/text.py
Benjamin Admin 165c493d1e
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 23s
Restructure: Move 52 files into 7 domain packages
korrektur/ zeugnis/ admin/ compliance/ worksheet/ training/ metrics/
52 shims, relative imports, RAG untouched.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 22:10:48 +02:00

111 lines
3.2 KiB
Python

"""
Zeugnis Crawler - Text extraction, chunking, and hashing utilities.
"""
import hashlib
from typing import List
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
def extract_text_from_pdf(content: bytes) -> str:
"""Extract text from PDF bytes."""
try:
from PyPDF2 import PdfReader
import io
reader = PdfReader(io.BytesIO(content))
text_parts = []
for page in reader.pages:
text = page.extract_text()
if text:
text_parts.append(text)
return "\n\n".join(text_parts)
except Exception as e:
print(f"PDF extraction failed: {e}")
return ""
def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str:
"""Extract text from HTML bytes."""
try:
from bs4 import BeautifulSoup
html = content.decode(encoding, errors="replace")
soup = BeautifulSoup(html, "html.parser")
# Remove script and style elements
for element in soup(["script", "style", "nav", "header", "footer"]):
element.decompose()
# Get text
text = soup.get_text(separator="\n", strip=True)
# Clean up whitespace
lines = [line.strip() for line in text.splitlines() if line.strip()]
return "\n".join(lines)
except Exception as e:
print(f"HTML extraction failed: {e}")
return ""
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
"""Split text into overlapping chunks."""
if not text:
return []
chunks = []
separators = ["\n\n", "\n", ". ", " "]
def split_recursive(text: str, sep_index: int = 0) -> List[str]:
if len(text) <= chunk_size:
return [text] if text.strip() else []
if sep_index >= len(separators):
# Force split at chunk_size
result = []
for i in range(0, len(text), chunk_size - overlap):
chunk = text[i:i + chunk_size]
if chunk.strip():
result.append(chunk)
return result
sep = separators[sep_index]
parts = text.split(sep)
result = []
current = ""
for part in parts:
if len(current) + len(sep) + len(part) <= chunk_size:
current = current + sep + part if current else part
else:
if current.strip():
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
current = part
if current.strip():
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
return result
chunks = split_recursive(text)
# Add overlap
if overlap > 0 and len(chunks) > 1:
overlapped = []
for i, chunk in enumerate(chunks):
if i > 0:
# Add end of previous chunk
prev_end = chunks[i - 1][-overlap:]
chunk = prev_end + chunk
overlapped.append(chunk)
chunks = overlapped
return chunks
def compute_hash(content: bytes) -> str:
"""Compute SHA-256 hash of content."""
return hashlib.sha256(content).hexdigest()