[split-required] Split 500-850 LOC files (batch 2)
backend-lehrer (10 files): - game/database.py (785 → 5), correction_api.py (683 → 4) - classroom_engine/antizipation.py (676 → 5) - llm_gateway schools/edu_search already done in prior batch klausur-service (12 files): - orientation_crop_api.py (694 → 5), pdf_export.py (677 → 4) - zeugnis_crawler.py (676 → 5), grid_editor_api.py (671 → 5) - eh_templates.py (658 → 5), mail/api.py (651 → 5) - qdrant_service.py (638 → 5), training_api.py (625 → 4) website (6 pages): - middleware (696 → 8), mail (733 → 6), consent (628 → 8) - compliance/risks (622 → 5), export (502 → 5), brandbook (629 → 7) studio-v2 (3 components): - B2BMigrationWizard (848 → 3), CleanupPanel (765 → 2) - dashboard-experimental (739 → 2) admin-lehrer (4 files): - uebersetzungen (769 → 4), manager (670 → 2) - ChunkBrowserQA (675 → 6), dsfa/page (674 → 5) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
110
klausur-service/backend/zeugnis_text.py
Normal file
110
klausur-service/backend/zeugnis_text.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""
|
||||
Zeugnis Crawler - Text extraction, chunking, and hashing utilities.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
from typing import List
|
||||
|
||||
CHUNK_SIZE = 1000
|
||||
CHUNK_OVERLAP = 200
|
||||
|
||||
|
||||
def extract_text_from_pdf(content: bytes) -> str:
|
||||
"""Extract text from PDF bytes."""
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
import io
|
||||
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
return "\n\n".join(text_parts)
|
||||
except Exception as e:
|
||||
print(f"PDF extraction failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str:
|
||||
"""Extract text from HTML bytes."""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
html = content.decode(encoding, errors="replace")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Remove script and style elements
|
||||
for element in soup(["script", "style", "nav", "header", "footer"]):
|
||||
element.decompose()
|
||||
|
||||
# Get text
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
# Clean up whitespace
|
||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
return "\n".join(lines)
|
||||
except Exception as e:
|
||||
print(f"HTML extraction failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
||||
"""Split text into overlapping chunks."""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
chunks = []
|
||||
separators = ["\n\n", "\n", ". ", " "]
|
||||
|
||||
def split_recursive(text: str, sep_index: int = 0) -> List[str]:
|
||||
if len(text) <= chunk_size:
|
||||
return [text] if text.strip() else []
|
||||
|
||||
if sep_index >= len(separators):
|
||||
# Force split at chunk_size
|
||||
result = []
|
||||
for i in range(0, len(text), chunk_size - overlap):
|
||||
chunk = text[i:i + chunk_size]
|
||||
if chunk.strip():
|
||||
result.append(chunk)
|
||||
return result
|
||||
|
||||
sep = separators[sep_index]
|
||||
parts = text.split(sep)
|
||||
result = []
|
||||
current = ""
|
||||
|
||||
for part in parts:
|
||||
if len(current) + len(sep) + len(part) <= chunk_size:
|
||||
current = current + sep + part if current else part
|
||||
else:
|
||||
if current.strip():
|
||||
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
|
||||
current = part
|
||||
|
||||
if current.strip():
|
||||
result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current])
|
||||
|
||||
return result
|
||||
|
||||
chunks = split_recursive(text)
|
||||
|
||||
# Add overlap
|
||||
if overlap > 0 and len(chunks) > 1:
|
||||
overlapped = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
if i > 0:
|
||||
# Add end of previous chunk
|
||||
prev_end = chunks[i - 1][-overlap:]
|
||||
chunk = prev_end + chunk
|
||||
overlapped.append(chunk)
|
||||
chunks = overlapped
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def compute_hash(content: bytes) -> str:
|
||||
"""Compute SHA-256 hash of content."""
|
||||
return hashlib.sha256(content).hexdigest()
|
||||
Reference in New Issue
Block a user