""" Zeugnis Crawler - Text extraction, chunking, and hashing utilities. """ import hashlib from typing import List CHUNK_SIZE = 1000 CHUNK_OVERLAP = 200 def extract_text_from_pdf(content: bytes) -> str: """Extract text from PDF bytes.""" try: from PyPDF2 import PdfReader import io reader = PdfReader(io.BytesIO(content)) text_parts = [] for page in reader.pages: text = page.extract_text() if text: text_parts.append(text) return "\n\n".join(text_parts) except Exception as e: print(f"PDF extraction failed: {e}") return "" def extract_text_from_html(content: bytes, encoding: str = "utf-8") -> str: """Extract text from HTML bytes.""" try: from bs4 import BeautifulSoup html = content.decode(encoding, errors="replace") soup = BeautifulSoup(html, "html.parser") # Remove script and style elements for element in soup(["script", "style", "nav", "header", "footer"]): element.decompose() # Get text text = soup.get_text(separator="\n", strip=True) # Clean up whitespace lines = [line.strip() for line in text.splitlines() if line.strip()] return "\n".join(lines) except Exception as e: print(f"HTML extraction failed: {e}") return "" def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]: """Split text into overlapping chunks.""" if not text: return [] chunks = [] separators = ["\n\n", "\n", ". ", " "] def split_recursive(text: str, sep_index: int = 0) -> List[str]: if len(text) <= chunk_size: return [text] if text.strip() else [] if sep_index >= len(separators): # Force split at chunk_size result = [] for i in range(0, len(text), chunk_size - overlap): chunk = text[i:i + chunk_size] if chunk.strip(): result.append(chunk) return result sep = separators[sep_index] parts = text.split(sep) result = [] current = "" for part in parts: if len(current) + len(sep) + len(part) <= chunk_size: current = current + sep + part if current else part else: if current.strip(): result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current]) current = part if current.strip(): result.extend(split_recursive(current, sep_index + 1) if len(current) > chunk_size else [current]) return result chunks = split_recursive(text) # Add overlap if overlap > 0 and len(chunks) > 1: overlapped = [] for i, chunk in enumerate(chunks): if i > 0: # Add end of previous chunk prev_end = chunks[i - 1][-overlap:] chunk = prev_end + chunk overlapped.append(chunk) chunks = overlapped return chunks def compute_hash(content: bytes) -> str: """Compute SHA-256 hash of content.""" return hashlib.sha256(content).hexdigest()