diff --git a/embedding-service/main.py b/embedding-service/main.py index 4220f8b..7c02227 100644 --- a/embedding-service/main.py +++ b/embedding-service/main.py @@ -483,7 +483,13 @@ def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]: Works for both German (DSGVO, BGB, AI Act DE) and English (NIST, SLSA, CRA EN) texts. """ if not text or len(text) <= chunk_size: - return [text.strip()] if text and text.strip() else [] + body = (text or "").strip() + if not body: + return [] + # Kurze Dokumente (ein ยง/Artikel) trotzdem mit Sektions-Prefix versehen, damit + # chunk_text_legal_structured Section/Artikel extrahieren kann (sonst article=""). + hdr = _extract_section_header(body.split("\n", 1)[0]) + return [f"[{hdr[:120]}] {body}"] if hdr else [body] # --- Phase 1: Split into sections by legal headers --- lines = text.split('\n')