Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 870cdc871e |
@@ -483,7 +483,13 @@ def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]:
|
||||
Works for both German (DSGVO, BGB, AI Act DE) and English (NIST, SLSA, CRA EN) texts.
|
||||
"""
|
||||
if not text or len(text) <= chunk_size:
|
||||
return [text.strip()] if text and text.strip() else []
|
||||
body = (text or "").strip()
|
||||
if not body:
|
||||
return []
|
||||
# Kurze Dokumente (ein §/Artikel) trotzdem mit Sektions-Prefix versehen, damit
|
||||
# chunk_text_legal_structured Section/Artikel extrahieren kann (sonst article="").
|
||||
hdr = _extract_section_header(body.split("\n", 1)[0])
|
||||
return [f"[{hdr[:120]}] {body}"] if hdr else [body]
|
||||
|
||||
# --- Phase 1: Split into sections by legal headers ---
|
||||
lines = text.split('\n')
|
||||
|
||||
Reference in New Issue
Block a user