feat(embedding): add structural metadata to legal chunking (Block D1)

chunk_text_legal_structured() returns metadata per chunk:
- section: "§ 312k", "Art. 5"
- section_title: "Kündigungsbutton"
- paragraph: "Abs. 1", "Nr. 3"
- paragraph_num: 1, 3
- page: (prepared for PDF integration)
- index: sequential position

/chunk endpoint now returns chunks_with_metadata alongside plain chunks.
Backward compatible — existing consumers use chunks field unchanged.

New regex: _PARAGRAPH_RE (Abs/Nr/Satz/lit), _SECTION_NUMBER_RE
New functions: _parse_section_metadata(), _extract_paragraph_ref()

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-01 15:25:23 +02:00
parent d9c16fb914
commit 6ab10415d8
+134 -6
View File
@@ -106,8 +106,19 @@ class ChunkRequest(BaseModel):
strategy: str = Field(default="semantic", description="Chunking strategy: semantic or recursive")
class ChunkMetadata(BaseModel):
text: str
section: str = ""
section_title: str = ""
paragraph: str = ""
paragraph_num: Optional[int] = None
page: Optional[int] = None
index: int = 0
class ChunkResponse(BaseModel):
chunks: List[str]
chunks_with_metadata: Optional[List[dict]] = None
count: int
strategy: str
@@ -349,6 +360,32 @@ def _split_sentences(text: str) -> List[str]:
return sentences
# Regex for paragraph/subsection references within text
_PARAGRAPH_RE = re.compile(
r'(?:'
r'Abs(?:atz|\.)\s*(\d+)' # Abs. 1, Absatz 2
r'|Nr\.\s*(\d+)' # Nr. 3
r'|Satz\s+(\d+)' # Satz 1
r'|lit\.\s*([a-z])' # lit. a
r'|\((\d+)\)' # (1), (2)
r')',
re.IGNORECASE
)
# Regex to extract section number from header
_SECTION_NUMBER_RE = re.compile(
r'(?:'
r'§\s*(\d+[a-z]*)' # § 25, § 312k
r'|Art(?:ikel|icle|\.)\s*(\d+)' # Artikel 5, Art. 3
r'|Section\s+(\d[\d.]*)' # Section 4.2
r'|Kapitel\s+(\d+)' # Kapitel 2
r'|Anhang\s+([IVXLC\d]+)' # Anhang III
r'|Annex\s+([IVXLC\d]+)' # Annex XII
r')',
re.IGNORECASE
)
def _extract_section_header(line: str) -> Optional[str]:
"""Extract a legal section header from a line, or None."""
m = _LEGAL_SECTION_RE.match(line.strip())
@@ -360,6 +397,58 @@ def _extract_section_header(line: str) -> Optional[str]:
return None
def _parse_section_metadata(header: str) -> dict:
"""Parse a section header into structured metadata.
Returns: {"section": "§ 312k", "section_title": "Kuendigungsbutton"}
"""
if not header:
return {"section": "", "section_title": ""}
m = _SECTION_NUMBER_RE.search(header)
section = ""
if m:
# Find which group matched
for i, g in enumerate(m.groups(), 1):
if g:
# Reconstruct the section reference
prefix = header[:m.start()].strip()
section = header[m.start():m.end()].strip()
break
# Title = everything after the section number
title = header
if section:
idx = header.find(section)
if idx >= 0:
title = header[idx + len(section):].strip()
# Remove leading punctuation/whitespace
title = title.lstrip(' .-–—:')
return {"section": section, "section_title": title.strip()}
def _extract_paragraph_ref(text: str) -> dict:
"""Extract paragraph/subsection reference from chunk text.
Returns: {"paragraph": "Abs. 1", "paragraph_num": 1}
"""
m = _PARAGRAPH_RE.search(text[:200]) # Only search first 200 chars
if not m:
return {"paragraph": "", "paragraph_num": None}
for i, g in enumerate(m.groups(), 1):
if g:
ref = text[m.start():m.end()].strip()
try:
num = int(g)
except ValueError:
num = ord(g.lower()) - ord('a') + 1 # lit. a = 1, b = 2
return {"paragraph": ref, "paragraph_num": num}
return {"paragraph": "", "paragraph_num": None}
def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]:
"""
Legal-document-aware chunking.
@@ -494,6 +583,38 @@ def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]:
return [c for c in final_chunks if c]
def chunk_text_legal_structured(text: str, chunk_size: int, overlap: int) -> List[dict]:
"""Legal-aware chunking that returns structured metadata per chunk.
Returns list of dicts with: text, section, section_title, paragraph, paragraph_num, index.
Uses the same splitting logic as chunk_text_legal but extracts metadata.
"""
plain_chunks = chunk_text_legal(text, chunk_size, overlap)
# Track which section each chunk belongs to by re-parsing the prefix
structured = []
for i, chunk_text in enumerate(plain_chunks):
meta = {"text": chunk_text, "section": "", "section_title": "",
"paragraph": "", "paragraph_num": None, "page": None, "index": i}
# Extract section from the [§ 25 Title] prefix that chunk_text_legal adds
prefix_match = re.match(r'^\[(.+?)\]\s*', chunk_text)
if prefix_match:
header = prefix_match.group(1)
section_meta = _parse_section_metadata(header)
meta["section"] = section_meta["section"]
meta["section_title"] = section_meta["section_title"]
# Extract paragraph reference from chunk content
para_meta = _extract_paragraph_ref(chunk_text)
meta["paragraph"] = para_meta["paragraph"]
meta["paragraph_num"] = para_meta["paragraph_num"]
structured.append(meta)
return structured
def chunk_text_recursive(text: str, chunk_size: int, overlap: int) -> List[str]:
"""Recursive character-based chunking (legacy, use legal_recursive for legal docs)."""
if not text or len(text) <= chunk_size:
@@ -879,15 +1000,22 @@ async def chunk_text(request: ChunkRequest):
if request.strategy == "semantic":
overlap_sentences = max(1, request.overlap // 100)
chunks = chunk_text_semantic(request.text, request.chunk_size, overlap_sentences)
else:
# All strategies (recursive, legal_recursive, etc.) use the legal-aware chunker.
# The old plain recursive chunker is no longer exposed via the API.
chunks = chunk_text_legal(request.text, request.chunk_size, request.overlap)
return ChunkResponse(
chunks=chunks,
count=len(chunks),
strategy=request.strategy
strategy=request.strategy,
)
else:
# All strategies use the legal-aware chunker
chunks = chunk_text_legal(request.text, request.chunk_size, request.overlap)
# Also generate structured metadata
structured = chunk_text_legal_structured(request.text, request.chunk_size, request.overlap)
return ChunkResponse(
chunks=chunks,
chunks_with_metadata=structured,
count=len(chunks),
strategy=request.strategy,
)
except Exception as e:
logger.error(f"Chunking error: {e}")