feat(embedding): add structural metadata to legal chunking (Block D1)

chunk_text_legal_structured() returns metadata per chunk: - section: "§ 312k", "Art. 5" - section_title: "Kündigungsbutton" - paragraph: "Abs. 1", "Nr. 3" - paragraph_num: 1, 3 - page: (prepared for PDF integration) - index: sequential position /chunk endpoint now returns chunks_with_metadata alongside plain chunks. Backward compatible — existing consumers use chunks field unchanged. New regex: _PARAGRAPH_RE (Abs/Nr/Satz/lit), _SECTION_NUMBER_RE New functions: _parse_section_metadata(), _extract_paragraph_ref() Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-01 15:25:23 +02:00
parent d9c16fb914
commit 6ab10415d8
1 changed files with 131 additions and 3 deletions
@@ -106,8 +106,19 @@ class ChunkRequest(BaseModel):
    strategy: str = Field(default="semantic", description="Chunking strategy: semantic or recursive")


+class ChunkMetadata(BaseModel):
+    text: str
+    section: str = ""
+    section_title: str = ""
+    paragraph: str = ""
+    paragraph_num: Optional[int] = None
+    page: Optional[int] = None
+    index: int = 0
+
+
 class ChunkResponse(BaseModel):
    chunks: List[str]
+    chunks_with_metadata: Optional[List[dict]] = None
    count: int
    strategy: str

@@ -349,6 +360,32 @@ def _split_sentences(text: str) -> List[str]:
    return sentences


+# Regex for paragraph/subsection references within text
+_PARAGRAPH_RE = re.compile(
+    r'(?:'
+    r'Abs(?:atz|\.)\s*(\d+)'         # Abs. 1, Absatz 2
+    r'|Nr\.\s*(\d+)'                 # Nr. 3
+    r'|Satz\s+(\d+)'                 # Satz 1
+    r'|lit\.\s*([a-z])'             # lit. a
+    r'|\((\d+)\)'                    # (1), (2)
+    r')',
+    re.IGNORECASE
+)
+
+# Regex to extract section number from header
+_SECTION_NUMBER_RE = re.compile(
+    r'(?:'
+    r'§\s*(\d+[a-z]*)'              # § 25, § 312k
+    r'|Art(?:ikel|icle|\.)\s*(\d+)'  # Artikel 5, Art. 3
+    r'|Section\s+(\d[\d.]*)'         # Section 4.2
+    r'|Kapitel\s+(\d+)'              # Kapitel 2
+    r'|Anhang\s+([IVXLC\d]+)'       # Anhang III
+    r'|Annex\s+([IVXLC\d]+)'        # Annex XII
+    r')',
+    re.IGNORECASE
+)
+
+
 def _extract_section_header(line: str) -> Optional[str]:
    """Extract a legal section header from a line, or None."""
    m = _LEGAL_SECTION_RE.match(line.strip())
@@ -360,6 +397,58 @@ def _extract_section_header(line: str) -> Optional[str]:
    return None


+def _parse_section_metadata(header: str) -> dict:
+    """Parse a section header into structured metadata.
+
+    Returns: {"section": "§ 312k", "section_title": "Kuendigungsbutton"}
+    """
+    if not header:
+        return {"section": "", "section_title": ""}
+
+    m = _SECTION_NUMBER_RE.search(header)
+    section = ""
+    if m:
+        # Find which group matched
+        for i, g in enumerate(m.groups(), 1):
+            if g:
+                # Reconstruct the section reference
+                prefix = header[:m.start()].strip()
+                section = header[m.start():m.end()].strip()
+                break
+
+    # Title = everything after the section number
+    title = header
+    if section:
+        idx = header.find(section)
+        if idx >= 0:
+            title = header[idx + len(section):].strip()
+            # Remove leading punctuation/whitespace
+            title = title.lstrip(' .-–—:')
+
+    return {"section": section, "section_title": title.strip()}
+
+
+def _extract_paragraph_ref(text: str) -> dict:
+    """Extract paragraph/subsection reference from chunk text.
+
+    Returns: {"paragraph": "Abs. 1", "paragraph_num": 1}
+    """
+    m = _PARAGRAPH_RE.search(text[:200])  # Only search first 200 chars
+    if not m:
+        return {"paragraph": "", "paragraph_num": None}
+
+    for i, g in enumerate(m.groups(), 1):
+        if g:
+            ref = text[m.start():m.end()].strip()
+            try:
+                num = int(g)
+            except ValueError:
+                num = ord(g.lower()) - ord('a') + 1  # lit. a = 1, b = 2
+            return {"paragraph": ref, "paragraph_num": num}
+
+    return {"paragraph": "", "paragraph_num": None}
+
+
 def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]:
    """
    Legal-document-aware chunking.
@@ -494,6 +583,38 @@ def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]:
    return [c for c in final_chunks if c]


+def chunk_text_legal_structured(text: str, chunk_size: int, overlap: int) -> List[dict]:
+    """Legal-aware chunking that returns structured metadata per chunk.
+
+    Returns list of dicts with: text, section, section_title, paragraph, paragraph_num, index.
+    Uses the same splitting logic as chunk_text_legal but extracts metadata.
+    """
+    plain_chunks = chunk_text_legal(text, chunk_size, overlap)
+
+    # Track which section each chunk belongs to by re-parsing the prefix
+    structured = []
+    for i, chunk_text in enumerate(plain_chunks):
+        meta = {"text": chunk_text, "section": "", "section_title": "",
+                "paragraph": "", "paragraph_num": None, "page": None, "index": i}
+
+        # Extract section from the [§ 25 Title] prefix that chunk_text_legal adds
+        prefix_match = re.match(r'^\[(.+?)\]\s*', chunk_text)
+        if prefix_match:
+            header = prefix_match.group(1)
+            section_meta = _parse_section_metadata(header)
+            meta["section"] = section_meta["section"]
+            meta["section_title"] = section_meta["section_title"]
+
+        # Extract paragraph reference from chunk content
+        para_meta = _extract_paragraph_ref(chunk_text)
+        meta["paragraph"] = para_meta["paragraph"]
+        meta["paragraph_num"] = para_meta["paragraph_num"]
+
+        structured.append(meta)
+
+    return structured
+
+
 def chunk_text_recursive(text: str, chunk_size: int, overlap: int) -> List[str]:
    """Recursive character-based chunking (legacy, use legal_recursive for legal docs)."""
    if not text or len(text) <= chunk_size:
@@ -879,15 +1000,22 @@ async def chunk_text(request: ChunkRequest):
        if request.strategy == "semantic":
            overlap_sentences = max(1, request.overlap // 100)
            chunks = chunk_text_semantic(request.text, request.chunk_size, overlap_sentences)
-        else:
-            # All strategies (recursive, legal_recursive, etc.) use the legal-aware chunker.
-            # The old plain recursive chunker is no longer exposed via the API.
-            chunks = chunk_text_legal(request.text, request.chunk_size, request.overlap)
-
            return ChunkResponse(
                chunks=chunks,
                count=len(chunks),
-            strategy=request.strategy
+                strategy=request.strategy,
+            )
+        else:
+            # All strategies use the legal-aware chunker
+            chunks = chunk_text_legal(request.text, request.chunk_size, request.overlap)
+            # Also generate structured metadata
+            structured = chunk_text_legal_structured(request.text, request.chunk_size, request.overlap)
+
+        return ChunkResponse(
+            chunks=chunks,
+            chunks_with_metadata=structured,
+            count=len(chunks),
+            strategy=request.strategy,
        )
    except Exception as e:
        logger.error(f"Chunking error: {e}")