From 6ab10415d83aa5c041ca34fa662f6a0aaf4d7df7 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 1 May 2026 15:25:23 +0200 Subject: [PATCH] feat(embedding): add structural metadata to legal chunking (Block D1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit chunk_text_legal_structured() returns metadata per chunk: - section: "§ 312k", "Art. 5" - section_title: "Kündigungsbutton" - paragraph: "Abs. 1", "Nr. 3" - paragraph_num: 1, 3 - page: (prepared for PDF integration) - index: sequential position /chunk endpoint now returns chunks_with_metadata alongside plain chunks. Backward compatible — existing consumers use chunks field unchanged. New regex: _PARAGRAPH_RE (Abs/Nr/Satz/lit), _SECTION_NUMBER_RE New functions: _parse_section_metadata(), _extract_paragraph_ref() Co-Authored-By: Claude Opus 4.6 (1M context) --- embedding-service/main.py | 134 +++++++++++++++++++++++++++++++++++++- 1 file changed, 131 insertions(+), 3 deletions(-) diff --git a/embedding-service/main.py b/embedding-service/main.py index bc1d557..9c533c4 100644 --- a/embedding-service/main.py +++ b/embedding-service/main.py @@ -106,8 +106,19 @@ class ChunkRequest(BaseModel): strategy: str = Field(default="semantic", description="Chunking strategy: semantic or recursive") +class ChunkMetadata(BaseModel): + text: str + section: str = "" + section_title: str = "" + paragraph: str = "" + paragraph_num: Optional[int] = None + page: Optional[int] = None + index: int = 0 + + class ChunkResponse(BaseModel): chunks: List[str] + chunks_with_metadata: Optional[List[dict]] = None count: int strategy: str @@ -349,6 +360,32 @@ def _split_sentences(text: str) -> List[str]: return sentences +# Regex for paragraph/subsection references within text +_PARAGRAPH_RE = re.compile( + r'(?:' + r'Abs(?:atz|\.)\s*(\d+)' # Abs. 1, Absatz 2 + r'|Nr\.\s*(\d+)' # Nr. 3 + r'|Satz\s+(\d+)' # Satz 1 + r'|lit\.\s*([a-z])' # lit. a + r'|\((\d+)\)' # (1), (2) + r')', + re.IGNORECASE +) + +# Regex to extract section number from header +_SECTION_NUMBER_RE = re.compile( + r'(?:' + r'§\s*(\d+[a-z]*)' # § 25, § 312k + r'|Art(?:ikel|icle|\.)\s*(\d+)' # Artikel 5, Art. 3 + r'|Section\s+(\d[\d.]*)' # Section 4.2 + r'|Kapitel\s+(\d+)' # Kapitel 2 + r'|Anhang\s+([IVXLC\d]+)' # Anhang III + r'|Annex\s+([IVXLC\d]+)' # Annex XII + r')', + re.IGNORECASE +) + + def _extract_section_header(line: str) -> Optional[str]: """Extract a legal section header from a line, or None.""" m = _LEGAL_SECTION_RE.match(line.strip()) @@ -360,6 +397,58 @@ def _extract_section_header(line: str) -> Optional[str]: return None +def _parse_section_metadata(header: str) -> dict: + """Parse a section header into structured metadata. + + Returns: {"section": "§ 312k", "section_title": "Kuendigungsbutton"} + """ + if not header: + return {"section": "", "section_title": ""} + + m = _SECTION_NUMBER_RE.search(header) + section = "" + if m: + # Find which group matched + for i, g in enumerate(m.groups(), 1): + if g: + # Reconstruct the section reference + prefix = header[:m.start()].strip() + section = header[m.start():m.end()].strip() + break + + # Title = everything after the section number + title = header + if section: + idx = header.find(section) + if idx >= 0: + title = header[idx + len(section):].strip() + # Remove leading punctuation/whitespace + title = title.lstrip(' .-–—:') + + return {"section": section, "section_title": title.strip()} + + +def _extract_paragraph_ref(text: str) -> dict: + """Extract paragraph/subsection reference from chunk text. + + Returns: {"paragraph": "Abs. 1", "paragraph_num": 1} + """ + m = _PARAGRAPH_RE.search(text[:200]) # Only search first 200 chars + if not m: + return {"paragraph": "", "paragraph_num": None} + + for i, g in enumerate(m.groups(), 1): + if g: + ref = text[m.start():m.end()].strip() + try: + num = int(g) + except ValueError: + num = ord(g.lower()) - ord('a') + 1 # lit. a = 1, b = 2 + return {"paragraph": ref, "paragraph_num": num} + + return {"paragraph": "", "paragraph_num": None} + + def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]: """ Legal-document-aware chunking. @@ -494,6 +583,38 @@ def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]: return [c for c in final_chunks if c] +def chunk_text_legal_structured(text: str, chunk_size: int, overlap: int) -> List[dict]: + """Legal-aware chunking that returns structured metadata per chunk. + + Returns list of dicts with: text, section, section_title, paragraph, paragraph_num, index. + Uses the same splitting logic as chunk_text_legal but extracts metadata. + """ + plain_chunks = chunk_text_legal(text, chunk_size, overlap) + + # Track which section each chunk belongs to by re-parsing the prefix + structured = [] + for i, chunk_text in enumerate(plain_chunks): + meta = {"text": chunk_text, "section": "", "section_title": "", + "paragraph": "", "paragraph_num": None, "page": None, "index": i} + + # Extract section from the [§ 25 Title] prefix that chunk_text_legal adds + prefix_match = re.match(r'^\[(.+?)\]\s*', chunk_text) + if prefix_match: + header = prefix_match.group(1) + section_meta = _parse_section_metadata(header) + meta["section"] = section_meta["section"] + meta["section_title"] = section_meta["section_title"] + + # Extract paragraph reference from chunk content + para_meta = _extract_paragraph_ref(chunk_text) + meta["paragraph"] = para_meta["paragraph"] + meta["paragraph_num"] = para_meta["paragraph_num"] + + structured.append(meta) + + return structured + + def chunk_text_recursive(text: str, chunk_size: int, overlap: int) -> List[str]: """Recursive character-based chunking (legacy, use legal_recursive for legal docs).""" if not text or len(text) <= chunk_size: @@ -879,15 +1000,22 @@ async def chunk_text(request: ChunkRequest): if request.strategy == "semantic": overlap_sentences = max(1, request.overlap // 100) chunks = chunk_text_semantic(request.text, request.chunk_size, overlap_sentences) + return ChunkResponse( + chunks=chunks, + count=len(chunks), + strategy=request.strategy, + ) else: - # All strategies (recursive, legal_recursive, etc.) use the legal-aware chunker. - # The old plain recursive chunker is no longer exposed via the API. + # All strategies use the legal-aware chunker chunks = chunk_text_legal(request.text, request.chunk_size, request.overlap) + # Also generate structured metadata + structured = chunk_text_legal_structured(request.text, request.chunk_size, request.overlap) return ChunkResponse( chunks=chunks, + chunks_with_metadata=structured, count=len(chunks), - strategy=request.strategy + strategy=request.strategy, ) except Exception as e: logger.error(f"Chunking error: {e}")