feat(embedding): add structural metadata to legal chunking (Block D1)
chunk_text_legal_structured() returns metadata per chunk: - section: "§ 312k", "Art. 5" - section_title: "Kündigungsbutton" - paragraph: "Abs. 1", "Nr. 3" - paragraph_num: 1, 3 - page: (prepared for PDF integration) - index: sequential position /chunk endpoint now returns chunks_with_metadata alongside plain chunks. Backward compatible — existing consumers use chunks field unchanged. New regex: _PARAGRAPH_RE (Abs/Nr/Satz/lit), _SECTION_NUMBER_RE New functions: _parse_section_metadata(), _extract_paragraph_ref() Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+131
-3
@@ -106,8 +106,19 @@ class ChunkRequest(BaseModel):
|
||||
strategy: str = Field(default="semantic", description="Chunking strategy: semantic or recursive")
|
||||
|
||||
|
||||
class ChunkMetadata(BaseModel):
|
||||
text: str
|
||||
section: str = ""
|
||||
section_title: str = ""
|
||||
paragraph: str = ""
|
||||
paragraph_num: Optional[int] = None
|
||||
page: Optional[int] = None
|
||||
index: int = 0
|
||||
|
||||
|
||||
class ChunkResponse(BaseModel):
|
||||
chunks: List[str]
|
||||
chunks_with_metadata: Optional[List[dict]] = None
|
||||
count: int
|
||||
strategy: str
|
||||
|
||||
@@ -349,6 +360,32 @@ def _split_sentences(text: str) -> List[str]:
|
||||
return sentences
|
||||
|
||||
|
||||
# Regex for paragraph/subsection references within text
|
||||
_PARAGRAPH_RE = re.compile(
|
||||
r'(?:'
|
||||
r'Abs(?:atz|\.)\s*(\d+)' # Abs. 1, Absatz 2
|
||||
r'|Nr\.\s*(\d+)' # Nr. 3
|
||||
r'|Satz\s+(\d+)' # Satz 1
|
||||
r'|lit\.\s*([a-z])' # lit. a
|
||||
r'|\((\d+)\)' # (1), (2)
|
||||
r')',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
# Regex to extract section number from header
|
||||
_SECTION_NUMBER_RE = re.compile(
|
||||
r'(?:'
|
||||
r'§\s*(\d+[a-z]*)' # § 25, § 312k
|
||||
r'|Art(?:ikel|icle|\.)\s*(\d+)' # Artikel 5, Art. 3
|
||||
r'|Section\s+(\d[\d.]*)' # Section 4.2
|
||||
r'|Kapitel\s+(\d+)' # Kapitel 2
|
||||
r'|Anhang\s+([IVXLC\d]+)' # Anhang III
|
||||
r'|Annex\s+([IVXLC\d]+)' # Annex XII
|
||||
r')',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
|
||||
def _extract_section_header(line: str) -> Optional[str]:
|
||||
"""Extract a legal section header from a line, or None."""
|
||||
m = _LEGAL_SECTION_RE.match(line.strip())
|
||||
@@ -360,6 +397,58 @@ def _extract_section_header(line: str) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _parse_section_metadata(header: str) -> dict:
|
||||
"""Parse a section header into structured metadata.
|
||||
|
||||
Returns: {"section": "§ 312k", "section_title": "Kuendigungsbutton"}
|
||||
"""
|
||||
if not header:
|
||||
return {"section": "", "section_title": ""}
|
||||
|
||||
m = _SECTION_NUMBER_RE.search(header)
|
||||
section = ""
|
||||
if m:
|
||||
# Find which group matched
|
||||
for i, g in enumerate(m.groups(), 1):
|
||||
if g:
|
||||
# Reconstruct the section reference
|
||||
prefix = header[:m.start()].strip()
|
||||
section = header[m.start():m.end()].strip()
|
||||
break
|
||||
|
||||
# Title = everything after the section number
|
||||
title = header
|
||||
if section:
|
||||
idx = header.find(section)
|
||||
if idx >= 0:
|
||||
title = header[idx + len(section):].strip()
|
||||
# Remove leading punctuation/whitespace
|
||||
title = title.lstrip(' .-–—:')
|
||||
|
||||
return {"section": section, "section_title": title.strip()}
|
||||
|
||||
|
||||
def _extract_paragraph_ref(text: str) -> dict:
|
||||
"""Extract paragraph/subsection reference from chunk text.
|
||||
|
||||
Returns: {"paragraph": "Abs. 1", "paragraph_num": 1}
|
||||
"""
|
||||
m = _PARAGRAPH_RE.search(text[:200]) # Only search first 200 chars
|
||||
if not m:
|
||||
return {"paragraph": "", "paragraph_num": None}
|
||||
|
||||
for i, g in enumerate(m.groups(), 1):
|
||||
if g:
|
||||
ref = text[m.start():m.end()].strip()
|
||||
try:
|
||||
num = int(g)
|
||||
except ValueError:
|
||||
num = ord(g.lower()) - ord('a') + 1 # lit. a = 1, b = 2
|
||||
return {"paragraph": ref, "paragraph_num": num}
|
||||
|
||||
return {"paragraph": "", "paragraph_num": None}
|
||||
|
||||
|
||||
def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]:
|
||||
"""
|
||||
Legal-document-aware chunking.
|
||||
@@ -494,6 +583,38 @@ def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]:
|
||||
return [c for c in final_chunks if c]
|
||||
|
||||
|
||||
def chunk_text_legal_structured(text: str, chunk_size: int, overlap: int) -> List[dict]:
|
||||
"""Legal-aware chunking that returns structured metadata per chunk.
|
||||
|
||||
Returns list of dicts with: text, section, section_title, paragraph, paragraph_num, index.
|
||||
Uses the same splitting logic as chunk_text_legal but extracts metadata.
|
||||
"""
|
||||
plain_chunks = chunk_text_legal(text, chunk_size, overlap)
|
||||
|
||||
# Track which section each chunk belongs to by re-parsing the prefix
|
||||
structured = []
|
||||
for i, chunk_text in enumerate(plain_chunks):
|
||||
meta = {"text": chunk_text, "section": "", "section_title": "",
|
||||
"paragraph": "", "paragraph_num": None, "page": None, "index": i}
|
||||
|
||||
# Extract section from the [§ 25 Title] prefix that chunk_text_legal adds
|
||||
prefix_match = re.match(r'^\[(.+?)\]\s*', chunk_text)
|
||||
if prefix_match:
|
||||
header = prefix_match.group(1)
|
||||
section_meta = _parse_section_metadata(header)
|
||||
meta["section"] = section_meta["section"]
|
||||
meta["section_title"] = section_meta["section_title"]
|
||||
|
||||
# Extract paragraph reference from chunk content
|
||||
para_meta = _extract_paragraph_ref(chunk_text)
|
||||
meta["paragraph"] = para_meta["paragraph"]
|
||||
meta["paragraph_num"] = para_meta["paragraph_num"]
|
||||
|
||||
structured.append(meta)
|
||||
|
||||
return structured
|
||||
|
||||
|
||||
def chunk_text_recursive(text: str, chunk_size: int, overlap: int) -> List[str]:
|
||||
"""Recursive character-based chunking (legacy, use legal_recursive for legal docs)."""
|
||||
if not text or len(text) <= chunk_size:
|
||||
@@ -879,15 +1000,22 @@ async def chunk_text(request: ChunkRequest):
|
||||
if request.strategy == "semantic":
|
||||
overlap_sentences = max(1, request.overlap // 100)
|
||||
chunks = chunk_text_semantic(request.text, request.chunk_size, overlap_sentences)
|
||||
return ChunkResponse(
|
||||
chunks=chunks,
|
||||
count=len(chunks),
|
||||
strategy=request.strategy,
|
||||
)
|
||||
else:
|
||||
# All strategies (recursive, legal_recursive, etc.) use the legal-aware chunker.
|
||||
# The old plain recursive chunker is no longer exposed via the API.
|
||||
# All strategies use the legal-aware chunker
|
||||
chunks = chunk_text_legal(request.text, request.chunk_size, request.overlap)
|
||||
# Also generate structured metadata
|
||||
structured = chunk_text_legal_structured(request.text, request.chunk_size, request.overlap)
|
||||
|
||||
return ChunkResponse(
|
||||
chunks=chunks,
|
||||
chunks_with_metadata=structured,
|
||||
count=len(chunks),
|
||||
strategy=request.strategy
|
||||
strategy=request.strategy,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Chunking error: {e}")
|
||||
|
||||
Reference in New Issue
Block a user