c258fbc3de
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 30s
CI / test-python-voice (push) Successful in 38s
CI / test-bqas (push) Successful in 40s
Add services/recital_ingester.py — parses EU act recitals (Erwägungsgründe) from the eur-lex/CELLAR preamble via the id="rct_N" markers (the table layout that defeats a naive article parser) and tags them as a SEPARATE interpretative source: source_class=recital, authority_weight=60, use_for_primary=false, so they rank below binding articles and surface only as interpretation context. Reuses the Parser-1 download + helpers. Add scripts/ingest_recitals.py (skip-by-existing, no auto re-ingest) + tests/fixture. Tested: 4 unit tests over a synthetic rct_N fixture, ruff + mypy clean, real CELLAR parse of DORA verified end-to-end (106 recitals, interpretative metadata). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
116 lines
4.3 KiB
Python
116 lines
4.3 KiB
Python
"""RecitalIngester (Parser 2): ingests EU act recitals (Erwägungsgründe) as a
|
|
SEPARATE, interpretative source — never a primary obligation source.
|
|
|
|
In eur-lex / CELLAR XHTML each recital sits in a preamble block
|
|
<div class="eli-subdivision" id="rct_N"> with the marker "(N)" and the text in
|
|
adjacent table cells, which is why a naive article parser finds none. This
|
|
parser keys on the id="rct_N" markers and joins the recital's prose.
|
|
|
|
Recitals are tagged source_class=recital / authority_weight=60 /
|
|
use_for_primary=false, so they rank below binding articles and surface only as
|
|
interpretation context (and trip the human-review flag if they ever top
|
|
results). Reuses the eur-lex download + helpers from legal_act_ingester
|
|
(Parser 1).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
from services.legal_act_ingester import RegSpec, UploadUnit, clean, refs_out
|
|
|
|
RECITAL_WEIGHT = 60
|
|
|
|
_RCT_DIV_RE = re.compile(r'id="rct_(\d+)"')
|
|
_OJ_P_RE = re.compile(r'<p[^>]*class="oj-normal"[^>]*>(.*?)</p>', re.S)
|
|
_RCT_NUM_RE = re.compile(r"^\(\d+\)$")
|
|
_MIN_RECITAL_CHARS = 20
|
|
|
|
|
|
@dataclass
|
|
class Recital:
|
|
num: str
|
|
text: str
|
|
|
|
|
|
def parse_recitals(raw: str, reg: str) -> list[Recital]:
|
|
"""Extract recitals from the preamble via the id="rct_N" markers. `reg` is
|
|
accepted for symmetry with the other parsers (recitals carry no reg in-text)."""
|
|
_ = reg
|
|
end = raw.find('class="oj-ti-art"')
|
|
if end < 0:
|
|
end = len(raw)
|
|
markers = [(m.group(1), m.start()) for m in _RCT_DIV_RE.finditer(raw, 0, end)]
|
|
recitals: list[Recital] = []
|
|
for i, (num, start) in enumerate(markers):
|
|
stop = markers[i + 1][1] if i + 1 < len(markers) else end
|
|
parts = [clean(inner) for inner in _OJ_P_RE.findall(raw[start:stop])]
|
|
body = " ".join(p for p in parts if p and not _RCT_NUM_RE.match(p))
|
|
if len(body) >= _MIN_RECITAL_CHARS:
|
|
recitals.append(Recital(num=num, text=body))
|
|
return recitals
|
|
|
|
|
|
def self_test(recitals: list[Recital]) -> tuple[bool, list[str]]:
|
|
"""Gate before upload. Every EU act has recitals → 0 is a parse failure."""
|
|
problems: list[str] = []
|
|
if not recitals:
|
|
problems.append("0 recitals parsed")
|
|
nums = [r.num for r in recitals]
|
|
if len(nums) != len(set(nums)):
|
|
problems.append("duplicate recital numbers")
|
|
return (not problems, problems)
|
|
|
|
|
|
def _recital_meta(spec: RegSpec, rc: Recital) -> dict[str, Any]:
|
|
cu = f"{spec.reg} Erwägungsgrund {rc.num}"
|
|
return {
|
|
"regulation_code": spec.reg,
|
|
"regulation_short": spec.reg,
|
|
"regulation_name_de": spec.name_de,
|
|
"citation_style": "recital",
|
|
"document_type": "legal_act",
|
|
"source_class": "recital",
|
|
"bindingness": "interpretative",
|
|
"authority_level": 60,
|
|
"authority_weight": RECITAL_WEIGHT,
|
|
"source_type": "law",
|
|
"issuer": "European Union",
|
|
"jurisdiction": "EU",
|
|
"legal_basis_rank": spec.legal_basis_rank,
|
|
"version_date": spec.version_date,
|
|
"source": "eur-lex.europa.eu",
|
|
"license": "public_eu",
|
|
"category": "recht",
|
|
"celex": spec.celex,
|
|
"use_for_primary": False, # interpretative — never a primary obligation source
|
|
"is_recital": True,
|
|
"citation_unit": cu,
|
|
"article_label": cu,
|
|
"article": f"Erwaegungsgrund-{rc.num}", # distinct → avoids point-ID collisions
|
|
"chunk_scope": "recital",
|
|
"article_type": "recital",
|
|
"references_out": refs_out(spec.reg, rc.text),
|
|
"norm_id": f"EU-{spec.reg.replace(' ', '')}-Rec{rc.num}",
|
|
}
|
|
|
|
|
|
def build_upload_units(recitals: list[Recital], spec: RegSpec, run_tag: str) -> list[UploadUnit]:
|
|
"""One UploadUnit per recital, each with its own document_version (the RAG
|
|
service derives `article` from text and would otherwise collide recitals)."""
|
|
slug = spec.reg.lower().replace(" ", "")
|
|
base = f"{run_tag}-{slug}"
|
|
units: list[UploadUnit] = []
|
|
for rc in recitals:
|
|
text = f"{spec.reg} Erwägungsgrund {rc.num}\n\n{rc.text}"
|
|
units.append(UploadUnit(
|
|
filename=f"{slug}_rec{rc.num}.txt",
|
|
text=text,
|
|
meta=_recital_meta(spec, rc),
|
|
document_version=f"{base}-rec{rc.num}",
|
|
collection=spec.collection,
|
|
))
|
|
return units
|