569f64a400
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 32s
CI / test-bqas (push) Successful in 30s
Add services/legal_act_ingester.py — the EU eur-lex LegalActIngester engine: CELLAR download (with eur-lex fallback, bypassing the HTTP 202 web block on large acts like DORA), parse into articles + annexes with full authority metadata + forward citation edges (references_out), and a self-test gate before upload. Refactor scripts/ingest_eu_regulations.py to use it: parse-based, per-unit upload with a skip-by-CELEX guard (no automatic re-ingest). Recitals are intentionally left to a separate ingester (Parser 2). Tested: parser / metadata / self-test / refs_out over a synthetic eur-lex fixture (7 tests), ruff + mypy clean, real CELLAR fetch of DORA verified end-to-end (64 articles, full authority metadata). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
333 lines
12 KiB
Python
333 lines
12 KiB
Python
"""Production LegalActIngester for EU eur-lex acts (Parser 1 of the corpus stack).
|
|
|
|
Downloads the German XHTML of an EU act (CELLAR machine endpoint, with the
|
|
eur-lex web UI as fallback), parses it into ARTICLES + ANNEXES with full
|
|
authority metadata and forward citation edges (references_out), and self-tests
|
|
the parse before any upload. The eur-lex / CELLAR XHTML uses uniform CSS classes
|
|
(oj-ti-art / oj-sti-art / oj-normal / oj-ti-grseq-1 / oj-doc-ti) across every
|
|
act, so one parser covers DSGVO / CRA / AI Act / DORA / NIS2 / MaschinenVO / ...
|
|
|
|
Recitals are intentionally NOT handled here — they are a separate, lower-weight
|
|
source (RecitalIngester, Parser 2). Scope of this module: binding articles + the
|
|
annexes that carry the actual obligations.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import html as html_lib
|
|
import json
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
CHUNK_SIZE = "2500"
|
|
CHUNK_OVERLAP = "200"
|
|
|
|
# CELLAR is the canonical machine endpoint and returns the full XHTML even for
|
|
# acts the eur-lex web UI blocks with an empty HTTP 202 (e.g. DORA). Try it
|
|
# first, fall back to the web UI for anything CELLAR cannot serve.
|
|
CELLAR_URL = "http://publications.europa.eu/resource/celex/{celex}"
|
|
EURLEX_URL = "https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{celex}"
|
|
_USER_AGENT = "Mozilla/5.0 (compatible; BreakPilot-LegalActIngester/1.0)"
|
|
|
|
_P_RE = re.compile(
|
|
r'<p[^>]*class="oj-(ti-art|sti-art|normal|expanded|ti-grseq-1|doc-ti)"[^>]*>(.*?)</p>',
|
|
re.S,
|
|
)
|
|
_TAG_RE = re.compile(r"<[^>]+>")
|
|
_ART_RE = re.compile(r"Artikel\s+(\d+[a-z]?)")
|
|
_ANNEX_RE = re.compile(r"ANHANG\s+([IVXLC]+)\b")
|
|
_PARA_RE = re.compile(r"§\s*(\d+[a-z]?)")
|
|
_ANNEX_REF_RE = re.compile(r"Anh[ae]ng\s+([IVXLC]+)\b")
|
|
_EMPTY_ANNEX_CHARS = 15 # below this an annex is a table/product list → skip on upload
|
|
|
|
|
|
@dataclass
|
|
class RegSpec:
|
|
"""The minimum an act needs to be ingested + cited."""
|
|
|
|
reg: str # short citation handle, e.g. "CRA"
|
|
celex: str # e.g. "32024R2847"
|
|
name_de: str
|
|
collection: str = "bp_compliance_ce"
|
|
version_date: str = "" # ISO date, e.g. "2024-10-23"
|
|
legal_basis_rank: str = "eu_regulation" # or "eu_directive"
|
|
|
|
|
|
@dataclass
|
|
class Article:
|
|
num: str
|
|
title: str = ""
|
|
body: list[str] = field(default_factory=list)
|
|
chapter: str = ""
|
|
|
|
|
|
@dataclass
|
|
class Annex:
|
|
num: str
|
|
title: str = ""
|
|
body: list[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class ParsedAct:
|
|
reg: str
|
|
articles: list[Article]
|
|
annexes: list[Annex]
|
|
|
|
|
|
@dataclass
|
|
class UploadUnit:
|
|
filename: str
|
|
text: str
|
|
meta: dict[str, Any]
|
|
document_version: str
|
|
collection: str
|
|
|
|
|
|
def clean(fragment: str) -> str:
|
|
"""Strip tags, unescape entities and collapse whitespace."""
|
|
return " ".join(html_lib.unescape(_TAG_RE.sub("", fragment)).split())
|
|
|
|
|
|
def download_act(celex: str, *, client: httpx.Client | None = None) -> str:
|
|
"""Fetch an act's German XHTML — CELLAR first, eur-lex fallback.
|
|
|
|
Raises RuntimeError if neither source yields a usable document (status 200
|
|
containing article markers).
|
|
"""
|
|
own_client = client is None
|
|
http = client or httpx.Client(timeout=60.0, follow_redirects=True)
|
|
try:
|
|
attempts: tuple[tuple[str, dict[str, str]], ...] = (
|
|
(
|
|
CELLAR_URL.format(celex=celex),
|
|
{"Accept-Language": "deu", "Accept": "application/xhtml+xml, text/html;q=0.9"},
|
|
),
|
|
(EURLEX_URL.format(celex=celex), {}),
|
|
)
|
|
for url, extra in attempts:
|
|
try:
|
|
resp = http.get(url, headers={"User-Agent": _USER_AGENT, **extra})
|
|
except httpx.HTTPError as exc:
|
|
logger.warning("legal-act fetch error for %s: %s", url, exc)
|
|
continue
|
|
if resp.status_code == 200 and "oj-ti-art" in resp.text:
|
|
logger.info("downloaded CELEX %s from %s (%d chars)", celex, url, len(resp.text))
|
|
return resp.text
|
|
logger.warning(
|
|
"no usable doc for CELEX %s from %s (status=%s, len=%d)",
|
|
celex, url, resp.status_code, len(resp.text),
|
|
)
|
|
raise RuntimeError(f"no usable XHTML for CELEX {celex} (CELLAR + eur-lex failed)")
|
|
finally:
|
|
if own_client:
|
|
http.close()
|
|
|
|
|
|
def refs_out(reg: str, text: str) -> list[str]:
|
|
"""Forward citation edges found in `text`: Art→Art, Art→§ (BDSG), Art→Annex."""
|
|
out = {f"Art. {m} {reg}" for m in _ART_RE.findall(text)}
|
|
out |= {f"§ {m} BDSG" for m in _PARA_RE.findall(text)}
|
|
out |= {f"{reg} Anhang {m}" for m in _ANNEX_REF_RE.findall(text)}
|
|
return sorted(out)
|
|
|
|
|
|
def parse_html(raw: str, reg: str) -> ParsedAct:
|
|
"""Parse eur-lex/CELLAR XHTML into articles + annexes (no recitals).
|
|
|
|
Text before the first article (recitals/preamble) is ignored on purpose —
|
|
that is RecitalIngester's job (Parser 2).
|
|
"""
|
|
articles: list[Article] = []
|
|
annexes: list[Annex] = []
|
|
cur: Article | None = None
|
|
ann: Annex | None = None
|
|
chapter = ""
|
|
|
|
for cls, inner in _P_RE.findall(raw):
|
|
txt = clean(inner)
|
|
if not txt:
|
|
continue
|
|
|
|
annex_match = _ANNEX_RE.match(txt) if cls == "doc-ti" else None
|
|
if annex_match:
|
|
if cur is not None:
|
|
articles.append(cur)
|
|
cur = None
|
|
if ann is not None:
|
|
annexes.append(ann)
|
|
ann = Annex(num=annex_match.group(1))
|
|
continue
|
|
|
|
if ann is not None: # annex mode
|
|
if cls in ("doc-ti", "ti-grseq-1") and not ann.title:
|
|
ann.title = txt
|
|
elif cls in ("normal", "expanded", "ti-grseq-1"):
|
|
ann.body.append(txt)
|
|
continue
|
|
|
|
if cls == "doc-ti": # document title / preamble headings
|
|
continue
|
|
if cls == "ti-grseq-1": # chapter / section heading
|
|
chapter = txt
|
|
continue
|
|
if cls == "ti-art":
|
|
art_match = _ART_RE.match(txt)
|
|
if art_match:
|
|
if cur is not None:
|
|
articles.append(cur)
|
|
cur = Article(num=art_match.group(1), chapter=chapter)
|
|
continue
|
|
if cls == "sti-art" and cur is not None:
|
|
cur.title = txt
|
|
continue
|
|
if cls in ("normal", "expanded") and cur is not None:
|
|
cur.body.append(txt)
|
|
|
|
if cur is not None:
|
|
articles.append(cur)
|
|
if ann is not None:
|
|
annexes.append(ann)
|
|
return ParsedAct(reg=reg, articles=articles, annexes=annexes)
|
|
|
|
|
|
def self_test(act: ParsedAct) -> tuple[bool, list[str]]:
|
|
"""Gate the parse before upload. Empty annexes (tables) do NOT fail — they
|
|
are skipped on upload. Returns (passed, problems)."""
|
|
problems: list[str] = []
|
|
if not act.articles:
|
|
problems.append("0 articles parsed")
|
|
nums = [a.num for a in act.articles]
|
|
if len(nums) != len(set(nums)):
|
|
problems.append("duplicate article numbers")
|
|
short = [a.num for a in act.articles if len(" ".join(a.body)) < 15]
|
|
if short:
|
|
problems.append(f"{len(short)} empty articles (e.g. {short[:3]})")
|
|
return (not problems, problems)
|
|
|
|
|
|
def _base_meta(spec: RegSpec) -> dict[str, Any]:
|
|
return {
|
|
"regulation_code": spec.reg,
|
|
"regulation_short": spec.reg,
|
|
"regulation_name_de": spec.name_de,
|
|
"citation_style": "article",
|
|
"document_type": "legal_act",
|
|
"source_class": "binding_law",
|
|
"bindingness": "binding",
|
|
"authority_level": 95,
|
|
"authority_weight": 100,
|
|
"source_type": "law",
|
|
"issuer": "European Union",
|
|
"jurisdiction": "EU",
|
|
"legal_basis_rank": spec.legal_basis_rank,
|
|
"version_date": spec.version_date,
|
|
"source": "eur-lex.europa.eu",
|
|
"license": "public_eu",
|
|
"category": "recht",
|
|
"celex": spec.celex,
|
|
"use_for_primary": True,
|
|
}
|
|
|
|
|
|
def _article_meta(spec: RegSpec, art: Article) -> dict[str, Any]:
|
|
cu = f"Art. {art.num} {spec.reg}"
|
|
meta = _base_meta(spec)
|
|
meta.update({
|
|
"citation_unit": cu,
|
|
"article_label": cu,
|
|
"parent_citation_unit": cu,
|
|
"is_citable": True,
|
|
"article": art.num,
|
|
"context_hierarchy": [art.chapter] if art.chapter else [],
|
|
"display_context": (art.chapter + " > " if art.chapter else "") + cu,
|
|
"chunk_scope": "section",
|
|
"article_title": art.title,
|
|
"article_type": "obligation",
|
|
"references_out": refs_out(spec.reg, " ".join(art.body)),
|
|
"norm_id": f"EU-{spec.reg.replace(' ', '')}-Art{art.num}",
|
|
})
|
|
return meta
|
|
|
|
|
|
def _annex_meta(spec: RegSpec, annex: Annex) -> dict[str, Any]:
|
|
cu = f"{spec.reg} Anhang {annex.num}"
|
|
meta = _base_meta(spec)
|
|
meta.update({
|
|
"citation_unit": cu,
|
|
"article_label": cu,
|
|
"parent_citation_unit": cu,
|
|
"is_citable": True,
|
|
"article": f"Anhang-{annex.num}", # distinct → avoids point-ID collisions
|
|
"context_hierarchy": [f"Anhang {annex.num}"],
|
|
"display_context": cu,
|
|
"chunk_scope": "annex",
|
|
"article_title": annex.title,
|
|
"article_type": "requirement",
|
|
"references_out": refs_out(spec.reg, " ".join(annex.body)),
|
|
"norm_id": f"EU-{spec.reg.replace(' ', '')}-Anhang{annex.num}",
|
|
})
|
|
return meta
|
|
|
|
|
|
def build_upload_units(act: ParsedAct, spec: RegSpec, run_tag: str) -> list[UploadUnit]:
|
|
"""One UploadUnit per article/annex. Articles share a document_version; each
|
|
annex gets its own (the RAG service derives `article` from text and would
|
|
otherwise collide annexes on chunk_index). Empty annexes are skipped.
|
|
"""
|
|
slug = spec.reg.lower().replace(" ", "")
|
|
base_version = f"{run_tag}-{slug}"
|
|
units: list[UploadUnit] = []
|
|
for art in act.articles:
|
|
text = f"Art. {art.num} {spec.reg} {art.title}\n\n" + "\n\n".join(art.body)
|
|
units.append(UploadUnit(
|
|
filename=f"{slug}_art{art.num}.txt",
|
|
text=text,
|
|
meta=_article_meta(spec, art),
|
|
document_version=base_version,
|
|
collection=spec.collection,
|
|
))
|
|
for annex in act.annexes:
|
|
if len(" ".join(annex.body)) < _EMPTY_ANNEX_CHARS:
|
|
continue # table / correspondence list — no usable prose
|
|
text = f"{spec.reg} Anhang {annex.num} {annex.title}\n\n" + "\n\n".join(annex.body)
|
|
units.append(UploadUnit(
|
|
filename=f"{slug}_anhang{annex.num}.txt",
|
|
text=text,
|
|
meta=_annex_meta(spec, annex),
|
|
document_version=f"{base_version}-anhang{annex.num}",
|
|
collection=spec.collection,
|
|
))
|
|
return units
|
|
|
|
|
|
def upload_unit(client: httpx.Client, rag_url: str, unit: UploadUnit) -> int:
|
|
"""Upload one unit to the RAG service. Returns the chunk count (0 on non-200)."""
|
|
data = {
|
|
"collection": unit.collection,
|
|
"data_type": "compliance",
|
|
"bundesland": "eu",
|
|
"use_case": "legal_reference",
|
|
"year": (unit.meta.get("version_date") or "")[:4] or "2026",
|
|
"chunk_strategy": "legal",
|
|
"chunk_size": CHUNK_SIZE,
|
|
"chunk_overlap": CHUNK_OVERLAP,
|
|
"metadata_json": json.dumps(unit.meta, ensure_ascii=False),
|
|
"document_version": unit.document_version,
|
|
}
|
|
resp = client.post(
|
|
f"{rag_url}/api/v1/documents/upload",
|
|
files={"file": (unit.filename, unit.text.encode("utf-8"), "text/plain")},
|
|
data=data,
|
|
)
|
|
if resp.status_code != 200:
|
|
logger.error("upload %s failed: %s %s", unit.filename, resp.status_code, resp.text[:200])
|
|
return 0
|
|
return int(resp.json().get("chunks_count", 0))
|