"""Production LegalActIngester for EU eur-lex acts (Parser 1 of the corpus stack). Downloads the German XHTML of an EU act (CELLAR machine endpoint, with the eur-lex web UI as fallback), parses it into ARTICLES + ANNEXES with full authority metadata and forward citation edges (references_out), and self-tests the parse before any upload. The eur-lex / CELLAR XHTML uses uniform CSS classes (oj-ti-art / oj-sti-art / oj-normal / oj-ti-grseq-1 / oj-doc-ti) across every act, so one parser covers DSGVO / CRA / AI Act / DORA / NIS2 / MaschinenVO / ... Recitals are intentionally NOT handled here — they are a separate, lower-weight source (RecitalIngester, Parser 2). Scope of this module: binding articles + the annexes that carry the actual obligations. """ from __future__ import annotations import html as html_lib import json import logging import re from dataclasses import dataclass, field from typing import Any import httpx logger = logging.getLogger(__name__) CHUNK_SIZE = "2500" CHUNK_OVERLAP = "200" # CELLAR is the canonical machine endpoint and returns the full XHTML even for # acts the eur-lex web UI blocks with an empty HTTP 202 (e.g. DORA). Try it # first, fall back to the web UI for anything CELLAR cannot serve. CELLAR_URL = "http://publications.europa.eu/resource/celex/{celex}" EURLEX_URL = "https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{celex}" _USER_AGENT = "Mozilla/5.0 (compatible; BreakPilot-LegalActIngester/1.0)" _P_RE = re.compile( r'
]*class="oj-(ti-art|sti-art|normal|expanded|ti-grseq-1|doc-ti)"[^>]*>(.*?)
', re.S, ) _TAG_RE = re.compile(r"<[^>]+>") _ART_RE = re.compile(r"Artikel\s+(\d+[a-z]?)") _ANNEX_RE = re.compile(r"ANHANG\s+([IVXLC]+)\b") _PARA_RE = re.compile(r"§\s*(\d+[a-z]?)") _ANNEX_REF_RE = re.compile(r"Anh[ae]ng\s+([IVXLC]+)\b") _EMPTY_ANNEX_CHARS = 15 # below this an annex is a table/product list → skip on upload @dataclass class RegSpec: """The minimum an act needs to be ingested + cited.""" reg: str # short citation handle, e.g. "CRA" celex: str # e.g. "32024R2847" name_de: str collection: str = "bp_compliance_ce" version_date: str = "" # ISO date, e.g. "2024-10-23" legal_basis_rank: str = "eu_regulation" # or "eu_directive" @dataclass class Article: num: str title: str = "" body: list[str] = field(default_factory=list) chapter: str = "" @dataclass class Annex: num: str title: str = "" body: list[str] = field(default_factory=list) @dataclass class ParsedAct: reg: str articles: list[Article] annexes: list[Annex] @dataclass class UploadUnit: filename: str text: str meta: dict[str, Any] document_version: str collection: str def clean(fragment: str) -> str: """Strip tags, unescape entities and collapse whitespace.""" return " ".join(html_lib.unescape(_TAG_RE.sub("", fragment)).split()) def download_act(celex: str, *, client: httpx.Client | None = None) -> str: """Fetch an act's German XHTML — CELLAR first, eur-lex fallback. Raises RuntimeError if neither source yields a usable document (status 200 containing article markers). """ own_client = client is None http = client or httpx.Client(timeout=60.0, follow_redirects=True) try: attempts: tuple[tuple[str, dict[str, str]], ...] = ( ( CELLAR_URL.format(celex=celex), {"Accept-Language": "deu", "Accept": "application/xhtml+xml, text/html;q=0.9"}, ), (EURLEX_URL.format(celex=celex), {}), ) for url, extra in attempts: try: resp = http.get(url, headers={"User-Agent": _USER_AGENT, **extra}) except httpx.HTTPError as exc: logger.warning("legal-act fetch error for %s: %s", url, exc) continue if resp.status_code == 200 and "oj-ti-art" in resp.text: logger.info("downloaded CELEX %s from %s (%d chars)", celex, url, len(resp.text)) return resp.text logger.warning( "no usable doc for CELEX %s from %s (status=%s, len=%d)", celex, url, resp.status_code, len(resp.text), ) raise RuntimeError(f"no usable XHTML for CELEX {celex} (CELLAR + eur-lex failed)") finally: if own_client: http.close() def refs_out(reg: str, text: str) -> list[str]: """Forward citation edges found in `text`: Art→Art, Art→§ (BDSG), Art→Annex.""" out = {f"Art. {m} {reg}" for m in _ART_RE.findall(text)} out |= {f"§ {m} BDSG" for m in _PARA_RE.findall(text)} out |= {f"{reg} Anhang {m}" for m in _ANNEX_REF_RE.findall(text)} return sorted(out) def parse_html(raw: str, reg: str) -> ParsedAct: """Parse eur-lex/CELLAR XHTML into articles + annexes (no recitals). Text before the first article (recitals/preamble) is ignored on purpose — that is RecitalIngester's job (Parser 2). """ articles: list[Article] = [] annexes: list[Annex] = [] cur: Article | None = None ann: Annex | None = None chapter = "" for cls, inner in _P_RE.findall(raw): txt = clean(inner) if not txt: continue annex_match = _ANNEX_RE.match(txt) if cls == "doc-ti" else None if annex_match: if cur is not None: articles.append(cur) cur = None if ann is not None: annexes.append(ann) ann = Annex(num=annex_match.group(1)) continue if ann is not None: # annex mode if cls in ("doc-ti", "ti-grseq-1") and not ann.title: ann.title = txt elif cls in ("normal", "expanded", "ti-grseq-1"): ann.body.append(txt) continue if cls == "doc-ti": # document title / preamble headings continue if cls == "ti-grseq-1": # chapter / section heading chapter = txt continue if cls == "ti-art": art_match = _ART_RE.match(txt) if art_match: if cur is not None: articles.append(cur) cur = Article(num=art_match.group(1), chapter=chapter) continue if cls == "sti-art" and cur is not None: cur.title = txt continue if cls in ("normal", "expanded") and cur is not None: cur.body.append(txt) if cur is not None: articles.append(cur) if ann is not None: annexes.append(ann) return ParsedAct(reg=reg, articles=articles, annexes=annexes) def self_test(act: ParsedAct) -> tuple[bool, list[str]]: """Gate the parse before upload. Empty annexes (tables) do NOT fail — they are skipped on upload. Returns (passed, problems).""" problems: list[str] = [] if not act.articles: problems.append("0 articles parsed") nums = [a.num for a in act.articles] if len(nums) != len(set(nums)): problems.append("duplicate article numbers") short = [a.num for a in act.articles if len(" ".join(a.body)) < 15] if short: problems.append(f"{len(short)} empty articles (e.g. {short[:3]})") return (not problems, problems) def _base_meta(spec: RegSpec) -> dict[str, Any]: return { "regulation_code": spec.reg, "regulation_short": spec.reg, "regulation_name_de": spec.name_de, "citation_style": "article", "document_type": "legal_act", "source_class": "binding_law", "bindingness": "binding", "authority_level": 95, "authority_weight": 100, "source_type": "law", "issuer": "European Union", "jurisdiction": "EU", "legal_basis_rank": spec.legal_basis_rank, "version_date": spec.version_date, "source": "eur-lex.europa.eu", "license": "public_eu", "category": "recht", "celex": spec.celex, "use_for_primary": True, } def _article_meta(spec: RegSpec, art: Article) -> dict[str, Any]: cu = f"Art. {art.num} {spec.reg}" meta = _base_meta(spec) meta.update({ "citation_unit": cu, "article_label": cu, "parent_citation_unit": cu, "is_citable": True, "article": art.num, "context_hierarchy": [art.chapter] if art.chapter else [], "display_context": (art.chapter + " > " if art.chapter else "") + cu, "chunk_scope": "section", "article_title": art.title, "article_type": "obligation", "references_out": refs_out(spec.reg, " ".join(art.body)), "norm_id": f"EU-{spec.reg.replace(' ', '')}-Art{art.num}", }) return meta def _annex_meta(spec: RegSpec, annex: Annex) -> dict[str, Any]: cu = f"{spec.reg} Anhang {annex.num}" meta = _base_meta(spec) meta.update({ "citation_unit": cu, "article_label": cu, "parent_citation_unit": cu, "is_citable": True, "article": f"Anhang-{annex.num}", # distinct → avoids point-ID collisions "context_hierarchy": [f"Anhang {annex.num}"], "display_context": cu, "chunk_scope": "annex", "article_title": annex.title, "article_type": "requirement", "references_out": refs_out(spec.reg, " ".join(annex.body)), "norm_id": f"EU-{spec.reg.replace(' ', '')}-Anhang{annex.num}", }) return meta def build_upload_units(act: ParsedAct, spec: RegSpec, run_tag: str) -> list[UploadUnit]: """One UploadUnit per article/annex. Articles share a document_version; each annex gets its own (the RAG service derives `article` from text and would otherwise collide annexes on chunk_index). Empty annexes are skipped. """ slug = spec.reg.lower().replace(" ", "") base_version = f"{run_tag}-{slug}" units: list[UploadUnit] = [] for art in act.articles: text = f"Art. {art.num} {spec.reg} {art.title}\n\n" + "\n\n".join(art.body) units.append(UploadUnit( filename=f"{slug}_art{art.num}.txt", text=text, meta=_article_meta(spec, art), document_version=base_version, collection=spec.collection, )) for annex in act.annexes: if len(" ".join(annex.body)) < _EMPTY_ANNEX_CHARS: continue # table / correspondence list — no usable prose text = f"{spec.reg} Anhang {annex.num} {annex.title}\n\n" + "\n\n".join(annex.body) units.append(UploadUnit( filename=f"{slug}_anhang{annex.num}.txt", text=text, meta=_annex_meta(spec, annex), document_version=f"{base_version}-anhang{annex.num}", collection=spec.collection, )) return units def upload_unit(client: httpx.Client, rag_url: str, unit: UploadUnit) -> int: """Upload one unit to the RAG service. Returns the chunk count (0 on non-200).""" data = { "collection": unit.collection, "data_type": "compliance", "bundesland": "eu", "use_case": "legal_reference", "year": (unit.meta.get("version_date") or "")[:4] or "2026", "chunk_strategy": "legal", "chunk_size": CHUNK_SIZE, "chunk_overlap": CHUNK_OVERLAP, "metadata_json": json.dumps(unit.meta, ensure_ascii=False), "document_version": unit.document_version, } resp = client.post( f"{rag_url}/api/v1/documents/upload", files={"file": (unit.filename, unit.text.encode("utf-8"), "text/plain")}, data=data, ) if resp.status_code != 200: logger.error("upload %s failed: %s %s", unit.filename, resp.status_code, resp.text[:200]) return 0 return int(resp.json().get("chunks_count", 0))