breakpilot-core/control-pipeline/services/recital_ingester.py

"""RecitalIngester (Parser 2): ingests EU act recitals (Erwägungsgründe) as a
SEPARATE, interpretative source — never a primary obligation source.

In eur-lex / CELLAR XHTML each recital sits in a preamble block
<div class="eli-subdivision" id="rct_N"> with the marker "(N)" and the text in
adjacent table cells, which is why a naive article parser finds none. This
parser keys on the id="rct_N" markers and joins the recital's prose.

Recitals are tagged source_class=recital / authority_weight=60 /
use_for_primary=false, so they rank below binding articles and surface only as
interpretation context (and trip the human-review flag if they ever top
results). Reuses the eur-lex download + helpers from legal_act_ingester
(Parser 1).
"""

from __future__ import annotations

import re
from dataclasses import dataclass
from typing import Any

from services.legal_act_ingester import RegSpec, UploadUnit, clean, refs_out

RECITAL_WEIGHT = 60

_RCT_DIV_RE = re.compile(r'id="rct_(\d+)"')
_OJ_P_RE = re.compile(r'<p[^>]*class="oj-normal"[^>]*>(.*?)</p>', re.S)
_RCT_NUM_RE = re.compile(r"^\(\d+\)$")
_MIN_RECITAL_CHARS = 20


@dataclass
class Recital:
    num: str
    text: str


def parse_recitals(raw: str, reg: str) -> list[Recital]:
    """Extract recitals from the preamble via the id="rct_N" markers. `reg` is
    accepted for symmetry with the other parsers (recitals carry no reg in-text)."""
    _ = reg
    end = raw.find('class="oj-ti-art"')
    if end < 0:
        end = len(raw)
    markers = [(m.group(1), m.start()) for m in _RCT_DIV_RE.finditer(raw, 0, end)]
    recitals: list[Recital] = []
    for i, (num, start) in enumerate(markers):
        stop = markers[i + 1][1] if i + 1 < len(markers) else end
        parts = [clean(inner) for inner in _OJ_P_RE.findall(raw[start:stop])]
        body = " ".join(p for p in parts if p and not _RCT_NUM_RE.match(p))
        if len(body) >= _MIN_RECITAL_CHARS:
            recitals.append(Recital(num=num, text=body))
    return recitals


def self_test(recitals: list[Recital]) -> tuple[bool, list[str]]:
    """Gate before upload. Every EU act has recitals → 0 is a parse failure."""
    problems: list[str] = []
    if not recitals:
        problems.append("0 recitals parsed")
    nums = [r.num for r in recitals]
    if len(nums) != len(set(nums)):
        problems.append("duplicate recital numbers")
    return (not problems, problems)


def _recital_meta(spec: RegSpec, rc: Recital) -> dict[str, Any]:
    cu = f"{spec.reg} Erwägungsgrund {rc.num}"
    return {
        "regulation_code": spec.reg,
        "regulation_short": spec.reg,
        "regulation_name_de": spec.name_de,
        "citation_style": "recital",
        "document_type": "legal_act",
        "source_class": "recital",
        "bindingness": "interpretative",
        "authority_level": 60,
        "authority_weight": RECITAL_WEIGHT,
        "source_type": "law",
        "issuer": "European Union",
        "jurisdiction": "EU",
        "legal_basis_rank": spec.legal_basis_rank,
        "version_date": spec.version_date,
        "source": "eur-lex.europa.eu",
        "license": "public_eu",
        "category": "recht",
        "celex": spec.celex,
        "use_for_primary": False,  # interpretative — never a primary obligation source
        "is_recital": True,
        "citation_unit": cu,
        "article_label": cu,
        "article": f"Erwaegungsgrund-{rc.num}",  # distinct → avoids point-ID collisions
        "chunk_scope": "recital",
        "article_type": "recital",
        "references_out": refs_out(spec.reg, rc.text),
        "norm_id": f"EU-{spec.reg.replace(' ', '')}-Rec{rc.num}",
    }


def build_upload_units(recitals: list[Recital], spec: RegSpec, run_tag: str) -> list[UploadUnit]:
    """One UploadUnit per recital, each with its own document_version (the RAG
    service derives `article` from text and would otherwise collide recitals)."""
    slug = spec.reg.lower().replace(" ", "")
    base = f"{run_tag}-{slug}"
    units: list[UploadUnit] = []
    for rc in recitals:
        text = f"{spec.reg} Erwägungsgrund {rc.num}\n\n{rc.text}"
        units.append(UploadUnit(
            filename=f"{slug}_rec{rc.num}.txt",
            text=text,
            meta=_recital_meta(spec, rc),
            document_version=f"{base}-rec{rc.num}",
            collection=spec.collection,
        ))
    return units