"""RecitalIngester (Parser 2): ingests EU act recitals (Erwägungsgründe) as a SEPARATE, interpretative source — never a primary obligation source. In eur-lex / CELLAR XHTML each recital sits in a preamble block
]*class="oj-normal"[^>]*>(.*?)
', re.S) _RCT_NUM_RE = re.compile(r"^\(\d+\)$") _MIN_RECITAL_CHARS = 20 @dataclass class Recital: num: str text: str def parse_recitals(raw: str, reg: str) -> list[Recital]: """Extract recitals from the preamble via the id="rct_N" markers. `reg` is accepted for symmetry with the other parsers (recitals carry no reg in-text).""" _ = reg end = raw.find('class="oj-ti-art"') if end < 0: end = len(raw) markers = [(m.group(1), m.start()) for m in _RCT_DIV_RE.finditer(raw, 0, end)] recitals: list[Recital] = [] for i, (num, start) in enumerate(markers): stop = markers[i + 1][1] if i + 1 < len(markers) else end parts = [clean(inner) for inner in _OJ_P_RE.findall(raw[start:stop])] body = " ".join(p for p in parts if p and not _RCT_NUM_RE.match(p)) if len(body) >= _MIN_RECITAL_CHARS: recitals.append(Recital(num=num, text=body)) return recitals def self_test(recitals: list[Recital]) -> tuple[bool, list[str]]: """Gate before upload. Every EU act has recitals → 0 is a parse failure.""" problems: list[str] = [] if not recitals: problems.append("0 recitals parsed") nums = [r.num for r in recitals] if len(nums) != len(set(nums)): problems.append("duplicate recital numbers") return (not problems, problems) def _recital_meta(spec: RegSpec, rc: Recital) -> dict[str, Any]: cu = f"{spec.reg} Erwägungsgrund {rc.num}" return { "regulation_code": spec.reg, "regulation_short": spec.reg, "regulation_name_de": spec.name_de, "citation_style": "recital", "document_type": "legal_act", "source_class": "recital", "bindingness": "interpretative", "authority_level": 60, "authority_weight": RECITAL_WEIGHT, "source_type": "law", "issuer": "European Union", "jurisdiction": "EU", "legal_basis_rank": spec.legal_basis_rank, "version_date": spec.version_date, "source": "eur-lex.europa.eu", "license": "public_eu", "category": "recht", "celex": spec.celex, "use_for_primary": False, # interpretative — never a primary obligation source "is_recital": True, "citation_unit": cu, "article_label": cu, "article": f"Erwaegungsgrund-{rc.num}", # distinct → avoids point-ID collisions "chunk_scope": "recital", "article_type": "recital", "references_out": refs_out(spec.reg, rc.text), "norm_id": f"EU-{spec.reg.replace(' ', '')}-Rec{rc.num}", } def build_upload_units(recitals: list[Recital], spec: RegSpec, run_tag: str) -> list[UploadUnit]: """One UploadUnit per recital, each with its own document_version (the RAG service derives `article` from text and would otherwise collide recitals).""" slug = spec.reg.lower().replace(" ", "") base = f"{run_tag}-{slug}" units: list[UploadUnit] = [] for rc in recitals: text = f"{spec.reg} Erwägungsgrund {rc.num}\n\n{rc.text}" units.append(UploadUnit( filename=f"{slug}_rec{rc.num}.txt", text=text, meta=_recital_meta(spec, rc), document_version=f"{base}-rec{rc.num}", collection=spec.collection, )) return units