"""Zitierfähigkeits-Join (Wake-up #2): logischer norm_id-Join auf legal_basis. KB-v2-Konvention (Board 2026-07-01, Compliance/KB-v2): `EU--Anhang` (Annex-Ebene, grob) · `EU--Art` (Artikel, in KB-v2 noch zu verifizieren) · Kapitel = TBD-Konvention. Namensvariante: `EU-MaschVO-*` (NICHT MaschinenVO). Kein char-Span nötig — logischer Join auf norm_id. Fügt `norm_ids` (Liste) je legal_basis + `norm_id_status` hinzu; setzt obligation.citation_status auf `norm_id_linked` (annex-grob). KEINE neue Klasse (Attribut). Freeze-safe. """ from __future__ import annotations import glob import json import re ACT = {"CRA": "CRA", "MaschVO": "MaschVO", "MaschinenVO": "MaschVO"} FILES = sorted(glob.glob("obligations/cra*.json")) def derive(source: str, anchor: str) -> tuple[list[str], str]: act = ACT.get(source, source) ids: list[str] = [] for rom in re.findall(r"An(?:hang|nex)\s+([IVX]+)", anchor, re.I): ids.append(f"EU-{act}-Anhang{rom.upper()}") articles = re.findall(r"\bArt(?:icle|\.)?\s*(\d+)", anchor) chapters = re.findall(r"Kapitel\s+([IVX/]+)", anchor, re.I) verify: list[str] = [] for n in articles: verify.append(f"EU-{act}-Art{n}") for grp in chapters: for rom in grp.split("/"): rom = rom.strip() if rom: verify.append(f"EU-{act}-Kapitel{rom.upper()}") # dedup, Annexe zuerst (confirmed), dann verify seen: set[str] = set() ordered = [x for x in ids + verify if not (x in seen or seen.add(x))] status = "annex_confirmed" if ids else ("verify_pending" if verify else "unparsed") return ordered, status def main() -> None: total_lb = linked = unparsed = 0 obl_linked = 0 for f in FILES: d = json.load(open(f, encoding="utf-8")) d.setdefault("norm_id_contract", { "convention": "EU--Anhang (Annex-Ebene) / EU--Art (verify) — KB-v2 bp_compliance_kb_2026_1_build", "act_naming": "EU-MaschVO-* (NICHT MaschinenVO)", "granularity": "annex-grob — 'Annex I Part II (1)' -> EU-CRA-AnhangI; Part/Punkt = KB-Enhancement TBD", "article_status": "EU--Art in KB-v2 noch zu verifizieren; Annex-IDs confirmed", "source": "Board Compliance/KB-v2 2026-07-01", }) for o in d.get("obligations", []): got = False for b in o.get("legal_basis", []): total_lb += 1 nids, st = derive(b.get("source", ""), b.get("anchor", "")) b["norm_ids"] = nids b["norm_id_status"] = st if nids: linked += 1 got = True if st == "unparsed": unparsed += 1 print(f" UNPARSED: {b.get('source')} \"{b.get('anchor')}\"") if got: o["citation_status"] = "norm_id_linked" obl_linked += 1 json.dump(d, open(f, "w", encoding="utf-8"), ensure_ascii=False, indent=1) print(f"legal_basis gesamt {total_lb} | mit norm_ids {linked} | unparsed {unparsed}") print(f"Obligations citation_status -> norm_id_linked: {obl_linked}") if __name__ == "__main__": main()