ffbedfa0dc
Wake-up #2 (Domaene 2): Zitierfaehigkeit ohne char-Level-Spans via logischem norm_id-Join auf KB-v2-Units (bp_compliance_kb_2026_1_build). Konvention (Board Compliance/KB-v2 2026-07-01): EU-<ACT>-Anhang<ROM> (Annex-Ebene, confirmed) / EU-<ACT>-Art<N> + EU-<ACT>-Kapitel<ROM> (verify_pending). Namensvariante EU-MaschVO-* (NICHT MaschinenVO). KEINE neue Klasse — norm_ids ist ein Attribut auf legal_basis (freeze-safe). - 65/65 legal_basis gejoint (CRA 40 + MaschVO 25), 0 unparsed; 64 Obligations citation_status -> norm_id_linked (BP/guidance-anchored bleiben ohne norm_id). - 53 annex_confirmed, 12 verify_pending; distinkt 5 Annex-IDs + 19 Art/Kapitel. - norm_id_manifest.json = KB-v2-Handoff (verify_pending Art-/Kapitel-IDs pruefen). - Granularitaet annex-grob (Part/Punkt = KB-Enhancement TBD); Artikel-norm_ids in KB-v2 noch zu verifizieren. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
76 lines
3.2 KiB
Python
76 lines
3.2 KiB
Python
"""Zitierfähigkeits-Join (Wake-up #2): logischer norm_id-Join auf legal_basis.
|
|
|
|
KB-v2-Konvention (Board 2026-07-01, Compliance/KB-v2): `EU-<ACT>-Anhang<ROM>` (Annex-Ebene, grob) ·
|
|
`EU-<ACT>-Art<N>` (Artikel, in KB-v2 noch zu verifizieren) · Kapitel = TBD-Konvention.
|
|
Namensvariante: `EU-MaschVO-*` (NICHT MaschinenVO). Kein char-Span nötig — logischer Join auf norm_id.
|
|
Fügt `norm_ids` (Liste) je legal_basis + `norm_id_status` hinzu; setzt obligation.citation_status
|
|
auf `norm_id_linked` (annex-grob). KEINE neue Klasse (Attribut). Freeze-safe.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import glob
|
|
import json
|
|
import re
|
|
|
|
ACT = {"CRA": "CRA", "MaschVO": "MaschVO", "MaschinenVO": "MaschVO"}
|
|
FILES = sorted(glob.glob("obligations/cra*.json"))
|
|
|
|
|
|
def derive(source: str, anchor: str) -> tuple[list[str], str]:
|
|
act = ACT.get(source, source)
|
|
ids: list[str] = []
|
|
for rom in re.findall(r"An(?:hang|nex)\s+([IVX]+)", anchor, re.I):
|
|
ids.append(f"EU-{act}-Anhang{rom.upper()}")
|
|
articles = re.findall(r"\bArt(?:icle|\.)?\s*(\d+)", anchor)
|
|
chapters = re.findall(r"Kapitel\s+([IVX/]+)", anchor, re.I)
|
|
verify: list[str] = []
|
|
for n in articles:
|
|
verify.append(f"EU-{act}-Art{n}")
|
|
for grp in chapters:
|
|
for rom in grp.split("/"):
|
|
rom = rom.strip()
|
|
if rom:
|
|
verify.append(f"EU-{act}-Kapitel{rom.upper()}")
|
|
# dedup, Annexe zuerst (confirmed), dann verify
|
|
seen: set[str] = set()
|
|
ordered = [x for x in ids + verify if not (x in seen or seen.add(x))]
|
|
status = "annex_confirmed" if ids else ("verify_pending" if verify else "unparsed")
|
|
return ordered, status
|
|
|
|
|
|
def main() -> None:
|
|
total_lb = linked = unparsed = 0
|
|
obl_linked = 0
|
|
for f in FILES:
|
|
d = json.load(open(f, encoding="utf-8"))
|
|
d.setdefault("norm_id_contract", {
|
|
"convention": "EU-<ACT>-Anhang<ROM> (Annex-Ebene) / EU-<ACT>-Art<N> (verify) — KB-v2 bp_compliance_kb_2026_1_build",
|
|
"act_naming": "EU-MaschVO-* (NICHT MaschinenVO)",
|
|
"granularity": "annex-grob — 'Annex I Part II (1)' -> EU-CRA-AnhangI; Part/Punkt = KB-Enhancement TBD",
|
|
"article_status": "EU-<ACT>-Art<N> in KB-v2 noch zu verifizieren; Annex-IDs confirmed",
|
|
"source": "Board Compliance/KB-v2 2026-07-01",
|
|
})
|
|
for o in d.get("obligations", []):
|
|
got = False
|
|
for b in o.get("legal_basis", []):
|
|
total_lb += 1
|
|
nids, st = derive(b.get("source", ""), b.get("anchor", ""))
|
|
b["norm_ids"] = nids
|
|
b["norm_id_status"] = st
|
|
if nids:
|
|
linked += 1
|
|
got = True
|
|
if st == "unparsed":
|
|
unparsed += 1
|
|
print(f" UNPARSED: {b.get('source')} \"{b.get('anchor')}\"")
|
|
if got:
|
|
o["citation_status"] = "norm_id_linked"
|
|
obl_linked += 1
|
|
json.dump(d, open(f, "w", encoding="utf-8"), ensure_ascii=False, indent=1)
|
|
print(f"legal_basis gesamt {total_lb} | mit norm_ids {linked} | unparsed {unparsed}")
|
|
print(f"Obligations citation_status -> norm_id_linked: {obl_linked}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|