feat(control-pipeline): persist SGE Knowledge Compiler (capability execution engine)

Move the proven, behavior-equivalent SGE compiler from the macmini build env into the repo (release hygiene, no behavior change, no new capability/architecture): - scripts/sge_build.py — structured guidance extractor + capability execution engine. main() runs run_engine(): C6 Tables -> C7 ReadingOrder -> C1/C2 Sections -> C4 References, order derived topologically from each capability consume/produce contract (not hardcoded). Region ownership: C6 claims tables, C7 residual prose. - scripts/capability_pipeline.py — Region IR {id,bbox,type,state,owner}, claim/consume/produce capabilities, topological resolve_order(). - scripts/reading_order.py — C7 reading-order reconstruction (multi-column reflow; identity gate on single-column so output==input). Verified bit-identical: artifact graph (IDs, parent/child, metadata, text) unchanged vs the pre-engine direct path across 8 docs (4 layout families x EN/DE, 0 mismatch / 0 only_base / 0 only_engine); Golden degraded=0. BUILD_CP default is now __file__-relative so the script self-locates control-pipeline/services.
2026-06-28 13:38:18 +02:00
parent 3b466be140
commit a8412e3db7
3 changed files with 954 additions and 0 deletions
@@ -0,0 +1,92 @@
 """Capability Execution Engine (Prototyp). Region-IR mit Runtime-Owner; Capabilities
 deklarieren claims/consumes/produces; Ausführungsreihenfolge wird aus dem Artefakt-Graphen
 ABGELEITET (topologisch), nicht hartkodiert. Realisiert C6/C7 als Pipeline-Stages mit Region-Ownership."""
 from dataclasses import dataclass, field
 import reading_order as RO
@dataclass
 class Region:
    id:int; bbox:tuple; type:str="unknown"; state:str="unclaimed"; owner:str=None
@dataclass
 class Artifact:
    kind:str; payload:object; source_region:int=None
 class Capability:
    name=""; consumes=[]; produces=[]; residual=False
    def claims(self, region): return False
    def run(self, regions, page, artifacts): return []
 class TableExtraction(Capability):
    name="C6_TableExtraction"; consumes=["table_region"]; produces=["table_units"]
    def claims(self, region): return region.type=="table"
    def run(self, regions, page, artifacts):
        out=[]
        for r in regions:
            crop=page.crop(_clamp(r.bbox, page))
            tbls=crop.find_tables()
            rows=tbls[0].extract() if tbls else []
            out.append(Artifact("table_units", {"region":r.id,"rows":len(rows)}, r.id))
        return out
 class ReadingOrder(Capability):
    name="C7_ReadingOrder"; consumes=["prose_region"]; produces=["ordered_prose"]; residual=True
    def claims(self, region): return region.type=="prose"
    def run(self, regions, page, artifacts):
        table_bboxes=[a.payload for a in artifacts if a.kind=="_table_bbox"]
        ws=[w for w in page.extract_words() if not _in_any(w, table_bboxes)]
        text=RO.emit_words(ws, float(page.width)) if hasattr(RO,"emit_words") else ""
        return [Artifact("ordered_prose", {"words":len(ws),"chars":len(text)}, None)]
 class FigureExtraction(Capability):
    name="C8_FigureExtraction"; consumes=["figure_region"]; produces=["figure_units"]
    def claims(self, region): return region.type=="figure"
 class References(Capability):
    name="C4_References"; consumes=["ordered_prose"]; produces=["citation_units"]
 def _in_any(w, bboxes):
    cx=(w["x0"]+w["x1"])/2; cy=(w["top"]+w["bottom"])/2
    for (x0,t,x1,b) in bboxes:
        if x0<=cx<=x1 and t<=cy<=b: return True
    return False
 def _clamp(b, page):
    x0,t,x1,bt=b
    return (max(0,x0),max(0,t),min(float(page.width),x1),min(float(page.height),bt))
 def segment(page):
    regions=[]; rid=0; tbb=[]; W=float(page.width); H=float(page.height)
    for t in page.find_tables():
        b=_clamp(t.bbox,page)
        if (b[2]-b[0])>=0.25*W and (b[3]-b[1])>=25:   # substanzielle Tabelle, keine Footer-Artefakte
            regions.append(Region(rid,b,"table")); tbb.append(b); rid+=1
    regions.append(Region(rid,(0,0,W,H),"prose")); rid+=1
    return regions, tbb
 def resolve_order(caps, raw_types):
    available=set(raw_types); ordered=[]; remaining=list(caps)
    while remaining:
        progressed=False
        for c in list(remaining):
            if all(dep in available for dep in c.consumes):
                ordered.append(c); available.update(c.produces); remaining.remove(c); progressed=True
        if not progressed: raise RuntimeError("unsatisfiable: "+str([c.name for c in remaining]))
    return ordered
 def run_pipeline(page, caps):
    regions, tbb=segment(page)
    raw_types=set(r.type+"_region" for r in regions)
    order=resolve_order(caps, raw_types)
    artifacts=[Artifact("_table_bbox",b) for b in tbb]   # geometrie für C7-Ausschluss
    # CLAIM-Phase: spezifische Claimer zuerst, residual zuletzt
    for c in sorted(order, key=lambda c:c.residual):
        for r in regions:
            if r.state=="unclaimed" and c.claims(r):
                r.state="claimed"; r.owner=c.name
    # RUN-Phase in abgeleiteter Reihenfolge
    for c in order:
        owned=[r for r in regions if r.owner==c.name]
        if owned or any(dep=="ordered_prose" for dep in c.consumes):
            artifacts+= c.run(owned, page, artifacts)
    return regions, [a for a in artifacts if not a.kind.startswith("_")], order
@@ -0,0 +1,107 @@
 """C7 Reading Order Reconstruction (Pilot). ReadingRegion-Modell, Identity-Gate.
 Scope: NUR Detect Regions / Determine Order / Emit Linear. KEINE Tabellen/Bilder/Sidebars/Fussnoten/Callouts."""
 import statistics
 def _lines(words, ytol=3.0):
    ws=sorted(words,key=lambda w:(round(w["top"],1),w["x0"])); lines=[]; cur=[]; cy=None
    for w in ws:
        if cy is None or abs(w["top"]-cy)<=ytol: cur.append(w); cy=w["top"] if cy is None else cy
        else: lines.append(cur); cur=[w]; cy=w["top"]
    if cur: lines.append(cur)
    return lines
 def _gutters(words, W):
    G=160; xs=[W*i/G for i in range(G+1)]
    cov=[sum(1 for w in words if w["x0"]<=x<=w["x1"]) for x in xs]
    pos=[c for c in cov if c>0]
    if not pos: return []
    body=statistics.median(pos)
    if body<6: return []
    thr=max(1,0.15*body); gut=[]; i=0
    while i<=G:
        if cov[i]<thr and 0.15*W<=xs[i]<=0.85*W:
            j=i
            while j<=G and cov[j]<thr: j+=1
            if xs[min(j,G)]-xs[i]>=0.02*W: gut.append((xs[i]+xs[min(j-1,G)])/2)
            i=j
        else: i+=1
    return gut
 def detect_regions(pg):
    ws=pg.extract_words(); W=float(pg.width)
    if len(ws)<60: return {"type":"single","reason":"sparse"}, ws
    cuts=_gutters(ws,W)
    if not cuts: return {"type":"single","reason":"no-gutter"}, ws
    def rc(a,b): return sum(1 for w in ws if a<=(w["x0"]+w["x1"])/2<b)
    minw=max(25,0.12*len(ws))
    keep=list(cuts); changed=True
    while keep and changed:
        changed=False; bnds=[0]+keep+[W]
        cnt=[rc(bnds[i],bnds[i+1]) for i in range(len(bnds)-1)]
        mn=min(range(len(cnt)),key=lambda i:cnt[i])
        if cnt[mn]<minw:
            if mn==0: del keep[0]
            elif mn==len(cnt)-1: del keep[-1]
            elif cnt[mn-1]<=cnt[mn+1]: del keep[mn-1]
            else: del keep[mn]
            changed=True
    if not keep: return {"type":"single","reason":"thin-merged"}, ws
    bounds=[0]+keep+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(bounds)-1)]
    return {"type":"multi","cols":cols,"cuts":keep,"ncols":len(cols)}, ws
 def emit_linear(pg):
    info,ws=detect_regions(pg)
    if info["type"]=="single": return pg.extract_text() or ""
    cuts=info["cuts"]; cols=info["cols"]; W=float(pg.width)
    def colidx(x):
        for k,c in enumerate(cols):
            if c[0]<=x<c[1]: return k
        return len(cols)-1
    seq=[]
    for ln in _lines(ws):
        sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
        for i in range(1,len(sw)):
            if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
            else: frags[-1].append(sw[i])
        for fr in frags:
            x0=min(w["x0"] for w in fr); x1=max(w["x1"] for w in fr); top=min(w["top"] for w in fr)
            text=" ".join(w["text"] for w in fr); spans=sum(1 for c in cuts if x0<c<x1)
            seq.append(("full",None,top,text) if spans>=1 else ("col",colidx((x0+x1)/2),top,text))
    out=[]; buf=[]
    def flush(b):
        res=[]
        for k in sorted(set(x[1] for x in b)):
            for x in sorted([x for x in b if x[1]==k], key=lambda x:x[2]): res.append(x[3])
        return res
    for it in seq:
        if it[0]=="full":
            if buf: out+=flush(buf); buf=[]
            out.append(it[3])
        else: buf.append(it)
    if buf: out+=flush(buf)
    return "\n".join(out)
 def emit_words(ws, W):
    flat=lambda L: " ".join(w["text"] for w in sorted(L,key=lambda w:(round(w["top"],1),w["x0"])))
    if len(ws)<60: return flat(ws)
    cuts=_gutters(ws,W)
    if not cuts: return flat(ws)
    bounds=[0]+cuts+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(cuts)+1)]
    def colidx(x):
        for k,c in enumerate(cols):
            if c[0]<=x<c[1]: return k
        return len(cols)-1
    buf={}
    for ln in _lines(ws):
        sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
        for i in range(1,len(sw)):
            if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
            else: frags[-1].append(sw[i])
        for fr in frags:
            mid=(min(w["x0"] for w in fr)+max(w["x1"] for w in fr))/2
            buf.setdefault(colidx(mid),[]).append((min(w["top"] for w in fr)," ".join(w["text"] for w in fr)))
    out=[]
    for k in sorted(buf):
        for top,t in sorted(buf[k]): out.append(t)
    return chr(10).join(out)
@@ -0,0 +1,755 @@
 #!/usr/bin/env python3
 """Structured Guidance Extractor — GENERIC builder (Wave 1b compiler).
 One deterministic parser, parametrized per document + language.
  python3 sge_build.py --doc EDPB_WP248_DPIA --lang de [--dry-run]
 Source per (doc,lang) = {"pdf_url"} OR {"zip_url","inner"[, "local"]}.
 Layout parser identical to the WP243 pilot (pdfplumber MIT, NO LLM). Mehrsprachigkeit
 = representation property: SAME document_id, language=<lang>, own document_version
 namespace (no point-id collision). Emits chunks/page + refs/chunk metrics.
 """
 import argparse, datetime, io, logging, os, re, subprocess, sys, time, zipfile
 from collections import Counter
 CP = os.getenv("BUILD_CP") or os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, CP)
 import httpx  # noqa: E402
 import pdfplumber  # noqa: E402
 from services.legal_act_ingester import UploadUnit, upload_unit  # noqa: E402
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 log = logging.getLogger("sge")
 RAG_URL = os.getenv("RAG_URL", "https://localhost:8097")
 BUILD_COLLECTION = "bp_compliance_kb_2026_1_build"
 MANIFEST_VERSION = "regulatory_build_manifest_v3/2026.1"
 # --- Layout-Familien (Phase 2): detected family-String -> F-Nummer + Familienkarte (erwartete Familie je Quelle) ---
 FAMILY_F = {"arabic-hierarchical": "F1", "roman-hierarchical": "F2",
            "arabic-caps+paragraph": "F3", "unnumbered-toc": "F4"}
 EXPECTED_FAMILY = {  # Familienkarte: VOR-Build-Erwartung, beim Build gegen detected gegated (FAMILY-GATE)
    "EDPB_WP243_DPO": "F1", "EDPB_WP248_DPIA": "F2", "EDPB_GL_05_2020_CONSENT": "F3",
    "EDPB_GL_09_2022_BREACH_NOTIFICATION": "F2", "EDPB_GL_07_2020_CONTROLLER_PROCESSOR": "F3",
    "EDPB_WP260_TRANSPARENCY": "F4",
    "DSK_SDM": "F1", "DSK_OH_KI_2024": "F1", "DSK_OH_CLOUD_2014": "F1",
    "DSK_KP_05_DSFA": "F1",  # FAMILY-GATE-Korrektur: parst 13 Sekt. via Heuristik, kein edge-short
    "DSK_KP_12_DSB": "edge-short", "DSK_KP_13_AV": "edge-short",  # echte Kurz-Flyer (0 Sekt., gate=FAIL)
    # verbleibende EDPB (vorab klassifiziert = reasoned, beim Build verifiziert):
    "EDPB_GL_01_2021_BREACH_EXAMPLES": "F3", "EDPB_GL_02_2019_ART6_1B_ONLINE": "F3",
    "EDPB_REC_01_2020_SUPPL_MEASURES": "F3", "EDPB_GL_05_2021_ART3_CHAPTERV": "F3",
    # ENISA-Charge (3. Herausgeber): erwartete HEADING-Familie (F1-F4); Capabilities (Tabellen/Mehrspaltig) orthogonal
    "ENISA_HANDBOOK_PDP": "F1", "ENISA_GL_SME_PDP": "F1", "ENISA_GL_EECC_SEC": "F3",
    "ENISA_ISPS_SME": "F1", "ENISA_TL_2023": "?",  # TL = Capability-Boundary-Kandidat (mehrspaltig/visuell)
    # NIST (US, Public Domain) — 4. Herausgeber, doc_type=Technical Standard (refs=section/control), tabellenlastig
    "NIST_SP_800_53B": "F1", "NIST_SP_800_171": "F1",
 }
 # --- Manifest-Vertrag: document_type -> expected_reference_types (User 2026-06-28: DEKLARATIV, kein Parser-Bug).
 # Der Validator verlangt Artikel-Referenzen NUR, wenn der doc_type sie erwartet -> ENISA/NIST (Standards) scheitern nicht mehr.
 ISSUER_DOCTYPE = {"Article 29 WP / EDPB": "Guidance", "EDPB": "Guidance", "DSK": "Guidance",
                  "ENISA": "Technical Standard", "NIST": "Technical Standard"}
 DOC_TYPE_REFS = {"EU Regulation": ["article", "recital", "annex"], "German Law": ["paragraph"],
                 "Guidance": ["article", "guidance"], "Technical Standard": ["section", "control"],
                 "Threat Report": ["cve", "cwe", "attack"], "Whitepaper": ["bibliography"]}
 # --- per-document registry (resolved sources pinned) ---
 NS = "https://ec.europa.eu/newsroom/just/document.cfm?doc_id="
 DOCS = {
    "EDPB_WP243_DPO": {  # Pilot (regression reference) — arabisch-Schema
        "reg": "EDPB WP243", "issuer": "Article 29 WP / EDPB", "expected_sections": 9,
        "name": "EDPB/WP29 Guidelines on Data Protection Officers (DPOs), WP 243 rev.01",
        "sources": {
            "en": {"pdf_url": NS + "44100"},
            "de": {"zip_url": NS + "48137", "inner": "wp243rev01_de.pdf", "local": "/tmp/doc_48137.bin"},
        },
    },
    "EDPB_WP248_DPIA": {
        "reg": "EDPB WP248", "issuer": "Article 29 WP / EDPB", "expected_sections": 8,
        "name": "EDPB/WP29 Guidelines on Data Protection Impact Assessment (DPIA), WP 248 rev.01",
        "sources": {
            "en": {"pdf_url": NS + "47711"},
            "de": {"zip_url": NS + "48464", "inner": "wp248 rev.01_de.pdf", "local": "/tmp/wp248_48464.bin"},
        },
    },
    "EDPB_GL_05_2020_CONSENT": {
        "reg": "EDPB GL 05/2020", "issuer": "EDPB", "expected_sections": 7,
        "name": "EDPB Guidelines 05/2020 on consent under Regulation 2016/679 (v1.1)",
        "sources": {
            "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/files/file1/edpb_guidelines_202005_consent_en.pdf"},
            "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/files/file1/edpb_guidelines_202005_consent_de.pdf"},
        },
    },
    "EDPB_GL_09_2022_BREACH_NOTIFICATION": {
        "reg": "EDPB GL 09/2022", "issuer": "EDPB", "expected_sections": 6,
        "name": "EDPB Guidelines 9/2022 on personal data breach notification under GDPR (v2.0)",
        "sources": {
            "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-04/edpb_guidelines_202209_personal_data_breach_notification_v2.0_en.pdf"},
            "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202209_personal_data_breach_notification_v2.0_de_0.pdf"},
        },
    },
    "EDPB_GL_07_2020_CONTROLLER_PROCESSOR": {
        "reg": "EDPB GL 07/2020", "issuer": "EDPB", "expected_sections": 5,
        "name": "EDPB Guidelines 07/2020 on the concepts of controller and processor in the GDPR (v2.0)",
        "sources": {
            "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-10/EDPB_guidelines_202007_controllerprocessor_final_en.pdf"},
            "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-10/EDPB_guidelines_202007_controllerprocessor_final_de.pdf"},
        },
    },
    "EDPB_WP260_TRANSPARENCY": {
        "reg": "EDPB WP260", "issuer": "Article 29 WP / EDPB", "expected_sections": 5,
        "name": "EDPB/WP29 Guidelines on transparency under Regulation 2016/679, WP 260 rev.01",
        "sources": {
            "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/2023-09/wp260rev01_en.pdf"},
            "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/2023-09/wp260rev01_de.pdf"},
        },
    },
    "EDPB_GL_01_2021_BREACH_EXAMPLES": {
        "reg": "EDPB GL 01/2021", "issuer": "EDPB", "expected_sections": 4,
        "name": "EDPB Guidelines 01/2021 on Examples regarding Personal Data Breach Notification",
        "sources": {
            "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/2022-01/edpb_guidelines_012021_pdbnotification_adopted_en.pdf"},
            "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2022-04/edpb_guidelines_012021_pdbnotification_adopted_de.pdf"},
        },
    },
    "EDPB_GL_02_2019_ART6_1B_ONLINE": {
        "reg": "EDPB GL 02/2019", "issuer": "EDPB", "expected_sections": 4,
        "name": "EDPB Guidelines 2/2019 on processing under Article 6(1)(b) GDPR (online services)",
        "sources": {
            "en": {"pdf_url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines-art_6-1-b-adopted_after_public_consultation_en.pdf"},
            "de": {"pdf_url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines-art_6-1-b-adopted_after_public_consultation_de_0.pdf"},
        },
    },
    "EDPB_REC_01_2020_SUPPL_MEASURES": {
        "reg": "EDPB REC 01/2020", "issuer": "EDPB", "expected_sections": 4,
        "name": "EDPB Recommendations 01/2020 on supplementary measures for transfer tools",
        "sources": {
            "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2021-06/edpb_recommendations_202001vo.2.0_supplementarymeasurestransferstools_en.pdf"},
            "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2022-04/edpb_recommendations_202001vo.2.0_supplementarymeasurestransferstools_de.pdf"},
        },
    },
    "EDPB_GL_05_2021_ART3_CHAPTERV": {
        "reg": "EDPB GL 05/2021", "issuer": "EDPB", "expected_sections": 4,
        "name": "EDPB Guidelines 05/2021 on the Interplay of Article 3 and Chapter V GDPR",
        "sources": {
            "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-02/edpb_guidelines_05-2021_interplay_between_the_application_of_art3-chapter_v_of_the_gdpr_v2_en_0.pdf"},
            "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-09/edpb_guidelines_05-2021_interplay_between_the_application_de.pdf"},
        },
    },
    "DSK_SDM": {  # F4-Cross-Issuer-Kandidat (DSK, dt.)
        "reg": "DSK SDM", "issuer": "DSK", "expected_sections": 3,
        "name": "DSK Standard-Datenschutzmodell V3.1",
        "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/ah/SDM-Methode-V31.pdf"}},
    },
    "DSK_OH_KI_2024": {
        "reg": "DSK OH KI", "issuer": "DSK", "expected_sections": 3,
        "name": "DSK Orientierungshilfe KI und Datenschutz (2024)",
        "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/oh/20240506_DSK_Orientierungshilfe_KI_und_Datenschutz.pdf"}},
    },
    "DSK_KP_12_DSB": {
        "reg": "DSK KP12", "issuer": "DSK", "expected_sections": 2,
        "name": "DSK Kurzpapier Nr. 12 - Datenschutzbeauftragte",
        "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_12.pdf"}},
    },
    "DSK_OH_CLOUD_2014": {
        "reg": "DSK OH Cloud", "issuer": "DSK", "expected_sections": 3,
        "name": "DSK Orientierungshilfe - Cloud Computing (2014)",
        "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/oh/20141009_oh_cloud_computing.pdf"}},
    },
    "DSK_KP_05_DSFA": {
        "reg": "DSK KP05", "issuer": "DSK", "expected_sections": 2,
        "name": "DSK Kurzpapier Nr. 5 - Datenschutz-Folgenabschätzung",
        "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf"}},
    },
    "DSK_KP_13_AV": {
        "reg": "DSK KP13", "issuer": "DSK", "expected_sections": 2,
        "name": "DSK Kurzpapier Nr. 13 - Auftragsverarbeitung",
        "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_13.pdf"}},
    },
    "ENISA_HANDBOOK_PDP": {  # Stufe 1 — bekannte Welt (Kalibrierung)
        "reg": "ENISA Handbook PDP", "issuer": "ENISA", "expected_sections": 4,
        "name": "ENISA Handbook on Security of Personal Data Processing",
        "sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/WP2017%20O-2-2-5%20GDPR%20Measures%20Handbook.pdf"}},
    },
    "ENISA_GL_SME_PDP": {  # Stufe 1
        "reg": "ENISA GL SME PDP", "issuer": "ENISA", "expected_sections": 4,
        "name": "ENISA Guidelines for SMEs on the security of personal data processing",
        "sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/WP2016%203-2%206%20Data%20Controllers%20Risk.pdf"}},
    },
    "ENISA_GL_EECC_SEC": {  # Stufe 2 — bekannte Welt + Annex/Tabellen
        "reg": "ENISA GL EECC", "issuer": "ENISA", "expected_sections": 4,
        "name": "ENISA Guideline on Security Measures under the EECC (4th edition)",
        "sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20-%20Guideline%20on%20Security%20Measures%20under%20the%20EECC-%204th%20edition.pdf"}},
    },
    "ENISA_ISPS_SME": {  # Stufe 3 — Grenztest (tabellenzentriert)
        "reg": "ENISA ISPS SME", "issuer": "ENISA", "expected_sections": 3,
        "name": "ENISA Information security and privacy standards for SMEs",
        "sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/Information%20security%20and%20privacy%20standards%20for%20SMEs.pdf"}},
    },
    "ENISA_TL_2023": {  # Stufe 3 — Grenztest (mehrspaltig/visuell)
        "reg": "ENISA TL 2023", "issuer": "ENISA", "expected_sections": 3,
        "name": "ENISA Threat Landscape 2023",
        "sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20Threat%20Landscape%202023.pdf"}},
    },
    "NIST_SP_800_53B": {  # NIST (US Public Domain) — tabellenzentriert (Control Baselines)
        "reg": "NIST SP 800-53B", "issuer": "NIST", "expected_sections": 3,
        "name": "NIST SP 800-53B Control Baselines for Information Systems and Organizations",
        "sources": {"en": {"pdf_url": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-53B.pdf"}},
    },
    "NIST_SP_800_171": {  # NIST — strukturierte Guidance + Tabellen
        "reg": "NIST SP 800-171", "issuer": "NIST", "expected_sections": 3,
        "name": "NIST SP 800-171 Rev 2 Protecting Controlled Unclassified Information",
        "sources": {"en": {"pdf_url": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-171r2.pdf"}},
    },
 }
 # --- language-specific lexical config ---
 LANG = {
    "en": {"artref": re.compile(r'(?:Article|Art\.)\s*(\d+)'),
           "noise": re.compile(r'ARTICLE 29 DATA PROTECTION|^\d{1,3}$|Adopted on|Revised and Adopted|Version \d', re.I)},
    "de": {"artref": re.compile(r'(?:Artikel|Art\.)\s*(\d+)'),
           "noise": re.compile(r'ARTIKEL.?29.?DATENSCHUTZ|ARTICLE 29 DATA PROTECTION|^\d{1,3}$|Angenommen|berarbeitet und|Adopted on|Fassung \d', re.I)},
 }
 # Generic enumerator: arabic (1 / 1.2.3) | roman (I. / III.B.) | Annex N.
 # \d{1,3} excludes years (4 digits) as section numbers. Roman requires trailing dot.
 ROMAN = r'(?:XVIII|VIII|XIII|XVII|III|VII|XII|XIV|XVI|XIX|II|IV|VI|IX|XI|XV|XX|I|V|X)'
 ENUM = re.compile(
    r'^(?:'
    r'(?P<ar>\d{1,3}(?:\.\d+){0,3})\.?'      # arabic 1 / 1.2.3 (self-pathed)
    r'|(?P<ro>' + ROMAN + r')\.'             # roman top-level I. / III. (require dot)
    r'|(?P<le>[A-Z])\.'                       # capital-letter sub A./B. (only under roman scheme)
    r'|(?P<ax>(?:Annex|Annexe|Anhang|Anlage)\s+\d+)'
    r')\s+(?P<title>\S.{0,84})$'
 )
 ANNEX_KW = ("ANNEX", "ANNEXE", "ANHANG", "ANLAGE")
 GDPR_HINT = re.compile(r'DSGVO|GDPR|2016/679|Verordnung \(EU\) 2016/679', re.I)
 def git_sha():
    try:
        return subprocess.check_output(["git", "-C", CP, "rev-parse", "--short", "HEAD"]).decode().strip()
    except Exception:
        return "unknown"
 CACHE_DIR = "/tmp/sge_cache"
 def _http_get(url, timeout=120.0, attempts=4):
    import time as _t
    last = None
    for i in range(attempts):
        try:
            with httpx.Client(timeout=timeout, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"}) as c:
                data = c.get(url).content
            if data:
                return data
            last = RuntimeError("empty body")
        except Exception as e:
            last = e
            log.info("download attempt %d/%d failed: %s", i + 1, attempts, e)
        _t.sleep(3 * (i + 1))
    raise RuntimeError("download failed after %d attempts: %s (%s)" % (attempts, url, last))
 def fetch(src):
    import hashlib
    os.makedirs(CACHE_DIR, exist_ok=True)
    cache_key = hashlib.md5((src.get("zip_url") or src.get("pdf_url") or "").encode()).hexdigest()
    cache_pdf = os.path.join(CACHE_DIR, cache_key + ".pdf")
    if os.path.exists(cache_pdf):
        pdf = open(cache_pdf, "rb").read()
        if pdf[:4] == b"%PDF":
            log.info("PDF cache hit %s (%d B)", cache_pdf, len(pdf))
            return pdf
    local = src.get("local", "")
    if "zip_url" in src:
        if local and os.path.exists(local):
            data = open(local, "rb").read(); log.info("ZIP local %s (%d B)", local, len(data))
        else:
            data = _http_get(src["zip_url"], timeout=180.0)
            log.info("ZIP downloaded (%d B)", len(data))
        pdf = zipfile.ZipFile(io.BytesIO(data)).read(src["inner"])
    else:
        if local and os.path.exists(local):
            pdf = open(local, "rb").read()
        else:
            pdf = _http_get(src["pdf_url"], timeout=120.0)
    if pdf[:4] != b"%PDF":
        raise RuntimeError("not a PDF: %r" % pdf[:16])
    try:
        open(cache_pdf, "wb").write(pdf)
    except Exception:
        pass
    return pdf
 def src_url(src):
    return src.get("zip_url") or src.get("pdf_url")
 def line_font(pg, ln):
    chs = [c for c in pg.chars if ln["top"] - 1 <= c["top"] <= ln["bottom"] + 1]
    if not chs:
        return 0.0, ""
    sz = Counter(round(c.get("size", 0), 1) for c in chs).most_common(1)[0][0]
    fn = Counter(c.get("fontname", "") for c in chs).most_common(1)[0][0]
    return sz, fn
 def _upper_ratio(title):
    letters = [c for c in title if c.isalpha()]
    return (sum(c.isupper() for c in letters) / len(letters)) if letters else 0.0
 _TTL_STOP = {"der", "die", "das", "den", "des", "dem", "und", "von", "zur", "zum", "für", "auf", "the",
             "of", "and", "to", "for", "in", "on", "an", "under", "with", "sur", "aux"}
 def _ttoks(s):
    out = set()
    for t in re.findall(r'[a-zà-ÿ0-9]+', s.lower()):
        t = re.sub(r'\d+$', '', t)  # Footnote-Ziffern entkleben ("freiwillig12" -> "freiwillig")
        if len(t) > 2 and t not in _TTL_STOP:
            out.add(t)
    return out
 def _is_caption(title):
    # strukturelle All-Caps-Überschrift (PREFACE/VORWORT/ANNEX) — auch ohne TOC-Eintrag gültig
    return _upper_ratio(title) >= 0.8 and len(title) <= 34
 # TOC-Zeile: <enum> <Titel> <lange Punktführung> <Seitenzahl>. EIGENE Regex (NICHT ENUM —
 # dessen Titel-Cap .{0,84} scheitert an 130+-Punkt-Führungen). Titel = non-greedy bis zur Führung.
 TOC_LINE = re.compile(r'^[A-Z0-9][\w./()\-]*\s+(?P<title>.+?)\s*\.{3,}\s*\d{1,3}\s*$')
 def extract_toc(all_lines, body_size):
    """TOC = explizite Strukturdeklaration des Dokuments. Sammelt die Titel-Token-Sets der
    Inhaltsverzeichnis-Einträge aus dem Frontmatter. Absätze stehen NIE im TOC. >=5 => TOC vorhanden."""
    titlesets = []
    for page_number, txt, s, _ in all_lines:
        if page_number > 8:
            break
        m = TOC_LINE.match(txt)
        if not m:
            continue
        ts = _ttoks(m.group("title"))
        if ts:
            titlesets.append(ts)
    return (len(titlesets) >= 5, titlesets)
 def _title_in_toc(title, titlesets):
    # Match relativ zum BODY (inter/|body|): eine echte Überschrift ist ~ein TOC-Titel (alle Body-Wörter
    # im TOC -> 1.0); ein Absatz, der Sektionswörter ENTHÄLT, hat Extra-Wörter (-> <0.75) -> kein Match.
    # Body-Titel ist bei 84 Zeichen gekappt = Präfix des vollen TOC-Titels, daher robust gegen Kappung.
    bt = _ttoks(title)
    if not bt:
        return False
    for ts in titlesets:
        if len(bt & ts) / len(bt) >= 0.75:
            return True
    return False
 def _looks_like_heading(txt, s, fn, body_size):
    # F4-Plausibilitätsfilter — NICHT die Entscheidung (die trifft das TOC). Trimmt nur Fließtext weg,
    # BEVOR das TOC befragt wird: kurz + nicht satzschließend. Bewusst KEIN Bold/Size-Zwang — das TOC
    # entscheidet, Format ist nur Filter (greift eh nur in als unnummeriert erkannten Docs).
    t = txt.rstrip()
    return 3 <= len(t) <= 90 and not t.endswith((".", ";", ":", ","))
 def parse_guidance(pdf_bytes, noise):
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        n_pages = len(pdf.pages)
        szc = Counter()
        all_lines = []  # (page_number, txt, size, fontname) — einmal extrahiert, zweimal genutzt
        n_tables = 0       # detected (Capability: Tables Detect)
        raw_tables = []    # (page_number, rows) — EXTRAHIERT (Capability: Tables Extract). Scope: gelinierte/einfache.
        for pg in pdf.pages:
            try:
                pts = pg.extract_tables()
            except Exception:
                pts = []
            n_tables += len(pts)
            for tbl in pts:
                # erster Scope: >=2 Zeilen + >=3 nicht-leere Zellen (keine verschachtelten/visuellen)
                if tbl and len(tbl) >= 2 and sum(1 for row in tbl for c in row if c and c.strip()) >= 3:
                    raw_tables.append((pg.page_number, [[(c or "").strip() for c in row] for row in tbl]))
            for ch in pg.chars:
                szc[round(ch.get("size", 0), 1)] += 1
            for ln in pg.extract_text_lines(layout=False):
                txt = " ".join(ln["text"].split())
                if not txt:
                    continue
                s, fn = line_font(pg, ln)
                all_lines.append((pg.page_number, txt, s, fn))
        total_chars = sum(szc.values())
        body_size = szc.most_common(1)[0][0]
        # Schema-Erkennung (selbstkalibrierend): nutzt das Dok ALL-CAPS bare-Integer-Sektionsüberschriften
        # (EDPB-Hausstil)? Dann nummeriert es auch Absätze (Satz-case) -> bare-Integer-Heading muss ALL-CAPS
        # sein. WP243 (Title-case-Sektionen, keine ALL-CAPS) -> Regel inaktiv, keine Regression. Schwelle 2
        # gegen einzelne Fluke-Caps-Zeile; EN/DE-formatierungsunabhängig (anders als bare-Integer-Magnitude).
        caps_secs = 0
        for _, txt, s, _ in all_lines:
            if s < body_size - 0.5:
                continue
            mm = ENUM.match(txt)
            if (mm and mm.group("ar") and "." not in mm.group("ar")
                    and not re.search(r'\s\d{1,3}$', mm.group("title"))
                    and _upper_ratio(mm.group("title")) >= 0.6):
                caps_secs += 1
        caps_scheme = caps_secs >= 2
        toc_present, toc = extract_toc(all_lines, body_size)
        # F4-DOC-Erkennung: decken NUMMERIERTE Headings das TOC ab? Wenn kaum -> die Headings sind unnummeriert
        # (F4) -> unnummerierten Pfad aktivieren. Sonst (F1-F3) AUS -> keine Regression.
        numbered_toc_hits = 0
        if toc_present:
            for _, txt, s, _ in all_lines:
                if s < body_size - 0.5:
                    continue
                mm = ENUM.match(txt)
                if not mm or re.search(r'\s\d{1,3}$', mm.group("title")):
                    continue
                ttl = mm.group("title").strip().rstrip('.').strip()
                if _title_in_toc(ttl, toc) or _is_caption(ttl):
                    numbered_toc_hits += 1
        unnumbered_doc = toc_present and numbered_toc_hits < max(3, len(toc) * 0.4)
        sections, cur, started, in_annex, seen_types, max_sec, un_count = [], None, False, False, set(), 0, 0
        top_en, top_type = None, None  # last level-1 enumerator (roman/arabic/annex) for sub-path assembly
        for page_number, txt, s, fn in all_lines:
            m = ENUM.match(txt)
            ok_head = bool(m) and s >= body_size - 0.5 and not re.search(r'\s\d{1,3}$', m.group("title"))
            if ok_head:
                if m.group("ar"): typ, en = "ar", m.group("ar")
                elif m.group("ro"): typ, en = "ro", m.group("ro")
                elif m.group("le"): typ, en = "le", m.group("le")
                else: typ, en = "ax", m.group("ax")
                title = m.group("title").strip().rstrip('.').strip()
                # Großbuchstaben-Sub nur in römisch-Schema (Label-Assembly) — immer
                if typ == "le" and top_type != "ro":
                    ok_head = False
                if ok_head and typ == "ar" and not in_annex:
                    if unnumbered_doc:
                        # F4-Doc: nummerierte Zeilen sind Absätze/Artikel-Refs ("13.1", "39.34"), nicht Headings
                        # -> nur akzeptieren, wenn der Titel im TOC steht.
                        if not _title_in_toc(title, toc):
                            ok_head = False
                    elif "." in en:
                        # dotted = echte Sub-Headings (immer ok), AUSSER Dezimal/Uhrzeit-FP: führende Null im
                        # Sub-Teil ("8.00 Uhr", "17.00"). Echte Sektionsnummern haben keine führende Null (3.1, 3.10).
                        if any(c.startswith("0") for c in en.split(".")[1:]):
                            ok_head = False
                    elif toc_present:
                        # bare-Integer: TOC = Wahrheit (Absätze stehen nie im TOC); All-Caps-Captions auch ohne TOC.
                        if not (_title_in_toc(title, toc) or _is_caption(title)):
                            ok_head = False
                    else:  # Fallback-Heuristiken nur für bare-Integer (kein TOC)
                        if top_type == "ro":
                            ok_head = False
                        elif caps_scheme and _upper_ratio(title) < 0.6:
                            ok_head = False
                        elif int(en) > max_sec + 5:
                            ok_head = False
            elif (m is None and unnumbered_doc and _looks_like_heading(txt, s, fn, body_size)
                  and not re.search(r'\.\d+$', txt.strip()) and _title_in_toc(txt, toc)):
                # F4 — UNNUMMERIERTE Überschrift: TOC = die ENTSCHEIDUNG, looks_like_heading nur Plausibilitätsfilter.
                # Greift NUR wenn ENUM nicht matcht UND der Doc als unnummeriert erkannt wurde (F1-F3 unberührt).
                # NICHT mit ".<Ziffern>" endend -> Fußnoten-Fragmente ("environment.47") raus (schont DE-Komposita).
                ok_head, typ, en, title = True, "ut", None, txt.strip().rstrip('.').strip()
            if ok_head:
                seen_types.add(typ)
                if typ == "le":
                    num, lvl = ((top_en + "." + en) if top_en else en), 2
                elif typ == "ut":
                    un_count += 1
                    num, lvl = str(un_count), 1
                else:
                    num = en
                    lvl = (num.count('.') + 1) if typ == "ar" else 1
                    top_en, top_type = en, typ
                    if typ == "ar" and "." not in en and not in_annex:
                        max_sec = max(max_sec, int(en))
                cur = {"num": num, "title": title, "level": lvl, "in_annex": in_annex,
                       "bold": "bold" in fn.lower(), "page": page_number, "body": []}
                sections.append(cur); started = True
                if typ == "ax" or any(k in (num + " " + title).upper() for k in ANNEX_KW):
                    in_annex = True
            elif started and cur is not None:
                if s >= body_size - 2 and not noise.match(txt):
                    cur["body"].append(txt)
        # TOC-Duplikate generisch entfernen: gleicher LABEL-Key doppelt (TOC-Stub p2-3 + echte Sektion)
        # -> den mit längerem Body behalten. Key = Label-Logik (A-Präfix für numerische Annex-Items),
        # NICHT roher num — sonst kollidieren WP243-Annex-Items (num 1-13) mit Kapiteln (num 1-5).
        def _lk(sc):
            n = sc["num"]
            return ("A" + n) if (sc.get("in_annex") and n.isdigit()) else n
        best = {}
        for sc in sections:
            k = _lk(sc)
            if k not in best or len("\n".join(sc["body"])) > len("\n".join(best[k]["body"])):
                best[k] = sc
        sections = [sc for sc in sections if best.get(_lk(sc)) is sc]
        return {"pages": n_pages, "total_chars": total_chars, "body_size": body_size,
                "sections": sections, "caps_scheme": caps_scheme, "schemes": seen_types,
                "toc": toc_present, "toc_entries": len(toc),
                "tables": n_tables, "annex": any(sc.get("in_annex") for sc in sections),
                "raw_tables": raw_tables}
 def self_test(parsed, expected, artref, expected_refs):
    probs, secs = [], parsed["sections"]
    if parsed["total_chars"] < 5000:
        probs.append("zu wenig embedded text (%d) -> OCR?" % parsed["total_chars"])
    if len(secs) < expected:
        probs.append("nur %d Sektionen < expected %d" % (len(secs), expected))
    full = " ".join(t for sc in secs for t in sc["body"]) + " " + " ".join(sc["title"] for sc in secs)
    arts = set(artref.findall(full))
    # Manifest-Vertrag (doc_type-abhängig): Artikel-Pflicht NUR wenn der doc_type 'article' erwartet
    # (Guidance/EU-VO). Technical Standards (NIST/ENISA) zitieren section/control -> kein FAIL bei fehlenden Artikeln.
    if "article" in expected_refs and not arts:
        probs.append("references_out: KEIN Artikel erkannt (doc_type erwartet article)")
    return (not probs, probs, sorted(arts, key=lambda x: int(x))[:12])
 def _build_units_struct(doc_id, doc, lang, parsed, base_version, prov):
    reg = doc["reg"]
    units = []
    sources = []
    idx = 0
    for sc in parsed["sections"]:
        body = "\n".join(sc["body"]).strip()
        if len(body) < 40:
            continue
        idx += 1
        num = sc["num"]
        lab = ("A" + num) if (sc.get("in_annex") and num.isdigit()) else num
        cu = "%s §%s" % (reg, lab)
        text = "%s §%s %s\n\n%s" % (reg, lab, sc["title"], body)
        m = {
            "regulation_code": reg, "regulation_short": reg, "regulation_name_de": doc["name"],
            "language": lang, "citation_style": "guidance_section", "document_type": "guidance",
            "source_class": "supervisory_guidance", "source_role": "interpretation", "use_for_primary": False,
            "bindingness": "non_binding_interpretative", "authority_level": 70, "authority_weight": 70,
            "source_type": "guidance", "issuer": doc["issuer"], "jurisdiction": "EU",
            "source": "ec.europa.eu", "license": "public_eu", "category": "guidance",
            "citation_unit": cu, "article_label": cu, "parent_citation_unit": reg, "is_citable": True,
            "article": "§%s" % lab, "article_title": sc["title"], "article_type": "interpretation",
            "chunk_scope": "guidance_section", "context_hierarchy": [reg],
            "display_context": "%s > §%s %s" % (reg, lab, sc["title"]),
            "norm_id": "EU-%s-%s-%s" % (doc_id, lang.upper(), lab),
            "references_out": [], "child_tables": sc.get("child_tables", []),
        }
        m.update(prov)
        units.append(UploadUnit(filename="%s_%s_s%d.txt" % (doc_id.lower(), lang, idx),
                                text=text, meta=m,
                                document_version="%s-s%d" % (base_version, idx),
                                collection=BUILD_COLLECTION))
        sources.append((body, sc["title"]))
    return units, sources
 def _attach_refs(units, sources, artref):
    for u, (body, title) in zip(units, sources):
        u.meta["references_out"] = sorted({"Art. %s DSGVO" % n for n in artref.findall(body + " " + title)},
                                          key=lambda x: int(x.split()[1]))
 def build_units(doc_id, doc, lang, parsed, base_version, prov, artref):
    units, sources = _build_units_struct(doc_id, doc, lang, parsed, base_version, prov)
    _attach_refs(units, sources, artref)
    return units
 def _table_md(rows):
    hdr = rows[0]
    out = ["| " + " | ".join(c or "" for c in hdr) + " |",
           "| " + " | ".join("---" for _ in hdr) + " |"]
    for r in rows[1:]:
        out.append("| " + " | ".join((c or "").replace("\n", " ") for c in r) + " |")
    return "\n".join(out)
 def build_table_units(doc_id, doc, lang, parsed, base_version, prov):
    # Capability Tables Extraction: jede Tabelle = EIGENE Knowledge-Unit (Markdown + JSON), an ihre Sektion gehängt.
    # table.parent_section = section.num ; section.child_tables = [table_id]. Separater Pfad — Heading-Parse unberührt.
    reg, sections, units = doc["reg"], parsed["sections"], []
    for ti, (page, rows) in enumerate(parsed.get("raw_tables", []), 1):
        parent = None
        for sc in sections:
            if sc.get("page", 0) <= page:
                parent = sc
            else:
                break
        plabel = parent["num"] if parent else "0"
        md = _table_md(rows)
        if len(md) < 25:
            continue
        tid = "%s-t%d" % (doc_id, ti)
        if parent is not None:
            parent.setdefault("child_tables", []).append(tid)
        cu = "%s §%s Tabelle %d" % (reg, plabel, ti)
        m = {"regulation_code": reg, "regulation_short": reg, "language": lang,
             "source_class": "supervisory_guidance", "source_role": "interpretation", "use_for_primary": False,
             "jurisdiction": "EU", "category": "guidance",
             "is_table": True, "table_id": tid, "parent_section": plabel, "page": page,
             "columns": rows[0], "rows": rows, "markdown": md,
             "extraction_method": "pdfplumber", "confidence": "lined-simple",
             "citation_unit": cu, "article_label": cu, "chunk_scope": "table",
             "display_context": "%s > §%s > Tabelle %d" % (reg, plabel, ti), "references_out": []}
        m.update(prov)
        units.append(UploadUnit(filename="%s_%s_tbl%d.txt" % (doc_id.lower(), lang, ti),
                                text="%s §%s — Tabelle %d (S.%d)\n\n%s" % (reg, plabel, ti, page, md),
                                meta=m, document_version="%s-tbl%d" % (base_version, ti),
                                collection=BUILD_COLLECTION))
    return units
 import capability_pipeline as _CP
 def _region_native_tables(pdf_bytes):
    rt = []
    regions = []
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as _pdf:
        for pg in _pdf.pages:
            try:
                tables = pg.find_tables()
            except Exception:
                tables = []
            for t in tables:
                try:
                    tbl = t.extract()
                except Exception:
                    tbl = None
                if tbl and len(tbl) >= 2 and sum(1 for row in tbl for c in row if c and c.strip()) >= 3:
                    rt.append((pg.page_number, [[(c or "").strip() for c in row] for row in tbl]))
                    regions.append((pg.page_number, t.bbox))
    return rt, regions
 class _C6Tables:
    name = "C6_Tables"
    consumes = ["table_region"]
    produces = ["table_units"]
    def run(self, ctx):
        rtables, regions = _region_native_tables(ctx["pdf"])
        ctx["claimed_table_regions"] = regions
        p2 = dict(ctx["parsed"])
        p2["raw_tables"] = rtables
        ctx["_table_units"] = build_table_units(ctx["doc_id"], ctx["doc"], ctx["lang"], p2, ctx["rt"], ctx["prov"])
 class _C7ReadingOrder:
    name = "C7_ReadingOrder"
    consumes = ["prose_region", "table_units"]
    produces = ["ordered_prose"]
    def run(self, ctx):
        ctx["_c7_owns_prose"] = True
 class _C1C2Sections:
    name = "C1C2_Sections"
    consumes = ["prose_region", "table_units"]
    produces = ["section_struct"]
    def run(self, ctx):
        units, sources = _build_units_struct(ctx["doc_id"], ctx["doc"], ctx["lang"], ctx["parsed"], ctx["rt"], ctx["prov"])
        ctx["_section_units"] = units
        ctx["_sources"] = sources
 class _C4References:
    name = "C4_References"
    consumes = ["section_struct"]
    produces = ["references"]
    def run(self, ctx):
        _attach_refs(ctx["_section_units"], ctx["_sources"], ctx["cfg"]["artref"])
 def run_engine(doc_id, doc, lang, parsed, run_tag, prov, cfg, pdf_bytes):
    ctx = {"doc_id": doc_id, "doc": doc, "lang": lang, "parsed": parsed,
           "rt": run_tag, "prov": prov, "cfg": cfg, "pdf": pdf_bytes}
    caps = [_C4References(), _C7ReadingOrder(), _C1C2Sections(), _C6Tables()]
    order = _CP.resolve_order(caps, {"table_region", "prose_region"})
    for c in order:
        c.run(ctx)
    return ctx["_section_units"], ctx["_table_units"]
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--doc", required=True)
    ap.add_argument("--lang", required=True, choices=["en", "de"])
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()
    doc = DOCS[args.doc]
    lang, cfg = args.lang, LANG[args.lang]
    sha = git_sha()
    run_tag = "2026.1-%s-%s" % (args.doc.lower(), lang)
    run_id = "%s-%s" % (run_tag, int(time.time()))
    date = datetime.date.today().isoformat()
    src = doc["sources"][lang]
    log.info("SGE %s [%s] | exp_sec=%d sha=%s dry=%s", args.doc, lang, doc["expected_sections"], sha, args.dry_run)
    doc_type = doc.get("doc_type") or ISSUER_DOCTYPE.get(doc["issuer"], "Guidance")
    expected_refs = DOC_TYPE_REFS.get(doc_type, ["article"])
    pdf = fetch(src)
    _t0 = time.time()
    parsed = parse_guidance(pdf, cfg["noise"])
    parse_ms = int((time.time() - _t0) * 1000)
    ok, probs, arts = self_test(parsed, doc["expected_sections"], cfg["artref"], expected_refs)
    pages = parsed["pages"]
    log.info("PARSED pages=%d chars=%d body=%.1f sections=%d caps_scheme=%s gate=%s %s",
             pages, parsed["total_chars"], parsed["body_size"], len(parsed["sections"]), parsed.get("caps_scheme"), ok, probs)
    log.info("Art-refs: %s", arts)
    for sc in parsed["sections"]:
        log.info("  [%s]%s %s (p%d)", sc["num"], "*" if sc.get("in_annex") else " ", sc["title"][:58], sc["page"])
    sch = parsed.get("schemes", set())
    family = ("unnumbered-toc" if "ut" in sch and not ({"ar", "ro"} & sch)  # F4
              else "roman-hierarchical" if "ro" in sch                       # F2
              else "arabic-caps+paragraph" if parsed.get("caps_scheme")      # F3
              else "arabic-hierarchical")                                    # F1
    detF = FAMILY_F.get(family, "?")
    expF = EXPECTED_FAMILY.get(args.doc, "?")
    gate = "OK" if expF == detF else ("REVIEW(exp=%s)" % expF if expF != "?" else "no-expected")
    prov = {"parser_version": "StructuredGuidanceExtractor@%s+sge_build" % sha, "ingest_run_id": run_id,
            "ingest_date": date, "source_url": src_url(src), "source_inner_file": src.get("inner", ""),
            "build_collection": BUILD_COLLECTION, "manifest_version": MANIFEST_VERSION, "document_id": args.doc,
            "layout_family": detF, "document_type": doc_type, "expected_reference_types": expected_refs}
    units, table_units = run_engine(args.doc, doc, lang, parsed, run_tag, prov, cfg, pdf)  # CUTOVER: Engine-Pfad
    total_refs = sum(len(u.meta["references_out"]) for u in units)
    detect = "TOC(%d)" % parsed.get("toc_entries", 0) if parsed.get("toc") else "heuristic"
    log.info("UNITS=%d table_units=%d | VITALS family=%s(%s) FAMILY-GATE=%s detect=%s tables_detect=%d tables_extract=%d annex=%s parse_ms=%d chunks/page(units)=%.2f refs/unit=%.2f",
             len(units), len(table_units), family, detF, gate, detect, parsed.get("tables", 0), len(table_units),
             "Y" if parsed.get("annex") else "N", parse_ms, len(units) / max(pages, 1), total_refs / max(len(units), 1))
    if gate.startswith("REVIEW"):
        log.warning("FAMILY-GATE REVIEW: %s erwartet %s, erkannt %s — Klassifikation prüfen", args.doc, expF, detF)
    if units:
        m = units[0].meta
        pk = ["parser_version", "ingest_run_id", "ingest_date", "source_url", "build_collection", "manifest_version", "document_id"]
        log.info("sample: label=%r language=%r source_class=%r use_for_primary=%r refs=%s",
                 m.get("article_label"), m.get("language"), m.get("source_class"), m.get("use_for_primary"), m.get("references_out"))
        log.info("provenance present: %s", all(k in m for k in pk))
    if args.dry_run:
        import json as _json, hashlib as _hl
        _g=[{"id":u.document_version,"filename":u.filename,"kind":("table" if u.meta.get("is_table") else "section"),
             "text_sha":_hl.sha256((u.text or "").encode()).hexdigest()[:16],"meta":u.meta} for u in (units+table_units)]
        _bp="/tmp/baseline_%s_%s.json"%(args.doc,lang)
        open(_bp,"w").write(_json.dumps(_g,ensure_ascii=False,indent=1,default=str))
        log.info("DRY RUN baseline -> %s (%d units: %d section + %d table)",_bp,len(_g),len(units),len(table_units)); return
    if not ok:
        log.error("GATE FAILED — aborting"); sys.exit(1)
    n = 0
    with httpx.Client(timeout=600.0, verify=False) as c:
        for u in units + table_units:
            n += upload_unit(c, RAG_URL, u)
    log.info("UPLOADED: %d section + %d table units -> %d chunks", len(units), len(table_units), n)
 if __name__ == "__main__":
    main()