\S.{0,84})$' ) ANNEX_KW = ("ANNEX", "ANNEXE", "ANHANG", "ANLAGE") GDPR_HINT = re.compile(r'DSGVO|GDPR|2016/679|Verordnung $EU$ 2016/679', re.I) def git_sha(): try: return subprocess.check_output(["git", "-C", CP, "rev-parse", "--short", "HEAD"]).decode().strip() except Exception: return "unknown" CACHE_DIR = "/tmp/sge_cache" def _http_get(url, timeout=120.0, attempts=4): import time as _t last = None for i in range(attempts): try: with httpx.Client(timeout=timeout, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"}) as c: data = c.get(url).content if data: return data last = RuntimeError("empty body") except Exception as e: last = e log.info("download attempt %d/%d failed: %s", i + 1, attempts, e) _t.sleep(3 * (i + 1)) raise RuntimeError("download failed after %d attempts: %s (%s)" % (attempts, url, last)) def fetch(src): import hashlib os.makedirs(CACHE_DIR, exist_ok=True) cache_key = hashlib.md5((src.get("zip_url") or src.get("pdf_url") or "").encode()).hexdigest() cache_pdf = os.path.join(CACHE_DIR, cache_key + ".pdf") if os.path.exists(cache_pdf): pdf = open(cache_pdf, "rb").read() if pdf[:4] == b"%PDF": log.info("PDF cache hit %s (%d B)", cache_pdf, len(pdf)) return pdf local = src.get("local", "") if "zip_url" in src: if local and os.path.exists(local): data = open(local, "rb").read(); log.info("ZIP local %s (%d B)", local, len(data)) else: data = _http_get(src["zip_url"], timeout=180.0) log.info("ZIP downloaded (%d B)", len(data)) pdf = zipfile.ZipFile(io.BytesIO(data)).read(src["inner"]) else: if local and os.path.exists(local): pdf = open(local, "rb").read() else: pdf = _http_get(src["pdf_url"], timeout=120.0) if pdf[:4] != b"%PDF": raise RuntimeError("not a PDF: %r" % pdf[:16]) try: open(cache_pdf, "wb").write(pdf) except Exception: pass return pdf def src_url(src): return src.get("zip_url") or src.get("pdf_url") def line_font(pg, ln): chs = [c for c in pg.chars if ln["top"] - 1 <= c["top"] <= ln["bottom"] + 1] if not chs: return 0.0, "" sz = Counter(round(c.get("size", 0), 1) for c in chs).most_common(1)[0][0] fn = Counter(c.get("fontname", "") for c in chs).most_common(1)[0][0] return sz, fn def _upper_ratio(title): letters = [c for c in title if c.isalpha()] return (sum(c.isupper() for c in letters) / len(letters)) if letters else 0.0 _TTL_STOP = {"der", "die", "das", "den", "des", "dem", "und", "von", "zur", "zum", "für", "auf", "the", "of", "and", "to", "for", "in", "on", "an", "under", "with", "sur", "aux"} def _ttoks(s): out = set() for t in re.findall(r'[a-zà-ÿ0-9]+', s.lower()): t = re.sub(r'\d+$', '', t) # Footnote-Ziffern entkleben ("freiwillig12" -> "freiwillig") if len(t) > 2 and t not in _TTL_STOP: out.add(t) return out def _is_caption(title): # strukturelle All-Caps-Überschrift (PREFACE/VORWORT/ANNEX) — auch ohne TOC-Eintrag gültig return _upper_ratio(title) >= 0.8 and len(title) <= 34 # TOC-Zeile: <enum> <Titel> <lange Punktführung> <Seitenzahl>. EIGENE Regex (NICHT ENUM — # dessen Titel-Cap .{0,84} scheitert an 130+-Punkt-Führungen). Titel = non-greedy bis zur Führung. TOC_LINE = re.compile(r'^[A-Z0-9][\w./()\-]*\s+(?P<title>.+?)\s*\.{3,}\s*\d{1,3}\s*$') def extract_toc(all_lines, body_size): """TOC = explizite Strukturdeklaration des Dokuments. Sammelt die Titel-Token-Sets der Inhaltsverzeichnis-Einträge aus dem Frontmatter. Absätze stehen NIE im TOC. >=5 => TOC vorhanden.""" titlesets = [] for page_number, txt, s, _ in all_lines: if page_number > 8: break m = TOC_LINE.match(txt) if not m: continue ts = _ttoks(m.group("title")) if ts: titlesets.append(ts) return (len(titlesets) >= 5, titlesets) def _title_in_toc(title, titlesets): # Match relativ zum BODY (inter/|body|): eine echte Überschrift ist ~ein TOC-Titel (alle Body-Wörter # im TOC -> 1.0); ein Absatz, der Sektionswörter ENTHÄLT, hat Extra-Wörter (-> <0.75) -> kein Match. # Body-Titel ist bei 84 Zeichen gekappt = Präfix des vollen TOC-Titels, daher robust gegen Kappung. bt = _ttoks(title) if not bt: return False for ts in titlesets: if len(bt & ts) / len(bt) >= 0.75: return True return False def _looks_like_heading(txt, s, fn, body_size): # F4-Plausibilitätsfilter — NICHT die Entscheidung (die trifft das TOC). Trimmt nur Fließtext weg, # BEVOR das TOC befragt wird: kurz + nicht satzschließend. Bewusst KEIN Bold/Size-Zwang — das TOC # entscheidet, Format ist nur Filter (greift eh nur in als unnummeriert erkannten Docs). t = txt.rstrip() return 3 <= len(t) <= 90 and not t.endswith((".", ";", ":", ",")) def parse_guidance(pdf_bytes, noise): with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: n_pages = len(pdf.pages) szc = Counter() all_lines = [] # (page_number, txt, size, fontname) — einmal extrahiert, zweimal genutzt n_tables = 0 # detected (Capability: Tables Detect) raw_tables = [] # (page_number, rows) — EXTRAHIERT (Capability: Tables Extract). Scope: gelinierte/einfache. for pg in pdf.pages: try: pts = pg.extract_tables() except Exception: pts = [] n_tables += len(pts) for tbl in pts: # erster Scope: >=2 Zeilen + >=3 nicht-leere Zellen (keine verschachtelten/visuellen) if tbl and len(tbl) >= 2 and sum(1 for row in tbl for c in row if c and c.strip()) >= 3: raw_tables.append((pg.page_number, [[(c or "").strip() for c in row] for row in tbl])) for ch in pg.chars: szc[round(ch.get("size", 0), 1)] += 1 for ln in pg.extract_text_lines(layout=False): txt = " ".join(ln["text"].split()) if not txt: continue s, fn = line_font(pg, ln) all_lines.append((pg.page_number, txt, s, fn)) total_chars = sum(szc.values()) body_size = szc.most_common(1)[0][0] # Schema-Erkennung (selbstkalibrierend): nutzt das Dok ALL-CAPS bare-Integer-Sektionsüberschriften # (EDPB-Hausstil)? Dann nummeriert es auch Absätze (Satz-case) -> bare-Integer-Heading muss ALL-CAPS # sein. WP243 (Title-case-Sektionen, keine ALL-CAPS) -> Regel inaktiv, keine Regression. Schwelle 2 # gegen einzelne Fluke-Caps-Zeile; EN/DE-formatierungsunabhängig (anders als bare-Integer-Magnitude). caps_secs = 0 for _, txt, s, _ in all_lines: if s < body_size - 0.5: continue mm = ENUM.match(txt) if (mm and mm.group("ar") and "." not in mm.group("ar") and not re.search(r'\s\d{1,3}$', mm.group("title")) and _upper_ratio(mm.group("title")) >= 0.6): caps_secs += 1 caps_scheme = caps_secs >= 2 toc_present, toc = extract_toc(all_lines, body_size) # F4-DOC-Erkennung: decken NUMMERIERTE Headings das TOC ab? Wenn kaum -> die Headings sind unnummeriert # (F4) -> unnummerierten Pfad aktivieren. Sonst (F1-F3) AUS -> keine Regression. numbered_toc_hits = 0 if toc_present: for _, txt, s, _ in all_lines: if s < body_size - 0.5: continue mm = ENUM.match(txt) if not mm or re.search(r'\s\d{1,3}$', mm.group("title")): continue ttl = mm.group("title").strip().rstrip('.').strip() if _title_in_toc(ttl, toc) or _is_caption(ttl): numbered_toc_hits += 1 unnumbered_doc = toc_present and numbered_toc_hits < max(3, len(toc) * 0.4) sections, cur, started, in_annex, seen_types, max_sec, un_count = [], None, False, False, set(), 0, 0 top_en, top_type = None, None # last level-1 enumerator (roman/arabic/annex) for sub-path assembly for page_number, txt, s, fn in all_lines: m = ENUM.match(txt) ok_head = bool(m) and s >= body_size - 0.5 and not re.search(r'\s\d{1,3}$', m.group("title")) if ok_head: if m.group("ar"): typ, en = "ar", m.group("ar") elif m.group("ro"): typ, en = "ro", m.group("ro") elif m.group("le"): typ, en = "le", m.group("le") else: typ, en = "ax", m.group("ax") title = m.group("title").strip().rstrip('.').strip() # Großbuchstaben-Sub nur in römisch-Schema (Label-Assembly) — immer if typ == "le" and top_type != "ro": ok_head = False if ok_head and typ == "ar" and not in_annex: if unnumbered_doc: # F4-Doc: nummerierte Zeilen sind Absätze/Artikel-Refs ("13.1", "39.34"), nicht Headings # -> nur akzeptieren, wenn der Titel im TOC steht. if not _title_in_toc(title, toc): ok_head = False elif "." in en: # dotted = echte Sub-Headings (immer ok), AUSSER Dezimal/Uhrzeit-FP: führende Null im # Sub-Teil ("8.00 Uhr", "17.00"). Echte Sektionsnummern haben keine führende Null (3.1, 3.10). if any(c.startswith("0") for c in en.split(".")[1:]): ok_head = False elif toc_present: # bare-Integer: TOC = Wahrheit (Absätze stehen nie im TOC); All-Caps-Captions auch ohne TOC. if not (_title_in_toc(title, toc) or _is_caption(title)): ok_head = False else: # Fallback-Heuristiken nur für bare-Integer (kein TOC) if top_type == "ro": ok_head = False elif caps_scheme and _upper_ratio(title) < 0.6: ok_head = False elif int(en) > max_sec + 5: ok_head = False elif (m is None and unnumbered_doc and _looks_like_heading(txt, s, fn, body_size) and not re.search(r'\.\d+$', txt.strip()) and _title_in_toc(txt, toc)): # F4 — UNNUMMERIERTE Überschrift: TOC = die ENTSCHEIDUNG, looks_like_heading nur Plausibilitätsfilter. # Greift NUR wenn ENUM nicht matcht UND der Doc als unnummeriert erkannt wurde (F1-F3 unberührt). # NICHT mit ".<Ziffern>" endend -> Fußnoten-Fragmente ("environment.47") raus (schont DE-Komposita). ok_head, typ, en, title = True, "ut", None, txt.strip().rstrip('.').strip() if ok_head: seen_types.add(typ) if typ == "le": num, lvl = ((top_en + "." + en) if top_en else en), 2 elif typ == "ut": un_count += 1 num, lvl = str(un_count), 1 else: num = en lvl = (num.count('.') + 1) if typ == "ar" else 1 top_en, top_type = en, typ if typ == "ar" and "." not in en and not in_annex: max_sec = max(max_sec, int(en)) cur = {"num": num, "title": title, "level": lvl, "in_annex": in_annex, "bold": "bold" in fn.lower(), "page": page_number, "body": []} sections.append(cur); started = True if typ == "ax" or any(k in (num + " " + title).upper() for k in ANNEX_KW): in_annex = True elif started and cur is not None: if s >= body_size - 2 and not noise.match(txt): cur["body"].append(txt) # TOC-Duplikate generisch entfernen: gleicher LABEL-Key doppelt (TOC-Stub p2-3 + echte Sektion) # -> den mit längerem Body behalten. Key = Label-Logik (A-Präfix für numerische Annex-Items), # NICHT roher num — sonst kollidieren WP243-Annex-Items (num 1-13) mit Kapiteln (num 1-5). def _lk(sc): n = sc["num"] return ("A" + n) if (sc.get("in_annex") and n.isdigit()) else n best = {} for sc in sections: k = _lk(sc) if k not in best or len("\n".join(sc["body"])) > len("\n".join(best[k]["body"])): best[k] = sc sections = [sc for sc in sections if best.get(_lk(sc)) is sc] return {"pages": n_pages, "total_chars": total_chars, "body_size": body_size, "sections": sections, "caps_scheme": caps_scheme, "schemes": seen_types, "toc": toc_present, "toc_entries": len(toc), "tables": n_tables, "annex": any(sc.get("in_annex") for sc in sections), "raw_tables": raw_tables} def self_test(parsed, expected, artref, expected_refs): probs, secs = [], parsed["sections"] if parsed["total_chars"] < 5000: probs.append("zu wenig embedded text (%d) -> OCR?" % parsed["total_chars"]) if len(secs) < expected: probs.append("nur %d Sektionen < expected %d" % (len(secs), expected)) full = " ".join(t for sc in secs for t in sc["body"]) + " " + " ".join(sc["title"] for sc in secs) arts = set(artref.findall(full)) # Manifest-Vertrag (doc_type-abhängig): Artikel-Pflicht NUR wenn der doc_type 'article' erwartet # (Guidance/EU-VO). Technical Standards (NIST/ENISA) zitieren section/control -> kein FAIL bei fehlenden Artikeln. if "article" in expected_refs and not arts: probs.append("references_out: KEIN Artikel erkannt (doc_type erwartet article)") return (not probs, probs, sorted(arts, key=lambda x: int(x))[:12]) def _build_units_struct(doc_id, doc, lang, parsed, base_version, prov): reg = doc["reg"] units = [] sources = [] idx = 0 for sc in parsed["sections"]: body = "\n".join(sc["body"]).strip() if len(body) < 40: continue idx += 1 num = sc["num"] lab = ("A" + num) if (sc.get("in_annex") and num.isdigit()) else num cu = "%s §%s" % (reg, lab) text = "%s §%s %s\n\n%s" % (reg, lab, sc["title"], body) m = { "regulation_code": reg, "regulation_short": reg, "regulation_name_de": doc["name"], "language": lang, "citation_style": "guidance_section", "document_type": "guidance", "source_class": "supervisory_guidance", "source_role": "interpretation", "use_for_primary": False, "bindingness": "non_binding_interpretative", "authority_level": 70, "authority_weight": 70, "source_type": "guidance", "issuer": doc["issuer"], "jurisdiction": "EU", "source": "ec.europa.eu", "license": "public_eu", "category": "guidance", "citation_unit": cu, "article_label": cu, "parent_citation_unit": reg, "is_citable": True, "article": "§%s" % lab, "article_title": sc["title"], "article_type": "interpretation", "chunk_scope": "guidance_section", "context_hierarchy": [reg], "display_context": "%s > §%s %s" % (reg, lab, sc["title"]), "norm_id": "EU-%s-%s-%s" % (doc_id, lang.upper(), lab), "references_out": [], "child_tables": sc.get("child_tables", []), } m.update(prov) units.append(UploadUnit(filename="%s_%s_s%d.txt" % (doc_id.lower(), lang, idx), text=text, meta=m, document_version="%s-s%d" % (base_version, idx), collection=BUILD_COLLECTION)) sources.append((body, sc["title"])) return units, sources def _attach_refs(units, sources, artref): for u, (body, title) in zip(units, sources): u.meta["references_out"] = sorted({"Art. %s DSGVO" % n for n in artref.findall(body + " " + title)}, key=lambda x: int(x.split()[1])) def build_units(doc_id, doc, lang, parsed, base_version, prov, artref): units, sources = _build_units_struct(doc_id, doc, lang, parsed, base_version, prov) _attach_refs(units, sources, artref) return units def _table_md(rows): hdr = rows[0] out = ["| " + " | ".join(c or "" for c in hdr) + " |", "| " + " | ".join("---" for _ in hdr) + " |"] for r in rows[1:]: out.append("| " + " | ".join((c or "").replace("\n", " ") for c in r) + " |") return "\n".join(out) def build_table_units(doc_id, doc, lang, parsed, base_version, prov): # Capability Tables Extraction: jede Tabelle = EIGENE Knowledge-Unit (Markdown + JSON), an ihre Sektion gehängt. # table.parent_section = section.num ; section.child_tables = [table_id]. Separater Pfad — Heading-Parse unberührt. reg, sections, units = doc["reg"], parsed["sections"], [] for ti, (page, rows) in enumerate(parsed.get("raw_tables", []), 1): parent = None for sc in sections: if sc.get("page", 0) <= page: parent = sc else: break plabel = parent["num"] if parent else "0" md = _table_md(rows) if len(md) < 25: continue tid = "%s-t%d" % (doc_id, ti) if parent is not None: parent.setdefault("child_tables", []).append(tid) cu = "%s §%s Tabelle %d" % (reg, plabel, ti) m = {"regulation_code": reg, "regulation_short": reg, "language": lang, "source_class": "supervisory_guidance", "source_role": "interpretation", "use_for_primary": False, "jurisdiction": "EU", "category": "guidance", "is_table": True, "table_id": tid, "parent_section": plabel, "page": page, "columns": rows[0], "rows": rows, "markdown": md, "extraction_method": "pdfplumber", "confidence": "lined-simple", "citation_unit": cu, "article_label": cu, "chunk_scope": "table", "display_context": "%s > §%s > Tabelle %d" % (reg, plabel, ti), "references_out": []} m.update(prov) units.append(UploadUnit(filename="%s_%s_tbl%d.txt" % (doc_id.lower(), lang, ti), text="%s §%s — Tabelle %d (S.%d)\n\n%s" % (reg, plabel, ti, page, md), meta=m, document_version="%s-tbl%d" % (base_version, ti), collection=BUILD_COLLECTION)) return units import capability_pipeline as _CP def _region_native_tables(pdf_bytes): rt = [] regions = [] with pdfplumber.open(io.BytesIO(pdf_bytes)) as _pdf: for pg in _pdf.pages: try: tables = pg.find_tables() except Exception: tables = [] for t in tables: try: tbl = t.extract() except Exception: tbl = None if tbl and len(tbl) >= 2 and sum(1 for row in tbl for c in row if c and c.strip()) >= 3: rt.append((pg.page_number, [[(c or "").strip() for c in row] for row in tbl])) regions.append((pg.page_number, t.bbox)) return rt, regions class _C6Tables: name = "C6_Tables" consumes = ["table_region"] produces = ["table_units"] def run(self, ctx): rtables, regions = _region_native_tables(ctx["pdf"]) ctx["claimed_table_regions"] = regions p2 = dict(ctx["parsed"]) p2["raw_tables"] = rtables ctx["_table_units"] = build_table_units(ctx["doc_id"], ctx["doc"], ctx["lang"], p2, ctx["rt"], ctx["prov"]) class _C7ReadingOrder: name = "C7_ReadingOrder" consumes = ["prose_region", "table_units"] produces = ["ordered_prose"] def run(self, ctx): ctx["_c7_owns_prose"] = True class _C1C2Sections: name = "C1C2_Sections" consumes = ["prose_region", "table_units"] produces = ["section_struct"] def run(self, ctx): units, sources = _build_units_struct(ctx["doc_id"], ctx["doc"], ctx["lang"], ctx["parsed"], ctx["rt"], ctx["prov"]) ctx["_section_units"] = units ctx["_sources"] = sources class _C4References: name = "C4_References" consumes = ["section_struct"] produces = ["references"] def run(self, ctx): _attach_refs(ctx["_section_units"], ctx["_sources"], ctx["cfg"]["artref"]) def run_engine(doc_id, doc, lang, parsed, run_tag, prov, cfg, pdf_bytes): ctx = {"doc_id": doc_id, "doc": doc, "lang": lang, "parsed": parsed, "rt": run_tag, "prov": prov, "cfg": cfg, "pdf": pdf_bytes} caps = [_C4References(), _C7ReadingOrder(), _C1C2Sections(), _C6Tables()] order = _CP.resolve_order(caps, {"table_region", "prose_region"}) for c in order: c.run(ctx) return ctx["_section_units"], ctx["_table_units"] def main(): ap = argparse.ArgumentParser() ap.add_argument("--doc", required=True) ap.add_argument("--lang", required=True, choices=["en", "de"]) ap.add_argument("--dry-run", action="store_true") args = ap.parse_args() doc = DOCS[args.doc] lang, cfg = args.lang, LANG[args.lang] sha = git_sha() run_tag = "2026.1-%s-%s" % (args.doc.lower(), lang) run_id = "%s-%s" % (run_tag, int(time.time())) date = datetime.date.today().isoformat() src = doc["sources"][lang] log.info("SGE %s [%s] | exp_sec=%d sha=%s dry=%s", args.doc, lang, doc["expected_sections"], sha, args.dry_run) doc_type = doc.get("doc_type") or ISSUER_DOCTYPE.get(doc["issuer"], "Guidance") expected_refs = DOC_TYPE_REFS.get(doc_type, ["article"]) pdf = fetch(src) _t0 = time.time() parsed = parse_guidance(pdf, cfg["noise"]) parse_ms = int((time.time() - _t0) * 1000) ok, probs, arts = self_test(parsed, doc["expected_sections"], cfg["artref"], expected_refs) pages = parsed["pages"] log.info("PARSED pages=%d chars=%d body=%.1f sections=%d caps_scheme=%s gate=%s %s", pages, parsed["total_chars"], parsed["body_size"], len(parsed["sections"]), parsed.get("caps_scheme"), ok, probs) log.info("Art-refs: %s", arts) for sc in parsed["sections"]: log.info(" [%s]%s %s (p%d)", sc["num"], "*" if sc.get("in_annex") else " ", sc["title"][:58], sc["page"]) sch = parsed.get("schemes", set()) family = ("unnumbered-toc" if "ut" in sch and not ({"ar", "ro"} & sch) # F4 else "roman-hierarchical" if "ro" in sch # F2 else "arabic-caps+paragraph" if parsed.get("caps_scheme") # F3 else "arabic-hierarchical") # F1 detF = FAMILY_F.get(family, "?") expF = EXPECTED_FAMILY.get(args.doc, "?") gate = "OK" if expF == detF else ("REVIEW(exp=%s)" % expF if expF != "?" else "no-expected") prov = {"parser_version": "StructuredGuidanceExtractor@%s+sge_build" % sha, "ingest_run_id": run_id, "ingest_date": date, "source_url": src_url(src), "source_inner_file": src.get("inner", ""), "build_collection": BUILD_COLLECTION, "manifest_version": MANIFEST_VERSION, "document_id": args.doc, "layout_family": detF, "document_type": doc_type, "expected_reference_types": expected_refs} units, table_units = run_engine(args.doc, doc, lang, parsed, run_tag, prov, cfg, pdf) # CUTOVER: Engine-Pfad total_refs = sum(len(u.meta["references_out"]) for u in units) detect = "TOC(%d)" % parsed.get("toc_entries", 0) if parsed.get("toc") else "heuristic" log.info("UNITS=%d table_units=%d | VITALS family=%s(%s) FAMILY-GATE=%s detect=%s tables_detect=%d tables_extract=%d annex=%s parse_ms=%d chunks/page(units)=%.2f refs/unit=%.2f", len(units), len(table_units), family, detF, gate, detect, parsed.get("tables", 0), len(table_units), "Y" if parsed.get("annex") else "N", parse_ms, len(units) / max(pages, 1), total_refs / max(len(units), 1)) if gate.startswith("REVIEW"): log.warning("FAMILY-GATE REVIEW: %s erwartet %s, erkannt %s — Klassifikation prüfen", args.doc, expF, detF) if units: m = units[0].meta pk = ["parser_version", "ingest_run_id", "ingest_date", "source_url", "build_collection", "manifest_version", "document_id"] log.info("sample: label=%r language=%r source_class=%r use_for_primary=%r refs=%s", m.get("article_label"), m.get("language"), m.get("source_class"), m.get("use_for_primary"), m.get("references_out")) log.info("provenance present: %s", all(k in m for k in pk)) if args.dry_run: import json as _json, hashlib as _hl _g=[{"id":u.document_version,"filename":u.filename,"kind":("table" if u.meta.get("is_table") else "section"), "text_sha":_hl.sha256((u.text or "").encode()).hexdigest()[:16],"meta":u.meta} for u in (units+table_units)] _bp="/tmp/baseline_%s_%s.json"%(args.doc,lang) open(_bp,"w").write(_json.dumps(_g,ensure_ascii=False,indent=1,default=str)) log.info("DRY RUN baseline -> %s (%d units: %d section + %d table)",_bp,len(_g),len(units),len(table_units)); return if not ok: log.error("GATE FAILED — aborting"); sys.exit(1) n = 0 with httpx.Client(timeout=600.0, verify=False) as c: for u in units + table_units: n += upload_unit(c, RAG_URL, u) log.info("UPLOADED: %d section + %d table units -> %d chunks", len(units), len(table_units), n) if __name__ == "__main_

#!/usr/bin/env python3 """Structured Guidance Extractor — GENERIC builder (Wave 1b compiler). One deterministic parser, parametrized per document + language. python3 sge_build.py --doc EDPB_WP248_DPIA --lang de [--dry-run] Source per (doc,lang) = {"pdf_url"} OR {"zip_url","inner"[, "local"]}. Layout parser identical to the WP243 pilot (pdfplumber MIT, NO LLM). Mehrsprachigkeit = representation property: SAME document_id, language=, own document_version namespace (no point-id collision). Emits chunks/page + refs/chunk metrics. """ import argparse, datetime, io, logging, os, re, subprocess, sys, time, zipfile from collections import Counter CP = os.getenv("BUILD_CP") or os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, CP) import httpx # noqa: E402 import pdfplumber # noqa: E402 from services.legal_act_ingester import UploadUnit, upload_unit # noqa: E402 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") log = logging.getLogger("sge") RAG_URL = os.getenv("RAG_URL", "https://localhost:8097") BUILD_COLLECTION = "bp_compliance_kb_2026_1_build" MANIFEST_VERSION = "regulatory_build_manifest_v3/2026.1" # --- Layout-Familien (Phase 2): detected family-String -> F-Nummer + Familienkarte (erwartete Familie je Quelle) --- FAMILY_F = {"arabic-hierarchical": "F1", "roman-hierarchical": "F2", "arabic-caps+paragraph": "F3", "unnumbered-toc": "F4"} EXPECTED_FAMILY = { # Familienkarte: VOR-Build-Erwartung, beim Build gegen detected gegated (FAMILY-GATE) "EDPB_WP243_DPO": "F1", "EDPB_WP248_DPIA": "F2", "EDPB_GL_05_2020_CONSENT": "F3", "EDPB_GL_09_2022_BREACH_NOTIFICATION": "F2", "EDPB_GL_07_2020_CONTROLLER_PROCESSOR": "F3", "EDPB_WP260_TRANSPARENCY": "F4", "DSK_SDM": "F1", "DSK_OH_KI_2024": "F1", "DSK_OH_CLOUD_2014": "F1", "DSK_KP_05_DSFA": "F1", # FAMILY-GATE-Korrektur: parst 13 Sekt. via Heuristik, kein edge-short "DSK_KP_12_DSB": "edge-short", "DSK_KP_13_AV": "edge-short", # echte Kurz-Flyer (0 Sekt., gate=FAIL) # verbleibende EDPB (vorab klassifiziert = reasoned, beim Build verifiziert): "EDPB_GL_01_2021_BREACH_EXAMPLES": "F3", "EDPB_GL_02_2019_ART6_1B_ONLINE": "F3", "EDPB_REC_01_2020_SUPPL_MEASURES": "F3", "EDPB_GL_05_2021_ART3_CHAPTERV": "F3", # ENISA-Charge (3. Herausgeber): erwartete HEADING-Familie (F1-F4); Capabilities (Tabellen/Mehrspaltig) orthogonal "ENISA_HANDBOOK_PDP": "F1", "ENISA_GL_SME_PDP": "F1", "ENISA_GL_EECC_SEC": "F3", "ENISA_ISPS_SME": "F1", "ENISA_TL_2023": "?", # TL = Capability-Boundary-Kandidat (mehrspaltig/visuell) # NIST (US, Public Domain) — 4. Herausgeber, doc_type=Technical Standard (refs=section/control), tabellenlastig "NIST_SP_800_53B": "F1", "NIST_SP_800_171": "F1", } # --- Manifest-Vertrag: document_type -> expected_reference_types (User 2026-06-28: DEKLARATIV, kein Parser-Bug). # Der Validator verlangt Artikel-Referenzen NUR, wenn der doc_type sie erwartet -> ENISA/NIST (Standards) scheitern nicht mehr. ISSUER_DOCTYPE = {"Article 29 WP / EDPB": "Guidance", "EDPB": "Guidance", "DSK": "Guidance", "ENISA": "Technical Standard", "NIST": "Technical Standard"} DOC_TYPE_REFS = {"EU Regulation": ["article", "recital", "annex"], "German Law": ["paragraph"], "Guidance": ["article", "guidance"], "Technical Standard": ["section", "control"], "Threat Report": ["cve", "cwe", "attack"], "Whitepaper": ["bibliography"]} # --- per-document registry (resolved sources pinned) --- NS = "https://ec.europa.eu/newsroom/just/document.cfm?doc_id=" DOCS = { "EDPB_WP243_DPO": { # Pilot (regression reference) — arabisch-Schema "reg": "EDPB WP243", "issuer": "Article 29 WP / EDPB", "expected_sections": 9, "name": "EDPB/WP29 Guidelines on Data Protection Officers (DPOs), WP 243 rev.01", "sources": { "en": {"pdf_url": NS + "44100"}, "de": {"zip_url": NS + "48137", "inner": "wp243rev01_de.pdf", "local": "/tmp/doc_48137.bin"}, }, }, "EDPB_WP248_DPIA": { "reg": "EDPB WP248", "issuer": "Article 29 WP / EDPB", "expected_sections": 8, "name": "EDPB/WP29 Guidelines on Data Protection Impact Assessment (DPIA), WP 248 rev.01", "sources": { "en": {"pdf_url": NS + "47711"}, "de": {"zip_url": NS + "48464", "inner": "wp248 rev.01_de.pdf", "local": "/tmp/wp248_48464.bin"}, }, }, "EDPB_GL_05_2020_CONSENT": { "reg": "EDPB GL 05/2020", "issuer": "EDPB", "expected_sections": 7, "name": "EDPB Guidelines 05/2020 on consent under Regulation 2016/679 (v1.1)", "sources": { "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/files/file1/edpb_guidelines_202005_consent_en.pdf"}, "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/files/file1/edpb_guidelines_202005_consent_de.pdf"}, }, }, "EDPB_GL_09_2022_BREACH_NOTIFICATION": { "reg": "EDPB GL 09/2022", "issuer": "EDPB", "expected_sections": 6, "name": "EDPB Guidelines 9/2022 on personal data breach notification under GDPR (v2.0)", "sources": { "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-04/edpb_guidelines_202209_personal_data_breach_notification_v2.0_en.pdf"}, "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202209_personal_data_breach_notification_v2.0_de_0.pdf"}, }, }, "EDPB_GL_07_2020_CONTROLLER_PROCESSOR": { "reg": "EDPB GL 07/2020", "issuer": "EDPB", "expected_sections": 5, "name": "EDPB Guidelines 07/2020 on the concepts of controller and processor in the GDPR (v2.0)", "sources": { "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-10/EDPB_guidelines_202007_controllerprocessor_final_en.pdf"}, "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-10/EDPB_guidelines_202007_controllerprocessor_final_de.pdf"}, }, }, "EDPB_WP260_TRANSPARENCY": { "reg": "EDPB WP260", "issuer": "Article 29 WP / EDPB", "expected_sections": 5, "name": "EDPB/WP29 Guidelines on transparency under Regulation 2016/679, WP 260 rev.01", "sources": { "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/2023-09/wp260rev01_en.pdf"}, "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/2023-09/wp260rev01_de.pdf"}, }, }, "EDPB_GL_01_2021_BREACH_EXAMPLES": { "reg": "EDPB GL 01/2021", "issuer": "EDPB", "expected_sections": 4, "name": "EDPB Guidelines 01/2021 on Examples regarding Personal Data Breach Notification", "sources": { "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/2022-01/edpb_guidelines_012021_pdbnotification_adopted_en.pdf"}, "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2022-04/edpb_guidelines_012021_pdbnotification_adopted_de.pdf"}, }, }, "EDPB_GL_02_2019_ART6_1B_ONLINE": { "reg": "EDPB GL 02/2019", "issuer": "EDPB", "expected_sections": 4, "name": "EDPB Guidelines 2/2019 on processing under Article 6(1)(b) GDPR (online services)", "sources": { "en": {"pdf_url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines-art_6-1-b-adopted_after_public_consultation_en.pdf"}, "de": {"pdf_url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines-art_6-1-b-adopted_after_public_consultation_de_0.pdf"}, }, }, "EDPB_REC_01_2020_SUPPL_MEASURES": { "reg": "EDPB REC 01/2020", "issuer": "EDPB", "expected_sections": 4, "name": "EDPB Recommendations 01/2020 on supplementary measures for transfer tools", "sources": { "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2021-06/edpb_recommendations_202001vo.2.0_supplementarymeasurestransferstools_en.pdf"}, "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2022-04/edpb_recommendations_202001vo.2.0_supplementarymeasurestransferstools_de.pdf"}, }, }, "EDPB_GL_05_2021_ART3_CHAPTERV": { "reg": "EDPB GL 05/2021", "issuer": "EDPB", "expected_sections": 4, "name": "EDPB Guidelines 05/2021 on the Interplay of Article 3 and Chapter V GDPR", "sources": { "en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-02/edpb_guidelines_05-2021_interplay_between_the_application_of_art3-chapter_v_of_the_gdpr_v2_en_0.pdf"}, "de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-09/edpb_guidelines_05-2021_interplay_between_the_application_de.pdf"}, }, }, "DSK_SDM": { # F4-Cross-Issuer-Kandidat (DSK, dt.) "reg": "DSK SDM", "issuer": "DSK", "expected_sections": 3, "name": "DSK Standard-Datenschutzmodell V3.1", "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/ah/SDM-Methode-V31.pdf"}}, }, "DSK_OH_KI_2024": { "reg": "DSK OH KI", "issuer": "DSK", "expected_sections": 3, "name": "DSK Orientierungshilfe KI und Datenschutz (2024)", "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/oh/20240506_DSK_Orientierungshilfe_KI_und_Datenschutz.pdf"}}, }, "DSK_KP_12_DSB": { "reg": "DSK KP12", "issuer": "DSK", "expected_sections": 2, "name": "DSK Kurzpapier Nr. 12 - Datenschutzbeauftragte", "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_12.pdf"}}, }, "DSK_OH_CLOUD_2014": { "reg": "DSK OH Cloud", "issuer": "DSK", "expected_sections": 3, "name": "DSK Orientierungshilfe - Cloud Computing (2014)", "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/oh/20141009_oh_cloud_computing.pdf"}}, }, "DSK_KP_05_DSFA": { "reg": "DSK KP05", "issuer": "DSK", "expected_sections": 2, "name": "DSK Kurzpapier Nr. 5 - Datenschutz-Folgenabschätzung", "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf"}}, }, "DSK_KP_13_AV": { "reg": "DSK KP13", "issuer": "DSK", "expected_sections": 2, "name": "DSK Kurzpapier Nr. 13 - Auftragsverarbeitung", "sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_13.pdf"}}, }, "ENISA_HANDBOOK_PDP": { # Stufe 1 — bekannte Welt (Kalibrierung) "reg": "ENISA Handbook PDP", "issuer": "ENISA", "expected_sections": 4, "name": "ENISA Handbook on Security of Personal Data Processing", "sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/WP2017%20O-2-2-5%20GDPR%20Measures%20Handbook.pdf"}}, }, "ENISA_GL_SME_PDP": { # Stufe 1 "reg": "ENISA GL SME PDP", "issuer": "ENISA", "expected_sections": 4, "name": "ENISA Guidelines for SMEs on the security of personal data processing", "sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/WP2016%203-2%206%20Data%20Controllers%20Risk.pdf"}}, }, "ENISA_GL_EECC_SEC": { # Stufe 2 — bekannte Welt + Annex/Tabellen "reg": "ENISA GL EECC", "issuer": "ENISA", "expected_sections": 4, "name": "ENISA Guideline on Security Measures under the EECC (4th edition)", "sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20-%20Guideline%20on%20Security%20Measures%20under%20the%20EECC-%204th%20edition.pdf"}}, }, "ENISA_ISPS_SME": { # Stufe 3 — Grenztest (tabellenzentriert) "reg": "ENISA ISPS SME", "issuer": "ENISA", "expected_sections": 3, "name": "ENISA Information security and privacy standards for SMEs", "sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/Information%20security%20and%20privacy%20standards%20for%20SMEs.pdf"}}, }, "ENISA_TL_2023": { # Stufe 3 — Grenztest (mehrspaltig/visuell) "reg": "ENISA TL 2023", "issuer": "ENISA", "expected_sections": 3, "name": "ENISA Threat Landscape 2023", "sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20Threat%20Landscape%202023.pdf"}}, }, "NIST_SP_800_53B": { # NIST (US Public Domain) — tabellenzentriert (Control Baselines) "reg": "NIST SP 800-53B", "issuer": "NIST", "expected_sections": 3, "name": "NIST SP 800-53B Control Baselines for Information Systems and Organizations", "sources": {"en": {"pdf_url": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-53B.pdf"}}, }, "NIST_SP_800_171": { # NIST — strukturierte Guidance + Tabellen "reg": "NIST SP 800-171", "issuer": "NIST", "expected_sections": 3, "name": "NIST SP 800-171 Rev 2 Protecting Controlled Unclassified Information", "sources": {"en": {"pdf_url": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-171r2.pdf"}}, }, } # --- language-specific lexical config --- LANG = { "en": {"artref": re.compile(r'(?:Article|Art\.)\s*(\d+)'), "noise": re.compile(r'ARTICLE 29 DATA PROTECTION|^\d{1,3}$|Adopted on|Revised and Adopted|Version \d', re.I)}, "de": {"artref": re.compile(r'(?:Artikel|Art\.)\s*(\d+)'), "noise": re.compile(r'ARTIKEL.?29.?DATENSCHUTZ|ARTICLE 29 DATA PROTECTION|^\d{1,3}$|Angenommen|berarbeitet und|Adopted on|Fassung \d', re.I)}, } # Generic enumerator: arabic (1 / 1.2.3) | roman (I. / III.B.) | Annex N. # \d{1,3} excludes years (4 digits) as section numbers. Roman requires trailing dot. ROMAN = r'(?:XVIII|VIII|XIII|XVII|III|VII|XII|XIV|XVI|XIX|II|IV|VI|IX|XI|XV|XX|I|V|X)' ENUM = re.compile( r'^(?:' r'(?P\d{1,3}(?:\.\d+){0,3})\.?' # arabic 1 / 1.2.3 (self-pathed) r'|(?P' + ROMAN + r')\.' # roman top-level I. / III. (require dot) r'|(?P[A-Z])\.' # capital-letter sub A./B. (only under roman scheme) r'|(?P(?:Annex|Annexe|Anhang|Anlage)\s+\d+)' r')\s+(?P\S.{0,84})$' ) ANNEX_KW = ("ANNEX", "ANNEXE", "ANHANG", "ANLAGE") GDPR_HINT = re.compile(r'DSGVO|GDPR|2016/679|Verordnung $EU$ 2016/679', re.I) def git_sha(): try: return subprocess.check_output(["git", "-C", CP, "rev-parse", "--short", "HEAD"]).decode().strip() except Exception: return "unknown" CACHE_DIR = "/tmp/sge_cache" def _http_get(url, timeout=120.0, attempts=4): import time as _t last = None for i in range(attempts): try: with httpx.Client(timeout=timeout, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"}) as c: data = c.get(url).content if data: return data last = RuntimeError("empty body") except Exception as e: last = e log.info("download attempt %d/%d failed: %s", i + 1, attempts, e) _t.sleep(3 * (i + 1)) raise RuntimeError("download failed after %d attempts: %s (%s)" % (attempts, url, last)) def fetch(src): import hashlib os.makedirs(CACHE_DIR, exist_ok=True) cache_key = hashlib.md5((src.get("zip_url") or src.get("pdf_url") or "").encode()).hexdigest() cache_pdf = os.path.join(CACHE_DIR, cache_key + ".pdf") if os.path.exists(cache_pdf): pdf = open(cache_pdf, "rb").read() if pdf[:4] == b"%PDF": log.info("PDF cache hit %s (%d B)", cache_pdf, len(pdf)) return pdf local = src.get("local", "") if "zip_url" in src: if local and os.path.exists(local): data = open(local, "rb").read(); log.info("ZIP local %s (%d B)", local, len(data)) else: data = _http_get(src["zip_url"], timeout=180.0) log.info("ZIP downloaded (%d B)", len(data)) pdf = zipfile.ZipFile(io.BytesIO(data)).read(src["inner"]) else: if local and os.path.exists(local): pdf = open(local, "rb").read() else: pdf = _http_get(src["pdf_url"], timeout=120.0) if pdf[:4] != b"%PDF": raise RuntimeError("not a PDF: %r" % pdf[:16]) try: open(cache_pdf, "wb").write(pdf) except Exception: pass return pdf def src_url(src): return src.get("zip_url") or src.get("pdf_url") def line_font(pg, ln): chs = [c for c in pg.chars if ln["top"] - 1 <= c["top"] <= ln["bottom"] + 1] if not chs: return 0.0, "" sz = Counter(round(c.get("size", 0), 1) for c in chs).most_common(1)[0][0] fn = Counter(c.get("fontname", "") for c in chs).most_common(1)[0][0] return sz, fn def _upper_ratio(title): letters = [c for c in title if c.isalpha()] return (sum(c.isupper() for c in letters) / len(letters)) if letters else 0.0 _TTL_STOP = {"der", "die", "das", "den", "des", "dem", "und", "von", "zur", "zum", "für", "auf", "the", "of", "and", "to", "for", "in", "on", "an", "under", "with", "sur", "aux"} def _ttoks(s): out = set() for t in re.findall(r'[a-zà-ÿ0-9]+', s.lower()): t = re.sub(r'\d+$', '', t) # Footnote-Ziffern entkleben ("freiwillig12" -> "freiwillig") if len(t) > 2 and t not in _TTL_STOP: out.add(t) return out def _is_caption(title): # strukturelle All-Caps-Überschrift (PREFACE/VORWORT/ANNEX) — auch ohne TOC-Eintrag gültig return _upper_ratio(title) >= 0.8 and len(title) <= 34 # TOC-Zeile: <enum> <Titel> <lange Punktführung> <Seitenzahl>. EIGENE Regex (NICHT ENUM — # dessen Titel-Cap .{0,84} scheitert an 130+-Punkt-Führungen). Titel = non-greedy bis zur Führung. TOC_LINE = re.compile(r'^[A-Z0-9][\w./()\-]*\s+(?P<title>.+?)\s*\.{3,}\s*\d{1,3}\s*$') def extract_toc(all_lines, body_size): """TOC = explizite Strukturdeklaration des Dokuments. Sammelt die Titel-Token-Sets der Inhaltsverzeichnis-Einträge aus dem Frontmatter. Absätze stehen NIE im TOC. >=5 => TOC vorhanden.""" titlesets = [] for page_number, txt, s, _ in all_lines: if page_number > 8: break m = TOC_LINE.match(txt) if not m: continue ts = _ttoks(m.group("title")) if ts: titlesets.append(ts) return (len(titlesets) >= 5, titlesets) def _title_in_toc(title, titlesets): # Match relativ zum BODY (inter/|body|): eine echte Überschrift ist ~ein TOC-Titel (alle Body-Wörter # im TOC -> 1.0); ein Absatz, der Sektionswörter ENTHÄLT, hat Extra-Wörter (-> <0.75) -> kein Match. # Body-Titel ist bei 84 Zeichen gekappt = Präfix des vollen TOC-Titels, daher robust gegen Kappung. bt = _ttoks(title) if not bt: return False for ts in titlesets: if len(bt & ts) / len(bt) >= 0.75: return True return False def _looks_like_heading(txt, s, fn, body_size): # F4-Plausibilitätsfilter — NICHT die Entscheidung (die trifft das TOC). Trimmt nur Fließtext weg, # BEVOR das TOC befragt wird: kurz + nicht satzschließend. Bewusst KEIN Bold/Size-Zwang — das TOC # entscheidet, Format ist nur Filter (greift eh nur in als unnummeriert erkannten Docs). t = txt.rstrip() return 3 <= len(t) <= 90 and not t.endswith((".", ";", ":", ",")) def parse_guidance(pdf_bytes, noise): with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: n_pages = len(pdf.pages) szc = Counter() all_lines = [] # (page_number, txt, size, fontname) — einmal extrahiert, zweimal genutzt n_tables = 0 # detected (Capability: Tables Detect) raw_tables = [] # (page_number, rows) — EXTRAHIERT (Capability: Tables Extract). Scope: gelinierte/einfache. for pg in pdf.pages: try: pts = pg.extract_tables() except Exception: pts = [] n_tables += len(pts) for tbl in pts: # erster Scope: >=2 Zeilen + >=3 nicht-leere Zellen (keine verschachtelten/visuellen) if tbl and len(tbl) >= 2 and sum(1 for row in tbl for c in row if c and c.strip()) >= 3: raw_tables.append((pg.page_number, [[(c or "").strip() for c in row] for row in tbl])) for ch in pg.chars: szc[round(ch.get("size", 0), 1)] += 1 for ln in pg.extract_text_lines(layout=False): txt = " ".join(ln["text"].split()) if not txt: continue s, fn = line_font(pg, ln) all_lines.append((pg.page_number, txt, s, fn)) total_chars = sum(szc.values()) body_size = szc.most_common(1)[0][0] # Schema-Erkennung (selbstkalibrierend): nutzt das Dok ALL-CAPS bare-Integer-Sektionsüberschriften # (EDPB-Hausstil)? Dann nummeriert es auch Absätze (Satz-case) -> bare-Integer-Heading muss ALL-CAPS # sein. WP243 (Title-case-Sektionen, keine ALL-CAPS) -> Regel inaktiv, keine Regression. Schwelle 2 # gegen einzelne Fluke-Caps-Zeile; EN/DE-formatierungsunabhängig (anders als bare-Integer-Magnitude). caps_secs = 0 for _, txt, s, _ in all_lines: if s < body_size - 0.5: continue mm = ENUM.match(txt) if (mm and mm.group("ar") and "." not in mm.group("ar") and not re.search(r'\s\d{1,3}$', mm.group("title")) and _upper_ratio(mm.group("title")) >= 0.6): caps_secs += 1 caps_scheme = caps_secs >= 2 toc_present, toc = extract_toc(all_lines, body_size) # F4-DOC-Erkennung: decken NUMMERIERTE Headings das TOC ab? Wenn kaum -> die Headings sind unnummeriert # (F4) -> unnummerierten Pfad aktivieren. Sonst (F1-F3) AUS -> keine Regression. numbered_toc_hits = 0 if toc_present: for _, txt, s, _ in all_lines: if s < body_size - 0.5: continue mm = ENUM.match(txt) if not mm or re.search(r'\s\d{1,3}$', mm.group("title")): continue ttl = mm.group("title").strip().rstrip('.').strip() if _title_in_toc(ttl, toc) or _is_caption(ttl): numbered_toc_hits += 1 unnumbered_doc = toc_present and numbered_toc_hits < max(3, len(toc) * 0.4) sections, cur, started, in_annex, seen_types, max_sec, un_count = [], None, False, False, set(), 0, 0 top_en, top_type = None, None # last level-1 enumerator (roman/arabic/annex) for sub-path assembly for page_number, txt, s, fn in all_lines: m = ENUM.match(txt) ok_head = bool(m) and s >= body_size - 0.5 and not re.search(r'\s\d{1,3}$', m.group("title")) if ok_head: if m.group("ar"): typ, en = "ar", m.group("ar") elif m.group("ro"): typ, en = "ro", m.group("ro") elif m.group("le"): typ, en = "le", m.group("le") else: typ, en = "ax", m.group("ax") title = m.group("title").strip().rstrip('.').strip() # Großbuchstaben-Sub nur in römisch-Schema (Label-Assembly) — immer if typ == "le" and top_type != "ro": ok_head = False if ok_head and typ == "ar" and not in_annex: if unnumbered_doc: # F4-Doc: nummerierte Zeilen sind Absätze/Artikel-Refs ("13.1", "39.34"), nicht Headings # -> nur akzeptieren, wenn der Titel im TOC steht. if not _title_in_toc(title, toc): ok_head = False elif "." in en: # dotted = echte Sub-Headings (immer ok), AUSSER Dezimal/Uhrzeit-FP: führende Null im # Sub-Teil ("8.00 Uhr", "17.00"). Echte Sektionsnummern haben keine führende Null (3.1, 3.10). if any(c.startswith("0") for c in en.split(".")[1:]): ok_head = False elif toc_present: # bare-Integer: TOC = Wahrheit (Absätze stehen nie im TOC); All-Caps-Captions auch ohne TOC. if not (_title_in_toc(title, toc) or _is_caption(title)): ok_head = False else: # Fallback-Heuristiken nur für bare-Integer (kein TOC) if top_type == "ro": ok_head = False elif caps_scheme and _upper_ratio(title) < 0.6: ok_head = False elif int(en) > max_sec + 5: ok_head = False elif (m is None and unnumbered_doc and _looks_like_heading(txt, s, fn, body_size) and not re.search(r'\.\d+$', txt.strip()) and _title_in_toc(txt, toc)): # F4 — UNNUMMERIERTE Überschrift: TOC = die ENTSCHEIDUNG, looks_like_heading nur Plausibilitätsfilter. # Greift NUR wenn ENUM nicht matcht UND der Doc als unnummeriert erkannt wurde (F1-F3 unberührt). # NICHT mit ".<Ziffern>" endend -> Fußnoten-Fragmente ("environment.47") raus (schont DE-Komposita). ok_head, typ, en, title = True, "ut", None, txt.strip().rstrip('.').strip() if ok_head: seen_types.add(typ) if typ == "le": num, lvl = ((top_en + "." + en) if top_en else en), 2 elif typ == "ut": un_count += 1 num, lvl = str(un_count), 1 else: num = en lvl = (num.count('.') + 1) if typ == "ar" else 1 top_en, top_type = en, typ if typ == "ar" and "." not in en and not in_annex: max_sec = max(max_sec, int(en)) cur = {"num": num, "title": title, "level": lvl, "in_annex": in_annex, "bold": "bold" in fn.lower(), "page": page_number, "body": []} sections.append(cur); started = True if typ == "ax" or any(k in (num + " " + title).upper() for k in ANNEX_KW): in_annex = True elif started and cur is not None: if s >= body_size - 2 and not noise.match(txt): cur["body"].append(txt) # TOC-Duplikate generisch entfernen: gleicher LABEL-Key doppelt (TOC-Stub p2-3 + echte Sektion) # -> den mit längerem Body behalten. Key = Label-Logik (A-Präfix für numerische Annex-Items), # NICHT roher num — sonst kollidieren WP243-Annex-Items (num 1-13) mit Kapiteln (num 1-5). def _lk(sc): n = sc["num"] return ("A" + n) if (sc.get("in_annex") and n.isdigit()) else n best = {} for sc in sections: k = _lk(sc) if k not in best or len("\n".join(sc["body"])) > len("\n".join(best[k]["body"])): best[k] = sc sections = [sc for sc in sections if best.get(_lk(sc)) is sc] return {"pages": n_pages, "total_chars": total_chars, "body_size": body_size, "sections": sections, "caps_scheme": caps_scheme, "schemes": seen_types, "toc": toc_present, "toc_entries": len(toc), "tables": n_tables, "annex": any(sc.get("in_annex") for sc in sections), "raw_tables": raw_tables} def self_test(parsed, expected, artref, expected_refs): probs, secs = [], parsed["sections"] if parsed["total_chars"] < 5000: probs.append("zu wenig embedded text (%d) -> OCR?" % parsed["total_chars"]) if len(secs) < expected: probs.append("nur %d Sektionen < expected %d" % (len(secs), expected)) full = " ".join(t for sc in secs for t in sc["body"]) + " " + " ".join(sc["title"] for sc in secs) arts = set(artref.findall(full)) # Manifest-Vertrag (doc_type-abhängig): Artikel-Pflicht NUR wenn der doc_type 'article' erwartet # (Guidance/EU-VO). Technical Standards (NIST/ENISA) zitieren section/control -> kein FAIL bei fehlenden Artikeln. if "article" in expected_refs and not arts: probs.append("references_out: KEIN Artikel erkannt (doc_type erwartet article)") return (not probs, probs, sorted(arts, key=lambda x: int(x))[:12]) def _build_units_struct(doc_id, doc, lang, parsed, base_version, prov): reg = doc["reg"] units = [] sources = [] idx = 0 for sc in parsed["sections"]: body = "\n".join(sc["body"]).strip() if len(body) < 40: continue idx += 1 num = sc["num"] lab = ("A" + num) if (sc.get("in_annex") and num.isdigit()) else num cu = "%s §%s" % (reg, lab) text = "%s §%s %s\n\n%s" % (reg, lab, sc["title"], body) m = { "regulation_code": reg, "regulation_short": reg, "regulation_name_de": doc["name"], "language": lang, "citation_style": "guidance_section", "document_type": "guidance", "source_class": "supervisory_guidance", "source_role": "interpretation", "use_for_primary": False, "bindingness": "non_binding_interpretative", "authority_level": 70, "authority_weight": 70, "source_type": "guidance", "issuer": doc["issuer"], "jurisdiction": "EU", "source": "ec.europa.eu", "license": "public_eu", "category": "guidance", "citation_unit": cu, "article_label": cu, "parent_citation_unit": reg, "is_citable": True, "article": "§%s" % lab, "article_title": sc["title"], "article_type": "interpretation", "chunk_scope": "guidance_section", "context_hierarchy": [reg], "display_context": "%s > §%s %s" % (reg, lab, sc["title"]), "norm_id": "EU-%s-%s-%s" % (doc_id, lang.upper(), lab), "references_out": [], "child_tables": sc.get("child_tables", []), } m.update(prov) units.append(UploadUnit(filename="%s_%s_s%d.txt" % (doc_id.lower(), lang, idx), text=text, meta=m, document_version="%s-s%d" % (base_version, idx), collection=BUILD_COLLECTION)) sources.append((body, sc["title"])) return units, sources def _attach_refs(units, sources, artref): for u, (body, title) in zip(units, sources): u.meta["references_out"] = sorted({"Art. %s DSGVO" % n for n in artref.findall(body + " " + title)}, key=lambda x: int(x.split()[1])) def build_units(doc_id, doc, lang, parsed, base_version, prov, artref): units, sources = _build_units_struct(doc_id, doc, lang, parsed, base_version, prov) _attach_refs(units, sources, artref) return units def _table_md(rows): hdr = rows[0] out = ["| " + " | ".join(c or "" for c in hdr) + " |", "| " + " | ".join("---" for _ in hdr) + " |"] for r in rows[1:]: out.append("| " + " | ".join((c or "").replace("\n", " ") for c in r) + " |") return "\n".join(out) def build_table_units(doc_id, doc, lang, parsed, base_version, prov): # Capability Tables Extraction: jede Tabelle = EIGENE Knowledge-Unit (Markdown + JSON), an ihre Sektion gehängt. # table.parent_section = section.num ; section.child_tables = [table_id]. Separater Pfad — Heading-Parse unberührt. reg, sections, units = doc["reg"], parsed["sections"], [] for ti, (page, rows) in enumerate(parsed.get("raw_tables", []), 1): parent = None for sc in sections: if sc.get("page", 0) <= page: parent = sc else: break plabel = parent["num"] if parent else "0" md = _table_md(rows) if len(md) < 25: continue tid = "%s-t%d" % (doc_id, ti) if parent is not None: parent.setdefault("child_tables", []).append(tid) cu = "%s §%s Tabelle %d" % (reg, plabel, ti) m = {"regulation_code": reg, "regulation_short": reg, "language": lang, "source_class": "supervisory_guidance", "source_role": "interpretation", "use_for_primary": False, "jurisdiction": "EU", "category": "guidance", "is_table": True, "table_id": tid, "parent_section": plabel, "page": page, "columns": rows[0], "rows": rows, "markdown": md, "extraction_method": "pdfplumber", "confidence": "lined-simple", "citation_unit": cu, "article_label": cu, "chunk_scope": "table", "display_context": "%s > §%s > Tabelle %d" % (reg, plabel, ti), "references_out": []} m.update(prov) units.append(UploadUnit(filename="%s_%s_tbl%d.txt" % (doc_id.lower(), lang, ti), text="%s §%s — Tabelle %d (S.%d)\n\n%s" % (reg, plabel, ti, page, md), meta=m, document_version="%s-tbl%d" % (base_version, ti), collection=BUILD_COLLECTION)) return units import capability_pipeline as _CP def _region_native_tables(pdf_bytes): rt = [] regions = [] with pdfplumber.open(io.BytesIO(pdf_bytes)) as _pdf: for pg in _pdf.pages: try: tables = pg.find_tables() except Exception: tables = [] for t in tables: try: tbl = t.extract() except Exception: tbl = None if tbl and len(tbl) >= 2 and sum(1 for row in tbl for c in row if c and c.strip()) >= 3: rt.append((pg.page_number, [[(c or "").strip() for c in row] for row in tbl])) regions.append((pg.page_number, t.bbox)) return rt, regions class _C6Tables: name = "C6_Tables" consumes = ["table_region"] produces = ["table_units"] def run(self, ctx): rtables, regions = _region_native_tables(ctx["pdf"]) ctx["claimed_table_regions"] = regions p2 = dict(ctx["parsed"]) p2["raw_tables"] = rtables ctx["_table_units"] = build_table_units(ctx["doc_id"], ctx["doc"], ctx["lang"], p2, ctx["rt"], ctx["prov"]) class _C7ReadingOrder: name = "C7_ReadingOrder" consumes = ["prose_region", "table_units"] produces = ["ordered_prose"] def run(self, ctx): ctx["_c7_owns_prose"] = True class _C1C2Sections: name = "C1C2_Sections" consumes = ["prose_region", "table_units"] produces = ["section_struct"] def run(self, ctx): units, sources = _build_units_struct(ctx["doc_id"], ctx["doc"], ctx["lang"], ctx["parsed"], ctx["rt"], ctx["prov"]) ctx["_section_units"] = units ctx["_sources"] = sources class _C4References: name = "C4_References" consumes = ["section_struct"] produces = ["references"] def run(self, ctx): _attach_refs(ctx["_section_units"], ctx["_sources"], ctx["cfg"]["artref"]) def run_engine(doc_id, doc, lang, parsed, run_tag, prov, cfg, pdf_bytes): ctx = {"doc_id": doc_id, "doc": doc, "lang": lang, "parsed": parsed, "rt": run_tag, "prov": prov, "cfg": cfg, "pdf": pdf_bytes} caps = [_C4References(), _C7ReadingOrder(), _C1C2Sections(), _C6Tables()] order = _CP.resolve_order(caps, {"table_region", "prose_region"}) for c in order: c.run(ctx) return ctx["_section_units"], ctx["_table_units"] def main(): ap = argparse.ArgumentParser() ap.add_argument("--doc", required=True) ap.add_argument("--lang", required=True, choices=["en", "de"]) ap.add_argument("--dry-run", action="store_true") args = ap.parse_args() doc = DOCS[args.doc] lang, cfg = args.lang, LANG[args.lang] sha = git_sha() run_tag = "2026.1-%s-%s" % (args.doc.lower(), lang) run_id = "%s-%s" % (run_tag, int(time.time())) date = datetime.date.today().isoformat() src = doc["sources"][lang] log.info("SGE %s [%s] | exp_sec=%d sha=%s dry=%s", args.doc, lang, doc["expected_sections"], sha, args.dry_run) doc_type = doc.get("doc_type") or ISSUER_DOCTYPE.get(doc["issuer"], "Guidance") expected_refs = DOC_TYPE_REFS.get(doc_type, ["article"]) pdf = fetch(src) _t0 = time.time() parsed = parse_guidance(pdf, cfg["noise"]) parse_ms = int((time.time() - _t0) * 1000) ok, probs, arts = self_test(parsed, doc["expected_sections"], cfg["artref"], expected_refs) pages = parsed["pages"] log.info("PARSED pages=%d chars=%d body=%.1f sections=%d caps_scheme=%s gate=%s %s", pages, parsed["total_chars"], parsed["body_size"], len(parsed["sections"]), parsed.get("caps_scheme"), ok, probs) log.info("Art-refs: %s", arts) for sc in parsed["sections"]: log.info(" [%s]%s %s (p%d)", sc["num"], "*" if sc.get("in_annex") else " ", sc["title"][:58], sc["page"]) sch = parsed.get("schemes", set()) family = ("unnumbered-toc" if "ut" in sch and not ({"ar", "ro"} & sch) # F4 else "roman-hierarchical" if "ro" in sch # F2 else "arabic-caps+paragraph" if parsed.get("caps_scheme") # F3 else "arabic-hierarchical") # F1 detF = FAMILY_F.get(family, "?") expF = EXPECTED_FAMILY.get(args.doc, "?") gate = "OK" if expF == detF else ("REVIEW(exp=%s)" % expF if expF != "?" else "no-expected") prov = {"parser_version": "StructuredGuidanceExtractor@%s+sge_build" % sha, "ingest_run_id": run_id, "ingest_date": date, "source_url": src_url(src), "source_inner_file": src.get("inner", ""), "build_collection": BUILD_COLLECTION, "manifest_version": MANIFEST_VERSION, "document_id": args.doc, "layout_family": detF, "document_type": doc_type, "expected_reference_types": expected_refs} units, table_units = run_engine(args.doc, doc, lang, parsed, run_tag, prov, cfg, pdf) # CUTOVER: Engine-Pfad total_refs = sum(len(u.meta["references_out"]) for u in units) detect = "TOC(%d)" % parsed.get("toc_entries", 0) if parsed.get("toc") else "heuristic" log.info("UNITS=%d table_units=%d | VITALS family=%s(%s) FAMILY-GATE=%s detect=%s tables_detect=%d tables_extract=%d annex=%s parse_ms=%d chunks/page(units)=%.2f refs/unit=%.2f", len(units), len(table_units), family, detF, gate, detect, parsed.get("tables", 0), len(table_units), "Y" if parsed.get("annex") else "N", parse_ms, len(units) / max(pages, 1), total_refs / max(len(units), 1)) if gate.startswith("REVIEW"): log.warning("FAMILY-GATE REVIEW: %s erwartet %s, erkannt %s — Klassifikation prüfen", args.doc, expF, detF) if units: m = units[0].meta pk = ["parser_version", "ingest_run_id", "ingest_date", "source_url", "build_collection", "manifest_version", "document_id"] log.info("sample: label=%r language=%r source_class=%r use_for_primary=%r refs=%s", m.get("article_label"), m.get("language"), m.get("source_class"), m.get("use_for_primary"), m.get("references_out")) log.info("provenance present: %s", all(k in m for k in pk)) if args.dry_run: import json as _json, hashlib as _hl _g=[{"id":u.document_version,"filename":u.filename,"kind":("table" if u.meta.get("is_table") else "section"), "text_sha":_hl.sha256((u.text or "").encode()).hexdigest()[:16],"meta":u.meta} for u in (units+table_units)] _bp="/tmp/baseline_%s_%s.json"%(args.doc,lang) open(_bp,"w").write(_json.dumps(_g,ensure_ascii=False,indent=1,default=str)) log.info("DRY RUN baseline -> %s (%d units: %d section + %d table)",_bp,len(_g),len(units),len(table_units)); return if not ok: log.error("GATE FAILED — aborting"); sys.exit(1) n = 0 with httpx.Client(timeout=600.0, verify=False) as c: for u in units + table_units: n += upload_unit(c, RAG_URL, u) log.info("UPLOADED: %d section + %d table units -> %d chunks", len(units), len(table_units), n) if __name__ == "__main__": main()