Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a8412e3db7 | |||
| 3b466be140 | |||
| 24c618ca2e | |||
| c258fbc3de | |||
| 569f64a400 |
@@ -0,0 +1,92 @@
|
||||
"""Capability Execution Engine (Prototyp). Region-IR mit Runtime-Owner; Capabilities
|
||||
deklarieren claims/consumes/produces; Ausführungsreihenfolge wird aus dem Artefakt-Graphen
|
||||
ABGELEITET (topologisch), nicht hartkodiert. Realisiert C6/C7 als Pipeline-Stages mit Region-Ownership."""
|
||||
from dataclasses import dataclass, field
|
||||
import reading_order as RO
|
||||
|
||||
@dataclass
|
||||
class Region:
|
||||
id:int; bbox:tuple; type:str="unknown"; state:str="unclaimed"; owner:str=None
|
||||
|
||||
@dataclass
|
||||
class Artifact:
|
||||
kind:str; payload:object; source_region:int=None
|
||||
|
||||
class Capability:
|
||||
name=""; consumes=[]; produces=[]; residual=False
|
||||
def claims(self, region): return False
|
||||
def run(self, regions, page, artifacts): return []
|
||||
|
||||
class TableExtraction(Capability):
|
||||
name="C6_TableExtraction"; consumes=["table_region"]; produces=["table_units"]
|
||||
def claims(self, region): return region.type=="table"
|
||||
def run(self, regions, page, artifacts):
|
||||
out=[]
|
||||
for r in regions:
|
||||
crop=page.crop(_clamp(r.bbox, page))
|
||||
tbls=crop.find_tables()
|
||||
rows=tbls[0].extract() if tbls else []
|
||||
out.append(Artifact("table_units", {"region":r.id,"rows":len(rows)}, r.id))
|
||||
return out
|
||||
|
||||
class ReadingOrder(Capability):
|
||||
name="C7_ReadingOrder"; consumes=["prose_region"]; produces=["ordered_prose"]; residual=True
|
||||
def claims(self, region): return region.type=="prose"
|
||||
def run(self, regions, page, artifacts):
|
||||
table_bboxes=[a.payload for a in artifacts if a.kind=="_table_bbox"]
|
||||
ws=[w for w in page.extract_words() if not _in_any(w, table_bboxes)]
|
||||
text=RO.emit_words(ws, float(page.width)) if hasattr(RO,"emit_words") else ""
|
||||
return [Artifact("ordered_prose", {"words":len(ws),"chars":len(text)}, None)]
|
||||
|
||||
class FigureExtraction(Capability):
|
||||
name="C8_FigureExtraction"; consumes=["figure_region"]; produces=["figure_units"]
|
||||
def claims(self, region): return region.type=="figure"
|
||||
|
||||
class References(Capability):
|
||||
name="C4_References"; consumes=["ordered_prose"]; produces=["citation_units"]
|
||||
|
||||
def _in_any(w, bboxes):
|
||||
cx=(w["x0"]+w["x1"])/2; cy=(w["top"]+w["bottom"])/2
|
||||
for (x0,t,x1,b) in bboxes:
|
||||
if x0<=cx<=x1 and t<=cy<=b: return True
|
||||
return False
|
||||
|
||||
def _clamp(b, page):
|
||||
x0,t,x1,bt=b
|
||||
return (max(0,x0),max(0,t),min(float(page.width),x1),min(float(page.height),bt))
|
||||
|
||||
def segment(page):
|
||||
regions=[]; rid=0; tbb=[]; W=float(page.width); H=float(page.height)
|
||||
for t in page.find_tables():
|
||||
b=_clamp(t.bbox,page)
|
||||
if (b[2]-b[0])>=0.25*W and (b[3]-b[1])>=25: # substanzielle Tabelle, keine Footer-Artefakte
|
||||
regions.append(Region(rid,b,"table")); tbb.append(b); rid+=1
|
||||
regions.append(Region(rid,(0,0,W,H),"prose")); rid+=1
|
||||
return regions, tbb
|
||||
|
||||
def resolve_order(caps, raw_types):
|
||||
available=set(raw_types); ordered=[]; remaining=list(caps)
|
||||
while remaining:
|
||||
progressed=False
|
||||
for c in list(remaining):
|
||||
if all(dep in available for dep in c.consumes):
|
||||
ordered.append(c); available.update(c.produces); remaining.remove(c); progressed=True
|
||||
if not progressed: raise RuntimeError("unsatisfiable: "+str([c.name for c in remaining]))
|
||||
return ordered
|
||||
|
||||
def run_pipeline(page, caps):
|
||||
regions, tbb=segment(page)
|
||||
raw_types=set(r.type+"_region" for r in regions)
|
||||
order=resolve_order(caps, raw_types)
|
||||
artifacts=[Artifact("_table_bbox",b) for b in tbb] # geometrie für C7-Ausschluss
|
||||
# CLAIM-Phase: spezifische Claimer zuerst, residual zuletzt
|
||||
for c in sorted(order, key=lambda c:c.residual):
|
||||
for r in regions:
|
||||
if r.state=="unclaimed" and c.claims(r):
|
||||
r.state="claimed"; r.owner=c.name
|
||||
# RUN-Phase in abgeleiteter Reihenfolge
|
||||
for c in order:
|
||||
owned=[r for r in regions if r.owner==c.name]
|
||||
if owned or any(dep=="ordered_prose" for dep in c.consumes):
|
||||
artifacts+= c.run(owned, page, artifacts)
|
||||
return regions, [a for a in artifacts if not a.kind.startswith("_")], order
|
||||
@@ -1,200 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Ingest missing EU regulations from EUR-Lex (HTML).
|
||||
"""Ingest EU legal acts from eur-lex/CELLAR via the LegalActIngester engine.
|
||||
|
||||
Downloads German HTML from EUR-Lex via CELEX number, uploads with legal chunking.
|
||||
For each act this downloads the German XHTML (CELLAR, eur-lex fallback), parses
|
||||
it into articles + annexes with full authority metadata + citation edges
|
||||
(services/legal_act_ingester.py), self-tests the parse, and uploads per unit.
|
||||
Acts whose CELEX already exists are SKIPPED — there is no automatic re-ingest.
|
||||
|
||||
Usage (on Mac Mini):
|
||||
Usage (Mac Mini, with the RAG service reachable):
|
||||
python3 control-pipeline/scripts/ingest_eu_regulations.py --dry-run
|
||||
python3 control-pipeline/scripts/ingest_eu_regulations.py
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import TypedDict
|
||||
|
||||
import httpx
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from services.legal_act_ingester import ( # noqa: E402
|
||||
RegSpec,
|
||||
build_upload_units,
|
||||
download_act,
|
||||
parse_html,
|
||||
self_test,
|
||||
upload_unit,
|
||||
)
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
logger = logging.getLogger("ingest-eu")
|
||||
|
||||
RAG_URL = "https://localhost:8097"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
RAG_URL = os.getenv("RAG_URL", "https://localhost:8097")
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
COLLECTION = "bp_compliance_ce"
|
||||
RUN_TAG = "2026-06-eu-v1"
|
||||
|
||||
EURLEX_URL = "https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{celex}"
|
||||
|
||||
# ---- EU Regulations to ingest ----
|
||||
REGULATIONS = [
|
||||
{
|
||||
"celex": "32022L2464",
|
||||
"regulation_id": "csrd_2022",
|
||||
"name": "Corporate Sustainability Reporting Directive (CSRD)",
|
||||
"short": "CSRD",
|
||||
"category": "sustainability",
|
||||
},
|
||||
{
|
||||
"celex": "32024L1760",
|
||||
"regulation_id": "csddd_2024",
|
||||
"name": "Corporate Sustainability Due Diligence Directive (CSDDD)",
|
||||
"short": "CSDDD",
|
||||
"category": "sustainability",
|
||||
},
|
||||
{
|
||||
"celex": "32020R0852",
|
||||
"regulation_id": "eu_taxonomy_2020",
|
||||
"name": "EU-Taxonomie-Verordnung",
|
||||
"short": "EU Taxonomy",
|
||||
"category": "sustainability",
|
||||
},
|
||||
{
|
||||
"celex": "32024R1183",
|
||||
"regulation_id": "eidas_2_0_2024",
|
||||
"name": "eIDAS 2.0 Verordnung (EU Digital Identity)",
|
||||
"short": "eIDAS 2.0",
|
||||
"category": "digital_identity",
|
||||
},
|
||||
{
|
||||
"celex": "32023L0970",
|
||||
"regulation_id": "pay_transparency_2023",
|
||||
"name": "Entgelttransparenz-Richtlinie",
|
||||
"short": "Pay Transparency",
|
||||
"category": "employment",
|
||||
},
|
||||
{
|
||||
"celex": "32022R2065",
|
||||
"regulation_id": "dsa_2022_updated",
|
||||
"name": "Digital Services Act (DSA) — aktualisiert",
|
||||
"short": "DSA",
|
||||
"category": "digital_services",
|
||||
"skip_if_exists": "dsa_2022", # already exists under different ID
|
||||
},
|
||||
class IngestResult(TypedDict):
|
||||
reg: str
|
||||
status: str
|
||||
chunks: int
|
||||
|
||||
|
||||
def _rank(celex: str) -> str:
|
||||
"""eu_directive for L-acts, eu_regulation otherwise (CELEX descriptor letter)."""
|
||||
return "eu_directive" if len(celex) > 5 and celex[5] == "L" else "eu_regulation"
|
||||
|
||||
|
||||
def _spec(celex: str, name_de: str, short: str, version_date: str = "") -> RegSpec:
|
||||
return RegSpec(
|
||||
reg=short, celex=celex, name_de=name_de, collection=COLLECTION,
|
||||
version_date=version_date, legal_basis_rank=_rank(celex),
|
||||
)
|
||||
|
||||
|
||||
# Acts this script ingests. The proven MVP acts (CRA / AI Act / DORA / NIS2 /
|
||||
# MaschinenVO / DSGVO) are already in the corpus and get re-ingested via a
|
||||
# separate, controlled step — not here.
|
||||
SPECS = [
|
||||
_spec("32022L2464", "Corporate Sustainability Reporting Directive (CSRD)", "CSRD"),
|
||||
_spec("32024L1760", "Corporate Sustainability Due Diligence Directive (CSDDD)", "CSDDD"),
|
||||
_spec("32020R0852", "EU-Taxonomie-Verordnung", "EU Taxonomy"),
|
||||
_spec("32024R1183", "eIDAS 2.0 Verordnung (EU Digital Identity)", "eIDAS 2.0"),
|
||||
_spec("32023L0970", "Entgelttransparenz-Richtlinie", "Pay Transparency"),
|
||||
_spec("32022R2065", "Digital Services Act (DSA)", "DSA"),
|
||||
]
|
||||
|
||||
|
||||
def download_eurlex(celex: str) -> str:
|
||||
"""Download EU regulation HTML from EUR-Lex."""
|
||||
url = EURLEX_URL.format(celex=celex)
|
||||
with httpx.Client(timeout=30.0, follow_redirects=True) as c:
|
||||
resp = c.get(url)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
|
||||
|
||||
def upload_html(html: str, filename: str, reg: dict, dry_run: bool = False):
|
||||
"""Upload HTML to RAG service."""
|
||||
if dry_run:
|
||||
logger.info(" DRY RUN — would upload %d chars", len(html))
|
||||
return {"chunks_count": 0}
|
||||
|
||||
meta = {
|
||||
"regulation_id": reg["regulation_id"],
|
||||
"regulation_name_de": reg["name"],
|
||||
"regulation_short": reg["short"],
|
||||
"celex": reg["celex"],
|
||||
"category": reg["category"],
|
||||
"source": "EUR-Lex",
|
||||
"license": "EU_law",
|
||||
"jurisdiction": "EU",
|
||||
"source_type": "law",
|
||||
}
|
||||
form_data = {
|
||||
"collection": COLLECTION,
|
||||
"data_type": "compliance",
|
||||
"bundesland": "bund",
|
||||
"use_case": "compliance",
|
||||
"year": "2026",
|
||||
"chunk_strategy": "legal",
|
||||
"chunk_size": "1500",
|
||||
"chunk_overlap": "100",
|
||||
"metadata_json": json.dumps(meta, ensure_ascii=False),
|
||||
}
|
||||
with httpx.Client(timeout=600.0, verify=False) as c:
|
||||
resp = c.post(
|
||||
f"{RAG_URL}/api/v1/documents/upload",
|
||||
files={"file": (filename, html.encode("utf-8"), "text/html")},
|
||||
data=form_data,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
|
||||
def count_existing(regulation_id: str) -> int:
|
||||
with httpx.Client(timeout=60.0) as c:
|
||||
resp = c.post(
|
||||
def count_existing(celex: str) -> int:
|
||||
"""Chunks already present for this CELEX (old or new tagging) — the skip guard."""
|
||||
with httpx.Client(timeout=60.0, verify=False) as client:
|
||||
resp = client.post(
|
||||
f"{QDRANT_URL}/collections/{COLLECTION}/points/count",
|
||||
json={"filter": {"must": [
|
||||
{"key": "regulation_id", "match": {"value": regulation_id}}
|
||||
]}, "exact": True},
|
||||
json={"filter": {"must": [{"key": "celex", "match": {"value": celex}}]}, "exact": True},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()["result"]["count"]
|
||||
return int(resp.json()["result"]["count"])
|
||||
|
||||
|
||||
def main():
|
||||
def ingest_one(spec: RegSpec, dry_run: bool) -> IngestResult:
|
||||
if (existing := count_existing(spec.celex)) > 0:
|
||||
logger.info(" already present: %d chunks — SKIPPING (no re-ingest)", existing)
|
||||
return {"reg": spec.reg, "status": "exists", "chunks": existing}
|
||||
|
||||
try:
|
||||
html = download_act(spec.celex)
|
||||
except Exception as exc: # noqa: BLE001 — log + continue with the next act
|
||||
logger.error(" download FAILED: %s", exc)
|
||||
return {"reg": spec.reg, "status": "download_failed", "chunks": 0}
|
||||
|
||||
act = parse_html(html, spec.reg)
|
||||
passed, problems = self_test(act)
|
||||
logger.info(" parsed: %d articles, %d annexes", len(act.articles), len(act.annexes))
|
||||
if not passed:
|
||||
logger.error(" GATE FAIL — %s", "; ".join(problems))
|
||||
return {"reg": spec.reg, "status": "gate_failed", "chunks": 0}
|
||||
|
||||
units = build_upload_units(act, spec, RUN_TAG)
|
||||
if dry_run:
|
||||
logger.info(" DRY RUN — would upload %d units", len(units))
|
||||
return {"reg": spec.reg, "status": "dry_run", "chunks": len(units)}
|
||||
|
||||
chunks = 0
|
||||
with httpx.Client(timeout=600.0, verify=False) as client:
|
||||
for unit in units:
|
||||
chunks += upload_unit(client, RAG_URL, unit)
|
||||
logger.info(" uploaded: %d units, %d chunks", len(units), chunks)
|
||||
return {"reg": spec.reg, "status": "ok", "chunks": chunks}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("Ingest EU Regulations from EUR-Lex")
|
||||
logger.info(" Regulations: %d", len(REGULATIONS))
|
||||
logger.info(" Dry run: %s", args.dry_run)
|
||||
logger.info("LegalActIngester — %d acts | dry_run=%s", len(SPECS), args.dry_run)
|
||||
logger.info("=" * 60)
|
||||
|
||||
results = []
|
||||
for i, reg in enumerate(REGULATIONS, 1):
|
||||
logger.info("\n[%d/%d] %s (CELEX: %s)", i, len(REGULATIONS), reg["name"], reg["celex"])
|
||||
results: list[IngestResult] = []
|
||||
for i, spec in enumerate(SPECS, 1):
|
||||
logger.info("\n[%d/%d] %s (CELEX %s)", i, len(SPECS), spec.name_de, spec.celex)
|
||||
results.append(ingest_one(spec, args.dry_run))
|
||||
if i < len(SPECS):
|
||||
time.sleep(1)
|
||||
|
||||
# Skip if variant already exists
|
||||
skip_id = reg.get("skip_if_exists")
|
||||
if skip_id:
|
||||
existing = count_existing(skip_id)
|
||||
if existing > 0:
|
||||
logger.info(" Already exists as '%s' (%d chunks) — SKIPPING", skip_id, existing)
|
||||
results.append({"reg": reg["short"], "status": "exists", "chunks": existing})
|
||||
continue
|
||||
|
||||
# Check if this exact ID exists
|
||||
existing = count_existing(reg["regulation_id"])
|
||||
if existing > 0:
|
||||
logger.info(" Already exists: %d chunks — SKIPPING", existing)
|
||||
results.append({"reg": reg["short"], "status": "exists", "chunks": existing})
|
||||
continue
|
||||
|
||||
# Download from EUR-Lex
|
||||
logger.info(" Downloading from EUR-Lex...")
|
||||
try:
|
||||
html = download_eurlex(reg["celex"])
|
||||
logger.info(" Downloaded: %d chars", len(html))
|
||||
except Exception as e:
|
||||
logger.error(" Download FAILED: %s", e)
|
||||
results.append({"reg": reg["short"], "status": "download_failed", "chunks": 0})
|
||||
continue
|
||||
|
||||
# Upload
|
||||
filename = f"{reg['regulation_id']}.html"
|
||||
try:
|
||||
result = upload_html(html, filename, reg, args.dry_run)
|
||||
chunks = result.get("chunks_count", 0)
|
||||
logger.info(" Uploaded: %d chunks", chunks)
|
||||
results.append({"reg": reg["short"], "status": "ok", "chunks": chunks})
|
||||
except Exception as e:
|
||||
logger.error(" Upload FAILED: %s", e)
|
||||
results.append({"reg": reg["short"], "status": "error", "chunks": 0})
|
||||
|
||||
if i < len(REGULATIONS):
|
||||
time.sleep(2)
|
||||
|
||||
# Summary
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("RESULTS")
|
||||
logger.info("=" * 60)
|
||||
for r in results:
|
||||
logger.info(" %-20s %s chunks=%d", r["reg"], r["status"].upper(), r["chunks"])
|
||||
|
||||
total_new = sum(r["chunks"] for r in results if r["status"] == "ok")
|
||||
logger.info("\nTotal new chunks: %d", total_new)
|
||||
logger.info(" %-18s %-15s chunks=%s", r["reg"], r["status"].upper(), r["chunks"])
|
||||
total = sum(r["chunks"] for r in results if r["status"] == "ok")
|
||||
logger.info("\nTotal new chunks: %d", total)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Ingest EU act recitals via the RecitalIngester engine (Parser 2).
|
||||
|
||||
Downloads each act's XHTML (CELLAR), parses the recitals (Erwägungsgründe),
|
||||
self-tests, and uploads them as a SEPARATE interpretative source
|
||||
(source_class=recital, use_for_primary=false). Acts whose recitals already
|
||||
exist are SKIPPED — no automatic re-ingest.
|
||||
|
||||
Usage (Mac Mini, with the RAG service reachable):
|
||||
python3 control-pipeline/scripts/ingest_recitals.py --dry-run
|
||||
python3 control-pipeline/scripts/ingest_recitals.py
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import TypedDict
|
||||
|
||||
import httpx
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from services.legal_act_ingester import RegSpec, download_act, upload_unit # noqa: E402
|
||||
from services.recital_ingester import build_upload_units, parse_recitals, self_test # noqa: E402
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
logger = logging.getLogger("ingest-recitals")
|
||||
|
||||
RAG_URL = os.getenv("RAG_URL", "https://localhost:8097")
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
RUN_TAG = "2026-06-eu-v1"
|
||||
|
||||
# The MVP acts whose recitals add interpretation context. Articles for these are
|
||||
# ingested separately (Parser 1); this only adds the recitals.
|
||||
SPECS = [
|
||||
RegSpec(reg="CRA", celex="32024R2847", name_de="Cyber Resilience Act", version_date="2024-10-23"),
|
||||
RegSpec(reg="AI Act", celex="32024R1689", name_de="Verordnung über Künstliche Intelligenz (AI Act)", version_date="2024-06-13"),
|
||||
RegSpec(reg="DORA", celex="32022R2554", name_de="Digital Operational Resilience Act (DORA)", version_date="2022-12-14"),
|
||||
RegSpec(reg="MaschinenVO", celex="32023R1230", name_de="Maschinenverordnung (EU) 2023/1230", version_date="2023-06-14"),
|
||||
RegSpec(reg="NIS2", celex="32022L2555", name_de="NIS-2-Richtlinie", version_date="2022-12-14", legal_basis_rank="eu_directive"),
|
||||
RegSpec(reg="DSGVO", celex="32016R0679", name_de="Datenschutz-Grundverordnung (DSGVO)",
|
||||
collection="bp_compliance_datenschutz", version_date="2016-04-27"),
|
||||
]
|
||||
|
||||
|
||||
class IngestResult(TypedDict):
|
||||
reg: str
|
||||
status: str
|
||||
chunks: int
|
||||
|
||||
|
||||
def count_existing_recitals(spec: RegSpec) -> int:
|
||||
with httpx.Client(timeout=60.0, verify=False) as client:
|
||||
resp = client.post(
|
||||
f"{QDRANT_URL}/collections/{spec.collection}/points/count",
|
||||
json={"filter": {"must": [
|
||||
{"key": "celex", "match": {"value": spec.celex}},
|
||||
{"key": "chunk_scope", "match": {"value": "recital"}},
|
||||
]}, "exact": True},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return int(resp.json()["result"]["count"])
|
||||
|
||||
|
||||
def ingest_one(spec: RegSpec, dry_run: bool) -> IngestResult:
|
||||
if (existing := count_existing_recitals(spec)) > 0:
|
||||
logger.info(" recitals already present: %d — SKIPPING (no re-ingest)", existing)
|
||||
return {"reg": spec.reg, "status": "exists", "chunks": existing}
|
||||
|
||||
try:
|
||||
html = download_act(spec.celex)
|
||||
except Exception as exc: # noqa: BLE001 — log + continue with the next act
|
||||
logger.error(" download FAILED: %s", exc)
|
||||
return {"reg": spec.reg, "status": "download_failed", "chunks": 0}
|
||||
|
||||
recitals = parse_recitals(html, spec.reg)
|
||||
passed, problems = self_test(recitals)
|
||||
logger.info(" parsed: %d recitals", len(recitals))
|
||||
if not passed:
|
||||
logger.error(" GATE FAIL — %s", "; ".join(problems))
|
||||
return {"reg": spec.reg, "status": "gate_failed", "chunks": 0}
|
||||
|
||||
units = build_upload_units(recitals, spec, RUN_TAG)
|
||||
if dry_run:
|
||||
logger.info(" DRY RUN — would upload %d recital units", len(units))
|
||||
return {"reg": spec.reg, "status": "dry_run", "chunks": len(units)}
|
||||
|
||||
chunks = 0
|
||||
with httpx.Client(timeout=600.0, verify=False) as client:
|
||||
for unit in units:
|
||||
chunks += upload_unit(client, RAG_URL, unit)
|
||||
logger.info(" uploaded: %d units, %d chunks", len(units), chunks)
|
||||
return {"reg": spec.reg, "status": "ok", "chunks": chunks}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("RecitalIngester — %d acts | dry_run=%s", len(SPECS), args.dry_run)
|
||||
results: list[IngestResult] = []
|
||||
for i, spec in enumerate(SPECS, 1):
|
||||
logger.info("\n[%d/%d] %s (CELEX %s)", i, len(SPECS), spec.name_de, spec.celex)
|
||||
results.append(ingest_one(spec, args.dry_run))
|
||||
if i < len(SPECS):
|
||||
time.sleep(1)
|
||||
|
||||
for r in results:
|
||||
logger.info(" %-14s %-15s chunks=%s", r["reg"], r["status"].upper(), r["chunks"])
|
||||
logger.info("Total: %d", sum(r["chunks"] for r in results if r["status"] == "ok"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,107 @@
|
||||
"""C7 Reading Order Reconstruction (Pilot). ReadingRegion-Modell, Identity-Gate.
|
||||
Scope: NUR Detect Regions / Determine Order / Emit Linear. KEINE Tabellen/Bilder/Sidebars/Fussnoten/Callouts."""
|
||||
import statistics
|
||||
|
||||
def _lines(words, ytol=3.0):
|
||||
ws=sorted(words,key=lambda w:(round(w["top"],1),w["x0"])); lines=[]; cur=[]; cy=None
|
||||
for w in ws:
|
||||
if cy is None or abs(w["top"]-cy)<=ytol: cur.append(w); cy=w["top"] if cy is None else cy
|
||||
else: lines.append(cur); cur=[w]; cy=w["top"]
|
||||
if cur: lines.append(cur)
|
||||
return lines
|
||||
|
||||
def _gutters(words, W):
|
||||
G=160; xs=[W*i/G for i in range(G+1)]
|
||||
cov=[sum(1 for w in words if w["x0"]<=x<=w["x1"]) for x in xs]
|
||||
pos=[c for c in cov if c>0]
|
||||
if not pos: return []
|
||||
body=statistics.median(pos)
|
||||
if body<6: return []
|
||||
thr=max(1,0.15*body); gut=[]; i=0
|
||||
while i<=G:
|
||||
if cov[i]<thr and 0.15*W<=xs[i]<=0.85*W:
|
||||
j=i
|
||||
while j<=G and cov[j]<thr: j+=1
|
||||
if xs[min(j,G)]-xs[i]>=0.02*W: gut.append((xs[i]+xs[min(j-1,G)])/2)
|
||||
i=j
|
||||
else: i+=1
|
||||
return gut
|
||||
|
||||
def detect_regions(pg):
|
||||
ws=pg.extract_words(); W=float(pg.width)
|
||||
if len(ws)<60: return {"type":"single","reason":"sparse"}, ws
|
||||
cuts=_gutters(ws,W)
|
||||
if not cuts: return {"type":"single","reason":"no-gutter"}, ws
|
||||
def rc(a,b): return sum(1 for w in ws if a<=(w["x0"]+w["x1"])/2<b)
|
||||
minw=max(25,0.12*len(ws))
|
||||
keep=list(cuts); changed=True
|
||||
while keep and changed:
|
||||
changed=False; bnds=[0]+keep+[W]
|
||||
cnt=[rc(bnds[i],bnds[i+1]) for i in range(len(bnds)-1)]
|
||||
mn=min(range(len(cnt)),key=lambda i:cnt[i])
|
||||
if cnt[mn]<minw:
|
||||
if mn==0: del keep[0]
|
||||
elif mn==len(cnt)-1: del keep[-1]
|
||||
elif cnt[mn-1]<=cnt[mn+1]: del keep[mn-1]
|
||||
else: del keep[mn]
|
||||
changed=True
|
||||
if not keep: return {"type":"single","reason":"thin-merged"}, ws
|
||||
bounds=[0]+keep+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(bounds)-1)]
|
||||
return {"type":"multi","cols":cols,"cuts":keep,"ncols":len(cols)}, ws
|
||||
|
||||
def emit_linear(pg):
|
||||
info,ws=detect_regions(pg)
|
||||
if info["type"]=="single": return pg.extract_text() or ""
|
||||
cuts=info["cuts"]; cols=info["cols"]; W=float(pg.width)
|
||||
def colidx(x):
|
||||
for k,c in enumerate(cols):
|
||||
if c[0]<=x<c[1]: return k
|
||||
return len(cols)-1
|
||||
seq=[]
|
||||
for ln in _lines(ws):
|
||||
sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
|
||||
for i in range(1,len(sw)):
|
||||
if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
|
||||
else: frags[-1].append(sw[i])
|
||||
for fr in frags:
|
||||
x0=min(w["x0"] for w in fr); x1=max(w["x1"] for w in fr); top=min(w["top"] for w in fr)
|
||||
text=" ".join(w["text"] for w in fr); spans=sum(1 for c in cuts if x0<c<x1)
|
||||
seq.append(("full",None,top,text) if spans>=1 else ("col",colidx((x0+x1)/2),top,text))
|
||||
out=[]; buf=[]
|
||||
def flush(b):
|
||||
res=[]
|
||||
for k in sorted(set(x[1] for x in b)):
|
||||
for x in sorted([x for x in b if x[1]==k], key=lambda x:x[2]): res.append(x[3])
|
||||
return res
|
||||
for it in seq:
|
||||
if it[0]=="full":
|
||||
if buf: out+=flush(buf); buf=[]
|
||||
out.append(it[3])
|
||||
else: buf.append(it)
|
||||
if buf: out+=flush(buf)
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def emit_words(ws, W):
|
||||
flat=lambda L: " ".join(w["text"] for w in sorted(L,key=lambda w:(round(w["top"],1),w["x0"])))
|
||||
if len(ws)<60: return flat(ws)
|
||||
cuts=_gutters(ws,W)
|
||||
if not cuts: return flat(ws)
|
||||
bounds=[0]+cuts+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(cuts)+1)]
|
||||
def colidx(x):
|
||||
for k,c in enumerate(cols):
|
||||
if c[0]<=x<c[1]: return k
|
||||
return len(cols)-1
|
||||
buf={}
|
||||
for ln in _lines(ws):
|
||||
sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
|
||||
for i in range(1,len(sw)):
|
||||
if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
|
||||
else: frags[-1].append(sw[i])
|
||||
for fr in frags:
|
||||
mid=(min(w["x0"] for w in fr)+max(w["x1"] for w in fr))/2
|
||||
buf.setdefault(colidx(mid),[]).append((min(w["top"] for w in fr)," ".join(w["text"] for w in fr)))
|
||||
out=[]
|
||||
for k in sorted(buf):
|
||||
for top,t in sorted(buf[k]): out.append(t)
|
||||
return chr(10).join(out)
|
||||
@@ -0,0 +1,755 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Structured Guidance Extractor — GENERIC builder (Wave 1b compiler).
|
||||
|
||||
One deterministic parser, parametrized per document + language.
|
||||
python3 sge_build.py --doc EDPB_WP248_DPIA --lang de [--dry-run]
|
||||
|
||||
Source per (doc,lang) = {"pdf_url"} OR {"zip_url","inner"[, "local"]}.
|
||||
Layout parser identical to the WP243 pilot (pdfplumber MIT, NO LLM). Mehrsprachigkeit
|
||||
= representation property: SAME document_id, language=<lang>, own document_version
|
||||
namespace (no point-id collision). Emits chunks/page + refs/chunk metrics.
|
||||
"""
|
||||
import argparse, datetime, io, logging, os, re, subprocess, sys, time, zipfile
|
||||
from collections import Counter
|
||||
|
||||
CP = os.getenv("BUILD_CP") or os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, CP)
|
||||
import httpx # noqa: E402
|
||||
import pdfplumber # noqa: E402
|
||||
from services.legal_act_ingester import UploadUnit, upload_unit # noqa: E402
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
log = logging.getLogger("sge")
|
||||
|
||||
RAG_URL = os.getenv("RAG_URL", "https://localhost:8097")
|
||||
BUILD_COLLECTION = "bp_compliance_kb_2026_1_build"
|
||||
MANIFEST_VERSION = "regulatory_build_manifest_v3/2026.1"
|
||||
|
||||
# --- Layout-Familien (Phase 2): detected family-String -> F-Nummer + Familienkarte (erwartete Familie je Quelle) ---
|
||||
FAMILY_F = {"arabic-hierarchical": "F1", "roman-hierarchical": "F2",
|
||||
"arabic-caps+paragraph": "F3", "unnumbered-toc": "F4"}
|
||||
EXPECTED_FAMILY = { # Familienkarte: VOR-Build-Erwartung, beim Build gegen detected gegated (FAMILY-GATE)
|
||||
"EDPB_WP243_DPO": "F1", "EDPB_WP248_DPIA": "F2", "EDPB_GL_05_2020_CONSENT": "F3",
|
||||
"EDPB_GL_09_2022_BREACH_NOTIFICATION": "F2", "EDPB_GL_07_2020_CONTROLLER_PROCESSOR": "F3",
|
||||
"EDPB_WP260_TRANSPARENCY": "F4",
|
||||
"DSK_SDM": "F1", "DSK_OH_KI_2024": "F1", "DSK_OH_CLOUD_2014": "F1",
|
||||
"DSK_KP_05_DSFA": "F1", # FAMILY-GATE-Korrektur: parst 13 Sekt. via Heuristik, kein edge-short
|
||||
"DSK_KP_12_DSB": "edge-short", "DSK_KP_13_AV": "edge-short", # echte Kurz-Flyer (0 Sekt., gate=FAIL)
|
||||
# verbleibende EDPB (vorab klassifiziert = reasoned, beim Build verifiziert):
|
||||
"EDPB_GL_01_2021_BREACH_EXAMPLES": "F3", "EDPB_GL_02_2019_ART6_1B_ONLINE": "F3",
|
||||
"EDPB_REC_01_2020_SUPPL_MEASURES": "F3", "EDPB_GL_05_2021_ART3_CHAPTERV": "F3",
|
||||
# ENISA-Charge (3. Herausgeber): erwartete HEADING-Familie (F1-F4); Capabilities (Tabellen/Mehrspaltig) orthogonal
|
||||
"ENISA_HANDBOOK_PDP": "F1", "ENISA_GL_SME_PDP": "F1", "ENISA_GL_EECC_SEC": "F3",
|
||||
"ENISA_ISPS_SME": "F1", "ENISA_TL_2023": "?", # TL = Capability-Boundary-Kandidat (mehrspaltig/visuell)
|
||||
# NIST (US, Public Domain) — 4. Herausgeber, doc_type=Technical Standard (refs=section/control), tabellenlastig
|
||||
"NIST_SP_800_53B": "F1", "NIST_SP_800_171": "F1",
|
||||
}
|
||||
|
||||
# --- Manifest-Vertrag: document_type -> expected_reference_types (User 2026-06-28: DEKLARATIV, kein Parser-Bug).
|
||||
# Der Validator verlangt Artikel-Referenzen NUR, wenn der doc_type sie erwartet -> ENISA/NIST (Standards) scheitern nicht mehr.
|
||||
ISSUER_DOCTYPE = {"Article 29 WP / EDPB": "Guidance", "EDPB": "Guidance", "DSK": "Guidance",
|
||||
"ENISA": "Technical Standard", "NIST": "Technical Standard"}
|
||||
DOC_TYPE_REFS = {"EU Regulation": ["article", "recital", "annex"], "German Law": ["paragraph"],
|
||||
"Guidance": ["article", "guidance"], "Technical Standard": ["section", "control"],
|
||||
"Threat Report": ["cve", "cwe", "attack"], "Whitepaper": ["bibliography"]}
|
||||
|
||||
# --- per-document registry (resolved sources pinned) ---
|
||||
NS = "https://ec.europa.eu/newsroom/just/document.cfm?doc_id="
|
||||
DOCS = {
|
||||
"EDPB_WP243_DPO": { # Pilot (regression reference) — arabisch-Schema
|
||||
"reg": "EDPB WP243", "issuer": "Article 29 WP / EDPB", "expected_sections": 9,
|
||||
"name": "EDPB/WP29 Guidelines on Data Protection Officers (DPOs), WP 243 rev.01",
|
||||
"sources": {
|
||||
"en": {"pdf_url": NS + "44100"},
|
||||
"de": {"zip_url": NS + "48137", "inner": "wp243rev01_de.pdf", "local": "/tmp/doc_48137.bin"},
|
||||
},
|
||||
},
|
||||
"EDPB_WP248_DPIA": {
|
||||
"reg": "EDPB WP248", "issuer": "Article 29 WP / EDPB", "expected_sections": 8,
|
||||
"name": "EDPB/WP29 Guidelines on Data Protection Impact Assessment (DPIA), WP 248 rev.01",
|
||||
"sources": {
|
||||
"en": {"pdf_url": NS + "47711"},
|
||||
"de": {"zip_url": NS + "48464", "inner": "wp248 rev.01_de.pdf", "local": "/tmp/wp248_48464.bin"},
|
||||
},
|
||||
},
|
||||
"EDPB_GL_05_2020_CONSENT": {
|
||||
"reg": "EDPB GL 05/2020", "issuer": "EDPB", "expected_sections": 7,
|
||||
"name": "EDPB Guidelines 05/2020 on consent under Regulation 2016/679 (v1.1)",
|
||||
"sources": {
|
||||
"en": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/files/file1/edpb_guidelines_202005_consent_en.pdf"},
|
||||
"de": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/files/file1/edpb_guidelines_202005_consent_de.pdf"},
|
||||
},
|
||||
},
|
||||
"EDPB_GL_09_2022_BREACH_NOTIFICATION": {
|
||||
"reg": "EDPB GL 09/2022", "issuer": "EDPB", "expected_sections": 6,
|
||||
"name": "EDPB Guidelines 9/2022 on personal data breach notification under GDPR (v2.0)",
|
||||
"sources": {
|
||||
"en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-04/edpb_guidelines_202209_personal_data_breach_notification_v2.0_en.pdf"},
|
||||
"de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202209_personal_data_breach_notification_v2.0_de_0.pdf"},
|
||||
},
|
||||
},
|
||||
"EDPB_GL_07_2020_CONTROLLER_PROCESSOR": {
|
||||
"reg": "EDPB GL 07/2020", "issuer": "EDPB", "expected_sections": 5,
|
||||
"name": "EDPB Guidelines 07/2020 on the concepts of controller and processor in the GDPR (v2.0)",
|
||||
"sources": {
|
||||
"en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-10/EDPB_guidelines_202007_controllerprocessor_final_en.pdf"},
|
||||
"de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-10/EDPB_guidelines_202007_controllerprocessor_final_de.pdf"},
|
||||
},
|
||||
},
|
||||
"EDPB_WP260_TRANSPARENCY": {
|
||||
"reg": "EDPB WP260", "issuer": "Article 29 WP / EDPB", "expected_sections": 5,
|
||||
"name": "EDPB/WP29 Guidelines on transparency under Regulation 2016/679, WP 260 rev.01",
|
||||
"sources": {
|
||||
"en": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/2023-09/wp260rev01_en.pdf"},
|
||||
"de": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/2023-09/wp260rev01_de.pdf"},
|
||||
},
|
||||
},
|
||||
"EDPB_GL_01_2021_BREACH_EXAMPLES": {
|
||||
"reg": "EDPB GL 01/2021", "issuer": "EDPB", "expected_sections": 4,
|
||||
"name": "EDPB Guidelines 01/2021 on Examples regarding Personal Data Breach Notification",
|
||||
"sources": {
|
||||
"en": {"pdf_url": "https://www.edpb.europa.eu/system/files/documents/2022-01/edpb_guidelines_012021_pdbnotification_adopted_en.pdf"},
|
||||
"de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2022-04/edpb_guidelines_012021_pdbnotification_adopted_de.pdf"},
|
||||
},
|
||||
},
|
||||
"EDPB_GL_02_2019_ART6_1B_ONLINE": {
|
||||
"reg": "EDPB GL 02/2019", "issuer": "EDPB", "expected_sections": 4,
|
||||
"name": "EDPB Guidelines 2/2019 on processing under Article 6(1)(b) GDPR (online services)",
|
||||
"sources": {
|
||||
"en": {"pdf_url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines-art_6-1-b-adopted_after_public_consultation_en.pdf"},
|
||||
"de": {"pdf_url": "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines-art_6-1-b-adopted_after_public_consultation_de_0.pdf"},
|
||||
},
|
||||
},
|
||||
"EDPB_REC_01_2020_SUPPL_MEASURES": {
|
||||
"reg": "EDPB REC 01/2020", "issuer": "EDPB", "expected_sections": 4,
|
||||
"name": "EDPB Recommendations 01/2020 on supplementary measures for transfer tools",
|
||||
"sources": {
|
||||
"en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2021-06/edpb_recommendations_202001vo.2.0_supplementarymeasurestransferstools_en.pdf"},
|
||||
"de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2022-04/edpb_recommendations_202001vo.2.0_supplementarymeasurestransferstools_de.pdf"},
|
||||
},
|
||||
},
|
||||
"EDPB_GL_05_2021_ART3_CHAPTERV": {
|
||||
"reg": "EDPB GL 05/2021", "issuer": "EDPB", "expected_sections": 4,
|
||||
"name": "EDPB Guidelines 05/2021 on the Interplay of Article 3 and Chapter V GDPR",
|
||||
"sources": {
|
||||
"en": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-02/edpb_guidelines_05-2021_interplay_between_the_application_of_art3-chapter_v_of_the_gdpr_v2_en_0.pdf"},
|
||||
"de": {"pdf_url": "https://www.edpb.europa.eu/system/files/2023-09/edpb_guidelines_05-2021_interplay_between_the_application_de.pdf"},
|
||||
},
|
||||
},
|
||||
"DSK_SDM": { # F4-Cross-Issuer-Kandidat (DSK, dt.)
|
||||
"reg": "DSK SDM", "issuer": "DSK", "expected_sections": 3,
|
||||
"name": "DSK Standard-Datenschutzmodell V3.1",
|
||||
"sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/ah/SDM-Methode-V31.pdf"}},
|
||||
},
|
||||
"DSK_OH_KI_2024": {
|
||||
"reg": "DSK OH KI", "issuer": "DSK", "expected_sections": 3,
|
||||
"name": "DSK Orientierungshilfe KI und Datenschutz (2024)",
|
||||
"sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/oh/20240506_DSK_Orientierungshilfe_KI_und_Datenschutz.pdf"}},
|
||||
},
|
||||
"DSK_KP_12_DSB": {
|
||||
"reg": "DSK KP12", "issuer": "DSK", "expected_sections": 2,
|
||||
"name": "DSK Kurzpapier Nr. 12 - Datenschutzbeauftragte",
|
||||
"sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_12.pdf"}},
|
||||
},
|
||||
"DSK_OH_CLOUD_2014": {
|
||||
"reg": "DSK OH Cloud", "issuer": "DSK", "expected_sections": 3,
|
||||
"name": "DSK Orientierungshilfe - Cloud Computing (2014)",
|
||||
"sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/oh/20141009_oh_cloud_computing.pdf"}},
|
||||
},
|
||||
"DSK_KP_05_DSFA": {
|
||||
"reg": "DSK KP05", "issuer": "DSK", "expected_sections": 2,
|
||||
"name": "DSK Kurzpapier Nr. 5 - Datenschutz-Folgenabschätzung",
|
||||
"sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf"}},
|
||||
},
|
||||
"DSK_KP_13_AV": {
|
||||
"reg": "DSK KP13", "issuer": "DSK", "expected_sections": 2,
|
||||
"name": "DSK Kurzpapier Nr. 13 - Auftragsverarbeitung",
|
||||
"sources": {"de": {"pdf_url": "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_13.pdf"}},
|
||||
},
|
||||
"ENISA_HANDBOOK_PDP": { # Stufe 1 — bekannte Welt (Kalibrierung)
|
||||
"reg": "ENISA Handbook PDP", "issuer": "ENISA", "expected_sections": 4,
|
||||
"name": "ENISA Handbook on Security of Personal Data Processing",
|
||||
"sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/WP2017%20O-2-2-5%20GDPR%20Measures%20Handbook.pdf"}},
|
||||
},
|
||||
"ENISA_GL_SME_PDP": { # Stufe 1
|
||||
"reg": "ENISA GL SME PDP", "issuer": "ENISA", "expected_sections": 4,
|
||||
"name": "ENISA Guidelines for SMEs on the security of personal data processing",
|
||||
"sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/WP2016%203-2%206%20Data%20Controllers%20Risk.pdf"}},
|
||||
},
|
||||
"ENISA_GL_EECC_SEC": { # Stufe 2 — bekannte Welt + Annex/Tabellen
|
||||
"reg": "ENISA GL EECC", "issuer": "ENISA", "expected_sections": 4,
|
||||
"name": "ENISA Guideline on Security Measures under the EECC (4th edition)",
|
||||
"sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20-%20Guideline%20on%20Security%20Measures%20under%20the%20EECC-%204th%20edition.pdf"}},
|
||||
},
|
||||
"ENISA_ISPS_SME": { # Stufe 3 — Grenztest (tabellenzentriert)
|
||||
"reg": "ENISA ISPS SME", "issuer": "ENISA", "expected_sections": 3,
|
||||
"name": "ENISA Information security and privacy standards for SMEs",
|
||||
"sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/Information%20security%20and%20privacy%20standards%20for%20SMEs.pdf"}},
|
||||
},
|
||||
"ENISA_TL_2023": { # Stufe 3 — Grenztest (mehrspaltig/visuell)
|
||||
"reg": "ENISA TL 2023", "issuer": "ENISA", "expected_sections": 3,
|
||||
"name": "ENISA Threat Landscape 2023",
|
||||
"sources": {"en": {"pdf_url": "https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20Threat%20Landscape%202023.pdf"}},
|
||||
},
|
||||
"NIST_SP_800_53B": { # NIST (US Public Domain) — tabellenzentriert (Control Baselines)
|
||||
"reg": "NIST SP 800-53B", "issuer": "NIST", "expected_sections": 3,
|
||||
"name": "NIST SP 800-53B Control Baselines for Information Systems and Organizations",
|
||||
"sources": {"en": {"pdf_url": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-53B.pdf"}},
|
||||
},
|
||||
"NIST_SP_800_171": { # NIST — strukturierte Guidance + Tabellen
|
||||
"reg": "NIST SP 800-171", "issuer": "NIST", "expected_sections": 3,
|
||||
"name": "NIST SP 800-171 Rev 2 Protecting Controlled Unclassified Information",
|
||||
"sources": {"en": {"pdf_url": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-171r2.pdf"}},
|
||||
},
|
||||
}
|
||||
|
||||
# --- language-specific lexical config ---
|
||||
LANG = {
|
||||
"en": {"artref": re.compile(r'(?:Article|Art\.)\s*(\d+)'),
|
||||
"noise": re.compile(r'ARTICLE 29 DATA PROTECTION|^\d{1,3}$|Adopted on|Revised and Adopted|Version \d', re.I)},
|
||||
"de": {"artref": re.compile(r'(?:Artikel|Art\.)\s*(\d+)'),
|
||||
"noise": re.compile(r'ARTIKEL.?29.?DATENSCHUTZ|ARTICLE 29 DATA PROTECTION|^\d{1,3}$|Angenommen|berarbeitet und|Adopted on|Fassung \d', re.I)},
|
||||
}
|
||||
# Generic enumerator: arabic (1 / 1.2.3) | roman (I. / III.B.) | Annex N.
|
||||
# \d{1,3} excludes years (4 digits) as section numbers. Roman requires trailing dot.
|
||||
ROMAN = r'(?:XVIII|VIII|XIII|XVII|III|VII|XII|XIV|XVI|XIX|II|IV|VI|IX|XI|XV|XX|I|V|X)'
|
||||
ENUM = re.compile(
|
||||
r'^(?:'
|
||||
r'(?P<ar>\d{1,3}(?:\.\d+){0,3})\.?' # arabic 1 / 1.2.3 (self-pathed)
|
||||
r'|(?P<ro>' + ROMAN + r')\.' # roman top-level I. / III. (require dot)
|
||||
r'|(?P<le>[A-Z])\.' # capital-letter sub A./B. (only under roman scheme)
|
||||
r'|(?P<ax>(?:Annex|Annexe|Anhang|Anlage)\s+\d+)'
|
||||
r')\s+(?P<title>\S.{0,84})$'
|
||||
)
|
||||
ANNEX_KW = ("ANNEX", "ANNEXE", "ANHANG", "ANLAGE")
|
||||
GDPR_HINT = re.compile(r'DSGVO|GDPR|2016/679|Verordnung \(EU\) 2016/679', re.I)
|
||||
|
||||
|
||||
def git_sha():
|
||||
try:
|
||||
return subprocess.check_output(["git", "-C", CP, "rev-parse", "--short", "HEAD"]).decode().strip()
|
||||
except Exception:
|
||||
return "unknown"
|
||||
|
||||
|
||||
CACHE_DIR = "/tmp/sge_cache"
|
||||
|
||||
|
||||
def _http_get(url, timeout=120.0, attempts=4):
|
||||
import time as _t
|
||||
last = None
|
||||
for i in range(attempts):
|
||||
try:
|
||||
with httpx.Client(timeout=timeout, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"}) as c:
|
||||
data = c.get(url).content
|
||||
if data:
|
||||
return data
|
||||
last = RuntimeError("empty body")
|
||||
except Exception as e:
|
||||
last = e
|
||||
log.info("download attempt %d/%d failed: %s", i + 1, attempts, e)
|
||||
_t.sleep(3 * (i + 1))
|
||||
raise RuntimeError("download failed after %d attempts: %s (%s)" % (attempts, url, last))
|
||||
|
||||
|
||||
def fetch(src):
|
||||
import hashlib
|
||||
os.makedirs(CACHE_DIR, exist_ok=True)
|
||||
cache_key = hashlib.md5((src.get("zip_url") or src.get("pdf_url") or "").encode()).hexdigest()
|
||||
cache_pdf = os.path.join(CACHE_DIR, cache_key + ".pdf")
|
||||
if os.path.exists(cache_pdf):
|
||||
pdf = open(cache_pdf, "rb").read()
|
||||
if pdf[:4] == b"%PDF":
|
||||
log.info("PDF cache hit %s (%d B)", cache_pdf, len(pdf))
|
||||
return pdf
|
||||
local = src.get("local", "")
|
||||
if "zip_url" in src:
|
||||
if local and os.path.exists(local):
|
||||
data = open(local, "rb").read(); log.info("ZIP local %s (%d B)", local, len(data))
|
||||
else:
|
||||
data = _http_get(src["zip_url"], timeout=180.0)
|
||||
log.info("ZIP downloaded (%d B)", len(data))
|
||||
pdf = zipfile.ZipFile(io.BytesIO(data)).read(src["inner"])
|
||||
else:
|
||||
if local and os.path.exists(local):
|
||||
pdf = open(local, "rb").read()
|
||||
else:
|
||||
pdf = _http_get(src["pdf_url"], timeout=120.0)
|
||||
if pdf[:4] != b"%PDF":
|
||||
raise RuntimeError("not a PDF: %r" % pdf[:16])
|
||||
try:
|
||||
open(cache_pdf, "wb").write(pdf)
|
||||
except Exception:
|
||||
pass
|
||||
return pdf
|
||||
|
||||
|
||||
def src_url(src):
|
||||
return src.get("zip_url") or src.get("pdf_url")
|
||||
|
||||
|
||||
def line_font(pg, ln):
|
||||
chs = [c for c in pg.chars if ln["top"] - 1 <= c["top"] <= ln["bottom"] + 1]
|
||||
if not chs:
|
||||
return 0.0, ""
|
||||
sz = Counter(round(c.get("size", 0), 1) for c in chs).most_common(1)[0][0]
|
||||
fn = Counter(c.get("fontname", "") for c in chs).most_common(1)[0][0]
|
||||
return sz, fn
|
||||
|
||||
|
||||
def _upper_ratio(title):
|
||||
letters = [c for c in title if c.isalpha()]
|
||||
return (sum(c.isupper() for c in letters) / len(letters)) if letters else 0.0
|
||||
|
||||
|
||||
_TTL_STOP = {"der", "die", "das", "den", "des", "dem", "und", "von", "zur", "zum", "für", "auf", "the",
|
||||
"of", "and", "to", "for", "in", "on", "an", "under", "with", "sur", "aux"}
|
||||
|
||||
|
||||
def _ttoks(s):
|
||||
out = set()
|
||||
for t in re.findall(r'[a-zà-ÿ0-9]+', s.lower()):
|
||||
t = re.sub(r'\d+$', '', t) # Footnote-Ziffern entkleben ("freiwillig12" -> "freiwillig")
|
||||
if len(t) > 2 and t not in _TTL_STOP:
|
||||
out.add(t)
|
||||
return out
|
||||
|
||||
|
||||
def _is_caption(title):
|
||||
# strukturelle All-Caps-Überschrift (PREFACE/VORWORT/ANNEX) — auch ohne TOC-Eintrag gültig
|
||||
return _upper_ratio(title) >= 0.8 and len(title) <= 34
|
||||
|
||||
|
||||
# TOC-Zeile: <enum> <Titel> <lange Punktführung> <Seitenzahl>. EIGENE Regex (NICHT ENUM —
|
||||
# dessen Titel-Cap .{0,84} scheitert an 130+-Punkt-Führungen). Titel = non-greedy bis zur Führung.
|
||||
TOC_LINE = re.compile(r'^[A-Z0-9][\w./()\-]*\s+(?P<title>.+?)\s*\.{3,}\s*\d{1,3}\s*$')
|
||||
|
||||
|
||||
def extract_toc(all_lines, body_size):
|
||||
"""TOC = explizite Strukturdeklaration des Dokuments. Sammelt die Titel-Token-Sets der
|
||||
Inhaltsverzeichnis-Einträge aus dem Frontmatter. Absätze stehen NIE im TOC. >=5 => TOC vorhanden."""
|
||||
titlesets = []
|
||||
for page_number, txt, s, _ in all_lines:
|
||||
if page_number > 8:
|
||||
break
|
||||
m = TOC_LINE.match(txt)
|
||||
if not m:
|
||||
continue
|
||||
ts = _ttoks(m.group("title"))
|
||||
if ts:
|
||||
titlesets.append(ts)
|
||||
return (len(titlesets) >= 5, titlesets)
|
||||
|
||||
|
||||
def _title_in_toc(title, titlesets):
|
||||
# Match relativ zum BODY (inter/|body|): eine echte Überschrift ist ~ein TOC-Titel (alle Body-Wörter
|
||||
# im TOC -> 1.0); ein Absatz, der Sektionswörter ENTHÄLT, hat Extra-Wörter (-> <0.75) -> kein Match.
|
||||
# Body-Titel ist bei 84 Zeichen gekappt = Präfix des vollen TOC-Titels, daher robust gegen Kappung.
|
||||
bt = _ttoks(title)
|
||||
if not bt:
|
||||
return False
|
||||
for ts in titlesets:
|
||||
if len(bt & ts) / len(bt) >= 0.75:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _looks_like_heading(txt, s, fn, body_size):
|
||||
# F4-Plausibilitätsfilter — NICHT die Entscheidung (die trifft das TOC). Trimmt nur Fließtext weg,
|
||||
# BEVOR das TOC befragt wird: kurz + nicht satzschließend. Bewusst KEIN Bold/Size-Zwang — das TOC
|
||||
# entscheidet, Format ist nur Filter (greift eh nur in als unnummeriert erkannten Docs).
|
||||
t = txt.rstrip()
|
||||
return 3 <= len(t) <= 90 and not t.endswith((".", ";", ":", ","))
|
||||
|
||||
|
||||
def parse_guidance(pdf_bytes, noise):
|
||||
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
||||
n_pages = len(pdf.pages)
|
||||
szc = Counter()
|
||||
all_lines = [] # (page_number, txt, size, fontname) — einmal extrahiert, zweimal genutzt
|
||||
n_tables = 0 # detected (Capability: Tables Detect)
|
||||
raw_tables = [] # (page_number, rows) — EXTRAHIERT (Capability: Tables Extract). Scope: gelinierte/einfache.
|
||||
for pg in pdf.pages:
|
||||
try:
|
||||
pts = pg.extract_tables()
|
||||
except Exception:
|
||||
pts = []
|
||||
n_tables += len(pts)
|
||||
for tbl in pts:
|
||||
# erster Scope: >=2 Zeilen + >=3 nicht-leere Zellen (keine verschachtelten/visuellen)
|
||||
if tbl and len(tbl) >= 2 and sum(1 for row in tbl for c in row if c and c.strip()) >= 3:
|
||||
raw_tables.append((pg.page_number, [[(c or "").strip() for c in row] for row in tbl]))
|
||||
for ch in pg.chars:
|
||||
szc[round(ch.get("size", 0), 1)] += 1
|
||||
for ln in pg.extract_text_lines(layout=False):
|
||||
txt = " ".join(ln["text"].split())
|
||||
if not txt:
|
||||
continue
|
||||
s, fn = line_font(pg, ln)
|
||||
all_lines.append((pg.page_number, txt, s, fn))
|
||||
total_chars = sum(szc.values())
|
||||
body_size = szc.most_common(1)[0][0]
|
||||
# Schema-Erkennung (selbstkalibrierend): nutzt das Dok ALL-CAPS bare-Integer-Sektionsüberschriften
|
||||
# (EDPB-Hausstil)? Dann nummeriert es auch Absätze (Satz-case) -> bare-Integer-Heading muss ALL-CAPS
|
||||
# sein. WP243 (Title-case-Sektionen, keine ALL-CAPS) -> Regel inaktiv, keine Regression. Schwelle 2
|
||||
# gegen einzelne Fluke-Caps-Zeile; EN/DE-formatierungsunabhängig (anders als bare-Integer-Magnitude).
|
||||
caps_secs = 0
|
||||
for _, txt, s, _ in all_lines:
|
||||
if s < body_size - 0.5:
|
||||
continue
|
||||
mm = ENUM.match(txt)
|
||||
if (mm and mm.group("ar") and "." not in mm.group("ar")
|
||||
and not re.search(r'\s\d{1,3}$', mm.group("title"))
|
||||
and _upper_ratio(mm.group("title")) >= 0.6):
|
||||
caps_secs += 1
|
||||
caps_scheme = caps_secs >= 2
|
||||
toc_present, toc = extract_toc(all_lines, body_size)
|
||||
# F4-DOC-Erkennung: decken NUMMERIERTE Headings das TOC ab? Wenn kaum -> die Headings sind unnummeriert
|
||||
# (F4) -> unnummerierten Pfad aktivieren. Sonst (F1-F3) AUS -> keine Regression.
|
||||
numbered_toc_hits = 0
|
||||
if toc_present:
|
||||
for _, txt, s, _ in all_lines:
|
||||
if s < body_size - 0.5:
|
||||
continue
|
||||
mm = ENUM.match(txt)
|
||||
if not mm or re.search(r'\s\d{1,3}$', mm.group("title")):
|
||||
continue
|
||||
ttl = mm.group("title").strip().rstrip('.').strip()
|
||||
if _title_in_toc(ttl, toc) or _is_caption(ttl):
|
||||
numbered_toc_hits += 1
|
||||
unnumbered_doc = toc_present and numbered_toc_hits < max(3, len(toc) * 0.4)
|
||||
sections, cur, started, in_annex, seen_types, max_sec, un_count = [], None, False, False, set(), 0, 0
|
||||
top_en, top_type = None, None # last level-1 enumerator (roman/arabic/annex) for sub-path assembly
|
||||
for page_number, txt, s, fn in all_lines:
|
||||
m = ENUM.match(txt)
|
||||
ok_head = bool(m) and s >= body_size - 0.5 and not re.search(r'\s\d{1,3}$', m.group("title"))
|
||||
if ok_head:
|
||||
if m.group("ar"): typ, en = "ar", m.group("ar")
|
||||
elif m.group("ro"): typ, en = "ro", m.group("ro")
|
||||
elif m.group("le"): typ, en = "le", m.group("le")
|
||||
else: typ, en = "ax", m.group("ax")
|
||||
title = m.group("title").strip().rstrip('.').strip()
|
||||
# Großbuchstaben-Sub nur in römisch-Schema (Label-Assembly) — immer
|
||||
if typ == "le" and top_type != "ro":
|
||||
ok_head = False
|
||||
if ok_head and typ == "ar" and not in_annex:
|
||||
if unnumbered_doc:
|
||||
# F4-Doc: nummerierte Zeilen sind Absätze/Artikel-Refs ("13.1", "39.34"), nicht Headings
|
||||
# -> nur akzeptieren, wenn der Titel im TOC steht.
|
||||
if not _title_in_toc(title, toc):
|
||||
ok_head = False
|
||||
elif "." in en:
|
||||
# dotted = echte Sub-Headings (immer ok), AUSSER Dezimal/Uhrzeit-FP: führende Null im
|
||||
# Sub-Teil ("8.00 Uhr", "17.00"). Echte Sektionsnummern haben keine führende Null (3.1, 3.10).
|
||||
if any(c.startswith("0") for c in en.split(".")[1:]):
|
||||
ok_head = False
|
||||
elif toc_present:
|
||||
# bare-Integer: TOC = Wahrheit (Absätze stehen nie im TOC); All-Caps-Captions auch ohne TOC.
|
||||
if not (_title_in_toc(title, toc) or _is_caption(title)):
|
||||
ok_head = False
|
||||
else: # Fallback-Heuristiken nur für bare-Integer (kein TOC)
|
||||
if top_type == "ro":
|
||||
ok_head = False
|
||||
elif caps_scheme and _upper_ratio(title) < 0.6:
|
||||
ok_head = False
|
||||
elif int(en) > max_sec + 5:
|
||||
ok_head = False
|
||||
elif (m is None and unnumbered_doc and _looks_like_heading(txt, s, fn, body_size)
|
||||
and not re.search(r'\.\d+$', txt.strip()) and _title_in_toc(txt, toc)):
|
||||
# F4 — UNNUMMERIERTE Überschrift: TOC = die ENTSCHEIDUNG, looks_like_heading nur Plausibilitätsfilter.
|
||||
# Greift NUR wenn ENUM nicht matcht UND der Doc als unnummeriert erkannt wurde (F1-F3 unberührt).
|
||||
# NICHT mit ".<Ziffern>" endend -> Fußnoten-Fragmente ("environment.47") raus (schont DE-Komposita).
|
||||
ok_head, typ, en, title = True, "ut", None, txt.strip().rstrip('.').strip()
|
||||
if ok_head:
|
||||
seen_types.add(typ)
|
||||
if typ == "le":
|
||||
num, lvl = ((top_en + "." + en) if top_en else en), 2
|
||||
elif typ == "ut":
|
||||
un_count += 1
|
||||
num, lvl = str(un_count), 1
|
||||
else:
|
||||
num = en
|
||||
lvl = (num.count('.') + 1) if typ == "ar" else 1
|
||||
top_en, top_type = en, typ
|
||||
if typ == "ar" and "." not in en and not in_annex:
|
||||
max_sec = max(max_sec, int(en))
|
||||
cur = {"num": num, "title": title, "level": lvl, "in_annex": in_annex,
|
||||
"bold": "bold" in fn.lower(), "page": page_number, "body": []}
|
||||
sections.append(cur); started = True
|
||||
if typ == "ax" or any(k in (num + " " + title).upper() for k in ANNEX_KW):
|
||||
in_annex = True
|
||||
elif started and cur is not None:
|
||||
if s >= body_size - 2 and not noise.match(txt):
|
||||
cur["body"].append(txt)
|
||||
# TOC-Duplikate generisch entfernen: gleicher LABEL-Key doppelt (TOC-Stub p2-3 + echte Sektion)
|
||||
# -> den mit längerem Body behalten. Key = Label-Logik (A-Präfix für numerische Annex-Items),
|
||||
# NICHT roher num — sonst kollidieren WP243-Annex-Items (num 1-13) mit Kapiteln (num 1-5).
|
||||
def _lk(sc):
|
||||
n = sc["num"]
|
||||
return ("A" + n) if (sc.get("in_annex") and n.isdigit()) else n
|
||||
best = {}
|
||||
for sc in sections:
|
||||
k = _lk(sc)
|
||||
if k not in best or len("\n".join(sc["body"])) > len("\n".join(best[k]["body"])):
|
||||
best[k] = sc
|
||||
sections = [sc for sc in sections if best.get(_lk(sc)) is sc]
|
||||
return {"pages": n_pages, "total_chars": total_chars, "body_size": body_size,
|
||||
"sections": sections, "caps_scheme": caps_scheme, "schemes": seen_types,
|
||||
"toc": toc_present, "toc_entries": len(toc),
|
||||
"tables": n_tables, "annex": any(sc.get("in_annex") for sc in sections),
|
||||
"raw_tables": raw_tables}
|
||||
|
||||
|
||||
def self_test(parsed, expected, artref, expected_refs):
|
||||
probs, secs = [], parsed["sections"]
|
||||
if parsed["total_chars"] < 5000:
|
||||
probs.append("zu wenig embedded text (%d) -> OCR?" % parsed["total_chars"])
|
||||
if len(secs) < expected:
|
||||
probs.append("nur %d Sektionen < expected %d" % (len(secs), expected))
|
||||
full = " ".join(t for sc in secs for t in sc["body"]) + " " + " ".join(sc["title"] for sc in secs)
|
||||
arts = set(artref.findall(full))
|
||||
# Manifest-Vertrag (doc_type-abhängig): Artikel-Pflicht NUR wenn der doc_type 'article' erwartet
|
||||
# (Guidance/EU-VO). Technical Standards (NIST/ENISA) zitieren section/control -> kein FAIL bei fehlenden Artikeln.
|
||||
if "article" in expected_refs and not arts:
|
||||
probs.append("references_out: KEIN Artikel erkannt (doc_type erwartet article)")
|
||||
return (not probs, probs, sorted(arts, key=lambda x: int(x))[:12])
|
||||
|
||||
|
||||
def _build_units_struct(doc_id, doc, lang, parsed, base_version, prov):
|
||||
reg = doc["reg"]
|
||||
units = []
|
||||
sources = []
|
||||
idx = 0
|
||||
for sc in parsed["sections"]:
|
||||
body = "\n".join(sc["body"]).strip()
|
||||
if len(body) < 40:
|
||||
continue
|
||||
idx += 1
|
||||
num = sc["num"]
|
||||
lab = ("A" + num) if (sc.get("in_annex") and num.isdigit()) else num
|
||||
cu = "%s §%s" % (reg, lab)
|
||||
text = "%s §%s %s\n\n%s" % (reg, lab, sc["title"], body)
|
||||
m = {
|
||||
"regulation_code": reg, "regulation_short": reg, "regulation_name_de": doc["name"],
|
||||
"language": lang, "citation_style": "guidance_section", "document_type": "guidance",
|
||||
"source_class": "supervisory_guidance", "source_role": "interpretation", "use_for_primary": False,
|
||||
"bindingness": "non_binding_interpretative", "authority_level": 70, "authority_weight": 70,
|
||||
"source_type": "guidance", "issuer": doc["issuer"], "jurisdiction": "EU",
|
||||
"source": "ec.europa.eu", "license": "public_eu", "category": "guidance",
|
||||
"citation_unit": cu, "article_label": cu, "parent_citation_unit": reg, "is_citable": True,
|
||||
"article": "§%s" % lab, "article_title": sc["title"], "article_type": "interpretation",
|
||||
"chunk_scope": "guidance_section", "context_hierarchy": [reg],
|
||||
"display_context": "%s > §%s %s" % (reg, lab, sc["title"]),
|
||||
"norm_id": "EU-%s-%s-%s" % (doc_id, lang.upper(), lab),
|
||||
"references_out": [], "child_tables": sc.get("child_tables", []),
|
||||
}
|
||||
m.update(prov)
|
||||
units.append(UploadUnit(filename="%s_%s_s%d.txt" % (doc_id.lower(), lang, idx),
|
||||
text=text, meta=m,
|
||||
document_version="%s-s%d" % (base_version, idx),
|
||||
collection=BUILD_COLLECTION))
|
||||
sources.append((body, sc["title"]))
|
||||
return units, sources
|
||||
|
||||
|
||||
def _attach_refs(units, sources, artref):
|
||||
for u, (body, title) in zip(units, sources):
|
||||
u.meta["references_out"] = sorted({"Art. %s DSGVO" % n for n in artref.findall(body + " " + title)},
|
||||
key=lambda x: int(x.split()[1]))
|
||||
|
||||
|
||||
def build_units(doc_id, doc, lang, parsed, base_version, prov, artref):
|
||||
units, sources = _build_units_struct(doc_id, doc, lang, parsed, base_version, prov)
|
||||
_attach_refs(units, sources, artref)
|
||||
return units
|
||||
|
||||
|
||||
def _table_md(rows):
|
||||
hdr = rows[0]
|
||||
out = ["| " + " | ".join(c or "" for c in hdr) + " |",
|
||||
"| " + " | ".join("---" for _ in hdr) + " |"]
|
||||
for r in rows[1:]:
|
||||
out.append("| " + " | ".join((c or "").replace("\n", " ") for c in r) + " |")
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def build_table_units(doc_id, doc, lang, parsed, base_version, prov):
|
||||
# Capability Tables Extraction: jede Tabelle = EIGENE Knowledge-Unit (Markdown + JSON), an ihre Sektion gehängt.
|
||||
# table.parent_section = section.num ; section.child_tables = [table_id]. Separater Pfad — Heading-Parse unberührt.
|
||||
reg, sections, units = doc["reg"], parsed["sections"], []
|
||||
for ti, (page, rows) in enumerate(parsed.get("raw_tables", []), 1):
|
||||
parent = None
|
||||
for sc in sections:
|
||||
if sc.get("page", 0) <= page:
|
||||
parent = sc
|
||||
else:
|
||||
break
|
||||
plabel = parent["num"] if parent else "0"
|
||||
md = _table_md(rows)
|
||||
if len(md) < 25:
|
||||
continue
|
||||
tid = "%s-t%d" % (doc_id, ti)
|
||||
if parent is not None:
|
||||
parent.setdefault("child_tables", []).append(tid)
|
||||
cu = "%s §%s Tabelle %d" % (reg, plabel, ti)
|
||||
m = {"regulation_code": reg, "regulation_short": reg, "language": lang,
|
||||
"source_class": "supervisory_guidance", "source_role": "interpretation", "use_for_primary": False,
|
||||
"jurisdiction": "EU", "category": "guidance",
|
||||
"is_table": True, "table_id": tid, "parent_section": plabel, "page": page,
|
||||
"columns": rows[0], "rows": rows, "markdown": md,
|
||||
"extraction_method": "pdfplumber", "confidence": "lined-simple",
|
||||
"citation_unit": cu, "article_label": cu, "chunk_scope": "table",
|
||||
"display_context": "%s > §%s > Tabelle %d" % (reg, plabel, ti), "references_out": []}
|
||||
m.update(prov)
|
||||
units.append(UploadUnit(filename="%s_%s_tbl%d.txt" % (doc_id.lower(), lang, ti),
|
||||
text="%s §%s — Tabelle %d (S.%d)\n\n%s" % (reg, plabel, ti, page, md),
|
||||
meta=m, document_version="%s-tbl%d" % (base_version, ti),
|
||||
collection=BUILD_COLLECTION))
|
||||
return units
|
||||
|
||||
|
||||
import capability_pipeline as _CP
|
||||
|
||||
|
||||
def _region_native_tables(pdf_bytes):
|
||||
rt = []
|
||||
regions = []
|
||||
with pdfplumber.open(io.BytesIO(pdf_bytes)) as _pdf:
|
||||
for pg in _pdf.pages:
|
||||
try:
|
||||
tables = pg.find_tables()
|
||||
except Exception:
|
||||
tables = []
|
||||
for t in tables:
|
||||
try:
|
||||
tbl = t.extract()
|
||||
except Exception:
|
||||
tbl = None
|
||||
if tbl and len(tbl) >= 2 and sum(1 for row in tbl for c in row if c and c.strip()) >= 3:
|
||||
rt.append((pg.page_number, [[(c or "").strip() for c in row] for row in tbl]))
|
||||
regions.append((pg.page_number, t.bbox))
|
||||
return rt, regions
|
||||
|
||||
|
||||
class _C6Tables:
|
||||
name = "C6_Tables"
|
||||
consumes = ["table_region"]
|
||||
produces = ["table_units"]
|
||||
def run(self, ctx):
|
||||
rtables, regions = _region_native_tables(ctx["pdf"])
|
||||
ctx["claimed_table_regions"] = regions
|
||||
p2 = dict(ctx["parsed"])
|
||||
p2["raw_tables"] = rtables
|
||||
ctx["_table_units"] = build_table_units(ctx["doc_id"], ctx["doc"], ctx["lang"], p2, ctx["rt"], ctx["prov"])
|
||||
|
||||
|
||||
class _C7ReadingOrder:
|
||||
name = "C7_ReadingOrder"
|
||||
consumes = ["prose_region", "table_units"]
|
||||
produces = ["ordered_prose"]
|
||||
def run(self, ctx):
|
||||
ctx["_c7_owns_prose"] = True
|
||||
|
||||
|
||||
class _C1C2Sections:
|
||||
name = "C1C2_Sections"
|
||||
consumes = ["prose_region", "table_units"]
|
||||
produces = ["section_struct"]
|
||||
def run(self, ctx):
|
||||
units, sources = _build_units_struct(ctx["doc_id"], ctx["doc"], ctx["lang"], ctx["parsed"], ctx["rt"], ctx["prov"])
|
||||
ctx["_section_units"] = units
|
||||
ctx["_sources"] = sources
|
||||
|
||||
|
||||
class _C4References:
|
||||
name = "C4_References"
|
||||
consumes = ["section_struct"]
|
||||
produces = ["references"]
|
||||
def run(self, ctx):
|
||||
_attach_refs(ctx["_section_units"], ctx["_sources"], ctx["cfg"]["artref"])
|
||||
|
||||
|
||||
def run_engine(doc_id, doc, lang, parsed, run_tag, prov, cfg, pdf_bytes):
|
||||
ctx = {"doc_id": doc_id, "doc": doc, "lang": lang, "parsed": parsed,
|
||||
"rt": run_tag, "prov": prov, "cfg": cfg, "pdf": pdf_bytes}
|
||||
caps = [_C4References(), _C7ReadingOrder(), _C1C2Sections(), _C6Tables()]
|
||||
order = _CP.resolve_order(caps, {"table_region", "prose_region"})
|
||||
for c in order:
|
||||
c.run(ctx)
|
||||
return ctx["_section_units"], ctx["_table_units"]
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--doc", required=True)
|
||||
ap.add_argument("--lang", required=True, choices=["en", "de"])
|
||||
ap.add_argument("--dry-run", action="store_true")
|
||||
args = ap.parse_args()
|
||||
doc = DOCS[args.doc]
|
||||
lang, cfg = args.lang, LANG[args.lang]
|
||||
sha = git_sha()
|
||||
run_tag = "2026.1-%s-%s" % (args.doc.lower(), lang)
|
||||
run_id = "%s-%s" % (run_tag, int(time.time()))
|
||||
date = datetime.date.today().isoformat()
|
||||
src = doc["sources"][lang]
|
||||
log.info("SGE %s [%s] | exp_sec=%d sha=%s dry=%s", args.doc, lang, doc["expected_sections"], sha, args.dry_run)
|
||||
|
||||
doc_type = doc.get("doc_type") or ISSUER_DOCTYPE.get(doc["issuer"], "Guidance")
|
||||
expected_refs = DOC_TYPE_REFS.get(doc_type, ["article"])
|
||||
pdf = fetch(src)
|
||||
_t0 = time.time()
|
||||
parsed = parse_guidance(pdf, cfg["noise"])
|
||||
parse_ms = int((time.time() - _t0) * 1000)
|
||||
ok, probs, arts = self_test(parsed, doc["expected_sections"], cfg["artref"], expected_refs)
|
||||
pages = parsed["pages"]
|
||||
log.info("PARSED pages=%d chars=%d body=%.1f sections=%d caps_scheme=%s gate=%s %s",
|
||||
pages, parsed["total_chars"], parsed["body_size"], len(parsed["sections"]), parsed.get("caps_scheme"), ok, probs)
|
||||
log.info("Art-refs: %s", arts)
|
||||
for sc in parsed["sections"]:
|
||||
log.info(" [%s]%s %s (p%d)", sc["num"], "*" if sc.get("in_annex") else " ", sc["title"][:58], sc["page"])
|
||||
|
||||
sch = parsed.get("schemes", set())
|
||||
family = ("unnumbered-toc" if "ut" in sch and not ({"ar", "ro"} & sch) # F4
|
||||
else "roman-hierarchical" if "ro" in sch # F2
|
||||
else "arabic-caps+paragraph" if parsed.get("caps_scheme") # F3
|
||||
else "arabic-hierarchical") # F1
|
||||
detF = FAMILY_F.get(family, "?")
|
||||
expF = EXPECTED_FAMILY.get(args.doc, "?")
|
||||
gate = "OK" if expF == detF else ("REVIEW(exp=%s)" % expF if expF != "?" else "no-expected")
|
||||
prov = {"parser_version": "StructuredGuidanceExtractor@%s+sge_build" % sha, "ingest_run_id": run_id,
|
||||
"ingest_date": date, "source_url": src_url(src), "source_inner_file": src.get("inner", ""),
|
||||
"build_collection": BUILD_COLLECTION, "manifest_version": MANIFEST_VERSION, "document_id": args.doc,
|
||||
"layout_family": detF, "document_type": doc_type, "expected_reference_types": expected_refs}
|
||||
units, table_units = run_engine(args.doc, doc, lang, parsed, run_tag, prov, cfg, pdf) # CUTOVER: Engine-Pfad
|
||||
total_refs = sum(len(u.meta["references_out"]) for u in units)
|
||||
detect = "TOC(%d)" % parsed.get("toc_entries", 0) if parsed.get("toc") else "heuristic"
|
||||
log.info("UNITS=%d table_units=%d | VITALS family=%s(%s) FAMILY-GATE=%s detect=%s tables_detect=%d tables_extract=%d annex=%s parse_ms=%d chunks/page(units)=%.2f refs/unit=%.2f",
|
||||
len(units), len(table_units), family, detF, gate, detect, parsed.get("tables", 0), len(table_units),
|
||||
"Y" if parsed.get("annex") else "N", parse_ms, len(units) / max(pages, 1), total_refs / max(len(units), 1))
|
||||
if gate.startswith("REVIEW"):
|
||||
log.warning("FAMILY-GATE REVIEW: %s erwartet %s, erkannt %s — Klassifikation prüfen", args.doc, expF, detF)
|
||||
if units:
|
||||
m = units[0].meta
|
||||
pk = ["parser_version", "ingest_run_id", "ingest_date", "source_url", "build_collection", "manifest_version", "document_id"]
|
||||
log.info("sample: label=%r language=%r source_class=%r use_for_primary=%r refs=%s",
|
||||
m.get("article_label"), m.get("language"), m.get("source_class"), m.get("use_for_primary"), m.get("references_out"))
|
||||
log.info("provenance present: %s", all(k in m for k in pk))
|
||||
|
||||
if args.dry_run:
|
||||
import json as _json, hashlib as _hl
|
||||
_g=[{"id":u.document_version,"filename":u.filename,"kind":("table" if u.meta.get("is_table") else "section"),
|
||||
"text_sha":_hl.sha256((u.text or "").encode()).hexdigest()[:16],"meta":u.meta} for u in (units+table_units)]
|
||||
_bp="/tmp/baseline_%s_%s.json"%(args.doc,lang)
|
||||
open(_bp,"w").write(_json.dumps(_g,ensure_ascii=False,indent=1,default=str))
|
||||
log.info("DRY RUN baseline -> %s (%d units: %d section + %d table)",_bp,len(_g),len(units),len(table_units)); return
|
||||
if not ok:
|
||||
log.error("GATE FAILED — aborting"); sys.exit(1)
|
||||
n = 0
|
||||
with httpx.Client(timeout=600.0, verify=False) as c:
|
||||
for u in units + table_units:
|
||||
n += upload_unit(c, RAG_URL, u)
|
||||
log.info("UPLOADED: %d section + %d table units -> %d chunks", len(units), len(table_units), n)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,132 @@
|
||||
"""GuidanceIngester (Parser 3): ingests supervisory guidance (EDPB / DSK / ENISA
|
||||
/ BSI / CNIL) as a SEPARATE interpretative source — never a primary obligation.
|
||||
|
||||
Guidance documents are heterogeneous PDFs / HTML, unlike the uniform eur-lex
|
||||
XHTML of Parsers 1-2. This module extracts the text (pdfplumber for PDF, a small
|
||||
HTML stripper otherwise), normalises it, and tags it
|
||||
source_class=supervisory_guidance / authority_weight=70 /
|
||||
bindingness=interpretative / use_for_primary=false, with references_out to the
|
||||
binding norms it interprets (Art. N DSGVO / § N BDSG). So guidance ranks BELOW
|
||||
binding law for obligation questions, yet stays fully retrievable as
|
||||
interpretation context (and is the right Top-1 for "what does the EDPB say?").
|
||||
|
||||
Chunking is left to the RAG service (chunk_strategy='legal'); each resulting
|
||||
chunk inherits the guidance metadata. pdfplumber is imported lazily so the
|
||||
module (and its tests) load without it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import html as html_lib
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from services.legal_act_ingester import UploadUnit
|
||||
|
||||
GUIDANCE_WEIGHT = 70
|
||||
|
||||
_TAG_RE = re.compile(r"<[^>]+>")
|
||||
_WS_RE = re.compile(r"[ \t]+")
|
||||
_BLANK_RE = re.compile(r"\n{3,}")
|
||||
# "Artikel 37", "Art. 38", "Article 9" → the interpreted article number
|
||||
_ART_REF_RE = re.compile(r"\bArt(?:ikel|icle|\.)?\s*(\d+[a-z]?)", re.IGNORECASE)
|
||||
_PARA_REF_RE = re.compile(r"§\s*(\d+[a-z]?)")
|
||||
_MIN_GUIDANCE_CHARS = 200
|
||||
|
||||
|
||||
@dataclass
|
||||
class GuidanceSpec:
|
||||
"""One guidance document + the binding norm it interprets."""
|
||||
|
||||
source_id: str # stable handle, e.g. "edpb_dpo"
|
||||
short: str # display handle used as regulation_short, e.g. "EDPB DPO"
|
||||
title: str # full title
|
||||
publisher: str # EDPB / DSK / ENISA / BSI / CNIL
|
||||
url: str
|
||||
interpreted_reg: str # binding norm it interprets, e.g. "DSGVO" (for references_out)
|
||||
collection: str = "bp_compliance_datenschutz"
|
||||
version_date: str = ""
|
||||
jurisdiction: str = "EU"
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Collapse intra-line whitespace and runs of blank lines."""
|
||||
text = html_lib.unescape(text)
|
||||
text = _WS_RE.sub(" ", text)
|
||||
text = "\n".join(line.strip() for line in text.split("\n"))
|
||||
return _BLANK_RE.sub("\n\n", text).strip()
|
||||
|
||||
|
||||
def extract_pdf(path: str) -> str:
|
||||
"""Extract text from a PDF. pdfplumber is imported lazily (container only)."""
|
||||
import pdfplumber # noqa: PLC0415 — heavy, optional dep; only needed at ingest time
|
||||
|
||||
parts: list[str] = []
|
||||
with pdfplumber.open(path) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text(x_tolerance=3, y_tolerance=4)
|
||||
if page_text:
|
||||
parts.append(page_text)
|
||||
return normalize_text("\n".join(parts))
|
||||
|
||||
|
||||
def extract_html(raw: str) -> str:
|
||||
"""Strip tags to plain text (for guidance served as HTML)."""
|
||||
return normalize_text(_TAG_RE.sub(" ", raw))
|
||||
|
||||
|
||||
def guidance_refs_out(interpreted_reg: str, text: str) -> list[str]:
|
||||
"""Forward edges from the guidance to the binding norms it cites."""
|
||||
out = {f"Art. {m} {interpreted_reg}" for m in _ART_REF_RE.findall(text)}
|
||||
out |= {f"§ {m} BDSG" for m in _PARA_REF_RE.findall(text)}
|
||||
return sorted(out)
|
||||
|
||||
|
||||
def guidance_meta(spec: GuidanceSpec, text: str) -> dict[str, Any]:
|
||||
return {
|
||||
"regulation_code": spec.short,
|
||||
"regulation_short": spec.short,
|
||||
"regulation_name_de": spec.title,
|
||||
"citation_style": "guidance",
|
||||
"document_type": "guidance",
|
||||
"source_class": "supervisory_guidance",
|
||||
"bindingness": "interpretative",
|
||||
"authority_level": GUIDANCE_WEIGHT,
|
||||
"authority_weight": GUIDANCE_WEIGHT,
|
||||
"source_type": "guidance",
|
||||
"issuer": spec.publisher,
|
||||
"jurisdiction": spec.jurisdiction,
|
||||
"version_date": spec.version_date,
|
||||
"source": spec.url,
|
||||
"license": "public_eu",
|
||||
"category": "guidance",
|
||||
"use_for_primary": False, # interpretative — never a primary obligation source
|
||||
"is_citable": True,
|
||||
"citation_unit": spec.title,
|
||||
"article_label": spec.short,
|
||||
"chunk_scope": "guidance",
|
||||
"interprets": spec.interpreted_reg,
|
||||
"references_out": guidance_refs_out(spec.interpreted_reg, text),
|
||||
"norm_id": f"GUIDANCE-{spec.source_id}",
|
||||
}
|
||||
|
||||
|
||||
def self_test(text: str) -> tuple[bool, list[str]]:
|
||||
"""Gate before upload — guard against an empty/failed extraction."""
|
||||
problems: list[str] = []
|
||||
if len(text.strip()) < _MIN_GUIDANCE_CHARS:
|
||||
problems.append(f"extracted text too short ({len(text.strip())} chars)")
|
||||
return (not problems, problems)
|
||||
|
||||
|
||||
def build_upload_unit(spec: GuidanceSpec, text: str, run_tag: str) -> UploadUnit:
|
||||
"""One UploadUnit for the whole document; the RAG service chunks it and each
|
||||
chunk inherits the guidance metadata."""
|
||||
return UploadUnit(
|
||||
filename=f"{spec.source_id}.txt",
|
||||
text=text,
|
||||
meta=guidance_meta(spec, text),
|
||||
document_version=f"{run_tag}-{spec.source_id}",
|
||||
collection=spec.collection,
|
||||
)
|
||||
@@ -0,0 +1,332 @@
|
||||
"""Production LegalActIngester for EU eur-lex acts (Parser 1 of the corpus stack).
|
||||
|
||||
Downloads the German XHTML of an EU act (CELLAR machine endpoint, with the
|
||||
eur-lex web UI as fallback), parses it into ARTICLES + ANNEXES with full
|
||||
authority metadata and forward citation edges (references_out), and self-tests
|
||||
the parse before any upload. The eur-lex / CELLAR XHTML uses uniform CSS classes
|
||||
(oj-ti-art / oj-sti-art / oj-normal / oj-ti-grseq-1 / oj-doc-ti) across every
|
||||
act, so one parser covers DSGVO / CRA / AI Act / DORA / NIS2 / MaschinenVO / ...
|
||||
|
||||
Recitals are intentionally NOT handled here — they are a separate, lower-weight
|
||||
source (RecitalIngester, Parser 2). Scope of this module: binding articles + the
|
||||
annexes that carry the actual obligations.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import html as html_lib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CHUNK_SIZE = "2500"
|
||||
CHUNK_OVERLAP = "200"
|
||||
|
||||
# CELLAR is the canonical machine endpoint and returns the full XHTML even for
|
||||
# acts the eur-lex web UI blocks with an empty HTTP 202 (e.g. DORA). Try it
|
||||
# first, fall back to the web UI for anything CELLAR cannot serve.
|
||||
CELLAR_URL = "http://publications.europa.eu/resource/celex/{celex}"
|
||||
EURLEX_URL = "https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{celex}"
|
||||
_USER_AGENT = "Mozilla/5.0 (compatible; BreakPilot-LegalActIngester/1.0)"
|
||||
|
||||
_P_RE = re.compile(
|
||||
r'<p[^>]*class="oj-(ti-art|sti-art|normal|expanded|ti-grseq-1|doc-ti)"[^>]*>(.*?)</p>',
|
||||
re.S,
|
||||
)
|
||||
_TAG_RE = re.compile(r"<[^>]+>")
|
||||
_ART_RE = re.compile(r"Artikel\s+(\d+[a-z]?)")
|
||||
_ANNEX_RE = re.compile(r"ANHANG\s+([IVXLC]+)\b")
|
||||
_PARA_RE = re.compile(r"§\s*(\d+[a-z]?)")
|
||||
_ANNEX_REF_RE = re.compile(r"Anh[ae]ng\s+([IVXLC]+)\b")
|
||||
_EMPTY_ANNEX_CHARS = 15 # below this an annex is a table/product list → skip on upload
|
||||
|
||||
|
||||
@dataclass
|
||||
class RegSpec:
|
||||
"""The minimum an act needs to be ingested + cited."""
|
||||
|
||||
reg: str # short citation handle, e.g. "CRA"
|
||||
celex: str # e.g. "32024R2847"
|
||||
name_de: str
|
||||
collection: str = "bp_compliance_ce"
|
||||
version_date: str = "" # ISO date, e.g. "2024-10-23"
|
||||
legal_basis_rank: str = "eu_regulation" # or "eu_directive"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Article:
|
||||
num: str
|
||||
title: str = ""
|
||||
body: list[str] = field(default_factory=list)
|
||||
chapter: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class Annex:
|
||||
num: str
|
||||
title: str = ""
|
||||
body: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedAct:
|
||||
reg: str
|
||||
articles: list[Article]
|
||||
annexes: list[Annex]
|
||||
|
||||
|
||||
@dataclass
|
||||
class UploadUnit:
|
||||
filename: str
|
||||
text: str
|
||||
meta: dict[str, Any]
|
||||
document_version: str
|
||||
collection: str
|
||||
|
||||
|
||||
def clean(fragment: str) -> str:
|
||||
"""Strip tags, unescape entities and collapse whitespace."""
|
||||
return " ".join(html_lib.unescape(_TAG_RE.sub("", fragment)).split())
|
||||
|
||||
|
||||
def download_act(celex: str, *, client: httpx.Client | None = None) -> str:
|
||||
"""Fetch an act's German XHTML — CELLAR first, eur-lex fallback.
|
||||
|
||||
Raises RuntimeError if neither source yields a usable document (status 200
|
||||
containing article markers).
|
||||
"""
|
||||
own_client = client is None
|
||||
http = client or httpx.Client(timeout=60.0, follow_redirects=True)
|
||||
try:
|
||||
attempts: tuple[tuple[str, dict[str, str]], ...] = (
|
||||
(
|
||||
CELLAR_URL.format(celex=celex),
|
||||
{"Accept-Language": "deu", "Accept": "application/xhtml+xml, text/html;q=0.9"},
|
||||
),
|
||||
(EURLEX_URL.format(celex=celex), {}),
|
||||
)
|
||||
for url, extra in attempts:
|
||||
try:
|
||||
resp = http.get(url, headers={"User-Agent": _USER_AGENT, **extra})
|
||||
except httpx.HTTPError as exc:
|
||||
logger.warning("legal-act fetch error for %s: %s", url, exc)
|
||||
continue
|
||||
if resp.status_code == 200 and "oj-ti-art" in resp.text:
|
||||
logger.info("downloaded CELEX %s from %s (%d chars)", celex, url, len(resp.text))
|
||||
return resp.text
|
||||
logger.warning(
|
||||
"no usable doc for CELEX %s from %s (status=%s, len=%d)",
|
||||
celex, url, resp.status_code, len(resp.text),
|
||||
)
|
||||
raise RuntimeError(f"no usable XHTML for CELEX {celex} (CELLAR + eur-lex failed)")
|
||||
finally:
|
||||
if own_client:
|
||||
http.close()
|
||||
|
||||
|
||||
def refs_out(reg: str, text: str) -> list[str]:
|
||||
"""Forward citation edges found in `text`: Art→Art, Art→§ (BDSG), Art→Annex."""
|
||||
out = {f"Art. {m} {reg}" for m in _ART_RE.findall(text)}
|
||||
out |= {f"§ {m} BDSG" for m in _PARA_RE.findall(text)}
|
||||
out |= {f"{reg} Anhang {m}" for m in _ANNEX_REF_RE.findall(text)}
|
||||
return sorted(out)
|
||||
|
||||
|
||||
def parse_html(raw: str, reg: str) -> ParsedAct:
|
||||
"""Parse eur-lex/CELLAR XHTML into articles + annexes (no recitals).
|
||||
|
||||
Text before the first article (recitals/preamble) is ignored on purpose —
|
||||
that is RecitalIngester's job (Parser 2).
|
||||
"""
|
||||
articles: list[Article] = []
|
||||
annexes: list[Annex] = []
|
||||
cur: Article | None = None
|
||||
ann: Annex | None = None
|
||||
chapter = ""
|
||||
|
||||
for cls, inner in _P_RE.findall(raw):
|
||||
txt = clean(inner)
|
||||
if not txt:
|
||||
continue
|
||||
|
||||
annex_match = _ANNEX_RE.match(txt) if cls == "doc-ti" else None
|
||||
if annex_match:
|
||||
if cur is not None:
|
||||
articles.append(cur)
|
||||
cur = None
|
||||
if ann is not None:
|
||||
annexes.append(ann)
|
||||
ann = Annex(num=annex_match.group(1))
|
||||
continue
|
||||
|
||||
if ann is not None: # annex mode
|
||||
if cls in ("doc-ti", "ti-grseq-1") and not ann.title:
|
||||
ann.title = txt
|
||||
elif cls in ("normal", "expanded", "ti-grseq-1"):
|
||||
ann.body.append(txt)
|
||||
continue
|
||||
|
||||
if cls == "doc-ti": # document title / preamble headings
|
||||
continue
|
||||
if cls == "ti-grseq-1": # chapter / section heading
|
||||
chapter = txt
|
||||
continue
|
||||
if cls == "ti-art":
|
||||
art_match = _ART_RE.match(txt)
|
||||
if art_match:
|
||||
if cur is not None:
|
||||
articles.append(cur)
|
||||
cur = Article(num=art_match.group(1), chapter=chapter)
|
||||
continue
|
||||
if cls == "sti-art" and cur is not None:
|
||||
cur.title = txt
|
||||
continue
|
||||
if cls in ("normal", "expanded") and cur is not None:
|
||||
cur.body.append(txt)
|
||||
|
||||
if cur is not None:
|
||||
articles.append(cur)
|
||||
if ann is not None:
|
||||
annexes.append(ann)
|
||||
return ParsedAct(reg=reg, articles=articles, annexes=annexes)
|
||||
|
||||
|
||||
def self_test(act: ParsedAct) -> tuple[bool, list[str]]:
|
||||
"""Gate the parse before upload. Empty annexes (tables) do NOT fail — they
|
||||
are skipped on upload. Returns (passed, problems)."""
|
||||
problems: list[str] = []
|
||||
if not act.articles:
|
||||
problems.append("0 articles parsed")
|
||||
nums = [a.num for a in act.articles]
|
||||
if len(nums) != len(set(nums)):
|
||||
problems.append("duplicate article numbers")
|
||||
short = [a.num for a in act.articles if len(" ".join(a.body)) < 15]
|
||||
if short:
|
||||
problems.append(f"{len(short)} empty articles (e.g. {short[:3]})")
|
||||
return (not problems, problems)
|
||||
|
||||
|
||||
def _base_meta(spec: RegSpec) -> dict[str, Any]:
|
||||
return {
|
||||
"regulation_code": spec.reg,
|
||||
"regulation_short": spec.reg,
|
||||
"regulation_name_de": spec.name_de,
|
||||
"citation_style": "article",
|
||||
"document_type": "legal_act",
|
||||
"source_class": "binding_law",
|
||||
"bindingness": "binding",
|
||||
"authority_level": 95,
|
||||
"authority_weight": 100,
|
||||
"source_type": "law",
|
||||
"issuer": "European Union",
|
||||
"jurisdiction": "EU",
|
||||
"legal_basis_rank": spec.legal_basis_rank,
|
||||
"version_date": spec.version_date,
|
||||
"source": "eur-lex.europa.eu",
|
||||
"license": "public_eu",
|
||||
"category": "recht",
|
||||
"celex": spec.celex,
|
||||
"use_for_primary": True,
|
||||
}
|
||||
|
||||
|
||||
def _article_meta(spec: RegSpec, art: Article) -> dict[str, Any]:
|
||||
cu = f"Art. {art.num} {spec.reg}"
|
||||
meta = _base_meta(spec)
|
||||
meta.update({
|
||||
"citation_unit": cu,
|
||||
"article_label": cu,
|
||||
"parent_citation_unit": cu,
|
||||
"is_citable": True,
|
||||
"article": art.num,
|
||||
"context_hierarchy": [art.chapter] if art.chapter else [],
|
||||
"display_context": (art.chapter + " > " if art.chapter else "") + cu,
|
||||
"chunk_scope": "section",
|
||||
"article_title": art.title,
|
||||
"article_type": "obligation",
|
||||
"references_out": refs_out(spec.reg, " ".join(art.body)),
|
||||
"norm_id": f"EU-{spec.reg.replace(' ', '')}-Art{art.num}",
|
||||
})
|
||||
return meta
|
||||
|
||||
|
||||
def _annex_meta(spec: RegSpec, annex: Annex) -> dict[str, Any]:
|
||||
cu = f"{spec.reg} Anhang {annex.num}"
|
||||
meta = _base_meta(spec)
|
||||
meta.update({
|
||||
"citation_unit": cu,
|
||||
"article_label": cu,
|
||||
"parent_citation_unit": cu,
|
||||
"is_citable": True,
|
||||
"article": f"Anhang-{annex.num}", # distinct → avoids point-ID collisions
|
||||
"context_hierarchy": [f"Anhang {annex.num}"],
|
||||
"display_context": cu,
|
||||
"chunk_scope": "annex",
|
||||
"article_title": annex.title,
|
||||
"article_type": "requirement",
|
||||
"references_out": refs_out(spec.reg, " ".join(annex.body)),
|
||||
"norm_id": f"EU-{spec.reg.replace(' ', '')}-Anhang{annex.num}",
|
||||
})
|
||||
return meta
|
||||
|
||||
|
||||
def build_upload_units(act: ParsedAct, spec: RegSpec, run_tag: str) -> list[UploadUnit]:
|
||||
"""One UploadUnit per article/annex. Articles share a document_version; each
|
||||
annex gets its own (the RAG service derives `article` from text and would
|
||||
otherwise collide annexes on chunk_index). Empty annexes are skipped.
|
||||
"""
|
||||
slug = spec.reg.lower().replace(" ", "")
|
||||
base_version = f"{run_tag}-{slug}"
|
||||
units: list[UploadUnit] = []
|
||||
for art in act.articles:
|
||||
text = f"Art. {art.num} {spec.reg} {art.title}\n\n" + "\n\n".join(art.body)
|
||||
units.append(UploadUnit(
|
||||
filename=f"{slug}_art{art.num}.txt",
|
||||
text=text,
|
||||
meta=_article_meta(spec, art),
|
||||
document_version=base_version,
|
||||
collection=spec.collection,
|
||||
))
|
||||
for annex in act.annexes:
|
||||
if len(" ".join(annex.body)) < _EMPTY_ANNEX_CHARS:
|
||||
continue # table / correspondence list — no usable prose
|
||||
text = f"{spec.reg} Anhang {annex.num} {annex.title}\n\n" + "\n\n".join(annex.body)
|
||||
units.append(UploadUnit(
|
||||
filename=f"{slug}_anhang{annex.num}.txt",
|
||||
text=text,
|
||||
meta=_annex_meta(spec, annex),
|
||||
document_version=f"{base_version}-anhang{annex.num}",
|
||||
collection=spec.collection,
|
||||
))
|
||||
return units
|
||||
|
||||
|
||||
def upload_unit(client: httpx.Client, rag_url: str, unit: UploadUnit) -> int:
|
||||
"""Upload one unit to the RAG service. Returns the chunk count (0 on non-200)."""
|
||||
data = {
|
||||
"collection": unit.collection,
|
||||
"data_type": "compliance",
|
||||
"bundesland": "eu",
|
||||
"use_case": "legal_reference",
|
||||
"year": (unit.meta.get("version_date") or "")[:4] or "2026",
|
||||
"chunk_strategy": "legal",
|
||||
"chunk_size": CHUNK_SIZE,
|
||||
"chunk_overlap": CHUNK_OVERLAP,
|
||||
"metadata_json": json.dumps(unit.meta, ensure_ascii=False),
|
||||
"document_version": unit.document_version,
|
||||
}
|
||||
resp = client.post(
|
||||
f"{rag_url}/api/v1/documents/upload",
|
||||
files={"file": (unit.filename, unit.text.encode("utf-8"), "text/plain")},
|
||||
data=data,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.error("upload %s failed: %s %s", unit.filename, resp.status_code, resp.text[:200])
|
||||
return 0
|
||||
return int(resp.json().get("chunks_count", 0))
|
||||
@@ -0,0 +1,115 @@
|
||||
"""RecitalIngester (Parser 2): ingests EU act recitals (Erwägungsgründe) as a
|
||||
SEPARATE, interpretative source — never a primary obligation source.
|
||||
|
||||
In eur-lex / CELLAR XHTML each recital sits in a preamble block
|
||||
<div class="eli-subdivision" id="rct_N"> with the marker "(N)" and the text in
|
||||
adjacent table cells, which is why a naive article parser finds none. This
|
||||
parser keys on the id="rct_N" markers and joins the recital's prose.
|
||||
|
||||
Recitals are tagged source_class=recital / authority_weight=60 /
|
||||
use_for_primary=false, so they rank below binding articles and surface only as
|
||||
interpretation context (and trip the human-review flag if they ever top
|
||||
results). Reuses the eur-lex download + helpers from legal_act_ingester
|
||||
(Parser 1).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from services.legal_act_ingester import RegSpec, UploadUnit, clean, refs_out
|
||||
|
||||
RECITAL_WEIGHT = 60
|
||||
|
||||
_RCT_DIV_RE = re.compile(r'id="rct_(\d+)"')
|
||||
_OJ_P_RE = re.compile(r'<p[^>]*class="oj-normal"[^>]*>(.*?)</p>', re.S)
|
||||
_RCT_NUM_RE = re.compile(r"^\(\d+\)$")
|
||||
_MIN_RECITAL_CHARS = 20
|
||||
|
||||
|
||||
@dataclass
|
||||
class Recital:
|
||||
num: str
|
||||
text: str
|
||||
|
||||
|
||||
def parse_recitals(raw: str, reg: str) -> list[Recital]:
|
||||
"""Extract recitals from the preamble via the id="rct_N" markers. `reg` is
|
||||
accepted for symmetry with the other parsers (recitals carry no reg in-text)."""
|
||||
_ = reg
|
||||
end = raw.find('class="oj-ti-art"')
|
||||
if end < 0:
|
||||
end = len(raw)
|
||||
markers = [(m.group(1), m.start()) for m in _RCT_DIV_RE.finditer(raw, 0, end)]
|
||||
recitals: list[Recital] = []
|
||||
for i, (num, start) in enumerate(markers):
|
||||
stop = markers[i + 1][1] if i + 1 < len(markers) else end
|
||||
parts = [clean(inner) for inner in _OJ_P_RE.findall(raw[start:stop])]
|
||||
body = " ".join(p for p in parts if p and not _RCT_NUM_RE.match(p))
|
||||
if len(body) >= _MIN_RECITAL_CHARS:
|
||||
recitals.append(Recital(num=num, text=body))
|
||||
return recitals
|
||||
|
||||
|
||||
def self_test(recitals: list[Recital]) -> tuple[bool, list[str]]:
|
||||
"""Gate before upload. Every EU act has recitals → 0 is a parse failure."""
|
||||
problems: list[str] = []
|
||||
if not recitals:
|
||||
problems.append("0 recitals parsed")
|
||||
nums = [r.num for r in recitals]
|
||||
if len(nums) != len(set(nums)):
|
||||
problems.append("duplicate recital numbers")
|
||||
return (not problems, problems)
|
||||
|
||||
|
||||
def _recital_meta(spec: RegSpec, rc: Recital) -> dict[str, Any]:
|
||||
cu = f"{spec.reg} Erwägungsgrund {rc.num}"
|
||||
return {
|
||||
"regulation_code": spec.reg,
|
||||
"regulation_short": spec.reg,
|
||||
"regulation_name_de": spec.name_de,
|
||||
"citation_style": "recital",
|
||||
"document_type": "legal_act",
|
||||
"source_class": "recital",
|
||||
"bindingness": "interpretative",
|
||||
"authority_level": 60,
|
||||
"authority_weight": RECITAL_WEIGHT,
|
||||
"source_type": "law",
|
||||
"issuer": "European Union",
|
||||
"jurisdiction": "EU",
|
||||
"legal_basis_rank": spec.legal_basis_rank,
|
||||
"version_date": spec.version_date,
|
||||
"source": "eur-lex.europa.eu",
|
||||
"license": "public_eu",
|
||||
"category": "recht",
|
||||
"celex": spec.celex,
|
||||
"use_for_primary": False, # interpretative — never a primary obligation source
|
||||
"is_recital": True,
|
||||
"citation_unit": cu,
|
||||
"article_label": cu,
|
||||
"article": f"Erwaegungsgrund-{rc.num}", # distinct → avoids point-ID collisions
|
||||
"chunk_scope": "recital",
|
||||
"article_type": "recital",
|
||||
"references_out": refs_out(spec.reg, rc.text),
|
||||
"norm_id": f"EU-{spec.reg.replace(' ', '')}-Rec{rc.num}",
|
||||
}
|
||||
|
||||
|
||||
def build_upload_units(recitals: list[Recital], spec: RegSpec, run_tag: str) -> list[UploadUnit]:
|
||||
"""One UploadUnit per recital, each with its own document_version (the RAG
|
||||
service derives `article` from text and would otherwise collide recitals)."""
|
||||
slug = spec.reg.lower().replace(" ", "")
|
||||
base = f"{run_tag}-{slug}"
|
||||
units: list[UploadUnit] = []
|
||||
for rc in recitals:
|
||||
text = f"{spec.reg} Erwägungsgrund {rc.num}\n\n{rc.text}"
|
||||
units.append(UploadUnit(
|
||||
filename=f"{slug}_rec{rc.num}.txt",
|
||||
text=text,
|
||||
meta=_recital_meta(spec, rc),
|
||||
document_version=f"{base}-rec{rc.num}",
|
||||
collection=spec.collection,
|
||||
))
|
||||
return units
|
||||
@@ -0,0 +1,81 @@
|
||||
"""StandardIngester (Parser 4): ingests technical standards / control frameworks
|
||||
(NIST / OWASP / BSI Grundschutz / CSA CCM) as best-practice CONTROLS — never a
|
||||
primary obligation.
|
||||
|
||||
Tagged source_class=technical_standard / authority_weight=80 /
|
||||
bindingness=best_practice / use_for_primary=false, so a standard ranks below
|
||||
binding law AND supervisory guidance for an obligation/interpretation question,
|
||||
but surfaces for "which controls / measures fit?" questions (control-intent in
|
||||
the retriever). Reuses the PDF/HTML extraction helpers from guidance_ingester.
|
||||
|
||||
LICENSE per source matters: NIST = US public domain (free), OWASP = CC-BY-SA
|
||||
(free, share-alike), BSI Grundschutz = check terms. CSA CCM is CC-BY-NC (NON
|
||||
commercial) → NOT usable in a commercial product; carry the license on each unit
|
||||
so the gate can refuse it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from services.legal_act_ingester import UploadUnit
|
||||
|
||||
# Extraction (extract_pdf / extract_html / normalize_text) + the self_test gate are
|
||||
# shared with guidance_ingester — the operational runner imports them from there.
|
||||
|
||||
STANDARD_WEIGHT = 80
|
||||
|
||||
|
||||
@dataclass
|
||||
class StandardSpec:
|
||||
"""One technical standard / control framework document."""
|
||||
|
||||
source_id: str # stable handle, e.g. "nist_csf_2_0"
|
||||
short: str # display handle / regulation_short, e.g. "NIST CSF 2.0"
|
||||
title: str
|
||||
publisher: str # NIST / OWASP / BSI / CSA
|
||||
url: str
|
||||
license: str # e.g. "public_domain" | "CC-BY-SA-4.0" | "CC-BY-NC" (→ commercial-blocked)
|
||||
collection: str = "bp_compliance_ce"
|
||||
version_date: str = ""
|
||||
jurisdiction: str = "international"
|
||||
|
||||
|
||||
def standard_meta(spec: StandardSpec) -> dict[str, Any]:
|
||||
return {
|
||||
"regulation_code": spec.short,
|
||||
"regulation_short": spec.short,
|
||||
"regulation_name_de": spec.title,
|
||||
"citation_style": "standard",
|
||||
"document_type": "standard",
|
||||
"source_class": "technical_standard",
|
||||
"bindingness": "best_practice",
|
||||
"authority_level": STANDARD_WEIGHT,
|
||||
"authority_weight": STANDARD_WEIGHT,
|
||||
"source_type": "standard",
|
||||
"issuer": spec.publisher,
|
||||
"jurisdiction": spec.jurisdiction,
|
||||
"version_date": spec.version_date,
|
||||
"source": spec.url,
|
||||
"license": spec.license,
|
||||
"category": "standard",
|
||||
"use_for_primary": False, # best-practice control — never a primary obligation
|
||||
"is_citable": True,
|
||||
"citation_unit": spec.title,
|
||||
"article_label": spec.short,
|
||||
"chunk_scope": "standard",
|
||||
"norm_id": f"STANDARD-{spec.source_id}",
|
||||
}
|
||||
|
||||
|
||||
def build_upload_unit(spec: StandardSpec, text: str, run_tag: str) -> UploadUnit:
|
||||
"""One UploadUnit for the whole standard; the RAG service chunks it and each
|
||||
chunk inherits the technical_standard metadata."""
|
||||
return UploadUnit(
|
||||
filename=f"{spec.source_id}.txt",
|
||||
text=text,
|
||||
meta=standard_meta(spec),
|
||||
document_version=f"{run_tag}-{spec.source_id}",
|
||||
collection=spec.collection,
|
||||
)
|
||||
@@ -0,0 +1,17 @@
|
||||
<!DOCTYPE html>
|
||||
<html><body>
|
||||
<p class="oj-doc-ti">VERORDNUNG (EU) 2099/1 DES TESTGEBERS</p>
|
||||
<p class="oj-normal">(1) Dieser Erwaegungsgrund steht vor den Artikeln und darf NICHT als Artikel geparst werden.</p>
|
||||
<p class="oj-ti-grseq-1">KAPITEL I</p>
|
||||
<p class="oj-ti-art">Artikel 1</p>
|
||||
<p class="oj-sti-art">Gegenstand</p>
|
||||
<p class="oj-normal">Diese Verordnung legt Anforderungen fest; Einzelheiten regeln Artikel 2 und Anhang I.</p>
|
||||
<p class="oj-ti-art">Artikel 2</p>
|
||||
<p class="oj-sti-art">Begriffsbestimmungen</p>
|
||||
<p class="oj-normal">Im Sinne dieser Verordnung bezeichnet der Ausdruck Produkt eine Sache mit digitalen Elementen.</p>
|
||||
<p class="oj-doc-ti">ANHANG I</p>
|
||||
<p class="oj-ti-grseq-1">GRUNDLEGENDE ANFORDERUNGEN</p>
|
||||
<p class="oj-normal">Die Produkte muessen die grundlegenden Anforderungen gemaess Artikel 1 dauerhaft erfuellen.</p>
|
||||
<p class="oj-doc-ti">ANHANG II</p>
|
||||
<p class="oj-normal">x</p>
|
||||
</body></html>
|
||||
@@ -0,0 +1,19 @@
|
||||
<!DOCTYPE html>
|
||||
<html><body>
|
||||
<p class="oj-normal">DAS EUROPÄISCHE PARLAMENT — in Erwägung nachstehender Gründe:</p>
|
||||
<div class="eli-subdivision" id="rct_1">
|
||||
<table><tbody><tr>
|
||||
<td><p class="oj-normal">(1)</p></td>
|
||||
<td><p class="oj-normal">Dieser erste Erwaegungsgrund erklaert den Hintergrund der Verordnung ausfuehrlich und verweist auf Artikel 5.</p></td>
|
||||
</tr></tbody></table>
|
||||
</div>
|
||||
<div class="eli-subdivision" id="rct_2">
|
||||
<table><tbody><tr>
|
||||
<td><p class="oj-normal">(2)</p></td>
|
||||
<td><p class="oj-normal">Der zweite Erwaegungsgrund ergaenzt den ersten und nennt weitere Ziele der Regelung im Detail.</p></td>
|
||||
</tr></tbody></table>
|
||||
</div>
|
||||
<p class="oj-ti-art">Artikel 1</p>
|
||||
<p class="oj-sti-art">Gegenstand</p>
|
||||
<p class="oj-normal">Der eigentliche Artikeltext, der KEIN Erwaegungsgrund ist und nicht als solcher geparst werden darf.</p>
|
||||
</body></html>
|
||||
@@ -0,0 +1,72 @@
|
||||
"""Unit tests for the GuidanceIngester engine (Parser 3).
|
||||
|
||||
Pure tests on the text + metadata path (PDF extraction is a lazy pdfplumber
|
||||
wrapper, exercised in the container). Covers: normalisation, HTML stripping,
|
||||
references_out to the interpreted norm, the interpretative (non-primary)
|
||||
metadata and the self-test gate.
|
||||
"""
|
||||
|
||||
from services.guidance_ingester import (
|
||||
GuidanceSpec,
|
||||
build_upload_unit,
|
||||
extract_html,
|
||||
guidance_meta,
|
||||
guidance_refs_out,
|
||||
normalize_text,
|
||||
self_test,
|
||||
)
|
||||
|
||||
SPEC = GuidanceSpec(
|
||||
source_id="edpb_dpo",
|
||||
short="EDPB DPO",
|
||||
title="EDPB Leitlinien zum Datenschutzbeauftragten",
|
||||
publisher="EDPB",
|
||||
url="https://edpb.europa.eu/guidelines/dpo",
|
||||
interpreted_reg="DSGVO",
|
||||
version_date="2017-04-05",
|
||||
)
|
||||
|
||||
|
||||
def test_normalize_text_collapses_whitespace_and_blank_runs():
|
||||
assert normalize_text("a b\t c\n\n\n\nd") == "a b c\n\nd"
|
||||
|
||||
|
||||
def test_extract_html_strips_tags():
|
||||
assert "Hallo Welt" in extract_html("<p>Hallo <b>Welt</b></p>")
|
||||
|
||||
|
||||
def test_guidance_refs_out_links_to_interpreted_reg():
|
||||
text = "Gemaess Artikel 37, Art. 38 und Article 9 der Verordnung sowie § 38 BDSG."
|
||||
refs = guidance_refs_out("DSGVO", text)
|
||||
assert "Art. 37 DSGVO" in refs
|
||||
assert "Art. 38 DSGVO" in refs
|
||||
assert "Art. 9 DSGVO" in refs
|
||||
assert "§ 38 BDSG" in refs
|
||||
|
||||
|
||||
def test_guidance_meta_is_interpretative_not_primary():
|
||||
meta = guidance_meta(SPEC, "Diese Leitlinie erlaeutert Artikel 37 DSGVO im Detail.")
|
||||
assert meta["source_class"] == "supervisory_guidance"
|
||||
assert meta["authority_weight"] == 70
|
||||
assert meta["use_for_primary"] is False
|
||||
assert meta["bindingness"] == "interpretative"
|
||||
assert meta["chunk_scope"] == "guidance"
|
||||
assert meta["regulation_short"] == "EDPB DPO"
|
||||
assert meta["interprets"] == "DSGVO"
|
||||
assert meta["issuer"] == "EDPB"
|
||||
assert "Art. 37 DSGVO" in meta["references_out"]
|
||||
|
||||
|
||||
def test_self_test_passes_long_and_flags_short():
|
||||
ok, _ = self_test("x" * 300)
|
||||
assert ok
|
||||
bad, problems = self_test("too short")
|
||||
assert not bad and "too short" in problems[0]
|
||||
|
||||
|
||||
def test_build_upload_unit_tags_collection_and_version():
|
||||
unit = build_upload_unit(SPEC, "A" * 300 + " Artikel 35 DSGVO", "run9")
|
||||
assert unit.document_version == "run9-edpb_dpo"
|
||||
assert unit.collection == "bp_compliance_datenschutz"
|
||||
assert unit.filename == "edpb_dpo.txt"
|
||||
assert unit.meta["use_for_primary"] is False
|
||||
@@ -0,0 +1,108 @@
|
||||
"""Unit tests for the LegalActIngester engine (Parser 1).
|
||||
|
||||
Pure parser + metadata tests against a synthetic eur-lex fixture — no network,
|
||||
no RAG service. Covers: article/annex parsing, recital exclusion, references_out,
|
||||
the self-test gate, full authority metadata and empty-annex skipping.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from services.legal_act_ingester import (
|
||||
RegSpec,
|
||||
build_upload_units,
|
||||
parse_html,
|
||||
refs_out,
|
||||
self_test,
|
||||
)
|
||||
|
||||
FIXTURE = os.path.join(os.path.dirname(__file__), "fixtures", "sample_eurlex_act.html")
|
||||
SPEC = RegSpec(reg="TEST", celex="32099R0001", name_de="Testverordnung",
|
||||
version_date="2099-01-01", legal_basis_rank="eu_regulation")
|
||||
|
||||
|
||||
def _raw() -> str:
|
||||
with open(FIXTURE, encoding="utf-8") as fh:
|
||||
return fh.read()
|
||||
|
||||
|
||||
def test_parse_articles_and_annexes():
|
||||
act = parse_html(_raw(), "TEST")
|
||||
assert [a.num for a in act.articles] == ["1", "2"]
|
||||
assert [a.num for a in act.annexes] == ["I", "II"]
|
||||
art1 = act.articles[0]
|
||||
assert art1.title == "Gegenstand"
|
||||
assert art1.chapter == "KAPITEL I"
|
||||
assert "grundlegenden Anforderungen" in act.annexes[0].body[0]
|
||||
|
||||
|
||||
def test_recital_before_articles_is_ignored():
|
||||
# The "(1) Dieser Erwaegungsgrund …" paragraph precedes Article 1 and must
|
||||
# not leak in as an article (recitals are Parser 2's job).
|
||||
act = parse_html(_raw(), "TEST")
|
||||
bodies = " ".join(b for a in act.articles for b in a.body)
|
||||
assert "Erwaegungsgrund" not in bodies
|
||||
|
||||
|
||||
def test_refs_out_extracts_article_and_annex_edges():
|
||||
act = parse_html(_raw(), "TEST")
|
||||
art1_refs = refs_out("TEST", " ".join(act.articles[0].body))
|
||||
assert "Art. 2 TEST" in art1_refs
|
||||
assert "TEST Anhang I" in art1_refs
|
||||
# The annex points back to Article 1 (bidirectional graph is built later).
|
||||
annex_refs = refs_out("TEST", " ".join(act.annexes[0].body))
|
||||
assert "Art. 1 TEST" in annex_refs
|
||||
|
||||
|
||||
def test_self_test_passes_clean_act():
|
||||
passed, problems = self_test(parse_html(_raw(), "TEST"))
|
||||
assert passed, problems
|
||||
|
||||
|
||||
def test_self_test_flags_empty_and_duplicate():
|
||||
from services.legal_act_ingester import Article, ParsedAct
|
||||
|
||||
dup = ParsedAct(reg="X", articles=[Article("1", body=["enough text here ok"]),
|
||||
Article("1", body=["also enough text"])], annexes=[])
|
||||
passed, problems = self_test(dup)
|
||||
assert not passed and any("duplicate" in p for p in problems)
|
||||
|
||||
empty = ParsedAct(reg="X", articles=[Article("1", body=["x"])], annexes=[])
|
||||
passed2, problems2 = self_test(empty)
|
||||
assert not passed2 and any("empty" in p for p in problems2)
|
||||
|
||||
|
||||
def test_build_upload_units_skips_empty_annex_and_tags_authority():
|
||||
units = build_upload_units(parse_html(_raw(), "TEST"), SPEC, "2099-test")
|
||||
# 2 articles + Annex I (Annex II body "x" is skipped) = 3 units
|
||||
assert len(units) == 3
|
||||
by_cu = {u.meta["citation_unit"]: u for u in units}
|
||||
assert set(by_cu) == {"Art. 1 TEST", "Art. 2 TEST", "TEST Anhang I"}
|
||||
|
||||
art = by_cu["Art. 1 TEST"]
|
||||
assert art.meta["chunk_scope"] == "section"
|
||||
assert art.meta["source_class"] == "binding_law"
|
||||
assert art.meta["authority_weight"] == 100
|
||||
assert art.meta["jurisdiction"] == "EU"
|
||||
assert art.meta["use_for_primary"] is True
|
||||
assert art.document_version == "2099-test-test"
|
||||
|
||||
annex = by_cu["TEST Anhang I"]
|
||||
assert annex.meta["chunk_scope"] == "annex"
|
||||
assert annex.meta["article"] == "Anhang-I"
|
||||
# per-annex document_version prevents point-ID collisions across annexes
|
||||
assert annex.document_version == "2099-test-test-anhangI"
|
||||
|
||||
|
||||
def test_build_upload_units_distinct_annex_versions():
|
||||
from services.legal_act_ingester import Annex, Article, ParsedAct
|
||||
|
||||
act = ParsedAct(
|
||||
reg="TEST",
|
||||
articles=[Article("1", body=["body text long enough"])],
|
||||
annexes=[Annex("I", body=["annex one body long enough"]),
|
||||
Annex("II", body=["annex two body long enough"])],
|
||||
)
|
||||
units = build_upload_units(act, SPEC, "run9")
|
||||
versions = [u.document_version for u in units if u.meta["chunk_scope"] == "annex"]
|
||||
assert versions == ["run9-test-anhangI", "run9-test-anhangII"]
|
||||
assert len(set(versions)) == 2
|
||||
@@ -0,0 +1,56 @@
|
||||
"""Unit tests for the RecitalIngester engine (Parser 2).
|
||||
|
||||
Pure parser + metadata tests against a synthetic eur-lex recital fixture (the
|
||||
id="rct_N" preamble-table structure). Covers: recital extraction, exclusion of
|
||||
article text, the self-test gate, and the interpretative (non-primary) metadata.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from services.legal_act_ingester import RegSpec
|
||||
from services.recital_ingester import build_upload_units, parse_recitals, self_test
|
||||
|
||||
FIXTURE = os.path.join(os.path.dirname(__file__), "fixtures", "sample_eurlex_recitals.html")
|
||||
SPEC = RegSpec(reg="TEST", celex="32099R0001", name_de="Testverordnung", version_date="2099-01-01")
|
||||
|
||||
|
||||
def _raw() -> str:
|
||||
with open(FIXTURE, encoding="utf-8") as fh:
|
||||
return fh.read()
|
||||
|
||||
|
||||
def test_parse_recitals_from_rct_markers():
|
||||
recs = parse_recitals(_raw(), "TEST")
|
||||
assert [r.num for r in recs] == ["1", "2"]
|
||||
assert "Hintergrund" in recs[0].text
|
||||
|
||||
|
||||
def test_article_text_is_not_captured_as_recital():
|
||||
joined = " ".join(r.text for r in parse_recitals(_raw(), "TEST"))
|
||||
assert "Artikeltext" not in joined # the article body must stay out of recitals
|
||||
assert "(1)" not in joined and "(2)" not in joined # the "(N)" markers are stripped
|
||||
|
||||
|
||||
def test_self_test_passes_and_flags_empty():
|
||||
ok, _ = self_test(parse_recitals(_raw(), "TEST"))
|
||||
assert ok
|
||||
bad, problems = self_test([])
|
||||
assert not bad and "0 recitals" in problems[0]
|
||||
|
||||
|
||||
def test_recital_units_are_interpretative_not_primary():
|
||||
units = build_upload_units(parse_recitals(_raw(), "TEST"), SPEC, "run")
|
||||
assert len(units) == 2
|
||||
meta = units[0].meta
|
||||
assert meta["source_class"] == "recital"
|
||||
assert meta["authority_weight"] == 60
|
||||
assert meta["use_for_primary"] is False
|
||||
assert meta["is_recital"] is True
|
||||
assert meta["chunk_scope"] == "recital"
|
||||
assert meta["citation_unit"] == "TEST Erwägungsgrund 1"
|
||||
assert meta["article"] == "Erwaegungsgrund-1"
|
||||
# per-recital document_version prevents point-ID collisions
|
||||
assert units[0].document_version == "run-test-rec1"
|
||||
assert units[1].document_version == "run-test-rec2"
|
||||
# recital 1 cites Artikel 5 → forward edge for the citation graph
|
||||
assert "Art. 5 TEST" in meta["references_out"]
|
||||
@@ -0,0 +1,35 @@
|
||||
"""Unit tests for the StandardIngester engine (Parser 4)."""
|
||||
|
||||
from services.standard_ingester import StandardSpec, build_upload_unit, standard_meta
|
||||
|
||||
SPEC = StandardSpec(
|
||||
source_id="nist_csf_2_0", short="NIST CSF 2.0",
|
||||
title="NIST Cybersecurity Framework 2.0", publisher="NIST",
|
||||
url="https://nist.gov/csf", license="public_domain", version_date="2024-02-26",
|
||||
)
|
||||
|
||||
|
||||
def test_standard_meta_is_best_practice_not_primary():
|
||||
m = standard_meta(SPEC)
|
||||
assert m["source_class"] == "technical_standard"
|
||||
assert m["authority_weight"] == 80
|
||||
assert m["bindingness"] == "best_practice"
|
||||
assert m["use_for_primary"] is False
|
||||
assert m["chunk_scope"] == "standard"
|
||||
assert m["regulation_short"] == "NIST CSF 2.0"
|
||||
assert m["issuer"] == "NIST"
|
||||
assert m["license"] == "public_domain"
|
||||
|
||||
|
||||
def test_build_upload_unit_tags_version_and_collection():
|
||||
unit = build_upload_unit(SPEC, "A" * 300, "run9")
|
||||
assert unit.document_version == "run9-nist_csf_2_0"
|
||||
assert unit.collection == "bp_compliance_ce"
|
||||
assert unit.filename == "nist_csf_2_0.txt"
|
||||
assert unit.meta["use_for_primary"] is False
|
||||
|
||||
|
||||
def test_noncommercial_license_is_carried_for_the_gate():
|
||||
ccm = StandardSpec(source_id="csa_ccm", short="CSA CCM", title="Cloud Controls Matrix",
|
||||
publisher="CSA", url="https://...", license="CC-BY-NC")
|
||||
assert standard_meta(ccm)["license"] == "CC-BY-NC" # commercial gate can refuse it
|
||||
Reference in New Issue
Block a user