feat(pipeline): derive 1,874 doc_check_controls from Master Controls
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 45s
CI / test-python-voice (push) Successful in 44s
CI / test-bqas (push) Successful in 40s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 45s
CI / test-python-voice (push) Successful in 44s
CI / test-bqas (push) Successful in 40s
8 document types: DSE (571), Cookie (381), Löschkonzept (309), Widerrufsbelehrung (153), DSFA (147), AVV (125), AGB (113), Impressum (75). Each control has binary check_question + pass_criteria + fail_criteria. Derived via Claude Haiku from existing MCs filtered by regulation source. Table: compliance.doc_check_controls (local + production synced) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,310 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Derive doc_check_controls from existing Master Controls.
|
||||||
|
|
||||||
|
Filters MCs by document-relevant regulations, then uses Claude Haiku
|
||||||
|
to generate check_question + pass_criteria + fail_criteria per control.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 /app/scripts/derive_doc_check_controls.py --dry-run
|
||||||
|
python3 /app/scripts/derive_doc_check_controls.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from sqlalchemy import create_engine, text
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("doc-check-derive")
|
||||||
|
|
||||||
|
DB_URL = os.getenv(
|
||||||
|
"DATABASE_URL",
|
||||||
|
"postgresql://breakpilot:breakpilot123@postgres:5432/breakpilot_db",
|
||||||
|
)
|
||||||
|
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
||||||
|
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
|
||||||
|
|
||||||
|
# Document types and their regulation sources
|
||||||
|
DOC_TYPES = {
|
||||||
|
"dse": {
|
||||||
|
"name": "Datenschutzinformation",
|
||||||
|
"sources": ["DSGVO (EU) 2016/679"],
|
||||||
|
"articles": ["%13%", "%14%"],
|
||||||
|
"extra_tokens": ["personal_data%", "data_subject_rights%", "consent%",
|
||||||
|
"data_processing_register%", "data_transfer%"],
|
||||||
|
},
|
||||||
|
"cookie": {
|
||||||
|
"name": "Cookie-Richtlinie",
|
||||||
|
"sources": ["TDDDG", "ePrivacy-Richtlinie"],
|
||||||
|
"articles": ["%25%", "%5%"],
|
||||||
|
"extra_tokens": ["cookie_consent%", "consent%"],
|
||||||
|
},
|
||||||
|
"impressum": {
|
||||||
|
"name": "Impressum",
|
||||||
|
"sources": ["TMG"],
|
||||||
|
"articles": ["%5%"],
|
||||||
|
"extra_tokens": ["ecommerce%"],
|
||||||
|
},
|
||||||
|
"widerruf": {
|
||||||
|
"name": "Widerrufsbelehrung",
|
||||||
|
"sources": ["BGB"],
|
||||||
|
"articles": ["%355%", "%312%"],
|
||||||
|
"extra_tokens": ["consumer_protection%"],
|
||||||
|
},
|
||||||
|
"agb": {
|
||||||
|
"name": "AGB",
|
||||||
|
"sources": ["BGB"],
|
||||||
|
"articles": ["%305%", "%307%", "%308%", "%309%"],
|
||||||
|
"extra_tokens": ["consumer_protection%"],
|
||||||
|
},
|
||||||
|
"dsfa": {
|
||||||
|
"name": "Datenschutz-Folgenabschaetzung",
|
||||||
|
"sources": ["DSGVO (EU) 2016/679"],
|
||||||
|
"articles": ["%35%"],
|
||||||
|
"extra_tokens": ["dpia%"],
|
||||||
|
},
|
||||||
|
"avv": {
|
||||||
|
"name": "Auftragsverarbeitung",
|
||||||
|
"sources": ["DSGVO (EU) 2016/679"],
|
||||||
|
"articles": ["%28%"],
|
||||||
|
"extra_tokens": ["data_processing_agreement%"],
|
||||||
|
},
|
||||||
|
"loeschkonzept": {
|
||||||
|
"name": "Loeschkonzept",
|
||||||
|
"sources": ["DSGVO (EU) 2016/679"],
|
||||||
|
"articles": ["%5%", "%17%"],
|
||||||
|
"extra_tokens": ["data_retention%"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """Du erzeugst binäre Prüfkriterien für Compliance-Dokumente.
|
||||||
|
|
||||||
|
Für jeden Control erzeugst du:
|
||||||
|
1. check_question: Eine JA/NEIN Frage die ein LLM anhand eines Dokuments beantworten kann
|
||||||
|
2. pass_criteria: Konkrete Textinhalte die vorhanden sein MÜSSEN (3-5 Stück)
|
||||||
|
3. fail_criteria: Typische Fehler/Mängel (2-3 Stück)
|
||||||
|
4. severity: HIGH, MEDIUM oder LOW
|
||||||
|
|
||||||
|
REGELN:
|
||||||
|
- check_question muss BINÄR beantwortbar sein (nicht "wie gut")
|
||||||
|
- pass_criteria müssen KONKRET sein ("Name + Rechtsform + Anschrift", nicht "Angaben")
|
||||||
|
- fail_criteria müssen TYPISCHE Fehler beschreiben
|
||||||
|
- Alles auf Deutsch
|
||||||
|
|
||||||
|
Antworte als JSON-Array:
|
||||||
|
[{"id":"...","check_question":"...","pass_criteria":["..."],"fail_criteria":["..."],"severity":"HIGH"}]"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_doc_controls(engine, doc_type: str, config: dict) -> list[dict]:
|
||||||
|
"""Get controls relevant for a document type."""
|
||||||
|
controls = []
|
||||||
|
|
||||||
|
# Strategy 1: By source + article
|
||||||
|
for source in config["sources"]:
|
||||||
|
for article in config["articles"]:
|
||||||
|
with engine.connect() as c:
|
||||||
|
rows = c.execute(text("""
|
||||||
|
SELECT cc.id, cc.control_id, cc.title,
|
||||||
|
COALESCE(cc.objective, '') as objective,
|
||||||
|
pc.source_citation->>'article' as article
|
||||||
|
FROM compliance.canonical_controls cc
|
||||||
|
JOIN compliance.canonical_controls pc ON pc.id = cc.parent_control_uuid
|
||||||
|
WHERE pc.source_citation->>'source' = :source
|
||||||
|
AND pc.source_citation->>'article' LIKE :article
|
||||||
|
AND cc.release_state NOT IN ('deprecated', 'rejected')
|
||||||
|
LIMIT 200
|
||||||
|
"""), {"source": source, "article": article}).fetchall()
|
||||||
|
for r in rows:
|
||||||
|
controls.append({
|
||||||
|
"uuid": str(r[0]), "control_id": r[1],
|
||||||
|
"title": r[2] or "", "objective": r[3] or "",
|
||||||
|
"article": r[4] or "", "doc_type": doc_type,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Strategy 2: By MC canonical_name
|
||||||
|
for token_pattern in config.get("extra_tokens", []):
|
||||||
|
with engine.connect() as c:
|
||||||
|
rows = c.execute(text("""
|
||||||
|
SELECT cc.id, cc.control_id, cc.title,
|
||||||
|
COALESCE(cc.objective, '') as objective
|
||||||
|
FROM compliance.master_controls mc
|
||||||
|
JOIN compliance.master_control_members mcm ON mcm.master_control_uuid = mc.id
|
||||||
|
JOIN compliance.canonical_controls cc ON cc.id = mcm.control_uuid
|
||||||
|
WHERE mc.canonical_name LIKE :pattern
|
||||||
|
AND cc.release_state NOT IN ('deprecated', 'rejected')
|
||||||
|
LIMIT 100
|
||||||
|
"""), {"pattern": token_pattern}).fetchall()
|
||||||
|
for r in rows:
|
||||||
|
controls.append({
|
||||||
|
"uuid": str(r[0]), "control_id": r[1],
|
||||||
|
"title": r[2] or "", "objective": r[3] or "",
|
||||||
|
"article": "", "doc_type": doc_type,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Deduplicate
|
||||||
|
seen = set()
|
||||||
|
unique = []
|
||||||
|
for c in controls:
|
||||||
|
if c["control_id"] not in seen:
|
||||||
|
seen.add(c["control_id"])
|
||||||
|
unique.append(c)
|
||||||
|
|
||||||
|
return unique
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_with_llm(controls: list[dict], doc_type_name: str) -> list[dict]:
|
||||||
|
"""Add check_question, pass/fail_criteria via Haiku."""
|
||||||
|
enriched = []
|
||||||
|
batch_size = 5
|
||||||
|
|
||||||
|
for i in range(0, len(controls), batch_size):
|
||||||
|
batch = controls[i:i + batch_size]
|
||||||
|
items = [
|
||||||
|
f'- id="{c["control_id"]}" doc="{doc_type_name}" '
|
||||||
|
f't="{c["title"]}" o="{c["objective"][:100]}"'
|
||||||
|
for c in batch
|
||||||
|
]
|
||||||
|
|
||||||
|
prompt = (
|
||||||
|
f"Dokumenttyp: {doc_type_name}\n"
|
||||||
|
f"Erzeuge Prüfkriterien:\n" + "\n".join(items)
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = httpx.post(ANTHROPIC_URL, headers={
|
||||||
|
"x-api-key": ANTHROPIC_API_KEY,
|
||||||
|
"anthropic-version": "2023-06-01",
|
||||||
|
"content-type": "application/json",
|
||||||
|
}, json={
|
||||||
|
"model": "claude-haiku-4-5-20251001",
|
||||||
|
"max_tokens": 2000, "temperature": 0.1,
|
||||||
|
"system": SYSTEM_PROMPT,
|
||||||
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
|
}, timeout=45.0)
|
||||||
|
resp.raise_for_status()
|
||||||
|
content = resp.json().get("content", [{}])[0].get("text", "")
|
||||||
|
start = content.find("[")
|
||||||
|
end = content.rfind("]") + 1
|
||||||
|
if start >= 0 and end > start:
|
||||||
|
results = json.loads(content[start:end])
|
||||||
|
result_map = {r.get("id", ""): r for r in results}
|
||||||
|
for ctrl in batch:
|
||||||
|
r = result_map.get(ctrl["control_id"], {})
|
||||||
|
if r.get("check_question"):
|
||||||
|
ctrl["check_question"] = r["check_question"]
|
||||||
|
ctrl["pass_criteria"] = r.get("pass_criteria", [])
|
||||||
|
ctrl["fail_criteria"] = r.get("fail_criteria", [])
|
||||||
|
ctrl["severity"] = r.get("severity", "MEDIUM")
|
||||||
|
enriched.append(ctrl)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Batch %d failed: %s", i, e)
|
||||||
|
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
return enriched
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--dry-run", action="store_true")
|
||||||
|
parser.add_argument("--doc-type", choices=list(DOC_TYPES.keys()),
|
||||||
|
help="Only one doc type")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
engine = create_engine(
|
||||||
|
DB_URL, connect_args={"options": "-c search_path=compliance,public"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create table
|
||||||
|
with engine.begin() as c:
|
||||||
|
c.execute(text("SET search_path TO compliance, public"))
|
||||||
|
c.execute(text("""
|
||||||
|
CREATE TABLE IF NOT EXISTS doc_check_controls (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
control_id VARCHAR(500) NOT NULL,
|
||||||
|
control_uuid UUID,
|
||||||
|
doc_type VARCHAR(50) NOT NULL,
|
||||||
|
title VARCHAR(500),
|
||||||
|
regulation VARCHAR(200),
|
||||||
|
article VARCHAR(100),
|
||||||
|
check_question TEXT NOT NULL,
|
||||||
|
pass_criteria JSONB DEFAULT '[]',
|
||||||
|
fail_criteria JSONB DEFAULT '[]',
|
||||||
|
severity VARCHAR(20) DEFAULT 'MEDIUM',
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
)
|
||||||
|
"""))
|
||||||
|
c.execute(text("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_doc_check_doc_type
|
||||||
|
ON doc_check_controls(doc_type)
|
||||||
|
"""))
|
||||||
|
|
||||||
|
doc_types = [args.doc_type] if args.doc_type else list(DOC_TYPES.keys())
|
||||||
|
all_checks = []
|
||||||
|
|
||||||
|
for dt in doc_types:
|
||||||
|
config = DOC_TYPES[dt]
|
||||||
|
logger.info("\n=== %s (%s) ===", dt, config["name"])
|
||||||
|
|
||||||
|
controls = get_doc_controls(engine, dt, config)
|
||||||
|
logger.info("Found %d relevant controls", len(controls))
|
||||||
|
|
||||||
|
if not controls:
|
||||||
|
continue
|
||||||
|
|
||||||
|
enriched = enrich_with_llm(controls, config["name"])
|
||||||
|
logger.info("Enriched %d with check criteria", len(enriched))
|
||||||
|
all_checks.extend(enriched)
|
||||||
|
|
||||||
|
logger.info("\nTotal: %d doc_check_controls across %d doc types",
|
||||||
|
len(all_checks), len(doc_types))
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
for dc in all_checks[:5]:
|
||||||
|
logger.info(" [%s] %s: %s", dc["doc_type"], dc["control_id"],
|
||||||
|
dc.get("check_question", "?")[:80])
|
||||||
|
logger.info("DRY RUN — not writing")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Write to DB
|
||||||
|
with engine.begin() as c:
|
||||||
|
c.execute(text("SET search_path TO compliance, public"))
|
||||||
|
c.execute(text("DELETE FROM doc_check_controls"))
|
||||||
|
for dc in all_checks:
|
||||||
|
c.execute(text("""
|
||||||
|
INSERT INTO doc_check_controls
|
||||||
|
(control_id, control_uuid, doc_type, title,
|
||||||
|
check_question, pass_criteria, fail_criteria, severity)
|
||||||
|
VALUES (:cid, CAST(:uuid AS uuid), :doc_type, :title,
|
||||||
|
:question, CAST(:pass AS jsonb),
|
||||||
|
CAST(:fail AS jsonb), :severity)
|
||||||
|
"""), {
|
||||||
|
"cid": dc["control_id"],
|
||||||
|
"uuid": dc["uuid"],
|
||||||
|
"doc_type": dc["doc_type"],
|
||||||
|
"title": dc["title"],
|
||||||
|
"question": dc.get("check_question", ""),
|
||||||
|
"pass": json.dumps(dc.get("pass_criteria", [])),
|
||||||
|
"fail": json.dumps(dc.get("fail_criteria", [])),
|
||||||
|
"severity": dc.get("severity", "MEDIUM"),
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info("Wrote %d doc_check_controls to DB", len(all_checks))
|
||||||
|
|
||||||
|
# Save as JSON too
|
||||||
|
Path("/tmp/doc_check_controls.json").write_text(
|
||||||
|
json.dumps(all_checks, indent=2, ensure_ascii=False)
|
||||||
|
)
|
||||||
|
logger.info("Saved to /tmp/doc_check_controls.json")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user