Files
Benjamin Admin 8a44e67293 feat(compliance-check): unlock all 1874 MCs + close gap-table items
User: 'wir haben 1800 MCs erstellt um sie zu 10% zu nutzen — das ist
Schwachsinn'. Fixed all 6 gaps from the audit.

#1 max_controls=0 (was 20):
- agent_compliance_check_routes _check_single: passes max_controls=0 to
  check_document_with_controls -> ALL MCs evaluated per doc_type.
- 8 doc_types now use 1874 MCs instead of 160 (10x coverage).
- Regex matching is cheap (<1s per doc); LLM-enrich cap of 10 stays.

#2 LLM-verify fixed:
- llm_verify.py was getting 0/N parsed. Causes: qwen3 thinking-mode
  wrapped output in <think>...</think>, /api/generate doesn't enforce
  JSON, prompt didn't handle code-fence wrappers.
- Now uses /api/chat with format='json' (forces valid JSON).
- _parse_batch_response strips <think> tags, accepts {results:[...]}
  AND bare [...], adds richer regex-fallback parse, logs raw head on
  total parse failure for diagnosis.

#3 Loeschkonzept checklist (new):
- doc_checks/loeschkonzept_checks.py — 9 L1 + 7 L2 checks per DIN 66398
  + Art. 5(1)(e)/17/32 DSGVO: scope+responsibility, data categories,
  retention periods, legal basis refs (HGB/AO/BGB), deletion trigger,
  deletion process+technical+systems, deletion proof, exceptions +
  Art. 18 lock, review cycle, DSGVO references.
- runner.py registered for loeschkonzept/loeschung/loeschfristen.

#4 regulation backfill script:
- backend-compliance/scripts/backfill_mc_regulation.py — regex-detects
  DSGVO/TDDDG/TMG/BGB/HGB/AO/MStV/UWG/VSBG/PAngV/GwG/BDSG/EU-VO
  references in MC title+question+pass_criteria, UPDATEs regulation +
  article fields.
- Idempotent (only NULL rows), --dry-run flag, batched 200/UPDATE.
- Run inside container: docker exec bp-compliance-backend python3 \
    /app/scripts/backfill_mc_regulation.py

#5 MC alias-fallback:
- rag_document_checker._MC_ALIAS_FALLBACK maps doc_types without own
  MCs to a related set: nutzungsbedingungen->agb, social_media->dse,
  sub_processor/scc/tom_annex->avv, loeschfristen->loeschkonzept,
  eu_institution/dsb->dse.
- _load_controls retries with the alias when the primary query
  returns 0 rows.
- 14 additional doc_types now get MC coverage transparently.

#6 cross-domain auto-discovery:
- _autodiscover_missing builds a crawl plan: primary submitted base
  + up to 2 related domains sharing the owner SLD (e.g. BMW Group:
  bmw.de + bmwgroup.com + bmwgroup.jobs).
- Detection: regex over submitted texts for https?://...<owner>...
  hostnames distinct from the primary base.
- Each crawled base contributes documents + cmp_payloads to the
  discovery pool.

Net effect for BMW: 1874 MCs evaluated (90 from cookie alone, was
20), Loeschkonzept Pflichtangaben benoten-bar, LLM overturns false
regex FAILs, Joint-Controller policies on bmwgroup.jobs (Social
Media) jetzt entdeckbar. Same wins will apply to CRA-Compliance check.
2026-05-17 13:07:50 +02:00

181 lines
5.4 KiB
Python

"""
Backfill the doc_check_controls.regulation + .article fields.
The fields are currently NULL on all 1874 rows. Many MCs cite the
relevant norm inline in title / check_question / pass_criteria
(e.g. 'Art. 6 Abs. 1 lit. a DSGVO', '§ 25 Abs. 1 TDDDG'). We detect
those with regex and UPDATE the row.
Run inside the bp-compliance-backend container:
docker exec bp-compliance-backend python3 /app/scripts/backfill_mc_regulation.py [--dry-run]
The script is idempotent: existing non-null regulation is never overwritten.
"""
from __future__ import annotations
import asyncio
import os
import re
import sys
import asyncpg
# Ordered: first match wins. Each pattern captures (article_str, regulation_label).
_PATTERNS: list[tuple[re.Pattern[str], str]] = [
# Art. X DSGVO / GDPR / EU 2016/679
(re.compile(
r"\b(?:art\.?|artikel)\s*"
r"(\d+[a-z]?(?:\s*(?:abs\.?|absatz)\s*\d+)?"
r"(?:\s*(?:lit\.?|litera|buchstabe)\s*[a-z])?"
r"(?:\s*satz\s*\d+)?)"
r"\s*(?:dsgvo|gdpr|vo\s*\(eu\)\s*2016/679|eu[\s-]?vo\s*2016/679)",
re.I,
), "DSGVO"),
# § X TDDDG / TTDSG
(re.compile(
r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?(?:\s*(?:nr\.?|lit\.?)\s*\w+)?)\s*"
r"(?:tdddg|ttdsg|tkg)",
re.I,
), "TDDDG"),
# § X TMG
(re.compile(
r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?(?:\s*nr\.?\s*\d+)?)\s*tmg\b",
re.I,
), "TMG"),
# § X BGB
(re.compile(
r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?)\s*bgb\b",
re.I,
), "BGB"),
# § X HGB
(re.compile(
r"§\s*(\d+[a-z]?)\s*hgb\b",
re.I,
), "HGB"),
# § X AO
(re.compile(
r"§\s*(\d+[a-z]?)\s*ao\b",
re.I,
), "AO"),
# § X MStV (Medienstaatsvertrag)
(re.compile(
r"§\s*(\d+[a-z]?)\s*m(?:edien)?st(?:aat)?v\b",
re.I,
), "MStV"),
# § X UWG
(re.compile(
r"§\s*(\d+[a-z]?)\s*uwg\b",
re.I,
), "UWG"),
# § X VSBG (Verbraucherstreitbeilegung)
(re.compile(
r"§\s*(\d+[a-z]?)\s*vsbg\b",
re.I,
), "VSBG"),
# § X PAngV
(re.compile(
r"§\s*(\d+[a-z]?)\s*p(?:reis)?ang?v\b",
re.I,
), "PAngV"),
# § X GwG
(re.compile(
r"§\s*(\d+[a-z]?)\s*gwg\b",
re.I,
), "GwG"),
# § X BDSG
(re.compile(
r"§\s*(\d+[a-z]?)\s*bdsg\b",
re.I,
), "BDSG"),
# EU-VO 524/2013 (ODR), 2018/1725 (EU-DSGVO) etc.
(re.compile(
r"\bart\.?\s*(\d+)\s*(?:eu[\s-]?vo|vo|verordnung)\s*(?:\(eu\)\s*)?(\d+/\d+)",
re.I,
), "EU-VO"),
# Norm names without numbers, last resort (set article=NULL)
(re.compile(r"\bdsgvo\b", re.I), "DSGVO"),
(re.compile(r"\btdddg\b|\bttdsg\b", re.I), "TDDDG"),
(re.compile(r"\btmg\b", re.I), "TMG"),
(re.compile(r"\bbgb\b", re.I), "BGB"),
(re.compile(r"\bmstv\b", re.I), "MStV"),
(re.compile(r"\buwg\b", re.I), "UWG"),
(re.compile(r"\bvsbg\b", re.I), "VSBG"),
(re.compile(r"\bgwg\b", re.I), "GwG"),
(re.compile(r"\bbdsg\b", re.I), "BDSG"),
]
def detect(text: str) -> tuple[str | None, str | None]:
"""Return (regulation, article) for the first pattern that matches."""
if not text:
return None, None
for pat, label in _PATTERNS:
m = pat.search(text)
if m:
article = m.group(1) if m.groups() else None
if label == "EU-VO" and m.lastindex and m.lastindex >= 2:
article = f"Art. {m.group(1)} EU-VO {m.group(2)}"
elif article:
article = re.sub(r"\s+", " ", article).strip()
return label, article
return None, None
async def main(dry_run: bool = False) -> None:
db = os.getenv("DATABASE_URL")
if not db:
print("DATABASE_URL not set", file=sys.stderr)
sys.exit(1)
conn = await asyncpg.connect(db)
rows = await conn.fetch(
"SELECT id, title, check_question, pass_criteria::text AS pc "
"FROM compliance.doc_check_controls "
"WHERE regulation IS NULL"
)
print(f"{len(rows)} MCs with NULL regulation")
updates: list[tuple[str | None, str | None, str]] = []
hits = {"DSGVO": 0, "TDDDG": 0, "TMG": 0, "BGB": 0, "MStV": 0,
"UWG": 0, "VSBG": 0, "EU-VO": 0, "HGB": 0, "AO": 0,
"PAngV": 0, "GwG": 0, "BDSG": 0}
no_match = 0
for r in rows:
combined = " ".join(filter(None, [
r["title"] or "", r["check_question"] or "", r["pc"] or "",
]))
reg, art = detect(combined)
if reg:
hits[reg] = hits.get(reg, 0) + 1
updates.append((reg, art, str(r["id"])))
else:
no_match += 1
print(f"Detected: {sum(hits.values())} | no match: {no_match}")
for k, v in sorted(hits.items(), key=lambda x: -x[1]):
if v:
print(f" {k:8s} {v:>5}")
if dry_run:
print("\nDRY RUN — no changes written. Re-run without --dry-run to apply.")
await conn.close()
return
# Apply updates in batches
BATCH = 200
for i in range(0, len(updates), BATCH):
chunk = updates[i:i + BATCH]
await conn.executemany(
"UPDATE compliance.doc_check_controls "
"SET regulation = $1, article = $2 WHERE id = $3::uuid",
chunk,
)
print(f"\nApplied {len(updates)} updates.")
await conn.close()
if __name__ == "__main__":
asyncio.run(main(dry_run="--dry-run" in sys.argv))