8a44e67293
User: 'wir haben 1800 MCs erstellt um sie zu 10% zu nutzen — das ist Schwachsinn'. Fixed all 6 gaps from the audit. #1 max_controls=0 (was 20): - agent_compliance_check_routes _check_single: passes max_controls=0 to check_document_with_controls -> ALL MCs evaluated per doc_type. - 8 doc_types now use 1874 MCs instead of 160 (10x coverage). - Regex matching is cheap (<1s per doc); LLM-enrich cap of 10 stays. #2 LLM-verify fixed: - llm_verify.py was getting 0/N parsed. Causes: qwen3 thinking-mode wrapped output in <think>...</think>, /api/generate doesn't enforce JSON, prompt didn't handle code-fence wrappers. - Now uses /api/chat with format='json' (forces valid JSON). - _parse_batch_response strips <think> tags, accepts {results:[...]} AND bare [...], adds richer regex-fallback parse, logs raw head on total parse failure for diagnosis. #3 Loeschkonzept checklist (new): - doc_checks/loeschkonzept_checks.py — 9 L1 + 7 L2 checks per DIN 66398 + Art. 5(1)(e)/17/32 DSGVO: scope+responsibility, data categories, retention periods, legal basis refs (HGB/AO/BGB), deletion trigger, deletion process+technical+systems, deletion proof, exceptions + Art. 18 lock, review cycle, DSGVO references. - runner.py registered for loeschkonzept/loeschung/loeschfristen. #4 regulation backfill script: - backend-compliance/scripts/backfill_mc_regulation.py — regex-detects DSGVO/TDDDG/TMG/BGB/HGB/AO/MStV/UWG/VSBG/PAngV/GwG/BDSG/EU-VO references in MC title+question+pass_criteria, UPDATEs regulation + article fields. - Idempotent (only NULL rows), --dry-run flag, batched 200/UPDATE. - Run inside container: docker exec bp-compliance-backend python3 \ /app/scripts/backfill_mc_regulation.py #5 MC alias-fallback: - rag_document_checker._MC_ALIAS_FALLBACK maps doc_types without own MCs to a related set: nutzungsbedingungen->agb, social_media->dse, sub_processor/scc/tom_annex->avv, loeschfristen->loeschkonzept, eu_institution/dsb->dse. - _load_controls retries with the alias when the primary query returns 0 rows. - 14 additional doc_types now get MC coverage transparently. #6 cross-domain auto-discovery: - _autodiscover_missing builds a crawl plan: primary submitted base + up to 2 related domains sharing the owner SLD (e.g. BMW Group: bmw.de + bmwgroup.com + bmwgroup.jobs). - Detection: regex over submitted texts for https?://...<owner>... hostnames distinct from the primary base. - Each crawled base contributes documents + cmp_payloads to the discovery pool. Net effect for BMW: 1874 MCs evaluated (90 from cookie alone, was 20), Loeschkonzept Pflichtangaben benoten-bar, LLM overturns false regex FAILs, Joint-Controller policies on bmwgroup.jobs (Social Media) jetzt entdeckbar. Same wins will apply to CRA-Compliance check.
181 lines
5.4 KiB
Python
181 lines
5.4 KiB
Python
"""
|
|
Backfill the doc_check_controls.regulation + .article fields.
|
|
|
|
The fields are currently NULL on all 1874 rows. Many MCs cite the
|
|
relevant norm inline in title / check_question / pass_criteria
|
|
(e.g. 'Art. 6 Abs. 1 lit. a DSGVO', '§ 25 Abs. 1 TDDDG'). We detect
|
|
those with regex and UPDATE the row.
|
|
|
|
Run inside the bp-compliance-backend container:
|
|
docker exec bp-compliance-backend python3 /app/scripts/backfill_mc_regulation.py [--dry-run]
|
|
|
|
The script is idempotent: existing non-null regulation is never overwritten.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
import asyncpg
|
|
|
|
|
|
# Ordered: first match wins. Each pattern captures (article_str, regulation_label).
|
|
_PATTERNS: list[tuple[re.Pattern[str], str]] = [
|
|
# Art. X DSGVO / GDPR / EU 2016/679
|
|
(re.compile(
|
|
r"\b(?:art\.?|artikel)\s*"
|
|
r"(\d+[a-z]?(?:\s*(?:abs\.?|absatz)\s*\d+)?"
|
|
r"(?:\s*(?:lit\.?|litera|buchstabe)\s*[a-z])?"
|
|
r"(?:\s*satz\s*\d+)?)"
|
|
r"\s*(?:dsgvo|gdpr|vo\s*\(eu\)\s*2016/679|eu[\s-]?vo\s*2016/679)",
|
|
re.I,
|
|
), "DSGVO"),
|
|
# § X TDDDG / TTDSG
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?(?:\s*(?:nr\.?|lit\.?)\s*\w+)?)\s*"
|
|
r"(?:tdddg|ttdsg|tkg)",
|
|
re.I,
|
|
), "TDDDG"),
|
|
# § X TMG
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?(?:\s*nr\.?\s*\d+)?)\s*tmg\b",
|
|
re.I,
|
|
), "TMG"),
|
|
# § X BGB
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?)\s*bgb\b",
|
|
re.I,
|
|
), "BGB"),
|
|
# § X HGB
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?)\s*hgb\b",
|
|
re.I,
|
|
), "HGB"),
|
|
# § X AO
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?)\s*ao\b",
|
|
re.I,
|
|
), "AO"),
|
|
# § X MStV (Medienstaatsvertrag)
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?)\s*m(?:edien)?st(?:aat)?v\b",
|
|
re.I,
|
|
), "MStV"),
|
|
# § X UWG
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?)\s*uwg\b",
|
|
re.I,
|
|
), "UWG"),
|
|
# § X VSBG (Verbraucherstreitbeilegung)
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?)\s*vsbg\b",
|
|
re.I,
|
|
), "VSBG"),
|
|
# § X PAngV
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?)\s*p(?:reis)?ang?v\b",
|
|
re.I,
|
|
), "PAngV"),
|
|
# § X GwG
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?)\s*gwg\b",
|
|
re.I,
|
|
), "GwG"),
|
|
# § X BDSG
|
|
(re.compile(
|
|
r"§\s*(\d+[a-z]?)\s*bdsg\b",
|
|
re.I,
|
|
), "BDSG"),
|
|
# EU-VO 524/2013 (ODR), 2018/1725 (EU-DSGVO) etc.
|
|
(re.compile(
|
|
r"\bart\.?\s*(\d+)\s*(?:eu[\s-]?vo|vo|verordnung)\s*(?:\(eu\)\s*)?(\d+/\d+)",
|
|
re.I,
|
|
), "EU-VO"),
|
|
# Norm names without numbers, last resort (set article=NULL)
|
|
(re.compile(r"\bdsgvo\b", re.I), "DSGVO"),
|
|
(re.compile(r"\btdddg\b|\bttdsg\b", re.I), "TDDDG"),
|
|
(re.compile(r"\btmg\b", re.I), "TMG"),
|
|
(re.compile(r"\bbgb\b", re.I), "BGB"),
|
|
(re.compile(r"\bmstv\b", re.I), "MStV"),
|
|
(re.compile(r"\buwg\b", re.I), "UWG"),
|
|
(re.compile(r"\bvsbg\b", re.I), "VSBG"),
|
|
(re.compile(r"\bgwg\b", re.I), "GwG"),
|
|
(re.compile(r"\bbdsg\b", re.I), "BDSG"),
|
|
]
|
|
|
|
|
|
def detect(text: str) -> tuple[str | None, str | None]:
|
|
"""Return (regulation, article) for the first pattern that matches."""
|
|
if not text:
|
|
return None, None
|
|
for pat, label in _PATTERNS:
|
|
m = pat.search(text)
|
|
if m:
|
|
article = m.group(1) if m.groups() else None
|
|
if label == "EU-VO" and m.lastindex and m.lastindex >= 2:
|
|
article = f"Art. {m.group(1)} EU-VO {m.group(2)}"
|
|
elif article:
|
|
article = re.sub(r"\s+", " ", article).strip()
|
|
return label, article
|
|
return None, None
|
|
|
|
|
|
async def main(dry_run: bool = False) -> None:
|
|
db = os.getenv("DATABASE_URL")
|
|
if not db:
|
|
print("DATABASE_URL not set", file=sys.stderr)
|
|
sys.exit(1)
|
|
conn = await asyncpg.connect(db)
|
|
|
|
rows = await conn.fetch(
|
|
"SELECT id, title, check_question, pass_criteria::text AS pc "
|
|
"FROM compliance.doc_check_controls "
|
|
"WHERE regulation IS NULL"
|
|
)
|
|
print(f"{len(rows)} MCs with NULL regulation")
|
|
|
|
updates: list[tuple[str | None, str | None, str]] = []
|
|
hits = {"DSGVO": 0, "TDDDG": 0, "TMG": 0, "BGB": 0, "MStV": 0,
|
|
"UWG": 0, "VSBG": 0, "EU-VO": 0, "HGB": 0, "AO": 0,
|
|
"PAngV": 0, "GwG": 0, "BDSG": 0}
|
|
no_match = 0
|
|
for r in rows:
|
|
combined = " ".join(filter(None, [
|
|
r["title"] or "", r["check_question"] or "", r["pc"] or "",
|
|
]))
|
|
reg, art = detect(combined)
|
|
if reg:
|
|
hits[reg] = hits.get(reg, 0) + 1
|
|
updates.append((reg, art, str(r["id"])))
|
|
else:
|
|
no_match += 1
|
|
|
|
print(f"Detected: {sum(hits.values())} | no match: {no_match}")
|
|
for k, v in sorted(hits.items(), key=lambda x: -x[1]):
|
|
if v:
|
|
print(f" {k:8s} {v:>5}")
|
|
|
|
if dry_run:
|
|
print("\nDRY RUN — no changes written. Re-run without --dry-run to apply.")
|
|
await conn.close()
|
|
return
|
|
|
|
# Apply updates in batches
|
|
BATCH = 200
|
|
for i in range(0, len(updates), BATCH):
|
|
chunk = updates[i:i + BATCH]
|
|
await conn.executemany(
|
|
"UPDATE compliance.doc_check_controls "
|
|
"SET regulation = $1, article = $2 WHERE id = $3::uuid",
|
|
chunk,
|
|
)
|
|
print(f"\nApplied {len(updates)} updates.")
|
|
await conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main(dry_run="--dry-run" in sys.argv))
|