""" Backfill the doc_check_controls.regulation + .article fields. The fields are currently NULL on all 1874 rows. Many MCs cite the relevant norm inline in title / check_question / pass_criteria (e.g. 'Art. 6 Abs. 1 lit. a DSGVO', '§ 25 Abs. 1 TDDDG'). We detect those with regex and UPDATE the row. Run inside the bp-compliance-backend container: docker exec bp-compliance-backend python3 /app/scripts/backfill_mc_regulation.py [--dry-run] The script is idempotent: existing non-null regulation is never overwritten. """ from __future__ import annotations import asyncio import os import re import sys import asyncpg # Ordered: first match wins. Each pattern captures (article_str, regulation_label). _PATTERNS: list[tuple[re.Pattern[str], str]] = [ # Art. X DSGVO / GDPR / EU 2016/679 (re.compile( r"\b(?:art\.?|artikel)\s*" r"(\d+[a-z]?(?:\s*(?:abs\.?|absatz)\s*\d+)?" r"(?:\s*(?:lit\.?|litera|buchstabe)\s*[a-z])?" r"(?:\s*satz\s*\d+)?)" r"\s*(?:dsgvo|gdpr|vo\s*\(eu\)\s*2016/679|eu[\s-]?vo\s*2016/679)", re.I, ), "DSGVO"), # § X TDDDG / TTDSG (re.compile( r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?(?:\s*(?:nr\.?|lit\.?)\s*\w+)?)\s*" r"(?:tdddg|ttdsg|tkg)", re.I, ), "TDDDG"), # § X TMG (re.compile( r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?(?:\s*nr\.?\s*\d+)?)\s*tmg\b", re.I, ), "TMG"), # § X BGB (re.compile( r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?)\s*bgb\b", re.I, ), "BGB"), # § X HGB (re.compile( r"§\s*(\d+[a-z]?)\s*hgb\b", re.I, ), "HGB"), # § X AO (re.compile( r"§\s*(\d+[a-z]?)\s*ao\b", re.I, ), "AO"), # § X MStV (Medienstaatsvertrag) (re.compile( r"§\s*(\d+[a-z]?)\s*m(?:edien)?st(?:aat)?v\b", re.I, ), "MStV"), # § X UWG (re.compile( r"§\s*(\d+[a-z]?)\s*uwg\b", re.I, ), "UWG"), # § X VSBG (Verbraucherstreitbeilegung) (re.compile( r"§\s*(\d+[a-z]?)\s*vsbg\b", re.I, ), "VSBG"), # § X PAngV (re.compile( r"§\s*(\d+[a-z]?)\s*p(?:reis)?ang?v\b", re.I, ), "PAngV"), # § X GwG (re.compile( r"§\s*(\d+[a-z]?)\s*gwg\b", re.I, ), "GwG"), # § X BDSG (re.compile( r"§\s*(\d+[a-z]?)\s*bdsg\b", re.I, ), "BDSG"), # EU-VO 524/2013 (ODR), 2018/1725 (EU-DSGVO) etc. (re.compile( r"\bart\.?\s*(\d+)\s*(?:eu[\s-]?vo|vo|verordnung)\s*(?:\(eu\)\s*)?(\d+/\d+)", re.I, ), "EU-VO"), # Norm names without numbers, last resort (set article=NULL) (re.compile(r"\bdsgvo\b", re.I), "DSGVO"), (re.compile(r"\btdddg\b|\bttdsg\b", re.I), "TDDDG"), (re.compile(r"\btmg\b", re.I), "TMG"), (re.compile(r"\bbgb\b", re.I), "BGB"), (re.compile(r"\bmstv\b", re.I), "MStV"), (re.compile(r"\buwg\b", re.I), "UWG"), (re.compile(r"\bvsbg\b", re.I), "VSBG"), (re.compile(r"\bgwg\b", re.I), "GwG"), (re.compile(r"\bbdsg\b", re.I), "BDSG"), ] def detect(text: str) -> tuple[str | None, str | None]: """Return (regulation, article) for the first pattern that matches.""" if not text: return None, None for pat, label in _PATTERNS: m = pat.search(text) if m: article = m.group(1) if m.groups() else None if label == "EU-VO" and m.lastindex and m.lastindex >= 2: article = f"Art. {m.group(1)} EU-VO {m.group(2)}" elif article: article = re.sub(r"\s+", " ", article).strip() return label, article return None, None async def main(dry_run: bool = False) -> None: db = os.getenv("DATABASE_URL") if not db: print("DATABASE_URL not set", file=sys.stderr) sys.exit(1) conn = await asyncpg.connect(db) rows = await conn.fetch( "SELECT id, title, check_question, pass_criteria::text AS pc " "FROM compliance.doc_check_controls " "WHERE regulation IS NULL" ) print(f"{len(rows)} MCs with NULL regulation") updates: list[tuple[str | None, str | None, str]] = [] hits = {"DSGVO": 0, "TDDDG": 0, "TMG": 0, "BGB": 0, "MStV": 0, "UWG": 0, "VSBG": 0, "EU-VO": 0, "HGB": 0, "AO": 0, "PAngV": 0, "GwG": 0, "BDSG": 0} no_match = 0 for r in rows: combined = " ".join(filter(None, [ r["title"] or "", r["check_question"] or "", r["pc"] or "", ])) reg, art = detect(combined) if reg: hits[reg] = hits.get(reg, 0) + 1 updates.append((reg, art, str(r["id"]))) else: no_match += 1 print(f"Detected: {sum(hits.values())} | no match: {no_match}") for k, v in sorted(hits.items(), key=lambda x: -x[1]): if v: print(f" {k:8s} {v:>5}") if dry_run: print("\nDRY RUN — no changes written. Re-run without --dry-run to apply.") await conn.close() return # Apply updates in batches BATCH = 200 for i in range(0, len(updates), BATCH): chunk = updates[i:i + BATCH] await conn.executemany( "UPDATE compliance.doc_check_controls " "SET regulation = $1, article = $2 WHERE id = $3::uuid", chunk, ) print(f"\nApplied {len(updates)} updates.") await conn.close() if __name__ == "__main__": asyncio.run(main(dry_run="--dry-run" in sys.argv))