Files
breakpilot-core/control-pipeline/scripts/atom_citation_inheritance.py
T
Benjamin Admin f398088fbb
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 36s
CI / test-python-voice (push) Successful in 40s
CI / test-bqas (push) Successful in 38s
feat(controls): atom-inheritance schema-aware (text + jsonb source_citation)
Prod canonical_controls.source_citation ist text-mit-JSON (DB-Swap-Anomalie),
macmini ist jsonb. _art()-Helper nutzt pg_input_is_valid(col::text,'jsonb') +
(col::text)::jsonb->>'article' (PG16+) -> ein Skript fuer beide Schemata.
Prod-Apply 2026-06-21 verifiziert: Zitierfaehigkeit 6,8%->60,8% (+169.755),
Stichprobe 8/8 korrekt. macmini-Dry-Run 0 (idempotent, kein Regress).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-21 22:44:38 +02:00

149 lines
4.8 KiB
Python

#!/usr/bin/env python3
"""Inherit source_citation from parent to atom controls.
Background
==========
citation_backfill.py fills source_citation on the *source-bearing* controls
(those with source_original_text — ~2-7 %) by re-linking them to the
re-ingested, article_label-bearing chunks. The remaining ~93 % are "atom"
controls (decompositions) that carry a parent_control_uuid but no own citation.
They cite the SAME norm as their parent, so the citation can be inherited —
no re-matching needed.
Self-written controls (license_rule = 3) are skipped (no external source).
Runs in idempotent iterations (atom -> master -> grandmaster) and prints
per-stage counts before any write. Safe to rerun — only fills rows whose
source_citation lacks an 'article'.
Usage::
python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
--db-password breakpilot123 --dry-run
python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
--db-password breakpilot123 --apply
"""
from __future__ import annotations
import argparse
import os
import sys
DB_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@localhost:5432/breakpilot_db")
def _art(alias: str) -> str:
"""SQL for source_citation->>'article' that works whether the column is jsonb
(macmini) or text-containing-JSON (prod schema anomaly from the DB swap).
pg_input_is_valid (PG16+) guards rows with invalid JSON so the cast never errors."""
col = f"{alias}.source_citation"
return (
f"(CASE WHEN {col} IS NOT NULL AND pg_input_is_valid({col}::text, 'jsonb') "
f"THEN ({col}::text)::jsonb->>'article' ELSE NULL END)"
)
# A row "needs" a citation when it has no article yet.
_NEEDS = f"({_art('cc')} IS NULL OR {_art('cc')} = '')"
# A parent can supply one when it carries a real article.
_PARENT_HAS = f"({_art('p')} IS NOT NULL AND {_art('p')} <> '')"
SQL_REPORT = f"""
SET search_path TO compliance, public;
SELECT
CASE WHEN cc.parent_control_uuid IS NULL THEN 'no_parent'
WHEN ({_PARENT_HAS.replace('p.', 'p2.')}) THEN 'parent_has_article'
ELSE 'parent_no_article' END AS bucket,
COUNT(*) AS n
FROM canonical_controls cc
LEFT JOIN canonical_controls p2 ON cc.parent_control_uuid = p2.id
WHERE {_NEEDS}
AND cc.license_rule IS DISTINCT FROM 3
GROUP BY 1 ORDER BY 2 DESC;
"""
SQL_INHERIT = f"""
SET search_path TO compliance, public;
UPDATE canonical_controls cc
SET source_citation = p.source_citation, updated_at = NOW()
FROM canonical_controls p
WHERE cc.parent_control_uuid = p.id
AND {_NEEDS}
AND {_PARENT_HAS}
AND cc.license_rule IS DISTINCT FROM 3;
"""
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description=__doc__)
p.add_argument("--db-url", default=DB_URL,
help="Postgres URL (default: $DATABASE_URL)")
p.add_argument("--max-iterations", type=int, default=6,
help="Cap on inheritance iterations to avoid loops")
g = p.add_mutually_exclusive_group(required=True)
g.add_argument("--dry-run", action="store_true")
g.add_argument("--apply", action="store_true")
return p.parse_args()
def print_bucket(rows, label: str) -> None:
print(f"\n## {label}")
total = 0
for bucket, n in rows:
print(f" {bucket:20} {n:>8}")
total += n
print(f" {'TOTAL':20} {total:>8}")
def main() -> int:
args = parse_args()
try:
import psycopg2
except ImportError:
print("error: psycopg2 not installed", file=sys.stderr)
return 2
conn = psycopg2.connect(args.db_url)
conn.autocommit = False
cur = conn.cursor()
print("=" * 60)
print(" Atom citation inheritance — source_citation via parent")
print(f" Mode: {'DRY-RUN' if args.dry_run else 'APPLY'}")
print("=" * 60)
cur.execute(SQL_REPORT)
print_bucket(cur.fetchall(), "Controls without article (need citation)")
if args.dry_run:
cur.execute(
"SET search_path TO compliance, public; "
f"SELECT COUNT(*) FROM canonical_controls cc "
f"JOIN canonical_controls p ON cc.parent_control_uuid = p.id "
f"WHERE {_NEEDS} AND {_PARENT_HAS} AND cc.license_rule IS DISTINCT FROM 3;"
)
print(f"\n## First inherit-pass would fill: {cur.fetchone()[0]} rows")
print("\nNo writes performed. Use --apply to execute.")
conn.rollback()
return 0
total = 0
for i in range(1, args.max_iterations + 1):
cur.execute(SQL_INHERIT)
updated = cur.rowcount
total += updated
print(f"\n iteration {i}: {updated} rows inherited")
if updated == 0:
break
conn.commit()
print(f"\n✓ Total atoms inherited: {total}")
cur.execute(SQL_REPORT)
print_bucket(cur.fetchall(), "Remaining without article")
return 0
if __name__ == "__main__":
raise SystemExit(main())