f398088fbb
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 36s
CI / test-python-voice (push) Successful in 40s
CI / test-bqas (push) Successful in 38s
Prod canonical_controls.source_citation ist text-mit-JSON (DB-Swap-Anomalie), macmini ist jsonb. _art()-Helper nutzt pg_input_is_valid(col::text,'jsonb') + (col::text)::jsonb->>'article' (PG16+) -> ein Skript fuer beide Schemata. Prod-Apply 2026-06-21 verifiziert: Zitierfaehigkeit 6,8%->60,8% (+169.755), Stichprobe 8/8 korrekt. macmini-Dry-Run 0 (idempotent, kein Regress). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
149 lines
4.8 KiB
Python
149 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Inherit source_citation from parent to atom controls.
|
|
|
|
Background
|
|
==========
|
|
|
|
citation_backfill.py fills source_citation on the *source-bearing* controls
|
|
(those with source_original_text — ~2-7 %) by re-linking them to the
|
|
re-ingested, article_label-bearing chunks. The remaining ~93 % are "atom"
|
|
controls (decompositions) that carry a parent_control_uuid but no own citation.
|
|
They cite the SAME norm as their parent, so the citation can be inherited —
|
|
no re-matching needed.
|
|
|
|
Self-written controls (license_rule = 3) are skipped (no external source).
|
|
|
|
Runs in idempotent iterations (atom -> master -> grandmaster) and prints
|
|
per-stage counts before any write. Safe to rerun — only fills rows whose
|
|
source_citation lacks an 'article'.
|
|
|
|
Usage::
|
|
|
|
python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
|
|
--db-password breakpilot123 --dry-run
|
|
python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
|
|
--db-password breakpilot123 --apply
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
|
|
DB_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@localhost:5432/breakpilot_db")
|
|
|
|
def _art(alias: str) -> str:
|
|
"""SQL for source_citation->>'article' that works whether the column is jsonb
|
|
(macmini) or text-containing-JSON (prod schema anomaly from the DB swap).
|
|
pg_input_is_valid (PG16+) guards rows with invalid JSON so the cast never errors."""
|
|
col = f"{alias}.source_citation"
|
|
return (
|
|
f"(CASE WHEN {col} IS NOT NULL AND pg_input_is_valid({col}::text, 'jsonb') "
|
|
f"THEN ({col}::text)::jsonb->>'article' ELSE NULL END)"
|
|
)
|
|
|
|
|
|
# A row "needs" a citation when it has no article yet.
|
|
_NEEDS = f"({_art('cc')} IS NULL OR {_art('cc')} = '')"
|
|
# A parent can supply one when it carries a real article.
|
|
_PARENT_HAS = f"({_art('p')} IS NOT NULL AND {_art('p')} <> '')"
|
|
|
|
SQL_REPORT = f"""
|
|
SET search_path TO compliance, public;
|
|
SELECT
|
|
CASE WHEN cc.parent_control_uuid IS NULL THEN 'no_parent'
|
|
WHEN ({_PARENT_HAS.replace('p.', 'p2.')}) THEN 'parent_has_article'
|
|
ELSE 'parent_no_article' END AS bucket,
|
|
COUNT(*) AS n
|
|
FROM canonical_controls cc
|
|
LEFT JOIN canonical_controls p2 ON cc.parent_control_uuid = p2.id
|
|
WHERE {_NEEDS}
|
|
AND cc.license_rule IS DISTINCT FROM 3
|
|
GROUP BY 1 ORDER BY 2 DESC;
|
|
"""
|
|
|
|
SQL_INHERIT = f"""
|
|
SET search_path TO compliance, public;
|
|
UPDATE canonical_controls cc
|
|
SET source_citation = p.source_citation, updated_at = NOW()
|
|
FROM canonical_controls p
|
|
WHERE cc.parent_control_uuid = p.id
|
|
AND {_NEEDS}
|
|
AND {_PARENT_HAS}
|
|
AND cc.license_rule IS DISTINCT FROM 3;
|
|
"""
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
p = argparse.ArgumentParser(description=__doc__)
|
|
p.add_argument("--db-url", default=DB_URL,
|
|
help="Postgres URL (default: $DATABASE_URL)")
|
|
p.add_argument("--max-iterations", type=int, default=6,
|
|
help="Cap on inheritance iterations to avoid loops")
|
|
g = p.add_mutually_exclusive_group(required=True)
|
|
g.add_argument("--dry-run", action="store_true")
|
|
g.add_argument("--apply", action="store_true")
|
|
return p.parse_args()
|
|
|
|
|
|
def print_bucket(rows, label: str) -> None:
|
|
print(f"\n## {label}")
|
|
total = 0
|
|
for bucket, n in rows:
|
|
print(f" {bucket:20} {n:>8}")
|
|
total += n
|
|
print(f" {'TOTAL':20} {total:>8}")
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
try:
|
|
import psycopg2
|
|
except ImportError:
|
|
print("error: psycopg2 not installed", file=sys.stderr)
|
|
return 2
|
|
|
|
conn = psycopg2.connect(args.db_url)
|
|
conn.autocommit = False
|
|
cur = conn.cursor()
|
|
|
|
print("=" * 60)
|
|
print(" Atom citation inheritance — source_citation via parent")
|
|
print(f" Mode: {'DRY-RUN' if args.dry_run else 'APPLY'}")
|
|
print("=" * 60)
|
|
|
|
cur.execute(SQL_REPORT)
|
|
print_bucket(cur.fetchall(), "Controls without article (need citation)")
|
|
|
|
if args.dry_run:
|
|
cur.execute(
|
|
"SET search_path TO compliance, public; "
|
|
f"SELECT COUNT(*) FROM canonical_controls cc "
|
|
f"JOIN canonical_controls p ON cc.parent_control_uuid = p.id "
|
|
f"WHERE {_NEEDS} AND {_PARENT_HAS} AND cc.license_rule IS DISTINCT FROM 3;"
|
|
)
|
|
print(f"\n## First inherit-pass would fill: {cur.fetchone()[0]} rows")
|
|
print("\nNo writes performed. Use --apply to execute.")
|
|
conn.rollback()
|
|
return 0
|
|
|
|
total = 0
|
|
for i in range(1, args.max_iterations + 1):
|
|
cur.execute(SQL_INHERIT)
|
|
updated = cur.rowcount
|
|
total += updated
|
|
print(f"\n iteration {i}: {updated} rows inherited")
|
|
if updated == 0:
|
|
break
|
|
conn.commit()
|
|
print(f"\n✓ Total atoms inherited: {total}")
|
|
|
|
cur.execute(SQL_REPORT)
|
|
print_bucket(cur.fetchall(), "Remaining without article")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|