feat(controls): Zitierfaehigkeit — Embedding-Re-Link + Atom-Vererbung

citation_backfill Tier-1 von totem sha256-Hash auf Semantik-Suche gegen die
re-ingestierten, article_label-tragenden Chunks umgestellt (Fundstelle aus
article_label); rag_client reicht article_label durch (additiv, Default-Feld).
NEU: scripts/atom_citation_inheritance.py vererbt source_citation parent->atom
(license_rule != 3), iterativ. macmini-Apply verifiziert: Zitierfaehigkeit
6.9%->61.3% (+171.765 Atome), Stichprobe korrekt (Atom == Parent-Fundstelle).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-21 14:17:57 +02:00
parent ff4a743558
commit de542633e2
3 changed files with 238 additions and 111 deletions
@@ -0,0 +1,145 @@
#!/usr/bin/env python3
"""Inherit source_citation from parent to atom controls.
Background
==========
citation_backfill.py fills source_citation on the *source-bearing* controls
(those with source_original_text — ~2-7 %) by re-linking them to the
re-ingested, article_label-bearing chunks. The remaining ~93 % are "atom"
controls (decompositions) that carry a parent_control_uuid but no own citation.
They cite the SAME norm as their parent, so the citation can be inherited —
no re-matching needed.
Self-written controls (license_rule = 3) are skipped (no external source).
Runs in idempotent iterations (atom -> master -> grandmaster) and prints
per-stage counts before any write. Safe to rerun — only fills rows whose
source_citation lacks an 'article'.
Usage::
python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
--db-password breakpilot123 --dry-run
python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
--db-password breakpilot123 --apply
"""
from __future__ import annotations
import argparse
import os
import sys
DB_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@localhost:5432/breakpilot_db")
# A row "needs" a citation when it has no article yet.
_NEEDS = (
"(cc.source_citation IS NULL "
" OR cc.source_citation->>'article' IS NULL "
" OR cc.source_citation->>'article' = '')"
)
# A parent can supply one when it carries a real article.
_PARENT_HAS = (
"p.source_citation IS NOT NULL "
"AND p.source_citation->>'article' IS NOT NULL "
"AND p.source_citation->>'article' <> ''"
)
SQL_REPORT = f"""
SET search_path TO compliance, public;
SELECT
CASE WHEN cc.parent_control_uuid IS NULL THEN 'no_parent'
WHEN ({_PARENT_HAS.replace('p.', 'p2.')}) THEN 'parent_has_article'
ELSE 'parent_no_article' END AS bucket,
COUNT(*) AS n
FROM canonical_controls cc
LEFT JOIN canonical_controls p2 ON cc.parent_control_uuid = p2.id
WHERE {_NEEDS}
AND cc.license_rule IS DISTINCT FROM 3
GROUP BY 1 ORDER BY 2 DESC;
"""
SQL_INHERIT = f"""
SET search_path TO compliance, public;
UPDATE canonical_controls cc
SET source_citation = p.source_citation, updated_at = NOW()
FROM canonical_controls p
WHERE cc.parent_control_uuid = p.id
AND {_NEEDS}
AND {_PARENT_HAS}
AND cc.license_rule IS DISTINCT FROM 3;
"""
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description=__doc__)
p.add_argument("--db-url", default=DB_URL,
help="Postgres URL (default: $DATABASE_URL)")
p.add_argument("--max-iterations", type=int, default=6,
help="Cap on inheritance iterations to avoid loops")
g = p.add_mutually_exclusive_group(required=True)
g.add_argument("--dry-run", action="store_true")
g.add_argument("--apply", action="store_true")
return p.parse_args()
def print_bucket(rows, label: str) -> None:
print(f"\n## {label}")
total = 0
for bucket, n in rows:
print(f" {bucket:20} {n:>8}")
total += n
print(f" {'TOTAL':20} {total:>8}")
def main() -> int:
args = parse_args()
try:
import psycopg2
except ImportError:
print("error: psycopg2 not installed", file=sys.stderr)
return 2
conn = psycopg2.connect(args.db_url)
conn.autocommit = False
cur = conn.cursor()
print("=" * 60)
print(" Atom citation inheritance — source_citation via parent")
print(f" Mode: {'DRY-RUN' if args.dry_run else 'APPLY'}")
print("=" * 60)
cur.execute(SQL_REPORT)
print_bucket(cur.fetchall(), "Controls without article (need citation)")
if args.dry_run:
cur.execute(
"SET search_path TO compliance, public; "
f"SELECT COUNT(*) FROM canonical_controls cc "
f"JOIN canonical_controls p ON cc.parent_control_uuid = p.id "
f"WHERE {_NEEDS} AND {_PARENT_HAS} AND cc.license_rule IS DISTINCT FROM 3;"
)
print(f"\n## First inherit-pass would fill: {cur.fetchone()[0]} rows")
print("\nNo writes performed. Use --apply to execute.")
conn.rollback()
return 0
total = 0
for i in range(1, args.max_iterations + 1):
cur.execute(SQL_INHERIT)
updated = cur.rowcount
total += updated
print(f"\n iteration {i}: {updated} rows inherited")
if updated == 0:
break
conn.commit()
print(f"\n✓ Total atoms inherited: {total}")
cur.execute(SQL_REPORT)
print_bucket(cur.fetchall(), "Remaining without article")
return 0
if __name__ == "__main__":
raise SystemExit(main())