feat(controls): Zitierfaehigkeit — Embedding-Re-Link + Atom-Vererbung
citation_backfill Tier-1 von totem sha256-Hash auf Semantik-Suche gegen die re-ingestierten, article_label-tragenden Chunks umgestellt (Fundstelle aus article_label); rag_client reicht article_label durch (additiv, Default-Feld). NEU: scripts/atom_citation_inheritance.py vererbt source_citation parent->atom (license_rule != 3), iterativ. macmini-Apply verifiziert: Zitierfaehigkeit 6.9%->61.3% (+171.765 Atome), Stichprobe korrekt (Atom == Parent-Fundstelle). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Inherit source_citation from parent to atom controls.
|
||||
|
||||
Background
|
||||
==========
|
||||
|
||||
citation_backfill.py fills source_citation on the *source-bearing* controls
|
||||
(those with source_original_text — ~2-7 %) by re-linking them to the
|
||||
re-ingested, article_label-bearing chunks. The remaining ~93 % are "atom"
|
||||
controls (decompositions) that carry a parent_control_uuid but no own citation.
|
||||
They cite the SAME norm as their parent, so the citation can be inherited —
|
||||
no re-matching needed.
|
||||
|
||||
Self-written controls (license_rule = 3) are skipped (no external source).
|
||||
|
||||
Runs in idempotent iterations (atom -> master -> grandmaster) and prints
|
||||
per-stage counts before any write. Safe to rerun — only fills rows whose
|
||||
source_citation lacks an 'article'.
|
||||
|
||||
Usage::
|
||||
|
||||
python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
|
||||
--db-password breakpilot123 --dry-run
|
||||
python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
|
||||
--db-password breakpilot123 --apply
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
DB_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@localhost:5432/breakpilot_db")
|
||||
|
||||
# A row "needs" a citation when it has no article yet.
|
||||
_NEEDS = (
|
||||
"(cc.source_citation IS NULL "
|
||||
" OR cc.source_citation->>'article' IS NULL "
|
||||
" OR cc.source_citation->>'article' = '')"
|
||||
)
|
||||
# A parent can supply one when it carries a real article.
|
||||
_PARENT_HAS = (
|
||||
"p.source_citation IS NOT NULL "
|
||||
"AND p.source_citation->>'article' IS NOT NULL "
|
||||
"AND p.source_citation->>'article' <> ''"
|
||||
)
|
||||
|
||||
SQL_REPORT = f"""
|
||||
SET search_path TO compliance, public;
|
||||
SELECT
|
||||
CASE WHEN cc.parent_control_uuid IS NULL THEN 'no_parent'
|
||||
WHEN ({_PARENT_HAS.replace('p.', 'p2.')}) THEN 'parent_has_article'
|
||||
ELSE 'parent_no_article' END AS bucket,
|
||||
COUNT(*) AS n
|
||||
FROM canonical_controls cc
|
||||
LEFT JOIN canonical_controls p2 ON cc.parent_control_uuid = p2.id
|
||||
WHERE {_NEEDS}
|
||||
AND cc.license_rule IS DISTINCT FROM 3
|
||||
GROUP BY 1 ORDER BY 2 DESC;
|
||||
"""
|
||||
|
||||
SQL_INHERIT = f"""
|
||||
SET search_path TO compliance, public;
|
||||
UPDATE canonical_controls cc
|
||||
SET source_citation = p.source_citation, updated_at = NOW()
|
||||
FROM canonical_controls p
|
||||
WHERE cc.parent_control_uuid = p.id
|
||||
AND {_NEEDS}
|
||||
AND {_PARENT_HAS}
|
||||
AND cc.license_rule IS DISTINCT FROM 3;
|
||||
"""
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(description=__doc__)
|
||||
p.add_argument("--db-url", default=DB_URL,
|
||||
help="Postgres URL (default: $DATABASE_URL)")
|
||||
p.add_argument("--max-iterations", type=int, default=6,
|
||||
help="Cap on inheritance iterations to avoid loops")
|
||||
g = p.add_mutually_exclusive_group(required=True)
|
||||
g.add_argument("--dry-run", action="store_true")
|
||||
g.add_argument("--apply", action="store_true")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def print_bucket(rows, label: str) -> None:
|
||||
print(f"\n## {label}")
|
||||
total = 0
|
||||
for bucket, n in rows:
|
||||
print(f" {bucket:20} {n:>8}")
|
||||
total += n
|
||||
print(f" {'TOTAL':20} {total:>8}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
try:
|
||||
import psycopg2
|
||||
except ImportError:
|
||||
print("error: psycopg2 not installed", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
conn = psycopg2.connect(args.db_url)
|
||||
conn.autocommit = False
|
||||
cur = conn.cursor()
|
||||
|
||||
print("=" * 60)
|
||||
print(" Atom citation inheritance — source_citation via parent")
|
||||
print(f" Mode: {'DRY-RUN' if args.dry_run else 'APPLY'}")
|
||||
print("=" * 60)
|
||||
|
||||
cur.execute(SQL_REPORT)
|
||||
print_bucket(cur.fetchall(), "Controls without article (need citation)")
|
||||
|
||||
if args.dry_run:
|
||||
cur.execute(
|
||||
"SET search_path TO compliance, public; "
|
||||
f"SELECT COUNT(*) FROM canonical_controls cc "
|
||||
f"JOIN canonical_controls p ON cc.parent_control_uuid = p.id "
|
||||
f"WHERE {_NEEDS} AND {_PARENT_HAS} AND cc.license_rule IS DISTINCT FROM 3;"
|
||||
)
|
||||
print(f"\n## First inherit-pass would fill: {cur.fetchone()[0]} rows")
|
||||
print("\nNo writes performed. Use --apply to execute.")
|
||||
conn.rollback()
|
||||
return 0
|
||||
|
||||
total = 0
|
||||
for i in range(1, args.max_iterations + 1):
|
||||
cur.execute(SQL_INHERIT)
|
||||
updated = cur.rowcount
|
||||
total += updated
|
||||
print(f"\n iteration {i}: {updated} rows inherited")
|
||||
if updated == 0:
|
||||
break
|
||||
conn.commit()
|
||||
print(f"\n✓ Total atoms inherited: {total}")
|
||||
|
||||
cur.execute(SQL_REPORT)
|
||||
print_bucket(cur.fetchall(), "Remaining without article")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user