de542633e2
citation_backfill Tier-1 von totem sha256-Hash auf Semantik-Suche gegen die re-ingestierten, article_label-tragenden Chunks umgestellt (Fundstelle aus article_label); rag_client reicht article_label durch (additiv, Default-Feld). NEU: scripts/atom_citation_inheritance.py vererbt source_citation parent->atom (license_rule != 3), iterativ. macmini-Apply verifiziert: Zitierfaehigkeit 6.9%->61.3% (+171.765 Atome), Stichprobe korrekt (Atom == Parent-Fundstelle). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
146 lines
4.5 KiB
Python
146 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Inherit source_citation from parent to atom controls.
|
|
|
|
Background
|
|
==========
|
|
|
|
citation_backfill.py fills source_citation on the *source-bearing* controls
|
|
(those with source_original_text — ~2-7 %) by re-linking them to the
|
|
re-ingested, article_label-bearing chunks. The remaining ~93 % are "atom"
|
|
controls (decompositions) that carry a parent_control_uuid but no own citation.
|
|
They cite the SAME norm as their parent, so the citation can be inherited —
|
|
no re-matching needed.
|
|
|
|
Self-written controls (license_rule = 3) are skipped (no external source).
|
|
|
|
Runs in idempotent iterations (atom -> master -> grandmaster) and prints
|
|
per-stage counts before any write. Safe to rerun — only fills rows whose
|
|
source_citation lacks an 'article'.
|
|
|
|
Usage::
|
|
|
|
python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
|
|
--db-password breakpilot123 --dry-run
|
|
python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
|
|
--db-password breakpilot123 --apply
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
|
|
DB_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@localhost:5432/breakpilot_db")
|
|
|
|
# A row "needs" a citation when it has no article yet.
|
|
_NEEDS = (
|
|
"(cc.source_citation IS NULL "
|
|
" OR cc.source_citation->>'article' IS NULL "
|
|
" OR cc.source_citation->>'article' = '')"
|
|
)
|
|
# A parent can supply one when it carries a real article.
|
|
_PARENT_HAS = (
|
|
"p.source_citation IS NOT NULL "
|
|
"AND p.source_citation->>'article' IS NOT NULL "
|
|
"AND p.source_citation->>'article' <> ''"
|
|
)
|
|
|
|
SQL_REPORT = f"""
|
|
SET search_path TO compliance, public;
|
|
SELECT
|
|
CASE WHEN cc.parent_control_uuid IS NULL THEN 'no_parent'
|
|
WHEN ({_PARENT_HAS.replace('p.', 'p2.')}) THEN 'parent_has_article'
|
|
ELSE 'parent_no_article' END AS bucket,
|
|
COUNT(*) AS n
|
|
FROM canonical_controls cc
|
|
LEFT JOIN canonical_controls p2 ON cc.parent_control_uuid = p2.id
|
|
WHERE {_NEEDS}
|
|
AND cc.license_rule IS DISTINCT FROM 3
|
|
GROUP BY 1 ORDER BY 2 DESC;
|
|
"""
|
|
|
|
SQL_INHERIT = f"""
|
|
SET search_path TO compliance, public;
|
|
UPDATE canonical_controls cc
|
|
SET source_citation = p.source_citation, updated_at = NOW()
|
|
FROM canonical_controls p
|
|
WHERE cc.parent_control_uuid = p.id
|
|
AND {_NEEDS}
|
|
AND {_PARENT_HAS}
|
|
AND cc.license_rule IS DISTINCT FROM 3;
|
|
"""
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
p = argparse.ArgumentParser(description=__doc__)
|
|
p.add_argument("--db-url", default=DB_URL,
|
|
help="Postgres URL (default: $DATABASE_URL)")
|
|
p.add_argument("--max-iterations", type=int, default=6,
|
|
help="Cap on inheritance iterations to avoid loops")
|
|
g = p.add_mutually_exclusive_group(required=True)
|
|
g.add_argument("--dry-run", action="store_true")
|
|
g.add_argument("--apply", action="store_true")
|
|
return p.parse_args()
|
|
|
|
|
|
def print_bucket(rows, label: str) -> None:
|
|
print(f"\n## {label}")
|
|
total = 0
|
|
for bucket, n in rows:
|
|
print(f" {bucket:20} {n:>8}")
|
|
total += n
|
|
print(f" {'TOTAL':20} {total:>8}")
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
try:
|
|
import psycopg2
|
|
except ImportError:
|
|
print("error: psycopg2 not installed", file=sys.stderr)
|
|
return 2
|
|
|
|
conn = psycopg2.connect(args.db_url)
|
|
conn.autocommit = False
|
|
cur = conn.cursor()
|
|
|
|
print("=" * 60)
|
|
print(" Atom citation inheritance — source_citation via parent")
|
|
print(f" Mode: {'DRY-RUN' if args.dry_run else 'APPLY'}")
|
|
print("=" * 60)
|
|
|
|
cur.execute(SQL_REPORT)
|
|
print_bucket(cur.fetchall(), "Controls without article (need citation)")
|
|
|
|
if args.dry_run:
|
|
cur.execute(
|
|
"SET search_path TO compliance, public; "
|
|
f"SELECT COUNT(*) FROM canonical_controls cc "
|
|
f"JOIN canonical_controls p ON cc.parent_control_uuid = p.id "
|
|
f"WHERE {_NEEDS} AND {_PARENT_HAS} AND cc.license_rule IS DISTINCT FROM 3;"
|
|
)
|
|
print(f"\n## First inherit-pass would fill: {cur.fetchone()[0]} rows")
|
|
print("\nNo writes performed. Use --apply to execute.")
|
|
conn.rollback()
|
|
return 0
|
|
|
|
total = 0
|
|
for i in range(1, args.max_iterations + 1):
|
|
cur.execute(SQL_INHERIT)
|
|
updated = cur.rowcount
|
|
total += updated
|
|
print(f"\n iteration {i}: {updated} rows inherited")
|
|
if updated == 0:
|
|
break
|
|
conn.commit()
|
|
print(f"\n✓ Total atoms inherited: {total}")
|
|
|
|
cur.execute(SQL_REPORT)
|
|
print_bucket(cur.fetchall(), "Remaining without article")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|