docs: session handover — Block F complete, pipeline done, G-pre1 analysis
Session 03-05.05.2026: - Block F1-F5 complete (DB migration of hardcoded dicts) - Control Generation: 1,599 controls + 11,522 obligations + 1,147 atomics - Production sync: 2,625 controls + 11,522 obligations synced - G-pre1 analysis: 183k objects → 144k after normalize (needs hierarchical clustering) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env python3
|
||||
"""G-pre1: Analyze unique objects and test normalization reduction."""
|
||||
from collections import Counter
|
||||
from sqlalchemy import create_engine, text
|
||||
|
||||
engine = create_engine(
|
||||
"postgresql://breakpilot:breakpilot123@postgres:5432/breakpilot_db",
|
||||
connect_args={"options": "-c search_path=compliance,public"},
|
||||
)
|
||||
|
||||
with engine.connect() as c:
|
||||
rows = c.execute(text("""
|
||||
SELECT DISTINCT
|
||||
split_part(generation_metadata->>'merge_group_hint', ':', 2) AS obj
|
||||
FROM canonical_controls
|
||||
WHERE generation_metadata->>'merge_group_hint' IS NOT NULL
|
||||
AND generation_metadata->>'merge_group_hint' != ''
|
||||
""")).fetchall()
|
||||
|
||||
objects = [r[0] for r in rows if r[0] and r[0].strip()]
|
||||
print("Unique raw objects: %d" % len(objects))
|
||||
|
||||
from services.control_dedup import normalize_object
|
||||
|
||||
norm_counts: Counter = Counter()
|
||||
for obj in objects:
|
||||
norm_counts[normalize_object(obj)] += 1
|
||||
|
||||
print("After normalize_object(): %d unique" % len(norm_counts))
|
||||
print("Reduction: %.1f%%" % ((1 - len(norm_counts) / len(objects)) * 100))
|
||||
print()
|
||||
print("Top 20 normalized objects:")
|
||||
for token, count in norm_counts.most_common(20):
|
||||
print(" %5d %s" % (count, token))
|
||||
print()
|
||||
print("Singletons (only 1 raw object): %d" % sum(1 for c in norm_counts.values() if c == 1))
|
||||
print("Groups with 2+ members: %d" % sum(1 for c in norm_counts.values() if c >= 2))
|
||||
Reference in New Issue
Block a user