0bad74a3bd
Session 03-05.05.2026: - Block F1-F5 complete (DB migration of hardcoded dicts) - Control Generation: 1,599 controls + 11,522 obligations + 1,147 atomics - Production sync: 2,625 controls + 11,522 obligations synced - G-pre1 analysis: 183k objects → 144k after normalize (needs hierarchical clustering) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
38 lines
1.4 KiB
Python
38 lines
1.4 KiB
Python
#!/usr/bin/env python3
|
|
"""G-pre1: Analyze unique objects and test normalization reduction."""
|
|
from collections import Counter
|
|
from sqlalchemy import create_engine, text
|
|
|
|
engine = create_engine(
|
|
"postgresql://breakpilot:breakpilot123@postgres:5432/breakpilot_db",
|
|
connect_args={"options": "-c search_path=compliance,public"},
|
|
)
|
|
|
|
with engine.connect() as c:
|
|
rows = c.execute(text("""
|
|
SELECT DISTINCT
|
|
split_part(generation_metadata->>'merge_group_hint', ':', 2) AS obj
|
|
FROM canonical_controls
|
|
WHERE generation_metadata->>'merge_group_hint' IS NOT NULL
|
|
AND generation_metadata->>'merge_group_hint' != ''
|
|
""")).fetchall()
|
|
|
|
objects = [r[0] for r in rows if r[0] and r[0].strip()]
|
|
print("Unique raw objects: %d" % len(objects))
|
|
|
|
from services.control_dedup import normalize_object
|
|
|
|
norm_counts: Counter = Counter()
|
|
for obj in objects:
|
|
norm_counts[normalize_object(obj)] += 1
|
|
|
|
print("After normalize_object(): %d unique" % len(norm_counts))
|
|
print("Reduction: %.1f%%" % ((1 - len(norm_counts) / len(objects)) * 100))
|
|
print()
|
|
print("Top 20 normalized objects:")
|
|
for token, count in norm_counts.most_common(20):
|
|
print(" %5d %s" % (count, token))
|
|
print()
|
|
print("Singletons (only 1 raw object): %d" % sum(1 for c in norm_counts.values() if c == 1))
|
|
print("Groups with 2+ members: %d" % sum(1 for c in norm_counts.values() if c >= 2))
|