#!/usr/bin/env python3 """G-pre1: Analyze unique objects and test normalization reduction.""" from collections import Counter from sqlalchemy import create_engine, text engine = create_engine( "postgresql://breakpilot:breakpilot123@postgres:5432/breakpilot_db", connect_args={"options": "-c search_path=compliance,public"}, ) with engine.connect() as c: rows = c.execute(text(""" SELECT DISTINCT split_part(generation_metadata->>'merge_group_hint', ':', 2) AS obj FROM canonical_controls WHERE generation_metadata->>'merge_group_hint' IS NOT NULL AND generation_metadata->>'merge_group_hint' != '' """)).fetchall() objects = [r[0] for r in rows if r[0] and r[0].strip()] print("Unique raw objects: %d" % len(objects)) from services.control_dedup import normalize_object norm_counts: Counter = Counter() for obj in objects: norm_counts[normalize_object(obj)] += 1 print("After normalize_object(): %d unique" % len(norm_counts)) print("Reduction: %.1f%%" % ((1 - len(norm_counts) / len(objects)) * 100)) print() print("Top 20 normalized objects:") for token, count in norm_counts.most_common(20): print(" %5d %s" % (count, token)) print() print("Singletons (only 1 raw object): %d" % sum(1 for c in norm_counts.values() if c == 1)) print("Groups with 2+ members: %d" % sum(1 for c in norm_counts.values() if c >= 2))