#!/usr/bin/env python3
"""G-pre1: Analyze unique objects and test normalization reduction."""
from collections import Counter
from sqlalchemy import create_engine, text

engine = create_engine(
    "postgresql://breakpilot:breakpilot123@postgres:5432/breakpilot_db",
    connect_args={"options": "-c search_path=compliance,public"},
)

with engine.connect() as c:
    rows = c.execute(text("""
        SELECT DISTINCT
            split_part(generation_metadata->>'merge_group_hint', ':', 2) AS obj
        FROM canonical_controls
        WHERE generation_metadata->>'merge_group_hint' IS NOT NULL
          AND generation_metadata->>'merge_group_hint' != ''
    """)).fetchall()

objects = [r[0] for r in rows if r[0] and r[0].strip()]
print("Unique raw objects: %d" % len(objects))

from services.control_dedup import normalize_object

norm_counts: Counter = Counter()
for obj in objects:
    norm_counts[normalize_object(obj)] += 1

print("After normalize_object(): %d unique" % len(norm_counts))
print("Reduction: %.1f%%" % ((1 - len(norm_counts) / len(objects)) * 100))
print()
print("Top 20 normalized objects:")
for token, count in norm_counts.most_common(20):
    print("  %5d  %s" % (count, token))
print()
print("Singletons (only 1 raw object): %d" % sum(1 for c in norm_counts.values() if c == 1))
print("Groups with 2+ members: %d" % sum(1 for c in norm_counts.values() if c >= 2))