"""
Step 4: Preamble vs. Article dedup.
If a preamble control covers the same topic as an article control
(same regulation, similar title), mark the preamble as duplicate.
Article controls always take priority.
"""
import os
import re
import psycopg2
import urllib.parse

STOPWORDS = {
    'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'eines',
    'und', 'oder', 'für', 'von', 'zu', 'mit', 'auf', 'in', 'an', 'bei', 'nach',
    'über', 'unter', 'durch', 'als', 'aus', 'zur', 'zum', 'im', 'am', 'um',
    'ist', 'sind', 'wird', 'werden', 'hat', 'haben', 'kann', 'können',
    'the', 'and', 'or', 'for', 'of', 'to', 'with', 'on', 'in', 'at', 'by',
    'is', 'are', 'be', 'was', 'were', 'been', 'has', 'have', 'had',
    'a', 'an', 'not', 'no', 'from',
}

def tokenize(title):
    """Tokenize and remove stopwords."""
    words = set(re.findall(r'\w+', title.lower()))
    return words - STOPWORDS

def jaccard(a, b):
    """Jaccard similarity between two word sets."""
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)

db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
    host=parsed.hostname, port=parsed.port or 5432,
    user=parsed.username, password=parsed.password,
    dbname=parsed.path.lstrip('/'),
    options="-c search_path=compliance,public"
)
cur = conn.cursor()

# Get all active controls with article_type
cur.execute("""
    SELECT id, control_id, title,
           source_citation->>'source' as source,
           source_citation->>'article' as article,
           source_citation->>'article_type' as article_type,
           release_state
    FROM compliance.canonical_controls
    WHERE release_state NOT IN ('duplicate', 'too_close')
    AND source_citation->>'article_type' IS NOT NULL
    ORDER BY source_citation->>'source', control_id
""")
controls = cur.fetchall()
print(f"Active controls with article_type: {len(controls)}")

# Group by source
by_source = {}
for c in controls:
    src = c[3] or "(null)"
    by_source.setdefault(src, []).append(c)

# For each source: find preamble controls that duplicate article controls
total_dupes = 0
dupe_pairs = []

for source, ctrls in sorted(by_source.items(), key=lambda x: -len(x[1])):
    articles = [c for c in ctrls if c[5] == 'article']
    preambles = [c for c in ctrls if c[5] == 'preamble']
    annexes = [c for c in ctrls if c[5] == 'annex']

    if not preambles or not articles:
        continue

    # Precompute tokens for article controls
    article_tokens = [(c, tokenize(c[2])) for c in articles]

    source_dupes = 0
    for p_ctrl in preambles:
        p_tokens = tokenize(p_ctrl[2])
        if not p_tokens:
            continue

        # Find best matching article control
        best_match = None
        best_score = 0
        for a_ctrl, a_tokens in article_tokens:
            score = jaccard(p_tokens, a_tokens)
            if score > best_score:
                best_score = score
                best_match = a_ctrl

        # Threshold: 0.40 similarity → likely same topic
        if best_score >= 0.40 and best_match:
            source_dupes += 1
            dupe_pairs.append((p_ctrl, best_match, best_score))

    if source_dupes > 0:
        print(f"\n{source}: {source_dupes} preamble duplicates (of {len(preambles)} preambles, {len(articles)} articles)")
        # Show first 3 pairs
        pairs_for_source = [(p, a, s) for p, a, s in dupe_pairs if p[3] == source]
        for p, a, score in pairs_for_source[:3]:
            print(f"  PREAMBLE {p[1]}: {p[2][:60]}")
            print(f"  ARTICLE  {a[1]}: {a[2][:60]}")
            print(f"  Jaccard: {score:.2f}  ({p[4]} vs {a[4]})")
            print()

    total_dupes += source_dupes

print(f"\n{'='*60}")
print(f"SUMMARY")
print(f"{'='*60}")
print(f"  Total preamble controls checked: {sum(len([c for c in v if c[5]=='preamble']) for v in by_source.values())}")
print(f"  Preamble duplicates found: {total_dupes}")
print(f"  Unique preamble controls: {sum(len([c for c in v if c[5]=='preamble']) for v in by_source.values()) - total_dupes}")

# Preview only — don't apply yet
if dupe_pairs:
    print(f"\n=== DRY RUN: Would mark {total_dupes} as duplicate ===")
    # Score distribution
    scores = [s for _, _, s in dupe_pairs]
    print(f"  Score range: {min(scores):.2f} - {max(scores):.2f}")
    print(f"  Score >= 0.50: {sum(1 for s in scores if s >= 0.50)}")
    print(f"  Score 0.40-0.49: {sum(1 for s in scores if 0.40 <= s < 0.50)}")

    # Ask for confirmation
    print(f"\n  Apply? Set APPLY=1 env var to mark duplicates.")
    if os.environ.get('APPLY') == '1':
        print(f"\n  Applying {total_dupes} duplicate marks...")
        applied = 0
        for p_ctrl, a_ctrl, score in dupe_pairs:
            cur.execute("""
                UPDATE compliance.canonical_controls
                SET release_state = 'duplicate',
                    updated_at = now()
                WHERE id = %s
                AND release_state NOT IN ('duplicate', 'too_close')
            """, (str(p_ctrl[0]),))
            if cur.rowcount > 0:
                applied += 1
        conn.commit()
        print(f"  Applied: {applied} marked as duplicate")
    else:
        # Show all pairs for review
        print(f"\n=== All {total_dupes} pairs ===")
        for p, a, score in sorted(dupe_pairs, key=lambda x: -x[2])[:30]:
            print(f"  [{score:.2f}] {p[1]:10s} ({p[4]}) → {a[1]:10s} ({a[4]})")
            print(f"         P: {p[2][:65]}")
            print(f"         A: {a[2][:65]}")

conn.close()