breakpilot-compliance/scripts/cleanup-qdrant-duplicates.py

#!/usr/bin/env python3
"""
Qdrant Duplicate Cleanup
Removes redundant/duplicate chunks from Qdrant collections.

Targets:
1. bp_compliance_recht — entire collection (100% subset of bp_compliance_ce)
2. bp_compliance_gesetze — old versions where _komplett exists
3. bp_compliance_gesetze — BGB section extracts (subset of bgb_komplett)
4. bp_compliance_gesetze — AT law duplicates (renamed copies)
5. bp_compliance_gesetze — stubs (1 chunk placeholders)
6. bp_compliance_gesetze — EU regulations already in bp_compliance_ce
7. bp_compliance_gesetze — dual-naming duplicates (keep newer/longer version)
8. bp_compliance_datenschutz — EDPB/WP duplicate ingestions

Run with --dry-run to preview deletions without executing.
"""

import argparse
import json
import sys
import time
import requests

# ─────────────────────────────────────────────────────────────────────────────
# Config — targets BOTH local and production Qdrant
# ─────────────────────────────────────────────────────────────────────────────

TARGETS = {
    "local": {
        "url": "http://macmini:6333",
        "api_key": None,
    },
    "production": {
        "url": "https://qdrant-dev.breakpilot.ai",
        "api_key": "z9cKbT74vl1aKPD1QGIlKWfET47VH93u",
    },
}

# ─────────────────────────────────────────────────────────────────────────────
# Deletion plan — regulation_ids to remove per collection
# ─────────────────────────────────────────────────────────────────────────────

# 1. bp_compliance_recht: DELETE ENTIRE COLLECTION
#    All 9 regulation_ids are already in bp_compliance_ce with same or more chunks

# 2. bp_compliance_gesetze: old versions (keep _komplett)
GESETZE_OLD_VERSIONS = [
    "ao",       # ao_komplett has 9,669 chunks vs ao's 1,752
    "bdsg",     # bdsg_2018_komplett has 1,056 vs bdsg's 389
    "egbgb",    # egbgb_komplett has 1,412 vs egbgb's 269
    "hgb",      # hgb_komplett has 11,363 vs hgb's 1,937
]

# 3. bp_compliance_gesetze: BGB section extracts (subset of bgb_komplett 4,024 chunks)
GESETZE_BGB_EXTRACTS = [
    "bgb_agb",         # 94 chunks
    "bgb_digital",     # 42 chunks
    "bgb_fernabsatz",  # 71 chunks
    "bgb_kaufrecht",   # 147 chunks
    "bgb_widerruf",    # 50 chunks
]

# 4. bp_compliance_gesetze: AT law duplicates (renamed copies with identical chunks)
GESETZE_AT_DUPLICATES = [
    "at_abgb_agb",  # 2,521 chunks = exact copy of at_abgb
    "at_bao_ret",    # 2,246 chunks = exact copy of at_bao
    "at_ugb_ret",    # 2,828 chunks = exact copy of at_ugb
]

# 5. bp_compliance_gesetze: stubs (1 chunk, incomplete ingestions)
GESETZE_STUBS = [
    "de_uwg",    # 1 chunk (uwg has 157)
    "de_pangv",  # 1 chunk (pangv has 99)
    "de_bsig",   # 1 chunk (standalone stub)
]

# 6. bp_compliance_gesetze: EU regulations already fully in bp_compliance_ce
#    CE has equal or more chunks for all of these
GESETZE_EU_CROSS_COLLECTION = [
    "eu_2016_679",   # GDPR: 423 in both
    "eu_2024_1689",  # AI Act: 726 in both
    "eu_2024_2847",  # CRA: 429 in gesetze, 1365 in CE
    "eu_2022_2555",  # NIS2: 344 in gesetze, 342 in CE (near-identical)
    "eu_2023_1230",  # Machinery: 395 in gesetze, 1271 in CE
]

# 7. bp_compliance_gesetze: dual-naming (keep the longer/newer version)
GESETZE_DUAL_NAMING = [
    "tkg",             # 1,391 chunks — de_tkg has 1,631 (keep de_tkg)
    "ustg",            # 915 chunks — de_ustg_ret has 1,071 (keep de_ustg_ret)
    "ddg_5",           # 40 chunks — ddg has 189 (section extract)
    "egbgb_widerruf",  # 36 chunks — egbgb_komplett has 1,412 (section extract)
]

# 8. bp_compliance_datenschutz: EDPB/WP duplicate ingestions (keep the longer-named version)
DATENSCHUTZ_DUPLICATES = [
    "edpb_rtbf_05_2019",           # 111 chunks — edpb_right_to_be_forgotten_05_2019 has 111 (keep long name)
    "edpb_vva_02_2021",            # 273 chunks — edpb_virtual_voice_assistant_02_2021 has 273 (keep long name)
    "edpb_01_2020",                # 337 chunks — edpb_transfers_01_2020 has 337 (keep long name)
    "wp242_portability",           # 141 chunks — wp242_right_portability has 141 (keep long name)
    "wp250_breach",                # 201 chunks — wp251_data_breach has 201 (keep long name)
    "wp244_profiling",             # 247 chunks — wp251_profiling has 247 (keep long name)
    "edpb_legitimate_interest",    # 672 chunks — edpb_legitimate_interest_01_2024 has 336 (keep dated version)
]

# ─────────────────────────────────────────────────────────────────────────────
# All gesetze deletions combined
# ─────────────────────────────────────────────────────────────────────────────

ALL_GESETZE_DELETIONS = (
    GESETZE_OLD_VERSIONS
    + GESETZE_BGB_EXTRACTS
    + GESETZE_AT_DUPLICATES
    + GESETZE_STUBS
    + GESETZE_EU_CROSS_COLLECTION
    + GESETZE_DUAL_NAMING
)


def log(msg):
    print(f"\033[0;32m[OK]\033[0m {msg}")

def warn(msg):
    print(f"\033[1;33m[WARN]\033[0m {msg}")

def info(msg):
    print(f"\033[0;36m[INFO]\033[0m {msg}")

def fail(msg):
    print(f"\033[0;31m[FAIL]\033[0m {msg}")


def make_session(target_config):
    """Create a requests session for the given target."""
    s = requests.Session()
    s.headers.update({"Content-Type": "application/json"})
    if target_config["api_key"]:
        s.headers.update({"api-key": target_config["api_key"]})
    s.timeout = 60
    return s


def count_by_regulation_id(session, url, collection, regulation_id):
    """Count points in a collection matching a regulation_id."""
    resp = session.post(
        f"{url}/collections/{collection}/points/count",
        json={
            "filter": {
                "must": [
                    {"key": "regulation_id", "match": {"value": regulation_id}}
                ]
            },
            "exact": True,
        },
    )
    if resp.status_code == 200:
        return resp.json().get("result", {}).get("count", 0)
    return -1


def count_collection(session, url, collection):
    """Get total point count for a collection."""
    resp = session.get(f"{url}/collections/{collection}")
    if resp.status_code == 200:
        return resp.json().get("result", {}).get("points_count", 0)
    return -1


def delete_by_regulation_id(session, url, collection, regulation_id, dry_run=True):
    """Delete all points in a collection matching a regulation_id."""
    count = count_by_regulation_id(session, url, collection, regulation_id)
    if count <= 0:
        if count == 0:
            info(f"  {collection}/{regulation_id}: 0 chunks (already clean)")
        else:
            warn(f"  {collection}/{regulation_id}: count failed")
        return 0

    if dry_run:
        info(f"  {collection}/{regulation_id}: {count} chunks (would delete)")
        return count

    resp = session.post(
        f"{url}/collections/{collection}/points/delete",
        json={
            "filter": {
                "must": [
                    {"key": "regulation_id", "match": {"value": regulation_id}}
                ]
            }
        },
    )
    if resp.status_code == 200:
        log(f"  {collection}/{regulation_id}: {count} chunks deleted")
        return count
    else:
        warn(f"  {collection}/{regulation_id}: delete failed ({resp.status_code}: {resp.text[:200]})")
        return 0


def delete_collection(session, url, collection, dry_run=True):
    """Delete an entire collection."""
    count = count_collection(session, url, collection)
    if count < 0:
        warn(f"  {collection}: not found or error")
        return 0

    if dry_run:
        info(f"  {collection}: {count} chunks total (would delete collection)")
        return count

    resp = session.delete(f"{url}/collections/{collection}")
    if resp.status_code == 200:
        log(f"  {collection}: deleted ({count} chunks)")
        return count
    else:
        warn(f"  {collection}: delete failed ({resp.status_code}: {resp.text[:200]})")
        return 0


def run_cleanup(target_name, target_config, dry_run=True):
    """Run the full cleanup for a single Qdrant target."""
    url = target_config["url"]
    session = make_session(target_config)

    print(f"\n{'='*60}")
    print(f"Target: {target_name} ({url})")
    print(f"Mode: {'DRY RUN' if dry_run else 'LIVE DELETE'}")
    print(f"{'='*60}")

    # Check connectivity
    try:
        resp = session.get(f"{url}/collections")
        resp.raise_for_status()
        collections = [c["name"] for c in resp.json().get("result", {}).get("collections", [])]
        info(f"Connected. Collections: {len(collections)}")
    except Exception as e:
        warn(f"Cannot connect to {url}: {e}")
        return

    total_deleted = 0

    # ── Step 1: Delete bp_compliance_recht ──
    print(f"\n--- Step 1: Delete bp_compliance_recht (100% subset of CE) ---")
    if "bp_compliance_recht" in collections:
        total_deleted += delete_collection(session, url, "bp_compliance_recht", dry_run)
    else:
        info("  bp_compliance_recht: not found (already deleted)")

    # ── Step 2: Delete old versions in gesetze ──
    print(f"\n--- Step 2: Delete old versions in bp_compliance_gesetze ---")
    print(f"    (ao, bdsg, egbgb, hgb — _komplett versions exist)")
    if "bp_compliance_gesetze" in collections:
        for reg_id in GESETZE_OLD_VERSIONS:
            total_deleted += delete_by_regulation_id(
                session, url, "bp_compliance_gesetze", reg_id, dry_run
            )
            time.sleep(0.2)

    # ── Step 3: Delete BGB section extracts ──
    print(f"\n--- Step 3: Delete BGB section extracts (bgb_komplett covers all) ---")
    if "bp_compliance_gesetze" in collections:
        for reg_id in GESETZE_BGB_EXTRACTS:
            total_deleted += delete_by_regulation_id(
                session, url, "bp_compliance_gesetze", reg_id, dry_run
            )
            time.sleep(0.2)

    # ── Step 4: Delete AT law duplicates ──
    print(f"\n--- Step 4: Delete Austrian law duplicates ---")
    if "bp_compliance_gesetze" in collections:
        for reg_id in GESETZE_AT_DUPLICATES:
            total_deleted += delete_by_regulation_id(
                session, url, "bp_compliance_gesetze", reg_id, dry_run
            )
            time.sleep(0.2)

    # ── Step 5: Delete stubs ──
    print(f"\n--- Step 5: Delete stub entries (1-chunk placeholders) ---")
    if "bp_compliance_gesetze" in collections:
        for reg_id in GESETZE_STUBS:
            total_deleted += delete_by_regulation_id(
                session, url, "bp_compliance_gesetze", reg_id, dry_run
            )
            time.sleep(0.2)

    # ── Step 6: Delete EU cross-collection duplicates from gesetze ──
    print(f"\n--- Step 6: Delete EU regulations from gesetze (keep CE) ---")
    if "bp_compliance_gesetze" in collections:
        for reg_id in GESETZE_EU_CROSS_COLLECTION:
            total_deleted += delete_by_regulation_id(
                session, url, "bp_compliance_gesetze", reg_id, dry_run
            )
            time.sleep(0.2)

    # ── Step 7: Delete dual-naming duplicates in gesetze ──
    print(f"\n--- Step 7: Delete dual-naming duplicates in gesetze ---")
    if "bp_compliance_gesetze" in collections:
        for reg_id in GESETZE_DUAL_NAMING:
            total_deleted += delete_by_regulation_id(
                session, url, "bp_compliance_gesetze", reg_id, dry_run
            )
            time.sleep(0.2)

    # ── Step 8: Delete EDPB/WP duplicates in datenschutz ──
    print(f"\n--- Step 8: Delete EDPB/WP duplicate ingestions ---")
    if "bp_compliance_datenschutz" in collections:
        for reg_id in DATENSCHUTZ_DUPLICATES:
            total_deleted += delete_by_regulation_id(
                session, url, "bp_compliance_datenschutz", reg_id, dry_run
            )
            time.sleep(0.2)

    # ── Summary ──
    print(f"\n{'='*60}")
    action = "would be deleted" if dry_run else "deleted"
    print(f"Total chunks {action}: {total_deleted:,}")
    print(f"{'='*60}\n")


def main():
    parser = argparse.ArgumentParser(description="Qdrant duplicate chunk cleanup")
    parser.add_argument(
        "--dry-run",
        action="store_true",
        default=False,
        help="Preview deletions without executing (default: false)",
    )
    parser.add_argument(
        "--target",
        choices=["local", "production", "both"],
        default="both",
        help="Which Qdrant instance to clean (default: both)",
    )
    args = parser.parse_args()

    if not args.dry_run:
        print("\n" + "!" * 60)
        print("  WARNING: LIVE DELETE MODE — chunks will be permanently removed!")
        print("!" * 60)
        answer = input("  Type 'DELETE' to confirm: ")
        if answer != "DELETE":
            print("  Aborted.")
            sys.exit(0)

    targets = (
        TARGETS.items()
        if args.target == "both"
        else [(args.target, TARGETS[args.target])]
    )

    for name, config in targets:
        run_cleanup(name, config, dry_run=args.dry_run)


if __name__ == "__main__":
    main()