Files
breakpilot-compliance/scripts/cleanup-qdrant-duplicates.py
Benjamin Admin 4f6bc8f6f6
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
feat(training+controls): interactive video pipeline, training blocks, control generator, CE libraries
Interactive Training Videos (CP-TRAIN):
- DB migration 022: training_checkpoints + checkpoint_progress tables
- NarratorScript generation via Anthropic (AI Teacher persona, German)
- TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg)
- 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress
- InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking)
- Learner portal integration with automatic completion on all checkpoints passed
- 30 new tests (handler validation + grading logic + manifest/progress + seek protection)

Training Blocks:
- Block generator, block store, block config CRUD + preview/generate endpoints
- Migration 021: training_blocks schema

Control Generator + Canonical Library:
- Control generator routes + service enhancements
- Canonical control library helpers, sidebar entry
- Citation backfill service + tests
- CE libraries data (hazard, protection, evidence, lifecycle, components)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 21:41:48 +01:00

359 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Qdrant Duplicate Cleanup
Removes redundant/duplicate chunks from Qdrant collections.
Targets:
1. bp_compliance_recht — entire collection (100% subset of bp_compliance_ce)
2. bp_compliance_gesetze — old versions where _komplett exists
3. bp_compliance_gesetze — BGB section extracts (subset of bgb_komplett)
4. bp_compliance_gesetze — AT law duplicates (renamed copies)
5. bp_compliance_gesetze — stubs (1 chunk placeholders)
6. bp_compliance_gesetze — EU regulations already in bp_compliance_ce
7. bp_compliance_gesetze — dual-naming duplicates (keep newer/longer version)
8. bp_compliance_datenschutz — EDPB/WP duplicate ingestions
Run with --dry-run to preview deletions without executing.
"""
import argparse
import json
import sys
import time
import requests
# ─────────────────────────────────────────────────────────────────────────────
# Config — targets BOTH local and production Qdrant
# ─────────────────────────────────────────────────────────────────────────────
TARGETS = {
"local": {
"url": "http://macmini:6333",
"api_key": None,
},
"production": {
"url": "https://qdrant-dev.breakpilot.ai",
"api_key": "z9cKbT74vl1aKPD1QGIlKWfET47VH93u",
},
}
# ─────────────────────────────────────────────────────────────────────────────
# Deletion plan — regulation_ids to remove per collection
# ─────────────────────────────────────────────────────────────────────────────
# 1. bp_compliance_recht: DELETE ENTIRE COLLECTION
# All 9 regulation_ids are already in bp_compliance_ce with same or more chunks
# 2. bp_compliance_gesetze: old versions (keep _komplett)
GESETZE_OLD_VERSIONS = [
"ao", # ao_komplett has 9,669 chunks vs ao's 1,752
"bdsg", # bdsg_2018_komplett has 1,056 vs bdsg's 389
"egbgb", # egbgb_komplett has 1,412 vs egbgb's 269
"hgb", # hgb_komplett has 11,363 vs hgb's 1,937
]
# 3. bp_compliance_gesetze: BGB section extracts (subset of bgb_komplett 4,024 chunks)
GESETZE_BGB_EXTRACTS = [
"bgb_agb", # 94 chunks
"bgb_digital", # 42 chunks
"bgb_fernabsatz", # 71 chunks
"bgb_kaufrecht", # 147 chunks
"bgb_widerruf", # 50 chunks
]
# 4. bp_compliance_gesetze: AT law duplicates (renamed copies with identical chunks)
GESETZE_AT_DUPLICATES = [
"at_abgb_agb", # 2,521 chunks = exact copy of at_abgb
"at_bao_ret", # 2,246 chunks = exact copy of at_bao
"at_ugb_ret", # 2,828 chunks = exact copy of at_ugb
]
# 5. bp_compliance_gesetze: stubs (1 chunk, incomplete ingestions)
GESETZE_STUBS = [
"de_uwg", # 1 chunk (uwg has 157)
"de_pangv", # 1 chunk (pangv has 99)
"de_bsig", # 1 chunk (standalone stub)
]
# 6. bp_compliance_gesetze: EU regulations already fully in bp_compliance_ce
# CE has equal or more chunks for all of these
GESETZE_EU_CROSS_COLLECTION = [
"eu_2016_679", # GDPR: 423 in both
"eu_2024_1689", # AI Act: 726 in both
"eu_2024_2847", # CRA: 429 in gesetze, 1365 in CE
"eu_2022_2555", # NIS2: 344 in gesetze, 342 in CE (near-identical)
"eu_2023_1230", # Machinery: 395 in gesetze, 1271 in CE
]
# 7. bp_compliance_gesetze: dual-naming (keep the longer/newer version)
GESETZE_DUAL_NAMING = [
"tkg", # 1,391 chunks — de_tkg has 1,631 (keep de_tkg)
"ustg", # 915 chunks — de_ustg_ret has 1,071 (keep de_ustg_ret)
"ddg_5", # 40 chunks — ddg has 189 (section extract)
"egbgb_widerruf", # 36 chunks — egbgb_komplett has 1,412 (section extract)
]
# 8. bp_compliance_datenschutz: EDPB/WP duplicate ingestions (keep the longer-named version)
DATENSCHUTZ_DUPLICATES = [
"edpb_rtbf_05_2019", # 111 chunks — edpb_right_to_be_forgotten_05_2019 has 111 (keep long name)
"edpb_vva_02_2021", # 273 chunks — edpb_virtual_voice_assistant_02_2021 has 273 (keep long name)
"edpb_01_2020", # 337 chunks — edpb_transfers_01_2020 has 337 (keep long name)
"wp242_portability", # 141 chunks — wp242_right_portability has 141 (keep long name)
"wp250_breach", # 201 chunks — wp251_data_breach has 201 (keep long name)
"wp244_profiling", # 247 chunks — wp251_profiling has 247 (keep long name)
"edpb_legitimate_interest", # 672 chunks — edpb_legitimate_interest_01_2024 has 336 (keep dated version)
]
# ─────────────────────────────────────────────────────────────────────────────
# All gesetze deletions combined
# ─────────────────────────────────────────────────────────────────────────────
ALL_GESETZE_DELETIONS = (
GESETZE_OLD_VERSIONS
+ GESETZE_BGB_EXTRACTS
+ GESETZE_AT_DUPLICATES
+ GESETZE_STUBS
+ GESETZE_EU_CROSS_COLLECTION
+ GESETZE_DUAL_NAMING
)
def log(msg):
print(f"\033[0;32m[OK]\033[0m {msg}")
def warn(msg):
print(f"\033[1;33m[WARN]\033[0m {msg}")
def info(msg):
print(f"\033[0;36m[INFO]\033[0m {msg}")
def fail(msg):
print(f"\033[0;31m[FAIL]\033[0m {msg}")
def make_session(target_config):
"""Create a requests session for the given target."""
s = requests.Session()
s.headers.update({"Content-Type": "application/json"})
if target_config["api_key"]:
s.headers.update({"api-key": target_config["api_key"]})
s.timeout = 60
return s
def count_by_regulation_id(session, url, collection, regulation_id):
"""Count points in a collection matching a regulation_id."""
resp = session.post(
f"{url}/collections/{collection}/points/count",
json={
"filter": {
"must": [
{"key": "regulation_id", "match": {"value": regulation_id}}
]
},
"exact": True,
},
)
if resp.status_code == 200:
return resp.json().get("result", {}).get("count", 0)
return -1
def count_collection(session, url, collection):
"""Get total point count for a collection."""
resp = session.get(f"{url}/collections/{collection}")
if resp.status_code == 200:
return resp.json().get("result", {}).get("points_count", 0)
return -1
def delete_by_regulation_id(session, url, collection, regulation_id, dry_run=True):
"""Delete all points in a collection matching a regulation_id."""
count = count_by_regulation_id(session, url, collection, regulation_id)
if count <= 0:
if count == 0:
info(f" {collection}/{regulation_id}: 0 chunks (already clean)")
else:
warn(f" {collection}/{regulation_id}: count failed")
return 0
if dry_run:
info(f" {collection}/{regulation_id}: {count} chunks (would delete)")
return count
resp = session.post(
f"{url}/collections/{collection}/points/delete",
json={
"filter": {
"must": [
{"key": "regulation_id", "match": {"value": regulation_id}}
]
}
},
)
if resp.status_code == 200:
log(f" {collection}/{regulation_id}: {count} chunks deleted")
return count
else:
warn(f" {collection}/{regulation_id}: delete failed ({resp.status_code}: {resp.text[:200]})")
return 0
def delete_collection(session, url, collection, dry_run=True):
"""Delete an entire collection."""
count = count_collection(session, url, collection)
if count < 0:
warn(f" {collection}: not found or error")
return 0
if dry_run:
info(f" {collection}: {count} chunks total (would delete collection)")
return count
resp = session.delete(f"{url}/collections/{collection}")
if resp.status_code == 200:
log(f" {collection}: deleted ({count} chunks)")
return count
else:
warn(f" {collection}: delete failed ({resp.status_code}: {resp.text[:200]})")
return 0
def run_cleanup(target_name, target_config, dry_run=True):
"""Run the full cleanup for a single Qdrant target."""
url = target_config["url"]
session = make_session(target_config)
print(f"\n{'='*60}")
print(f"Target: {target_name} ({url})")
print(f"Mode: {'DRY RUN' if dry_run else 'LIVE DELETE'}")
print(f"{'='*60}")
# Check connectivity
try:
resp = session.get(f"{url}/collections")
resp.raise_for_status()
collections = [c["name"] for c in resp.json().get("result", {}).get("collections", [])]
info(f"Connected. Collections: {len(collections)}")
except Exception as e:
warn(f"Cannot connect to {url}: {e}")
return
total_deleted = 0
# ── Step 1: Delete bp_compliance_recht ──
print(f"\n--- Step 1: Delete bp_compliance_recht (100% subset of CE) ---")
if "bp_compliance_recht" in collections:
total_deleted += delete_collection(session, url, "bp_compliance_recht", dry_run)
else:
info(" bp_compliance_recht: not found (already deleted)")
# ── Step 2: Delete old versions in gesetze ──
print(f"\n--- Step 2: Delete old versions in bp_compliance_gesetze ---")
print(f" (ao, bdsg, egbgb, hgb — _komplett versions exist)")
if "bp_compliance_gesetze" in collections:
for reg_id in GESETZE_OLD_VERSIONS:
total_deleted += delete_by_regulation_id(
session, url, "bp_compliance_gesetze", reg_id, dry_run
)
time.sleep(0.2)
# ── Step 3: Delete BGB section extracts ──
print(f"\n--- Step 3: Delete BGB section extracts (bgb_komplett covers all) ---")
if "bp_compliance_gesetze" in collections:
for reg_id in GESETZE_BGB_EXTRACTS:
total_deleted += delete_by_regulation_id(
session, url, "bp_compliance_gesetze", reg_id, dry_run
)
time.sleep(0.2)
# ── Step 4: Delete AT law duplicates ──
print(f"\n--- Step 4: Delete Austrian law duplicates ---")
if "bp_compliance_gesetze" in collections:
for reg_id in GESETZE_AT_DUPLICATES:
total_deleted += delete_by_regulation_id(
session, url, "bp_compliance_gesetze", reg_id, dry_run
)
time.sleep(0.2)
# ── Step 5: Delete stubs ──
print(f"\n--- Step 5: Delete stub entries (1-chunk placeholders) ---")
if "bp_compliance_gesetze" in collections:
for reg_id in GESETZE_STUBS:
total_deleted += delete_by_regulation_id(
session, url, "bp_compliance_gesetze", reg_id, dry_run
)
time.sleep(0.2)
# ── Step 6: Delete EU cross-collection duplicates from gesetze ──
print(f"\n--- Step 6: Delete EU regulations from gesetze (keep CE) ---")
if "bp_compliance_gesetze" in collections:
for reg_id in GESETZE_EU_CROSS_COLLECTION:
total_deleted += delete_by_regulation_id(
session, url, "bp_compliance_gesetze", reg_id, dry_run
)
time.sleep(0.2)
# ── Step 7: Delete dual-naming duplicates in gesetze ──
print(f"\n--- Step 7: Delete dual-naming duplicates in gesetze ---")
if "bp_compliance_gesetze" in collections:
for reg_id in GESETZE_DUAL_NAMING:
total_deleted += delete_by_regulation_id(
session, url, "bp_compliance_gesetze", reg_id, dry_run
)
time.sleep(0.2)
# ── Step 8: Delete EDPB/WP duplicates in datenschutz ──
print(f"\n--- Step 8: Delete EDPB/WP duplicate ingestions ---")
if "bp_compliance_datenschutz" in collections:
for reg_id in DATENSCHUTZ_DUPLICATES:
total_deleted += delete_by_regulation_id(
session, url, "bp_compliance_datenschutz", reg_id, dry_run
)
time.sleep(0.2)
# ── Summary ──
print(f"\n{'='*60}")
action = "would be deleted" if dry_run else "deleted"
print(f"Total chunks {action}: {total_deleted:,}")
print(f"{'='*60}\n")
def main():
parser = argparse.ArgumentParser(description="Qdrant duplicate chunk cleanup")
parser.add_argument(
"--dry-run",
action="store_true",
default=False,
help="Preview deletions without executing (default: false)",
)
parser.add_argument(
"--target",
choices=["local", "production", "both"],
default="both",
help="Which Qdrant instance to clean (default: both)",
)
args = parser.parse_args()
if not args.dry_run:
print("\n" + "!" * 60)
print(" WARNING: LIVE DELETE MODE — chunks will be permanently removed!")
print("!" * 60)
answer = input(" Type 'DELETE' to confirm: ")
if answer != "DELETE":
print(" Aborted.")
sys.exit(0)
targets = (
TARGETS.items()
if args.target == "both"
else [(args.target, TARGETS[args.target])]
)
for name, config in targets:
run_cleanup(name, config, dry_run=args.dry_run)
if __name__ == "__main__":
main()