Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Interactive Training Videos (CP-TRAIN): - DB migration 022: training_checkpoints + checkpoint_progress tables - NarratorScript generation via Anthropic (AI Teacher persona, German) - TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg) - 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress - InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking) - Learner portal integration with automatic completion on all checkpoints passed - 30 new tests (handler validation + grading logic + manifest/progress + seek protection) Training Blocks: - Block generator, block store, block config CRUD + preview/generate endpoints - Migration 021: training_blocks schema Control Generator + Canonical Library: - Control generator routes + service enhancements - Canonical control library helpers, sidebar entry - Citation backfill service + tests - CE libraries data (hazard, protection, evidence, lifecycle, components) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
359 lines
14 KiB
Python
359 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Qdrant Duplicate Cleanup
|
|
Removes redundant/duplicate chunks from Qdrant collections.
|
|
|
|
Targets:
|
|
1. bp_compliance_recht — entire collection (100% subset of bp_compliance_ce)
|
|
2. bp_compliance_gesetze — old versions where _komplett exists
|
|
3. bp_compliance_gesetze — BGB section extracts (subset of bgb_komplett)
|
|
4. bp_compliance_gesetze — AT law duplicates (renamed copies)
|
|
5. bp_compliance_gesetze — stubs (1 chunk placeholders)
|
|
6. bp_compliance_gesetze — EU regulations already in bp_compliance_ce
|
|
7. bp_compliance_gesetze — dual-naming duplicates (keep newer/longer version)
|
|
8. bp_compliance_datenschutz — EDPB/WP duplicate ingestions
|
|
|
|
Run with --dry-run to preview deletions without executing.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
import time
|
|
import requests
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Config — targets BOTH local and production Qdrant
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
TARGETS = {
|
|
"local": {
|
|
"url": "http://macmini:6333",
|
|
"api_key": None,
|
|
},
|
|
"production": {
|
|
"url": "https://qdrant-dev.breakpilot.ai",
|
|
"api_key": "z9cKbT74vl1aKPD1QGIlKWfET47VH93u",
|
|
},
|
|
}
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Deletion plan — regulation_ids to remove per collection
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
# 1. bp_compliance_recht: DELETE ENTIRE COLLECTION
|
|
# All 9 regulation_ids are already in bp_compliance_ce with same or more chunks
|
|
|
|
# 2. bp_compliance_gesetze: old versions (keep _komplett)
|
|
GESETZE_OLD_VERSIONS = [
|
|
"ao", # ao_komplett has 9,669 chunks vs ao's 1,752
|
|
"bdsg", # bdsg_2018_komplett has 1,056 vs bdsg's 389
|
|
"egbgb", # egbgb_komplett has 1,412 vs egbgb's 269
|
|
"hgb", # hgb_komplett has 11,363 vs hgb's 1,937
|
|
]
|
|
|
|
# 3. bp_compliance_gesetze: BGB section extracts (subset of bgb_komplett 4,024 chunks)
|
|
GESETZE_BGB_EXTRACTS = [
|
|
"bgb_agb", # 94 chunks
|
|
"bgb_digital", # 42 chunks
|
|
"bgb_fernabsatz", # 71 chunks
|
|
"bgb_kaufrecht", # 147 chunks
|
|
"bgb_widerruf", # 50 chunks
|
|
]
|
|
|
|
# 4. bp_compliance_gesetze: AT law duplicates (renamed copies with identical chunks)
|
|
GESETZE_AT_DUPLICATES = [
|
|
"at_abgb_agb", # 2,521 chunks = exact copy of at_abgb
|
|
"at_bao_ret", # 2,246 chunks = exact copy of at_bao
|
|
"at_ugb_ret", # 2,828 chunks = exact copy of at_ugb
|
|
]
|
|
|
|
# 5. bp_compliance_gesetze: stubs (1 chunk, incomplete ingestions)
|
|
GESETZE_STUBS = [
|
|
"de_uwg", # 1 chunk (uwg has 157)
|
|
"de_pangv", # 1 chunk (pangv has 99)
|
|
"de_bsig", # 1 chunk (standalone stub)
|
|
]
|
|
|
|
# 6. bp_compliance_gesetze: EU regulations already fully in bp_compliance_ce
|
|
# CE has equal or more chunks for all of these
|
|
GESETZE_EU_CROSS_COLLECTION = [
|
|
"eu_2016_679", # GDPR: 423 in both
|
|
"eu_2024_1689", # AI Act: 726 in both
|
|
"eu_2024_2847", # CRA: 429 in gesetze, 1365 in CE
|
|
"eu_2022_2555", # NIS2: 344 in gesetze, 342 in CE (near-identical)
|
|
"eu_2023_1230", # Machinery: 395 in gesetze, 1271 in CE
|
|
]
|
|
|
|
# 7. bp_compliance_gesetze: dual-naming (keep the longer/newer version)
|
|
GESETZE_DUAL_NAMING = [
|
|
"tkg", # 1,391 chunks — de_tkg has 1,631 (keep de_tkg)
|
|
"ustg", # 915 chunks — de_ustg_ret has 1,071 (keep de_ustg_ret)
|
|
"ddg_5", # 40 chunks — ddg has 189 (section extract)
|
|
"egbgb_widerruf", # 36 chunks — egbgb_komplett has 1,412 (section extract)
|
|
]
|
|
|
|
# 8. bp_compliance_datenschutz: EDPB/WP duplicate ingestions (keep the longer-named version)
|
|
DATENSCHUTZ_DUPLICATES = [
|
|
"edpb_rtbf_05_2019", # 111 chunks — edpb_right_to_be_forgotten_05_2019 has 111 (keep long name)
|
|
"edpb_vva_02_2021", # 273 chunks — edpb_virtual_voice_assistant_02_2021 has 273 (keep long name)
|
|
"edpb_01_2020", # 337 chunks — edpb_transfers_01_2020 has 337 (keep long name)
|
|
"wp242_portability", # 141 chunks — wp242_right_portability has 141 (keep long name)
|
|
"wp250_breach", # 201 chunks — wp251_data_breach has 201 (keep long name)
|
|
"wp244_profiling", # 247 chunks — wp251_profiling has 247 (keep long name)
|
|
"edpb_legitimate_interest", # 672 chunks — edpb_legitimate_interest_01_2024 has 336 (keep dated version)
|
|
]
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# All gesetze deletions combined
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
ALL_GESETZE_DELETIONS = (
|
|
GESETZE_OLD_VERSIONS
|
|
+ GESETZE_BGB_EXTRACTS
|
|
+ GESETZE_AT_DUPLICATES
|
|
+ GESETZE_STUBS
|
|
+ GESETZE_EU_CROSS_COLLECTION
|
|
+ GESETZE_DUAL_NAMING
|
|
)
|
|
|
|
|
|
def log(msg):
|
|
print(f"\033[0;32m[OK]\033[0m {msg}")
|
|
|
|
def warn(msg):
|
|
print(f"\033[1;33m[WARN]\033[0m {msg}")
|
|
|
|
def info(msg):
|
|
print(f"\033[0;36m[INFO]\033[0m {msg}")
|
|
|
|
def fail(msg):
|
|
print(f"\033[0;31m[FAIL]\033[0m {msg}")
|
|
|
|
|
|
def make_session(target_config):
|
|
"""Create a requests session for the given target."""
|
|
s = requests.Session()
|
|
s.headers.update({"Content-Type": "application/json"})
|
|
if target_config["api_key"]:
|
|
s.headers.update({"api-key": target_config["api_key"]})
|
|
s.timeout = 60
|
|
return s
|
|
|
|
|
|
def count_by_regulation_id(session, url, collection, regulation_id):
|
|
"""Count points in a collection matching a regulation_id."""
|
|
resp = session.post(
|
|
f"{url}/collections/{collection}/points/count",
|
|
json={
|
|
"filter": {
|
|
"must": [
|
|
{"key": "regulation_id", "match": {"value": regulation_id}}
|
|
]
|
|
},
|
|
"exact": True,
|
|
},
|
|
)
|
|
if resp.status_code == 200:
|
|
return resp.json().get("result", {}).get("count", 0)
|
|
return -1
|
|
|
|
|
|
def count_collection(session, url, collection):
|
|
"""Get total point count for a collection."""
|
|
resp = session.get(f"{url}/collections/{collection}")
|
|
if resp.status_code == 200:
|
|
return resp.json().get("result", {}).get("points_count", 0)
|
|
return -1
|
|
|
|
|
|
def delete_by_regulation_id(session, url, collection, regulation_id, dry_run=True):
|
|
"""Delete all points in a collection matching a regulation_id."""
|
|
count = count_by_regulation_id(session, url, collection, regulation_id)
|
|
if count <= 0:
|
|
if count == 0:
|
|
info(f" {collection}/{regulation_id}: 0 chunks (already clean)")
|
|
else:
|
|
warn(f" {collection}/{regulation_id}: count failed")
|
|
return 0
|
|
|
|
if dry_run:
|
|
info(f" {collection}/{regulation_id}: {count} chunks (would delete)")
|
|
return count
|
|
|
|
resp = session.post(
|
|
f"{url}/collections/{collection}/points/delete",
|
|
json={
|
|
"filter": {
|
|
"must": [
|
|
{"key": "regulation_id", "match": {"value": regulation_id}}
|
|
]
|
|
}
|
|
},
|
|
)
|
|
if resp.status_code == 200:
|
|
log(f" {collection}/{regulation_id}: {count} chunks deleted")
|
|
return count
|
|
else:
|
|
warn(f" {collection}/{regulation_id}: delete failed ({resp.status_code}: {resp.text[:200]})")
|
|
return 0
|
|
|
|
|
|
def delete_collection(session, url, collection, dry_run=True):
|
|
"""Delete an entire collection."""
|
|
count = count_collection(session, url, collection)
|
|
if count < 0:
|
|
warn(f" {collection}: not found or error")
|
|
return 0
|
|
|
|
if dry_run:
|
|
info(f" {collection}: {count} chunks total (would delete collection)")
|
|
return count
|
|
|
|
resp = session.delete(f"{url}/collections/{collection}")
|
|
if resp.status_code == 200:
|
|
log(f" {collection}: deleted ({count} chunks)")
|
|
return count
|
|
else:
|
|
warn(f" {collection}: delete failed ({resp.status_code}: {resp.text[:200]})")
|
|
return 0
|
|
|
|
|
|
def run_cleanup(target_name, target_config, dry_run=True):
|
|
"""Run the full cleanup for a single Qdrant target."""
|
|
url = target_config["url"]
|
|
session = make_session(target_config)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Target: {target_name} ({url})")
|
|
print(f"Mode: {'DRY RUN' if dry_run else 'LIVE DELETE'}")
|
|
print(f"{'='*60}")
|
|
|
|
# Check connectivity
|
|
try:
|
|
resp = session.get(f"{url}/collections")
|
|
resp.raise_for_status()
|
|
collections = [c["name"] for c in resp.json().get("result", {}).get("collections", [])]
|
|
info(f"Connected. Collections: {len(collections)}")
|
|
except Exception as e:
|
|
warn(f"Cannot connect to {url}: {e}")
|
|
return
|
|
|
|
total_deleted = 0
|
|
|
|
# ── Step 1: Delete bp_compliance_recht ──
|
|
print(f"\n--- Step 1: Delete bp_compliance_recht (100% subset of CE) ---")
|
|
if "bp_compliance_recht" in collections:
|
|
total_deleted += delete_collection(session, url, "bp_compliance_recht", dry_run)
|
|
else:
|
|
info(" bp_compliance_recht: not found (already deleted)")
|
|
|
|
# ── Step 2: Delete old versions in gesetze ──
|
|
print(f"\n--- Step 2: Delete old versions in bp_compliance_gesetze ---")
|
|
print(f" (ao, bdsg, egbgb, hgb — _komplett versions exist)")
|
|
if "bp_compliance_gesetze" in collections:
|
|
for reg_id in GESETZE_OLD_VERSIONS:
|
|
total_deleted += delete_by_regulation_id(
|
|
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
|
)
|
|
time.sleep(0.2)
|
|
|
|
# ── Step 3: Delete BGB section extracts ──
|
|
print(f"\n--- Step 3: Delete BGB section extracts (bgb_komplett covers all) ---")
|
|
if "bp_compliance_gesetze" in collections:
|
|
for reg_id in GESETZE_BGB_EXTRACTS:
|
|
total_deleted += delete_by_regulation_id(
|
|
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
|
)
|
|
time.sleep(0.2)
|
|
|
|
# ── Step 4: Delete AT law duplicates ──
|
|
print(f"\n--- Step 4: Delete Austrian law duplicates ---")
|
|
if "bp_compliance_gesetze" in collections:
|
|
for reg_id in GESETZE_AT_DUPLICATES:
|
|
total_deleted += delete_by_regulation_id(
|
|
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
|
)
|
|
time.sleep(0.2)
|
|
|
|
# ── Step 5: Delete stubs ──
|
|
print(f"\n--- Step 5: Delete stub entries (1-chunk placeholders) ---")
|
|
if "bp_compliance_gesetze" in collections:
|
|
for reg_id in GESETZE_STUBS:
|
|
total_deleted += delete_by_regulation_id(
|
|
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
|
)
|
|
time.sleep(0.2)
|
|
|
|
# ── Step 6: Delete EU cross-collection duplicates from gesetze ──
|
|
print(f"\n--- Step 6: Delete EU regulations from gesetze (keep CE) ---")
|
|
if "bp_compliance_gesetze" in collections:
|
|
for reg_id in GESETZE_EU_CROSS_COLLECTION:
|
|
total_deleted += delete_by_regulation_id(
|
|
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
|
)
|
|
time.sleep(0.2)
|
|
|
|
# ── Step 7: Delete dual-naming duplicates in gesetze ──
|
|
print(f"\n--- Step 7: Delete dual-naming duplicates in gesetze ---")
|
|
if "bp_compliance_gesetze" in collections:
|
|
for reg_id in GESETZE_DUAL_NAMING:
|
|
total_deleted += delete_by_regulation_id(
|
|
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
|
)
|
|
time.sleep(0.2)
|
|
|
|
# ── Step 8: Delete EDPB/WP duplicates in datenschutz ──
|
|
print(f"\n--- Step 8: Delete EDPB/WP duplicate ingestions ---")
|
|
if "bp_compliance_datenschutz" in collections:
|
|
for reg_id in DATENSCHUTZ_DUPLICATES:
|
|
total_deleted += delete_by_regulation_id(
|
|
session, url, "bp_compliance_datenschutz", reg_id, dry_run
|
|
)
|
|
time.sleep(0.2)
|
|
|
|
# ── Summary ──
|
|
print(f"\n{'='*60}")
|
|
action = "would be deleted" if dry_run else "deleted"
|
|
print(f"Total chunks {action}: {total_deleted:,}")
|
|
print(f"{'='*60}\n")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Qdrant duplicate chunk cleanup")
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
default=False,
|
|
help="Preview deletions without executing (default: false)",
|
|
)
|
|
parser.add_argument(
|
|
"--target",
|
|
choices=["local", "production", "both"],
|
|
default="both",
|
|
help="Which Qdrant instance to clean (default: both)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if not args.dry_run:
|
|
print("\n" + "!" * 60)
|
|
print(" WARNING: LIVE DELETE MODE — chunks will be permanently removed!")
|
|
print("!" * 60)
|
|
answer = input(" Type 'DELETE' to confirm: ")
|
|
if answer != "DELETE":
|
|
print(" Aborted.")
|
|
sys.exit(0)
|
|
|
|
targets = (
|
|
TARGETS.items()
|
|
if args.target == "both"
|
|
else [(args.target, TARGETS[args.target])]
|
|
)
|
|
|
|
for name, config in targets:
|
|
run_cleanup(name, config, dry_run=args.dry_run)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|