feat(training+controls): interactive video pipeline, training blocks, control generator, CE libraries
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Interactive Training Videos (CP-TRAIN): - DB migration 022: training_checkpoints + checkpoint_progress tables - NarratorScript generation via Anthropic (AI Teacher persona, German) - TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg) - 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress - InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking) - Learner portal integration with automatic completion on all checkpoints passed - 30 new tests (handler validation + grading logic + manifest/progress + seek protection) Training Blocks: - Block generator, block store, block config CRUD + preview/generate endpoints - Migration 021: training_blocks schema Control Generator + Canonical Library: - Control generator routes + service enhancements - Canonical control library helpers, sidebar entry - Citation backfill service + tests - CE libraries data (hazard, protection, evidence, lifecycle, components) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
358
scripts/cleanup-qdrant-duplicates.py
Normal file
358
scripts/cleanup-qdrant-duplicates.py
Normal file
@@ -0,0 +1,358 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Qdrant Duplicate Cleanup
|
||||
Removes redundant/duplicate chunks from Qdrant collections.
|
||||
|
||||
Targets:
|
||||
1. bp_compliance_recht — entire collection (100% subset of bp_compliance_ce)
|
||||
2. bp_compliance_gesetze — old versions where _komplett exists
|
||||
3. bp_compliance_gesetze — BGB section extracts (subset of bgb_komplett)
|
||||
4. bp_compliance_gesetze — AT law duplicates (renamed copies)
|
||||
5. bp_compliance_gesetze — stubs (1 chunk placeholders)
|
||||
6. bp_compliance_gesetze — EU regulations already in bp_compliance_ce
|
||||
7. bp_compliance_gesetze — dual-naming duplicates (keep newer/longer version)
|
||||
8. bp_compliance_datenschutz — EDPB/WP duplicate ingestions
|
||||
|
||||
Run with --dry-run to preview deletions without executing.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import requests
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Config — targets BOTH local and production Qdrant
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
TARGETS = {
|
||||
"local": {
|
||||
"url": "http://macmini:6333",
|
||||
"api_key": None,
|
||||
},
|
||||
"production": {
|
||||
"url": "https://qdrant-dev.breakpilot.ai",
|
||||
"api_key": "z9cKbT74vl1aKPD1QGIlKWfET47VH93u",
|
||||
},
|
||||
}
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Deletion plan — regulation_ids to remove per collection
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# 1. bp_compliance_recht: DELETE ENTIRE COLLECTION
|
||||
# All 9 regulation_ids are already in bp_compliance_ce with same or more chunks
|
||||
|
||||
# 2. bp_compliance_gesetze: old versions (keep _komplett)
|
||||
GESETZE_OLD_VERSIONS = [
|
||||
"ao", # ao_komplett has 9,669 chunks vs ao's 1,752
|
||||
"bdsg", # bdsg_2018_komplett has 1,056 vs bdsg's 389
|
||||
"egbgb", # egbgb_komplett has 1,412 vs egbgb's 269
|
||||
"hgb", # hgb_komplett has 11,363 vs hgb's 1,937
|
||||
]
|
||||
|
||||
# 3. bp_compliance_gesetze: BGB section extracts (subset of bgb_komplett 4,024 chunks)
|
||||
GESETZE_BGB_EXTRACTS = [
|
||||
"bgb_agb", # 94 chunks
|
||||
"bgb_digital", # 42 chunks
|
||||
"bgb_fernabsatz", # 71 chunks
|
||||
"bgb_kaufrecht", # 147 chunks
|
||||
"bgb_widerruf", # 50 chunks
|
||||
]
|
||||
|
||||
# 4. bp_compliance_gesetze: AT law duplicates (renamed copies with identical chunks)
|
||||
GESETZE_AT_DUPLICATES = [
|
||||
"at_abgb_agb", # 2,521 chunks = exact copy of at_abgb
|
||||
"at_bao_ret", # 2,246 chunks = exact copy of at_bao
|
||||
"at_ugb_ret", # 2,828 chunks = exact copy of at_ugb
|
||||
]
|
||||
|
||||
# 5. bp_compliance_gesetze: stubs (1 chunk, incomplete ingestions)
|
||||
GESETZE_STUBS = [
|
||||
"de_uwg", # 1 chunk (uwg has 157)
|
||||
"de_pangv", # 1 chunk (pangv has 99)
|
||||
"de_bsig", # 1 chunk (standalone stub)
|
||||
]
|
||||
|
||||
# 6. bp_compliance_gesetze: EU regulations already fully in bp_compliance_ce
|
||||
# CE has equal or more chunks for all of these
|
||||
GESETZE_EU_CROSS_COLLECTION = [
|
||||
"eu_2016_679", # GDPR: 423 in both
|
||||
"eu_2024_1689", # AI Act: 726 in both
|
||||
"eu_2024_2847", # CRA: 429 in gesetze, 1365 in CE
|
||||
"eu_2022_2555", # NIS2: 344 in gesetze, 342 in CE (near-identical)
|
||||
"eu_2023_1230", # Machinery: 395 in gesetze, 1271 in CE
|
||||
]
|
||||
|
||||
# 7. bp_compliance_gesetze: dual-naming (keep the longer/newer version)
|
||||
GESETZE_DUAL_NAMING = [
|
||||
"tkg", # 1,391 chunks — de_tkg has 1,631 (keep de_tkg)
|
||||
"ustg", # 915 chunks — de_ustg_ret has 1,071 (keep de_ustg_ret)
|
||||
"ddg_5", # 40 chunks — ddg has 189 (section extract)
|
||||
"egbgb_widerruf", # 36 chunks — egbgb_komplett has 1,412 (section extract)
|
||||
]
|
||||
|
||||
# 8. bp_compliance_datenschutz: EDPB/WP duplicate ingestions (keep the longer-named version)
|
||||
DATENSCHUTZ_DUPLICATES = [
|
||||
"edpb_rtbf_05_2019", # 111 chunks — edpb_right_to_be_forgotten_05_2019 has 111 (keep long name)
|
||||
"edpb_vva_02_2021", # 273 chunks — edpb_virtual_voice_assistant_02_2021 has 273 (keep long name)
|
||||
"edpb_01_2020", # 337 chunks — edpb_transfers_01_2020 has 337 (keep long name)
|
||||
"wp242_portability", # 141 chunks — wp242_right_portability has 141 (keep long name)
|
||||
"wp250_breach", # 201 chunks — wp251_data_breach has 201 (keep long name)
|
||||
"wp244_profiling", # 247 chunks — wp251_profiling has 247 (keep long name)
|
||||
"edpb_legitimate_interest", # 672 chunks — edpb_legitimate_interest_01_2024 has 336 (keep dated version)
|
||||
]
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# All gesetze deletions combined
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
ALL_GESETZE_DELETIONS = (
|
||||
GESETZE_OLD_VERSIONS
|
||||
+ GESETZE_BGB_EXTRACTS
|
||||
+ GESETZE_AT_DUPLICATES
|
||||
+ GESETZE_STUBS
|
||||
+ GESETZE_EU_CROSS_COLLECTION
|
||||
+ GESETZE_DUAL_NAMING
|
||||
)
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(f"\033[0;32m[OK]\033[0m {msg}")
|
||||
|
||||
def warn(msg):
|
||||
print(f"\033[1;33m[WARN]\033[0m {msg}")
|
||||
|
||||
def info(msg):
|
||||
print(f"\033[0;36m[INFO]\033[0m {msg}")
|
||||
|
||||
def fail(msg):
|
||||
print(f"\033[0;31m[FAIL]\033[0m {msg}")
|
||||
|
||||
|
||||
def make_session(target_config):
|
||||
"""Create a requests session for the given target."""
|
||||
s = requests.Session()
|
||||
s.headers.update({"Content-Type": "application/json"})
|
||||
if target_config["api_key"]:
|
||||
s.headers.update({"api-key": target_config["api_key"]})
|
||||
s.timeout = 60
|
||||
return s
|
||||
|
||||
|
||||
def count_by_regulation_id(session, url, collection, regulation_id):
|
||||
"""Count points in a collection matching a regulation_id."""
|
||||
resp = session.post(
|
||||
f"{url}/collections/{collection}/points/count",
|
||||
json={
|
||||
"filter": {
|
||||
"must": [
|
||||
{"key": "regulation_id", "match": {"value": regulation_id}}
|
||||
]
|
||||
},
|
||||
"exact": True,
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
return resp.json().get("result", {}).get("count", 0)
|
||||
return -1
|
||||
|
||||
|
||||
def count_collection(session, url, collection):
|
||||
"""Get total point count for a collection."""
|
||||
resp = session.get(f"{url}/collections/{collection}")
|
||||
if resp.status_code == 200:
|
||||
return resp.json().get("result", {}).get("points_count", 0)
|
||||
return -1
|
||||
|
||||
|
||||
def delete_by_regulation_id(session, url, collection, regulation_id, dry_run=True):
|
||||
"""Delete all points in a collection matching a regulation_id."""
|
||||
count = count_by_regulation_id(session, url, collection, regulation_id)
|
||||
if count <= 0:
|
||||
if count == 0:
|
||||
info(f" {collection}/{regulation_id}: 0 chunks (already clean)")
|
||||
else:
|
||||
warn(f" {collection}/{regulation_id}: count failed")
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
info(f" {collection}/{regulation_id}: {count} chunks (would delete)")
|
||||
return count
|
||||
|
||||
resp = session.post(
|
||||
f"{url}/collections/{collection}/points/delete",
|
||||
json={
|
||||
"filter": {
|
||||
"must": [
|
||||
{"key": "regulation_id", "match": {"value": regulation_id}}
|
||||
]
|
||||
}
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
log(f" {collection}/{regulation_id}: {count} chunks deleted")
|
||||
return count
|
||||
else:
|
||||
warn(f" {collection}/{regulation_id}: delete failed ({resp.status_code}: {resp.text[:200]})")
|
||||
return 0
|
||||
|
||||
|
||||
def delete_collection(session, url, collection, dry_run=True):
|
||||
"""Delete an entire collection."""
|
||||
count = count_collection(session, url, collection)
|
||||
if count < 0:
|
||||
warn(f" {collection}: not found or error")
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
info(f" {collection}: {count} chunks total (would delete collection)")
|
||||
return count
|
||||
|
||||
resp = session.delete(f"{url}/collections/{collection}")
|
||||
if resp.status_code == 200:
|
||||
log(f" {collection}: deleted ({count} chunks)")
|
||||
return count
|
||||
else:
|
||||
warn(f" {collection}: delete failed ({resp.status_code}: {resp.text[:200]})")
|
||||
return 0
|
||||
|
||||
|
||||
def run_cleanup(target_name, target_config, dry_run=True):
|
||||
"""Run the full cleanup for a single Qdrant target."""
|
||||
url = target_config["url"]
|
||||
session = make_session(target_config)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Target: {target_name} ({url})")
|
||||
print(f"Mode: {'DRY RUN' if dry_run else 'LIVE DELETE'}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Check connectivity
|
||||
try:
|
||||
resp = session.get(f"{url}/collections")
|
||||
resp.raise_for_status()
|
||||
collections = [c["name"] for c in resp.json().get("result", {}).get("collections", [])]
|
||||
info(f"Connected. Collections: {len(collections)}")
|
||||
except Exception as e:
|
||||
warn(f"Cannot connect to {url}: {e}")
|
||||
return
|
||||
|
||||
total_deleted = 0
|
||||
|
||||
# ── Step 1: Delete bp_compliance_recht ──
|
||||
print(f"\n--- Step 1: Delete bp_compliance_recht (100% subset of CE) ---")
|
||||
if "bp_compliance_recht" in collections:
|
||||
total_deleted += delete_collection(session, url, "bp_compliance_recht", dry_run)
|
||||
else:
|
||||
info(" bp_compliance_recht: not found (already deleted)")
|
||||
|
||||
# ── Step 2: Delete old versions in gesetze ──
|
||||
print(f"\n--- Step 2: Delete old versions in bp_compliance_gesetze ---")
|
||||
print(f" (ao, bdsg, egbgb, hgb — _komplett versions exist)")
|
||||
if "bp_compliance_gesetze" in collections:
|
||||
for reg_id in GESETZE_OLD_VERSIONS:
|
||||
total_deleted += delete_by_regulation_id(
|
||||
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
||||
)
|
||||
time.sleep(0.2)
|
||||
|
||||
# ── Step 3: Delete BGB section extracts ──
|
||||
print(f"\n--- Step 3: Delete BGB section extracts (bgb_komplett covers all) ---")
|
||||
if "bp_compliance_gesetze" in collections:
|
||||
for reg_id in GESETZE_BGB_EXTRACTS:
|
||||
total_deleted += delete_by_regulation_id(
|
||||
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
||||
)
|
||||
time.sleep(0.2)
|
||||
|
||||
# ── Step 4: Delete AT law duplicates ──
|
||||
print(f"\n--- Step 4: Delete Austrian law duplicates ---")
|
||||
if "bp_compliance_gesetze" in collections:
|
||||
for reg_id in GESETZE_AT_DUPLICATES:
|
||||
total_deleted += delete_by_regulation_id(
|
||||
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
||||
)
|
||||
time.sleep(0.2)
|
||||
|
||||
# ── Step 5: Delete stubs ──
|
||||
print(f"\n--- Step 5: Delete stub entries (1-chunk placeholders) ---")
|
||||
if "bp_compliance_gesetze" in collections:
|
||||
for reg_id in GESETZE_STUBS:
|
||||
total_deleted += delete_by_regulation_id(
|
||||
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
||||
)
|
||||
time.sleep(0.2)
|
||||
|
||||
# ── Step 6: Delete EU cross-collection duplicates from gesetze ──
|
||||
print(f"\n--- Step 6: Delete EU regulations from gesetze (keep CE) ---")
|
||||
if "bp_compliance_gesetze" in collections:
|
||||
for reg_id in GESETZE_EU_CROSS_COLLECTION:
|
||||
total_deleted += delete_by_regulation_id(
|
||||
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
||||
)
|
||||
time.sleep(0.2)
|
||||
|
||||
# ── Step 7: Delete dual-naming duplicates in gesetze ──
|
||||
print(f"\n--- Step 7: Delete dual-naming duplicates in gesetze ---")
|
||||
if "bp_compliance_gesetze" in collections:
|
||||
for reg_id in GESETZE_DUAL_NAMING:
|
||||
total_deleted += delete_by_regulation_id(
|
||||
session, url, "bp_compliance_gesetze", reg_id, dry_run
|
||||
)
|
||||
time.sleep(0.2)
|
||||
|
||||
# ── Step 8: Delete EDPB/WP duplicates in datenschutz ──
|
||||
print(f"\n--- Step 8: Delete EDPB/WP duplicate ingestions ---")
|
||||
if "bp_compliance_datenschutz" in collections:
|
||||
for reg_id in DATENSCHUTZ_DUPLICATES:
|
||||
total_deleted += delete_by_regulation_id(
|
||||
session, url, "bp_compliance_datenschutz", reg_id, dry_run
|
||||
)
|
||||
time.sleep(0.2)
|
||||
|
||||
# ── Summary ──
|
||||
print(f"\n{'='*60}")
|
||||
action = "would be deleted" if dry_run else "deleted"
|
||||
print(f"Total chunks {action}: {total_deleted:,}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Qdrant duplicate chunk cleanup")
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Preview deletions without executing (default: false)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
choices=["local", "production", "both"],
|
||||
default="both",
|
||||
help="Which Qdrant instance to clean (default: both)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.dry_run:
|
||||
print("\n" + "!" * 60)
|
||||
print(" WARNING: LIVE DELETE MODE — chunks will be permanently removed!")
|
||||
print("!" * 60)
|
||||
answer = input(" Type 'DELETE' to confirm: ")
|
||||
if answer != "DELETE":
|
||||
print(" Aborted.")
|
||||
sys.exit(0)
|
||||
|
||||
targets = (
|
||||
TARGETS.items()
|
||||
if args.target == "both"
|
||||
else [(args.target, TARGETS[args.target])]
|
||||
)
|
||||
|
||||
for name, config in targets:
|
||||
run_cleanup(name, config, dry_run=args.dry_run)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user