feat(control-pipeline): incremental dedup + ENISA CRA ingestion
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 43s
CI / test-python-voice (push) Successful in 33s
CI / test-bqas (push) Successful in 37s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 43s
CI / test-python-voice (push) Successful in 33s
CI / test-bqas (push) Successful in 37s
BatchDedup since-Parameter (services/batch_dedup_runner.py + api): - Neuer 'since: datetime' Param scoped Phase 1 + Phase 2 SQL auf created_at >= since. - Phase 2 checkpoint wird beim scoped Lauf geloescht (verhindert Skip neuer Atomics deren control_id alphabetisch unter dem stale last_id liegt). - 6-13x schneller fuer nachgeschobene Dokumente (19k statt 172k Atomics). - Doku: control-pipeline/docs/incremental-dedup.md. Neue Scripts: - gpre1_object_groups_incremental.py: Append neuer Objects an object_groups via bge-m3 nearest-neighbor (threshold default 0.85, empfehlbar 0.78 fuer breiteres Synonym-Matching). Pure INSERT/UPDATE, kein DELETE. - gpre2_master_controls_incremental.py: Non-destructive Master-Controls-Update. Existing MCs unangetastet (UUIDs + master_control_id bleiben), nur neue Members appended + neue MCs fuer Object-Groups die jetzt min-phases erreichen. - ingest_enisa_cra.py: Ingestion der 8 CRA-relevanten ENISA-Dokumente (Standards Mapping, EUCC-Implementation, NIS2 TIG, SRP FAQ, EUCC Eval Methodology, CVD Policies, Threat Landscape 2025). chunk_strategy=legal, requirement_strength=guidance|consultation_draft|evidentiary. Quelldaten: legal-sources/enisa/enisa_cra_single_reporting_platform_faq.html (PDFs sind .gitignore-gefiltert). Ergebnis dieser Pipeline-Iteration: - 1.296 neue CRA-Controls + 19.652 atomare Children - +362 neue Master-Controls, 10.017 existing erweitert - Total: 13.950 MCs, 620 CRA-MCs (vorher 566), 1.304 CRA-Atomics (vorher 841) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,7 @@ import json
|
||||
import logging
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
@@ -108,24 +109,37 @@ class BatchDedupRunner:
|
||||
self._progress_phase = ""
|
||||
self._progress_count = 0
|
||||
self._progress_total = 0
|
||||
self._since = None # set by run() when scoped run requested
|
||||
|
||||
async def run(
|
||||
self,
|
||||
dry_run: bool = False,
|
||||
hint_filter: str = None,
|
||||
since: datetime = None,
|
||||
) -> dict:
|
||||
"""Run the full batch dedup pipeline.
|
||||
|
||||
Args:
|
||||
dry_run: If True, compute stats but don't modify DB/Qdrant.
|
||||
hint_filter: If set, only process groups matching this hint prefix.
|
||||
since: If set, only process controls with created_at >= since.
|
||||
Useful for incremental dedup after single-document ingestion.
|
||||
|
||||
Returns:
|
||||
Stats dict with counts.
|
||||
"""
|
||||
start = time.monotonic()
|
||||
logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s)",
|
||||
dry_run, hint_filter)
|
||||
logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s, since=%s)",
|
||||
dry_run, hint_filter, since)
|
||||
|
||||
# Scoped runs reset checkpoint to avoid skipping new controls whose
|
||||
# control_id sorts before the stale last_id of a previous full run.
|
||||
self._since = since
|
||||
if since and not dry_run:
|
||||
self.db.execute(text(
|
||||
"DELETE FROM canonical_generation_jobs WHERE status = 'dedup_phase2_checkpoint'"
|
||||
))
|
||||
self.db.commit()
|
||||
|
||||
if not dry_run:
|
||||
await ensure_qdrant_collection(collection=self.collection)
|
||||
@@ -133,7 +147,7 @@ class BatchDedupRunner:
|
||||
# Phase 1: Intra-group dedup (same merge_group_hint)
|
||||
# Optimization: skip singleton groups (they're automatically masters)
|
||||
self._progress_phase = "phase1"
|
||||
groups = self._load_merge_groups(hint_filter)
|
||||
groups = self._load_merge_groups(hint_filter, since)
|
||||
self._progress_total = self.stats["total_controls"]
|
||||
|
||||
multi_groups = [(h, c) for h, c in groups if len(c) > 1]
|
||||
@@ -171,7 +185,7 @@ class BatchDedupRunner:
|
||||
logger.info("BatchDedup completed in %.1fs: %s", elapsed, self.stats)
|
||||
return self.stats
|
||||
|
||||
def _load_merge_groups(self, hint_filter: str = None) -> list:
|
||||
def _load_merge_groups(self, hint_filter: str = None, since: datetime = None) -> list:
|
||||
"""Load all Pass 0b controls grouped by merge_group_hint, largest first."""
|
||||
conditions = [
|
||||
"decomposition_method = 'pass0b'",
|
||||
@@ -184,6 +198,10 @@ class BatchDedupRunner:
|
||||
conditions.append("generation_metadata->>'merge_group_hint' LIKE :hf")
|
||||
params["hf"] = f"{hint_filter}%"
|
||||
|
||||
if since:
|
||||
conditions.append("created_at >= :since")
|
||||
params["since"] = since
|
||||
|
||||
where = " AND ".join(conditions)
|
||||
rows = self.db.execute(text(f"""
|
||||
SELECT id::text, control_id, title, objective,
|
||||
@@ -335,13 +353,15 @@ class BatchDedupRunner:
|
||||
"""
|
||||
logger.info("BatchDedup Phase 2: Cross-group pass starting...")
|
||||
|
||||
# Count total
|
||||
total_row = self.db.execute(text("""
|
||||
# Count total — respect scoped run if since is set
|
||||
since_clause = " AND created_at >= :since" if self._since else ""
|
||||
params = {"since": self._since} if self._since else {}
|
||||
total_row = self.db.execute(text(f"""
|
||||
SELECT COUNT(*) FROM canonical_controls
|
||||
WHERE decomposition_method = 'pass0b'
|
||||
AND release_state != 'duplicate'
|
||||
AND release_state != 'deprecated'
|
||||
""")).fetchone()
|
||||
AND release_state != 'deprecated'{since_clause}
|
||||
"""), params).fetchone()
|
||||
total = total_row[0] if total_row else 0
|
||||
|
||||
self._progress_total = total
|
||||
@@ -360,13 +380,16 @@ class BatchDedupRunner:
|
||||
last_control_id = checkpoint_row[0] if checkpoint_row else ""
|
||||
|
||||
if last_control_id:
|
||||
skip_row = self.db.execute(text("""
|
||||
skip_params = {"last_id": last_control_id}
|
||||
if self._since:
|
||||
skip_params["since"] = self._since
|
||||
skip_row = self.db.execute(text(f"""
|
||||
SELECT COUNT(*) FROM canonical_controls
|
||||
WHERE decomposition_method = 'pass0b'
|
||||
AND release_state != 'duplicate'
|
||||
AND release_state != 'deprecated'
|
||||
AND control_id <= :last_id
|
||||
"""), {"last_id": last_control_id}).fetchone()
|
||||
AND control_id <= :last_id{since_clause}
|
||||
"""), skip_params).fetchone()
|
||||
skipped = skip_row[0] if skip_row else 0
|
||||
self._progress_count = skipped
|
||||
logger.info("BatchDedup Cross-group: RESUMING from %s (skipping %d already processed)",
|
||||
@@ -382,17 +405,20 @@ class BatchDedupRunner:
|
||||
total, last_control_id or "beginning")
|
||||
|
||||
while True:
|
||||
rows = self.db.execute(text("""
|
||||
page_params = {"last_id": last_control_id, "page_size": DB_PAGE}
|
||||
if self._since:
|
||||
page_params["since"] = self._since
|
||||
rows = self.db.execute(text(f"""
|
||||
SELECT id::text, control_id, title,
|
||||
generation_metadata->>'merge_group_hint' as merge_group_hint
|
||||
FROM canonical_controls
|
||||
WHERE decomposition_method = 'pass0b'
|
||||
AND release_state != 'duplicate'
|
||||
AND release_state != 'deprecated'
|
||||
AND control_id > :last_id
|
||||
AND control_id > :last_id{since_clause}
|
||||
ORDER BY control_id
|
||||
LIMIT :page_size
|
||||
"""), {"last_id": last_control_id, "page_size": DB_PAGE}).fetchall()
|
||||
"""), page_params).fetchall()
|
||||
|
||||
if not rows:
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user