feat(control-pipeline): incremental dedup + ENISA CRA ingestion
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 43s
CI / test-python-voice (push) Successful in 33s
CI / test-bqas (push) Successful in 37s

BatchDedup since-Parameter (services/batch_dedup_runner.py + api):
- Neuer 'since: datetime' Param scoped Phase 1 + Phase 2 SQL auf created_at >= since.
- Phase 2 checkpoint wird beim scoped Lauf geloescht (verhindert Skip neuer Atomics
  deren control_id alphabetisch unter dem stale last_id liegt).
- 6-13x schneller fuer nachgeschobene Dokumente (19k statt 172k Atomics).
- Doku: control-pipeline/docs/incremental-dedup.md.

Neue Scripts:
- gpre1_object_groups_incremental.py: Append neuer Objects an object_groups via
  bge-m3 nearest-neighbor (threshold default 0.85, empfehlbar 0.78 fuer breiteres
  Synonym-Matching). Pure INSERT/UPDATE, kein DELETE.
- gpre2_master_controls_incremental.py: Non-destructive Master-Controls-Update.
  Existing MCs unangetastet (UUIDs + master_control_id bleiben), nur neue Members
  appended + neue MCs fuer Object-Groups die jetzt min-phases erreichen.
- ingest_enisa_cra.py: Ingestion der 8 CRA-relevanten ENISA-Dokumente
  (Standards Mapping, EUCC-Implementation, NIS2 TIG, SRP FAQ, EUCC Eval Methodology,
  CVD Policies, Threat Landscape 2025). chunk_strategy=legal,
  requirement_strength=guidance|consultation_draft|evidentiary.

Quelldaten: legal-sources/enisa/enisa_cra_single_reporting_platform_faq.html
(PDFs sind .gitignore-gefiltert).

Ergebnis dieser Pipeline-Iteration:
- 1.296 neue CRA-Controls + 19.652 atomare Children
- +362 neue Master-Controls, 10.017 existing erweitert
- Total: 13.950 MCs, 620 CRA-MCs (vorher 566), 1.304 CRA-Atomics (vorher 841)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-18 18:21:46 +02:00
parent 47d7beeb52
commit 9783657da3
7 changed files with 1895 additions and 15 deletions
+40 -14
View File
@@ -22,6 +22,7 @@ import json
import logging
import time
from collections import defaultdict
from datetime import datetime
from sqlalchemy import text
@@ -108,24 +109,37 @@ class BatchDedupRunner:
self._progress_phase = ""
self._progress_count = 0
self._progress_total = 0
self._since = None # set by run() when scoped run requested
async def run(
self,
dry_run: bool = False,
hint_filter: str = None,
since: datetime = None,
) -> dict:
"""Run the full batch dedup pipeline.
Args:
dry_run: If True, compute stats but don't modify DB/Qdrant.
hint_filter: If set, only process groups matching this hint prefix.
since: If set, only process controls with created_at >= since.
Useful for incremental dedup after single-document ingestion.
Returns:
Stats dict with counts.
"""
start = time.monotonic()
logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s)",
dry_run, hint_filter)
logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s, since=%s)",
dry_run, hint_filter, since)
# Scoped runs reset checkpoint to avoid skipping new controls whose
# control_id sorts before the stale last_id of a previous full run.
self._since = since
if since and not dry_run:
self.db.execute(text(
"DELETE FROM canonical_generation_jobs WHERE status = 'dedup_phase2_checkpoint'"
))
self.db.commit()
if not dry_run:
await ensure_qdrant_collection(collection=self.collection)
@@ -133,7 +147,7 @@ class BatchDedupRunner:
# Phase 1: Intra-group dedup (same merge_group_hint)
# Optimization: skip singleton groups (they're automatically masters)
self._progress_phase = "phase1"
groups = self._load_merge_groups(hint_filter)
groups = self._load_merge_groups(hint_filter, since)
self._progress_total = self.stats["total_controls"]
multi_groups = [(h, c) for h, c in groups if len(c) > 1]
@@ -171,7 +185,7 @@ class BatchDedupRunner:
logger.info("BatchDedup completed in %.1fs: %s", elapsed, self.stats)
return self.stats
def _load_merge_groups(self, hint_filter: str = None) -> list:
def _load_merge_groups(self, hint_filter: str = None, since: datetime = None) -> list:
"""Load all Pass 0b controls grouped by merge_group_hint, largest first."""
conditions = [
"decomposition_method = 'pass0b'",
@@ -184,6 +198,10 @@ class BatchDedupRunner:
conditions.append("generation_metadata->>'merge_group_hint' LIKE :hf")
params["hf"] = f"{hint_filter}%"
if since:
conditions.append("created_at >= :since")
params["since"] = since
where = " AND ".join(conditions)
rows = self.db.execute(text(f"""
SELECT id::text, control_id, title, objective,
@@ -335,13 +353,15 @@ class BatchDedupRunner:
"""
logger.info("BatchDedup Phase 2: Cross-group pass starting...")
# Count total
total_row = self.db.execute(text("""
# Count total — respect scoped run if since is set
since_clause = " AND created_at >= :since" if self._since else ""
params = {"since": self._since} if self._since else {}
total_row = self.db.execute(text(f"""
SELECT COUNT(*) FROM canonical_controls
WHERE decomposition_method = 'pass0b'
AND release_state != 'duplicate'
AND release_state != 'deprecated'
""")).fetchone()
AND release_state != 'deprecated'{since_clause}
"""), params).fetchone()
total = total_row[0] if total_row else 0
self._progress_total = total
@@ -360,13 +380,16 @@ class BatchDedupRunner:
last_control_id = checkpoint_row[0] if checkpoint_row else ""
if last_control_id:
skip_row = self.db.execute(text("""
skip_params = {"last_id": last_control_id}
if self._since:
skip_params["since"] = self._since
skip_row = self.db.execute(text(f"""
SELECT COUNT(*) FROM canonical_controls
WHERE decomposition_method = 'pass0b'
AND release_state != 'duplicate'
AND release_state != 'deprecated'
AND control_id <= :last_id
"""), {"last_id": last_control_id}).fetchone()
AND control_id <= :last_id{since_clause}
"""), skip_params).fetchone()
skipped = skip_row[0] if skip_row else 0
self._progress_count = skipped
logger.info("BatchDedup Cross-group: RESUMING from %s (skipping %d already processed)",
@@ -382,17 +405,20 @@ class BatchDedupRunner:
total, last_control_id or "beginning")
while True:
rows = self.db.execute(text("""
page_params = {"last_id": last_control_id, "page_size": DB_PAGE}
if self._since:
page_params["since"] = self._since
rows = self.db.execute(text(f"""
SELECT id::text, control_id, title,
generation_metadata->>'merge_group_hint' as merge_group_hint
FROM canonical_controls
WHERE decomposition_method = 'pass0b'
AND release_state != 'duplicate'
AND release_state != 'deprecated'
AND control_id > :last_id
AND control_id > :last_id{since_clause}
ORDER BY control_id
LIMIT :page_size
"""), {"last_id": last_control_id, "page_size": DB_PAGE}).fetchall()
"""), page_params).fetchall()
if not rows:
break