From f39e5a71af751acd8b5aa674e28436dbcc3ff8c3 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 26 Mar 2026 20:13:00 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20Obligation-Deduplizierung=20=E2=80=94?= =?UTF-8?q?=2034.617=20Duplikate=20als=20'duplicate'=20markiert?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Neue Endpunkte POST /obligations/dedup und GET /obligations/dedup-stats. Pro candidate_id wird der aelteste Eintrag behalten, alle weiteren erhalten release_state='duplicate' mit merged_into_id + quality_flags fuer Traceability. Detail-View filtert Duplikate aus. MKDocs aktualisiert. Co-Authored-By: Claude Opus 4.6 --- .../app/api/sdk/v1/canonical/route.ts | 9 + .../api/canonical_control_routes.py | 160 +++++++++++++++++- .../migrations/081_obligation_dedup_state.sql | 11 ++ .../tests/test_canonical_control_routes.py | 83 +++++++++ .../sdk-modules/canonical-control-library.md | 36 ++++ 5 files changed, 297 insertions(+), 2 deletions(-) create mode 100644 backend-compliance/migrations/081_obligation_dedup_state.sql diff --git a/admin-compliance/app/api/sdk/v1/canonical/route.ts b/admin-compliance/app/api/sdk/v1/canonical/route.ts index d5ab588..19c662b 100644 --- a/admin-compliance/app/api/sdk/v1/canonical/route.ts +++ b/admin-compliance/app/api/sdk/v1/canonical/route.ts @@ -157,6 +157,10 @@ export async function GET(request: NextRequest) { backendPath = '/api/compliance/v1/canonical/controls/v1-enrichment-stats' break + case 'obligation-dedup-stats': + backendPath = '/api/compliance/v1/canonical/obligations/dedup-stats' + break + case 'controls-customer': { const custSeverity = searchParams.get('severity') const custDomain = searchParams.get('domain') @@ -228,6 +232,11 @@ export async function POST(request: NextRequest) { const batchSize = searchParams.get('batch_size') ?? '100' const enrichOffset = searchParams.get('offset') ?? '0' backendPath = `/api/compliance/v1/canonical/controls/enrich-v1-matches?dry_run=${dryRun}&batch_size=${batchSize}&offset=${enrichOffset}` + } else if (endpoint === 'obligation-dedup') { + const dryRun = searchParams.get('dry_run') ?? 'true' + const batchSize = searchParams.get('batch_size') ?? '0' + const dedupOffset = searchParams.get('offset') ?? '0' + backendPath = `/api/compliance/v1/canonical/obligations/dedup?dry_run=${dryRun}&batch_size=${batchSize}&offset=${dedupOffset}` } else if (endpoint === 'similarity-check') { const controlId = searchParams.get('id') if (!controlId) { diff --git a/backend-compliance/compliance/api/canonical_control_routes.py b/backend-compliance/compliance/api/canonical_control_routes.py index d0d0409..182058e 100644 --- a/backend-compliance/compliance/api/canonical_control_routes.py +++ b/backend-compliance/compliance/api/canonical_control_routes.py @@ -1015,7 +1015,7 @@ async def get_control_provenance(control_id: str): normative_strength, release_state FROM obligation_candidates WHERE parent_control_uuid = CAST(:uid AS uuid) - AND release_state NOT IN ('rejected', 'merged') + AND release_state NOT IN ('rejected', 'merged', 'duplicate') ORDER BY candidate_id """), {"uid": ctrl_uuid}, @@ -1150,7 +1150,7 @@ async def backfill_normative_strength( cc.source_citation->>'source' AS parent_source FROM obligation_candidates oc JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid - WHERE oc.release_state NOT IN ('rejected', 'merged') + WHERE oc.release_state NOT IN ('rejected', 'merged', 'duplicate') AND oc.normative_strength IS NOT NULL ORDER BY oc.candidate_id """)).fetchall() @@ -1201,6 +1201,162 @@ async def backfill_normative_strength( } +# ============================================================================= +# OBLIGATION DEDUPLICATION +# ============================================================================= + +@router.post("/obligations/dedup") +async def dedup_obligations( + dry_run: bool = Query(True, description="Nur zaehlen, nicht aendern"), + batch_size: int = Query(0, description="0 = alle auf einmal"), + offset: int = Query(0, description="Offset fuer Batch-Verarbeitung"), +): + """ + Markiert doppelte obligation_candidates als 'duplicate'. + + Duplikate = mehrere Eintraege mit gleichem candidate_id. + Pro candidate_id wird der aelteste Eintrag (MIN(created_at)) behalten, + alle anderen erhalten release_state='duplicate' und merged_into_id + zeigt auf den behaltenen Eintrag. + """ + with SessionLocal() as db: + # 1. Finde alle candidate_ids mit mehr als einem Eintrag + # (nur noch nicht-deduplizierte beruecksichtigen) + dup_query = """ + SELECT candidate_id, count(*) as cnt + FROM obligation_candidates + WHERE release_state NOT IN ('rejected', 'merged', 'duplicate') + GROUP BY candidate_id + HAVING count(*) > 1 + ORDER BY candidate_id + """ + if batch_size > 0: + dup_query += f" LIMIT {batch_size} OFFSET {offset}" + + dup_groups = db.execute(text(dup_query)).fetchall() + + total_groups = db.execute(text(""" + SELECT count(*) FROM ( + SELECT candidate_id + FROM obligation_candidates + WHERE release_state NOT IN ('rejected', 'merged', 'duplicate') + GROUP BY candidate_id + HAVING count(*) > 1 + ) sub + """)).scalar() + + # 2. Pro Gruppe: aeltesten behalten, Rest als duplicate markieren + kept_count = 0 + duplicate_count = 0 + sample_changes: list[dict[str, Any]] = [] + + for grp in dup_groups: + cid = grp.candidate_id + + # Alle Eintraege fuer dieses candidate_id holen + entries = db.execute(text(""" + SELECT id, candidate_id, obligation_text, release_state, created_at + FROM obligation_candidates + WHERE candidate_id = :cid + AND release_state NOT IN ('rejected', 'merged', 'duplicate') + ORDER BY created_at ASC, id ASC + """), {"cid": cid}).fetchall() + + if len(entries) < 2: + continue + + keeper = entries[0] # aeltester Eintrag + duplicates = entries[1:] + kept_count += 1 + duplicate_count += len(duplicates) + + if len(sample_changes) < 20: + sample_changes.append({ + "candidate_id": cid, + "kept_id": str(keeper.id), + "kept_text": keeper.obligation_text[:100], + "duplicate_count": len(duplicates), + "duplicate_ids": [str(d.id) for d in duplicates], + }) + + if not dry_run: + for dup in duplicates: + db.execute(text(""" + UPDATE obligation_candidates + SET release_state = 'duplicate', + merged_into_id = CAST(:keeper_id AS uuid), + quality_flags = COALESCE(quality_flags, '{}'::jsonb) + || jsonb_build_object( + 'dedup_reason', 'duplicate of ' || :keeper_cid, + 'dedup_kept_id', :keeper_id_str, + 'dedup_at', NOW()::text + ) + WHERE id = CAST(:dup_id AS uuid) + """), { + "keeper_id": str(keeper.id), + "keeper_cid": cid, + "keeper_id_str": str(keeper.id), + "dup_id": str(dup.id), + }) + + if not dry_run and duplicate_count > 0: + db.commit() + + return { + "dry_run": dry_run, + "stats": { + "total_duplicate_groups": total_groups, + "processed_groups": len(dup_groups), + "kept": kept_count, + "marked_duplicate": duplicate_count, + }, + "sample_changes": sample_changes, + } + + +@router.get("/obligations/dedup-stats") +async def dedup_obligations_stats(): + """Statistiken ueber den aktuellen Dedup-Status der Obligations.""" + with SessionLocal() as db: + total = db.execute(text( + "SELECT count(*) FROM obligation_candidates" + )).scalar() + + by_state = db.execute(text(""" + SELECT release_state, count(*) as cnt + FROM obligation_candidates + GROUP BY release_state + ORDER BY release_state + """)).fetchall() + + dup_groups = db.execute(text(""" + SELECT count(*) FROM ( + SELECT candidate_id + FROM obligation_candidates + WHERE release_state NOT IN ('rejected', 'merged', 'duplicate') + GROUP BY candidate_id + HAVING count(*) > 1 + ) sub + """)).scalar() + + removable = db.execute(text(""" + SELECT COALESCE(sum(cnt - 1), 0) FROM ( + SELECT candidate_id, count(*) as cnt + FROM obligation_candidates + WHERE release_state NOT IN ('rejected', 'merged', 'duplicate') + GROUP BY candidate_id + HAVING count(*) > 1 + ) sub + """)).scalar() + + return { + "total_obligations": total, + "by_state": {r.release_state: r.cnt for r in by_state}, + "pending_duplicate_groups": dup_groups, + "pending_removable_duplicates": removable, + } + + # ============================================================================= # EVIDENCE TYPE BACKFILL # ============================================================================= diff --git a/backend-compliance/migrations/081_obligation_dedup_state.sql b/backend-compliance/migrations/081_obligation_dedup_state.sql new file mode 100644 index 0000000..6247286 --- /dev/null +++ b/backend-compliance/migrations/081_obligation_dedup_state.sql @@ -0,0 +1,11 @@ +-- Migration 081: Add 'duplicate' release_state for obligation deduplication +-- +-- Allows marking duplicate obligation_candidates as 'duplicate' instead of +-- deleting them, preserving traceability via merged_into_id. + +ALTER TABLE obligation_candidates + DROP CONSTRAINT IF EXISTS obligation_candidates_release_state_check; + +ALTER TABLE obligation_candidates + ADD CONSTRAINT obligation_candidates_release_state_check + CHECK (release_state IN ('extracted', 'validated', 'rejected', 'composed', 'merged', 'duplicate')); diff --git a/backend-compliance/tests/test_canonical_control_routes.py b/backend-compliance/tests/test_canonical_control_routes.py index caef0dc..d956f63 100644 --- a/backend-compliance/tests/test_canonical_control_routes.py +++ b/backend-compliance/tests/test_canonical_control_routes.py @@ -462,3 +462,86 @@ class TestControlsMeta: assert "category_counts" in data assert "evidence_type_counts" in data assert "release_state_counts" in data + + +class TestObligationDedup: + """Tests for obligation deduplication endpoints.""" + + @patch("compliance.api.canonical_control_routes.SessionLocal") + def test_dedup_dry_run(self, mock_cls): + db = MagicMock() + db.__enter__ = MagicMock(return_value=db) + db.__exit__ = MagicMock(return_value=False) + mock_cls.return_value = db + + # Mock: 2 duplicate groups + dup_row1 = MagicMock(candidate_id="OC-AUTH-001-01", cnt=3) + dup_row2 = MagicMock(candidate_id="OC-AUTH-001-02", cnt=2) + + # Entries for group 1 + import uuid + uid1 = uuid.uuid4() + uid2 = uuid.uuid4() + uid3 = uuid.uuid4() + entry1 = MagicMock(id=uid1, candidate_id="OC-AUTH-001-01", obligation_text="Text A", release_state="composed", created_at=datetime(2026, 1, 1, tzinfo=timezone.utc)) + entry2 = MagicMock(id=uid2, candidate_id="OC-AUTH-001-01", obligation_text="Text B", release_state="composed", created_at=datetime(2026, 1, 2, tzinfo=timezone.utc)) + entry3 = MagicMock(id=uid3, candidate_id="OC-AUTH-001-01", obligation_text="Text C", release_state="composed", created_at=datetime(2026, 1, 3, tzinfo=timezone.utc)) + + # Entries for group 2 + uid4 = uuid.uuid4() + uid5 = uuid.uuid4() + entry4 = MagicMock(id=uid4, candidate_id="OC-AUTH-001-02", obligation_text="Text D", release_state="composed", created_at=datetime(2026, 1, 1, tzinfo=timezone.utc)) + entry5 = MagicMock(id=uid5, candidate_id="OC-AUTH-001-02", obligation_text="Text E", release_state="composed", created_at=datetime(2026, 1, 2, tzinfo=timezone.utc)) + + # Side effects: 1) dup groups, 2) total count, 3) entries grp1, 4) entries grp2 + mock_result_groups = MagicMock() + mock_result_groups.fetchall.return_value = [dup_row1, dup_row2] + mock_result_total = MagicMock() + mock_result_total.scalar.return_value = 2 + mock_result_entries1 = MagicMock() + mock_result_entries1.fetchall.return_value = [entry1, entry2, entry3] + mock_result_entries2 = MagicMock() + mock_result_entries2.fetchall.return_value = [entry4, entry5] + + db.execute.side_effect = [mock_result_groups, mock_result_total, mock_result_entries1, mock_result_entries2] + + resp = _client.post("/api/compliance/v1/canonical/obligations/dedup?dry_run=true") + assert resp.status_code == 200 + data = resp.json() + assert data["dry_run"] is True + assert data["stats"]["total_duplicate_groups"] == 2 + assert data["stats"]["kept"] == 2 + assert data["stats"]["marked_duplicate"] == 3 # 2 from grp1 + 1 from grp2 + # Dry run: no commit + db.commit.assert_not_called() + + @patch("compliance.api.canonical_control_routes.SessionLocal") + def test_dedup_stats(self, mock_cls): + db = MagicMock() + db.__enter__ = MagicMock(return_value=db) + db.__exit__ = MagicMock(return_value=False) + mock_cls.return_value = db + + # total, by_state, dup_groups, removable + mock_total = MagicMock() + mock_total.scalar.return_value = 76046 + mock_states = MagicMock() + mock_states.fetchall.return_value = [ + MagicMock(release_state="composed", cnt=41217), + MagicMock(release_state="duplicate", cnt=34829), + ] + mock_dup_groups = MagicMock() + mock_dup_groups.scalar.return_value = 0 + mock_removable = MagicMock() + mock_removable.scalar.return_value = 0 + + db.execute.side_effect = [mock_total, mock_states, mock_dup_groups, mock_removable] + + resp = _client.get("/api/compliance/v1/canonical/obligations/dedup-stats") + assert resp.status_code == 200 + data = resp.json() + assert data["total_obligations"] == 76046 + assert data["by_state"]["composed"] == 41217 + assert data["by_state"]["duplicate"] == 34829 + assert data["pending_duplicate_groups"] == 0 + assert data["pending_removable_duplicates"] == 0 diff --git a/docs-src/services/sdk-modules/canonical-control-library.md b/docs-src/services/sdk-modules/canonical-control-library.md index e07d3f6..429b839 100644 --- a/docs-src/services/sdk-modules/canonical-control-library.md +++ b/docs-src/services/sdk-modules/canonical-control-library.md @@ -152,6 +152,8 @@ erDiagram | `POST` | `/v1/canonical/generate/backfill-domain` | Domain/Category/Target-Audience nachpflegen (Anthropic) | | `GET` | `/v1/canonical/blocked-sources` | Gesperrte Quellen (Rule 3) | | `POST` | `/v1/canonical/blocked-sources/cleanup` | Cleanup-Workflow starten | +| `POST` | `/v1/canonical/obligations/dedup` | Obligation-Duplikate markieren (dry_run, batch_size, offset) | +| `GET` | `/v1/canonical/obligations/dedup-stats` | Dedup-Statistik (total, by_state, pending) | ### Beispiel: Control abrufen @@ -984,6 +986,37 @@ vom Parent-Obligation uebernommen. **Datei:** `compliance/services/decomposition_pass.py` **Test-Script:** `scripts/qa/test_pass0a.py` (standalone, speichert JSON) +#### Obligation Deduplizierung + +Die Decomposition-Pipeline erzeugt pro Rich Control mehrere Obligation Candidates. +Durch Wiederholungen in der Pipeline koennen identische `candidate_id`-Eintraege +mehrfach existieren (z.B. 5x `OC-AUTH-839-01` mit leicht unterschiedlichem Text). + +**Dedup-Strategie:** Pro `candidate_id` wird der aelteste Eintrag (`MIN(created_at)`) +behalten. Alle anderen erhalten: + +- `release_state = 'duplicate'` +- `merged_into_id` → UUID des behaltenen Eintrags +- `quality_flags.dedup_reason` → z.B. `"duplicate of OC-AUTH-839-01"` + +**Endpunkte:** + +```bash +# Dry Run — zaehlt betroffene Duplikat-Gruppen +curl -X POST "https://macmini:8002/api/compliance/v1/canonical/obligations/dedup?dry_run=true" + +# Ausfuehren — markiert alle Duplikate +curl -X POST "https://macmini:8002/api/compliance/v1/canonical/obligations/dedup?dry_run=false" + +# Statistiken +curl "https://macmini:8002/api/compliance/v1/canonical/obligations/dedup-stats" +``` + +**Stand (2026-03-26):** 76.046 Obligations gesamt, davon 34.617 als `duplicate` markiert. +41.043 aktive Obligations verbleiben (composed + validated). + +**Migration:** `081_obligation_dedup_state.sql` — Fuegt `'duplicate'` zum `release_state` Constraint hinzu. + --- ### Migration Passes (1-5) @@ -1033,6 +1066,9 @@ Die Crosswalk-Matrix bildet diese N:M-Beziehung ab. |---------|-------------| | `obligation_candidates` | Extrahierte atomare Pflichten aus Rich Controls | | `obligation_candidates.obligation_type` | `pflicht` / `empfehlung` / `kann` (3-Tier-Klassifizierung) | +| `obligation_candidates.release_state` | `extracted` / `validated` / `rejected` / `composed` / `merged` / `duplicate` | +| `obligation_candidates.merged_into_id` | UUID des behaltenen Eintrags (bei `duplicate`/`merged`) | +| `obligation_candidates.quality_flags` | JSONB mit Metadaten (u.a. `dedup_reason`, `dedup_kept_id`) | | `canonical_controls.parent_control_uuid` | Self-Referenz zum Rich Control (neues Feld) | | `canonical_controls.decomposition_method` | Zerlegungsmethode (neues Feld) | | `canonical_controls.obligation_type` | Uebernommen von Obligation: pflicht/empfehlung/kann |