feat: Obligation-Deduplizierung — 34.617 Duplikate als 'duplicate' markiert
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 33s
CI/CD / test-python-backend-compliance (push) Successful in 35s
CI/CD / test-python-document-crawler (push) Successful in 30s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 13s
CI/CD / Deploy (push) Successful in 3s

Neue Endpunkte POST /obligations/dedup und GET /obligations/dedup-stats.
Pro candidate_id wird der aelteste Eintrag behalten, alle weiteren erhalten
release_state='duplicate' mit merged_into_id + quality_flags fuer Traceability.
Detail-View filtert Duplikate aus. MKDocs aktualisiert.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-26 20:13:00 +01:00
parent ac42a0aaa0
commit f39e5a71af
5 changed files with 297 additions and 2 deletions

View File

@@ -1015,7 +1015,7 @@ async def get_control_provenance(control_id: str):
normative_strength, release_state
FROM obligation_candidates
WHERE parent_control_uuid = CAST(:uid AS uuid)
AND release_state NOT IN ('rejected', 'merged')
AND release_state NOT IN ('rejected', 'merged', 'duplicate')
ORDER BY candidate_id
"""),
{"uid": ctrl_uuid},
@@ -1150,7 +1150,7 @@ async def backfill_normative_strength(
cc.source_citation->>'source' AS parent_source
FROM obligation_candidates oc
JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid
WHERE oc.release_state NOT IN ('rejected', 'merged')
WHERE oc.release_state NOT IN ('rejected', 'merged', 'duplicate')
AND oc.normative_strength IS NOT NULL
ORDER BY oc.candidate_id
""")).fetchall()
@@ -1201,6 +1201,162 @@ async def backfill_normative_strength(
}
# =============================================================================
# OBLIGATION DEDUPLICATION
# =============================================================================
@router.post("/obligations/dedup")
async def dedup_obligations(
dry_run: bool = Query(True, description="Nur zaehlen, nicht aendern"),
batch_size: int = Query(0, description="0 = alle auf einmal"),
offset: int = Query(0, description="Offset fuer Batch-Verarbeitung"),
):
"""
Markiert doppelte obligation_candidates als 'duplicate'.
Duplikate = mehrere Eintraege mit gleichem candidate_id.
Pro candidate_id wird der aelteste Eintrag (MIN(created_at)) behalten,
alle anderen erhalten release_state='duplicate' und merged_into_id
zeigt auf den behaltenen Eintrag.
"""
with SessionLocal() as db:
# 1. Finde alle candidate_ids mit mehr als einem Eintrag
# (nur noch nicht-deduplizierte beruecksichtigen)
dup_query = """
SELECT candidate_id, count(*) as cnt
FROM obligation_candidates
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
GROUP BY candidate_id
HAVING count(*) > 1
ORDER BY candidate_id
"""
if batch_size > 0:
dup_query += f" LIMIT {batch_size} OFFSET {offset}"
dup_groups = db.execute(text(dup_query)).fetchall()
total_groups = db.execute(text("""
SELECT count(*) FROM (
SELECT candidate_id
FROM obligation_candidates
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
GROUP BY candidate_id
HAVING count(*) > 1
) sub
""")).scalar()
# 2. Pro Gruppe: aeltesten behalten, Rest als duplicate markieren
kept_count = 0
duplicate_count = 0
sample_changes: list[dict[str, Any]] = []
for grp in dup_groups:
cid = grp.candidate_id
# Alle Eintraege fuer dieses candidate_id holen
entries = db.execute(text("""
SELECT id, candidate_id, obligation_text, release_state, created_at
FROM obligation_candidates
WHERE candidate_id = :cid
AND release_state NOT IN ('rejected', 'merged', 'duplicate')
ORDER BY created_at ASC, id ASC
"""), {"cid": cid}).fetchall()
if len(entries) < 2:
continue
keeper = entries[0] # aeltester Eintrag
duplicates = entries[1:]
kept_count += 1
duplicate_count += len(duplicates)
if len(sample_changes) < 20:
sample_changes.append({
"candidate_id": cid,
"kept_id": str(keeper.id),
"kept_text": keeper.obligation_text[:100],
"duplicate_count": len(duplicates),
"duplicate_ids": [str(d.id) for d in duplicates],
})
if not dry_run:
for dup in duplicates:
db.execute(text("""
UPDATE obligation_candidates
SET release_state = 'duplicate',
merged_into_id = CAST(:keeper_id AS uuid),
quality_flags = COALESCE(quality_flags, '{}'::jsonb)
|| jsonb_build_object(
'dedup_reason', 'duplicate of ' || :keeper_cid,
'dedup_kept_id', :keeper_id_str,
'dedup_at', NOW()::text
)
WHERE id = CAST(:dup_id AS uuid)
"""), {
"keeper_id": str(keeper.id),
"keeper_cid": cid,
"keeper_id_str": str(keeper.id),
"dup_id": str(dup.id),
})
if not dry_run and duplicate_count > 0:
db.commit()
return {
"dry_run": dry_run,
"stats": {
"total_duplicate_groups": total_groups,
"processed_groups": len(dup_groups),
"kept": kept_count,
"marked_duplicate": duplicate_count,
},
"sample_changes": sample_changes,
}
@router.get("/obligations/dedup-stats")
async def dedup_obligations_stats():
"""Statistiken ueber den aktuellen Dedup-Status der Obligations."""
with SessionLocal() as db:
total = db.execute(text(
"SELECT count(*) FROM obligation_candidates"
)).scalar()
by_state = db.execute(text("""
SELECT release_state, count(*) as cnt
FROM obligation_candidates
GROUP BY release_state
ORDER BY release_state
""")).fetchall()
dup_groups = db.execute(text("""
SELECT count(*) FROM (
SELECT candidate_id
FROM obligation_candidates
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
GROUP BY candidate_id
HAVING count(*) > 1
) sub
""")).scalar()
removable = db.execute(text("""
SELECT COALESCE(sum(cnt - 1), 0) FROM (
SELECT candidate_id, count(*) as cnt
FROM obligation_candidates
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
GROUP BY candidate_id
HAVING count(*) > 1
) sub
""")).scalar()
return {
"total_obligations": total,
"by_state": {r.release_state: r.cnt for r in by_state},
"pending_duplicate_groups": dup_groups,
"pending_removable_duplicates": removable,
}
# =============================================================================
# EVIDENCE TYPE BACKFILL
# =============================================================================