feat: Obligation-Deduplizierung — 34.617 Duplikate als 'duplicate' markiert
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 33s
CI/CD / test-python-backend-compliance (push) Successful in 35s
CI/CD / test-python-document-crawler (push) Successful in 30s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 13s
CI/CD / Deploy (push) Successful in 3s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 33s
CI/CD / test-python-backend-compliance (push) Successful in 35s
CI/CD / test-python-document-crawler (push) Successful in 30s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 13s
CI/CD / Deploy (push) Successful in 3s
Neue Endpunkte POST /obligations/dedup und GET /obligations/dedup-stats. Pro candidate_id wird der aelteste Eintrag behalten, alle weiteren erhalten release_state='duplicate' mit merged_into_id + quality_flags fuer Traceability. Detail-View filtert Duplikate aus. MKDocs aktualisiert. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1015,7 +1015,7 @@ async def get_control_provenance(control_id: str):
|
||||
normative_strength, release_state
|
||||
FROM obligation_candidates
|
||||
WHERE parent_control_uuid = CAST(:uid AS uuid)
|
||||
AND release_state NOT IN ('rejected', 'merged')
|
||||
AND release_state NOT IN ('rejected', 'merged', 'duplicate')
|
||||
ORDER BY candidate_id
|
||||
"""),
|
||||
{"uid": ctrl_uuid},
|
||||
@@ -1150,7 +1150,7 @@ async def backfill_normative_strength(
|
||||
cc.source_citation->>'source' AS parent_source
|
||||
FROM obligation_candidates oc
|
||||
JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid
|
||||
WHERE oc.release_state NOT IN ('rejected', 'merged')
|
||||
WHERE oc.release_state NOT IN ('rejected', 'merged', 'duplicate')
|
||||
AND oc.normative_strength IS NOT NULL
|
||||
ORDER BY oc.candidate_id
|
||||
""")).fetchall()
|
||||
@@ -1201,6 +1201,162 @@ async def backfill_normative_strength(
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OBLIGATION DEDUPLICATION
|
||||
# =============================================================================
|
||||
|
||||
@router.post("/obligations/dedup")
|
||||
async def dedup_obligations(
|
||||
dry_run: bool = Query(True, description="Nur zaehlen, nicht aendern"),
|
||||
batch_size: int = Query(0, description="0 = alle auf einmal"),
|
||||
offset: int = Query(0, description="Offset fuer Batch-Verarbeitung"),
|
||||
):
|
||||
"""
|
||||
Markiert doppelte obligation_candidates als 'duplicate'.
|
||||
|
||||
Duplikate = mehrere Eintraege mit gleichem candidate_id.
|
||||
Pro candidate_id wird der aelteste Eintrag (MIN(created_at)) behalten,
|
||||
alle anderen erhalten release_state='duplicate' und merged_into_id
|
||||
zeigt auf den behaltenen Eintrag.
|
||||
"""
|
||||
with SessionLocal() as db:
|
||||
# 1. Finde alle candidate_ids mit mehr als einem Eintrag
|
||||
# (nur noch nicht-deduplizierte beruecksichtigen)
|
||||
dup_query = """
|
||||
SELECT candidate_id, count(*) as cnt
|
||||
FROM obligation_candidates
|
||||
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
|
||||
GROUP BY candidate_id
|
||||
HAVING count(*) > 1
|
||||
ORDER BY candidate_id
|
||||
"""
|
||||
if batch_size > 0:
|
||||
dup_query += f" LIMIT {batch_size} OFFSET {offset}"
|
||||
|
||||
dup_groups = db.execute(text(dup_query)).fetchall()
|
||||
|
||||
total_groups = db.execute(text("""
|
||||
SELECT count(*) FROM (
|
||||
SELECT candidate_id
|
||||
FROM obligation_candidates
|
||||
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
|
||||
GROUP BY candidate_id
|
||||
HAVING count(*) > 1
|
||||
) sub
|
||||
""")).scalar()
|
||||
|
||||
# 2. Pro Gruppe: aeltesten behalten, Rest als duplicate markieren
|
||||
kept_count = 0
|
||||
duplicate_count = 0
|
||||
sample_changes: list[dict[str, Any]] = []
|
||||
|
||||
for grp in dup_groups:
|
||||
cid = grp.candidate_id
|
||||
|
||||
# Alle Eintraege fuer dieses candidate_id holen
|
||||
entries = db.execute(text("""
|
||||
SELECT id, candidate_id, obligation_text, release_state, created_at
|
||||
FROM obligation_candidates
|
||||
WHERE candidate_id = :cid
|
||||
AND release_state NOT IN ('rejected', 'merged', 'duplicate')
|
||||
ORDER BY created_at ASC, id ASC
|
||||
"""), {"cid": cid}).fetchall()
|
||||
|
||||
if len(entries) < 2:
|
||||
continue
|
||||
|
||||
keeper = entries[0] # aeltester Eintrag
|
||||
duplicates = entries[1:]
|
||||
kept_count += 1
|
||||
duplicate_count += len(duplicates)
|
||||
|
||||
if len(sample_changes) < 20:
|
||||
sample_changes.append({
|
||||
"candidate_id": cid,
|
||||
"kept_id": str(keeper.id),
|
||||
"kept_text": keeper.obligation_text[:100],
|
||||
"duplicate_count": len(duplicates),
|
||||
"duplicate_ids": [str(d.id) for d in duplicates],
|
||||
})
|
||||
|
||||
if not dry_run:
|
||||
for dup in duplicates:
|
||||
db.execute(text("""
|
||||
UPDATE obligation_candidates
|
||||
SET release_state = 'duplicate',
|
||||
merged_into_id = CAST(:keeper_id AS uuid),
|
||||
quality_flags = COALESCE(quality_flags, '{}'::jsonb)
|
||||
|| jsonb_build_object(
|
||||
'dedup_reason', 'duplicate of ' || :keeper_cid,
|
||||
'dedup_kept_id', :keeper_id_str,
|
||||
'dedup_at', NOW()::text
|
||||
)
|
||||
WHERE id = CAST(:dup_id AS uuid)
|
||||
"""), {
|
||||
"keeper_id": str(keeper.id),
|
||||
"keeper_cid": cid,
|
||||
"keeper_id_str": str(keeper.id),
|
||||
"dup_id": str(dup.id),
|
||||
})
|
||||
|
||||
if not dry_run and duplicate_count > 0:
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"dry_run": dry_run,
|
||||
"stats": {
|
||||
"total_duplicate_groups": total_groups,
|
||||
"processed_groups": len(dup_groups),
|
||||
"kept": kept_count,
|
||||
"marked_duplicate": duplicate_count,
|
||||
},
|
||||
"sample_changes": sample_changes,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/obligations/dedup-stats")
|
||||
async def dedup_obligations_stats():
|
||||
"""Statistiken ueber den aktuellen Dedup-Status der Obligations."""
|
||||
with SessionLocal() as db:
|
||||
total = db.execute(text(
|
||||
"SELECT count(*) FROM obligation_candidates"
|
||||
)).scalar()
|
||||
|
||||
by_state = db.execute(text("""
|
||||
SELECT release_state, count(*) as cnt
|
||||
FROM obligation_candidates
|
||||
GROUP BY release_state
|
||||
ORDER BY release_state
|
||||
""")).fetchall()
|
||||
|
||||
dup_groups = db.execute(text("""
|
||||
SELECT count(*) FROM (
|
||||
SELECT candidate_id
|
||||
FROM obligation_candidates
|
||||
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
|
||||
GROUP BY candidate_id
|
||||
HAVING count(*) > 1
|
||||
) sub
|
||||
""")).scalar()
|
||||
|
||||
removable = db.execute(text("""
|
||||
SELECT COALESCE(sum(cnt - 1), 0) FROM (
|
||||
SELECT candidate_id, count(*) as cnt
|
||||
FROM obligation_candidates
|
||||
WHERE release_state NOT IN ('rejected', 'merged', 'duplicate')
|
||||
GROUP BY candidate_id
|
||||
HAVING count(*) > 1
|
||||
) sub
|
||||
""")).scalar()
|
||||
|
||||
return {
|
||||
"total_obligations": total,
|
||||
"by_state": {r.release_state: r.cnt for r in by_state},
|
||||
"pending_duplicate_groups": dup_groups,
|
||||
"pending_removable_duplicates": removable,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# EVIDENCE TYPE BACKFILL
|
||||
# =============================================================================
|
||||
|
||||
Reference in New Issue
Block a user