feat(coverage): Korpus-Dokumente gruppiert nach Art + Herausgeber-Familie
CI / dep-audit (push) Has been skipped
CI / test-python-backend (push) Successful in 27s
CI / test-python-document-crawler (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / build-sha-integrity (push) Successful in 14s
CI / validate-canonical-controls (push) Successful in 10s
CI / loc-budget (push) Successful in 25s
CI / go-lint (push) Has been skipped
CI / detect-changes (push) Successful in 19s
CI / python-lint (push) Has been skipped
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m8s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped

Die "Korpus-Dokumente"-Tabelle wird nach Dokument-Art geordnet
(Gesetze & Verordnungen → Behörden-Leitfäden → Standards & Best Practice →
Rechtsprechung) mit Zwischenüberschriften, und je Herausgeber-Familie
zusammengefasst (alle DSK, alle EDPB, alle OWASP/NIST/ENISA gemeinsam).
Deterministischer Kategorisierer (categorizeCorpusDoc) + Grouper
(groupCorpusDocs), pure + unit-getestet.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-16 12:20:10 +02:00
parent 9e9d780902
commit 8a0097f5da
3 changed files with 193 additions and 15 deletions
@@ -46,6 +46,97 @@ export interface CorpusOverview {
totals: { documents: number; catalog_sources: number }
}
// --- Korpus-Dokumente: gruppieren nach Art (Gesetz/Leitfaden/Standard/Urteil)
// + Herausgeber-Familie (DSK, EDPB, OWASP, NIST …). Deterministisch, pure. ---
interface DocCat {
key: string
label: string
order: number
}
const CAT_LAW: DocCat = { key: 'law', label: 'Gesetze & Verordnungen', order: 1 }
const CAT_GUIDANCE: DocCat = {
key: 'guidance',
label: 'Behörden-Leitfäden & Orientierungshilfen',
order: 2,
}
const CAT_STANDARD: DocCat = {
key: 'standard',
label: 'Standards & Best Practice',
order: 3,
}
const CAT_COURT: DocCat = { key: 'court', label: 'Rechtsprechung', order: 4 }
export function categorizeCorpusDoc(src: string): { cat: DocCat; family: string } {
const u = (src || '').toUpperCase()
// Standards & Best Practice (technische Familien)
if (u.includes('OWASP')) return { cat: CAT_STANDARD, family: 'OWASP' }
if (u.includes('NIST')) return { cat: CAT_STANDARD, family: 'NIST' }
if (u.includes('CISA')) return { cat: CAT_STANDARD, family: 'CISA' }
if (u.includes('OECD')) return { cat: CAT_STANDARD, family: 'OECD' }
if (u.includes('ENISA')) return { cat: CAT_STANDARD, family: 'ENISA' }
// Behörden-Leitfäden (Datenschutz-Aufsicht + EU-Kommissions-Guides)
if (u.startsWith('DSK'))
return { cat: CAT_GUIDANCE, family: 'DSK (Datenschutzkonferenz)' }
if (u.includes('EDPB')) return { cat: CAT_GUIDANCE, family: 'EDPB' }
if (u.includes('EDPS')) return { cat: CAT_GUIDANCE, family: 'EDPS' }
if (u.includes('WP29'))
return { cat: CAT_GUIDANCE, family: 'WP29 (Art.-29-Gruppe)' }
if (u.includes('BFDI')) return { cat: CAT_GUIDANCE, family: 'BfDI' }
if (u.includes('EU MACHINERY GUIDE') || u.includes('EU BLUE GUIDE'))
return { cat: CAT_GUIDANCE, family: 'EU-Kommission (Guides)' }
// Rechtsprechung
if (u.startsWith('BGH') || u.startsWith('BVGER') || u.startsWith('EUGH'))
return { cat: CAT_COURT, family: 'Rechtsprechung' }
// Default: Gesetz/Verordnung/Richtlinie
return { cat: CAT_LAW, family: 'Gesetze & Verordnungen' }
}
export interface CorpusFamilyGroup {
family: string
total: number
docs: CorpusDoc[]
}
export interface CorpusCatGroup {
key: string
label: string
order: number
total: number
families: CorpusFamilyGroup[]
}
// Group corpus docs by category (ordered: laws → guidance → standards → court),
// families within each sorted by size, docs within a family by size. So all DSK
// sit together, all EDPB together, all OWASP/NIST together, under headings.
export function groupCorpusDocs(docs: CorpusDoc[]): CorpusCatGroup[] {
const cats = new Map<string, { cat: DocCat; fam: Map<string, CorpusDoc[]> }>()
for (const d of docs) {
const { cat, family } = categorizeCorpusDoc(d.source_regulation)
if (!cats.has(cat.key)) cats.set(cat.key, { cat, fam: new Map() })
const fam = cats.get(cat.key)!.fam
if (!fam.has(family)) fam.set(family, [])
fam.get(family)!.push(d)
}
return [...cats.values()]
.map(({ cat, fam }) => {
const families = [...fam.entries()]
.map(([family, ds]) => ({
family,
docs: [...ds].sort((a, b) => b.atom_count - a.atom_count),
total: ds.reduce((s, d) => s + d.atom_count, 0),
}))
.sort((a, b) => b.total - a.total)
return {
key: cat.key,
label: cat.label,
order: cat.order,
total: families.reduce((s, f) => s + f.total, 0),
families,
}
})
.sort((a, b) => a.order - b.order)
}
export const USE_CASE_GROUP_LABELS: Record<string, string> = {
document: 'Dokument-Compliance',
security: 'Security',