feat(coverage): Korpus-Dokumente gruppiert nach Art + Herausgeber-Familie
CI / dep-audit (push) Has been skipped
CI / test-python-backend (push) Successful in 27s
CI / test-python-document-crawler (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / build-sha-integrity (push) Successful in 14s
CI / validate-canonical-controls (push) Successful in 10s
CI / loc-budget (push) Successful in 25s
CI / go-lint (push) Has been skipped
CI / detect-changes (push) Successful in 19s
CI / python-lint (push) Has been skipped
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m8s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped

Die "Korpus-Dokumente"-Tabelle wird nach Dokument-Art geordnet
(Gesetze & Verordnungen → Behörden-Leitfäden → Standards & Best Practice →
Rechtsprechung) mit Zwischenüberschriften, und je Herausgeber-Familie
zusammengefasst (alle DSK, alle EDPB, alle OWASP/NIST/ENISA gemeinsam).
Deterministischer Kategorisierer (categorizeCorpusDoc) + Grouper
(groupCorpusDocs), pure + unit-getestet.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-16 12:20:10 +02:00
parent 9e9d780902
commit 8a0097f5da
3 changed files with 193 additions and 15 deletions
@@ -8,10 +8,21 @@ import {
splitByTier,
severityBadgeClass,
addresseeLabel,
categorizeCorpusDoc,
groupCorpusDocs,
type UseCaseRow,
type ControlItem,
type CorpusDoc,
} from './_helpers'
const doc = (src: string, n = 1): CorpusDoc => ({
source_regulation: src,
license_rule: 1,
license_tier: 't',
atom_count: n,
use_case: null,
})
const ctrl = (over: Partial<ControlItem>): ControlItem => ({
id: 'id',
title: 'T',
@@ -108,6 +119,46 @@ describe('coverage helpers', () => {
expect(addresseeLabel('unbekannt_neu')).toBe('unbekannt_neu')
})
it('categorizes corpus docs by type + issuer family', () => {
expect(categorizeCorpusDoc('DSGVO (EU) 2016/679').cat.key).toBe('law')
expect(categorizeCorpusDoc('Medizinprodukteverordnung (EU) 2017/745 (MDR)').cat.key).toBe('law')
expect(categorizeCorpusDoc('DSK OH Telemedien')).toMatchObject({
cat: { key: 'guidance' },
family: 'DSK (Datenschutzkonferenz)',
})
expect(categorizeCorpusDoc('EDPB Fines Calculation')).toMatchObject({
cat: { key: 'guidance' },
family: 'EDPB',
})
expect(categorizeCorpusDoc('OWASP Top 10 (2021)')).toMatchObject({
cat: { key: 'standard' },
family: 'OWASP',
})
expect(categorizeCorpusDoc('NIST SP 800-53 Rev. 5').family).toBe('NIST')
expect(categorizeCorpusDoc('ENISA NIS2 Security Measures').family).toBe('ENISA')
expect(categorizeCorpusDoc('BGH I ZR 7/16').cat.key).toBe('court')
})
it('groups corpus docs: laws → guidance → standards → court, families clustered', () => {
const groups = groupCorpusDocs([
doc('OWASP Top 10', 10),
doc('DSGVO (EU) 2016/679', 50),
doc('DSK OH Telemedien', 5),
doc('EDPB Fines', 8),
doc('NIST SP 800-53', 20),
doc('DSK OH Direktwerbung', 3),
doc('BGH I ZR 7/16', 1),
])
expect(groups.map((g) => g.key)).toEqual(['law', 'guidance', 'standard', 'court'])
const guidance = groups.find((g) => g.key === 'guidance')!
// two DSK docs collapse into one family
const dsk = guidance.families.find((f) => f.family.startsWith('DSK'))!
expect(dsk.docs.length).toBe(2)
const std = groups.find((g) => g.key === 'standard')!
// NIST (20) before OWASP (10) — families sorted by size desc
expect(std.families.map((f) => f.family)).toEqual(['NIST', 'OWASP'])
})
it('splitByTier separates core (relevant) from review', () => {
const { core, review } = splitByTier([
ctrl({ id: 'a', relevant: true }),
@@ -46,6 +46,97 @@ export interface CorpusOverview {
totals: { documents: number; catalog_sources: number }
}
// --- Korpus-Dokumente: gruppieren nach Art (Gesetz/Leitfaden/Standard/Urteil)
// + Herausgeber-Familie (DSK, EDPB, OWASP, NIST …). Deterministisch, pure. ---
interface DocCat {
key: string
label: string
order: number
}
const CAT_LAW: DocCat = { key: 'law', label: 'Gesetze & Verordnungen', order: 1 }
const CAT_GUIDANCE: DocCat = {
key: 'guidance',
label: 'Behörden-Leitfäden & Orientierungshilfen',
order: 2,
}
const CAT_STANDARD: DocCat = {
key: 'standard',
label: 'Standards & Best Practice',
order: 3,
}
const CAT_COURT: DocCat = { key: 'court', label: 'Rechtsprechung', order: 4 }
export function categorizeCorpusDoc(src: string): { cat: DocCat; family: string } {
const u = (src || '').toUpperCase()
// Standards & Best Practice (technische Familien)
if (u.includes('OWASP')) return { cat: CAT_STANDARD, family: 'OWASP' }
if (u.includes('NIST')) return { cat: CAT_STANDARD, family: 'NIST' }
if (u.includes('CISA')) return { cat: CAT_STANDARD, family: 'CISA' }
if (u.includes('OECD')) return { cat: CAT_STANDARD, family: 'OECD' }
if (u.includes('ENISA')) return { cat: CAT_STANDARD, family: 'ENISA' }
// Behörden-Leitfäden (Datenschutz-Aufsicht + EU-Kommissions-Guides)
if (u.startsWith('DSK'))
return { cat: CAT_GUIDANCE, family: 'DSK (Datenschutzkonferenz)' }
if (u.includes('EDPB')) return { cat: CAT_GUIDANCE, family: 'EDPB' }
if (u.includes('EDPS')) return { cat: CAT_GUIDANCE, family: 'EDPS' }
if (u.includes('WP29'))
return { cat: CAT_GUIDANCE, family: 'WP29 (Art.-29-Gruppe)' }
if (u.includes('BFDI')) return { cat: CAT_GUIDANCE, family: 'BfDI' }
if (u.includes('EU MACHINERY GUIDE') || u.includes('EU BLUE GUIDE'))
return { cat: CAT_GUIDANCE, family: 'EU-Kommission (Guides)' }
// Rechtsprechung
if (u.startsWith('BGH') || u.startsWith('BVGER') || u.startsWith('EUGH'))
return { cat: CAT_COURT, family: 'Rechtsprechung' }
// Default: Gesetz/Verordnung/Richtlinie
return { cat: CAT_LAW, family: 'Gesetze & Verordnungen' }
}
export interface CorpusFamilyGroup {
family: string
total: number
docs: CorpusDoc[]
}
export interface CorpusCatGroup {
key: string
label: string
order: number
total: number
families: CorpusFamilyGroup[]
}
// Group corpus docs by category (ordered: laws → guidance → standards → court),
// families within each sorted by size, docs within a family by size. So all DSK
// sit together, all EDPB together, all OWASP/NIST together, under headings.
export function groupCorpusDocs(docs: CorpusDoc[]): CorpusCatGroup[] {
const cats = new Map<string, { cat: DocCat; fam: Map<string, CorpusDoc[]> }>()
for (const d of docs) {
const { cat, family } = categorizeCorpusDoc(d.source_regulation)
if (!cats.has(cat.key)) cats.set(cat.key, { cat, fam: new Map() })
const fam = cats.get(cat.key)!.fam
if (!fam.has(family)) fam.set(family, [])
fam.get(family)!.push(d)
}
return [...cats.values()]
.map(({ cat, fam }) => {
const families = [...fam.entries()]
.map(([family, ds]) => ({
family,
docs: [...ds].sort((a, b) => b.atom_count - a.atom_count),
total: ds.reduce((s, d) => s + d.atom_count, 0),
}))
.sort((a, b) => b.total - a.total)
return {
key: cat.key,
label: cat.label,
order: cat.order,
total: families.reduce((s, f) => s + f.total, 0),
families,
}
})
.sort((a, b) => a.order - b.order)
}
export const USE_CASE_GROUP_LABELS: Record<string, string> = {
document: 'Dokument-Compliance',
security: 'Security',
+51 -15
View File
@@ -1,3 +1,4 @@
import { Fragment } from 'react'
import Link from 'next/link'
import {
type UseCaseRow,
@@ -5,6 +6,7 @@ import {
licenseTierBadgeClass,
commercialBadgeClass,
groupUseCases,
groupCorpusDocs,
} from './_helpers'
const BACKEND_URL =
@@ -163,22 +165,56 @@ export default async function CoveragePage() {
</tr>
</thead>
<tbody className="divide-y divide-gray-100 bg-white">
{(corpus?.documents ?? []).map((d) => (
<tr key={d.source_regulation}>
<td className="px-4 py-2 text-gray-900">{d.source_regulation}</td>
<td className="px-4 py-2">
<span
className={`rounded px-2 py-0.5 text-xs font-medium ${licenseTierBadgeClass(d.license_rule)}`}
title={d.license_tier}
{groupCorpusDocs(corpus?.documents ?? []).map((cat) => (
<Fragment key={cat.key}>
<tr className="bg-gray-100">
<td
colSpan={4}
className="px-4 py-2 text-sm font-semibold text-gray-800"
>
Tier {d.license_rule ?? '?'}
</span>
</td>
<td className="px-4 py-2 text-right">{d.atom_count.toLocaleString('de-DE')}</td>
<td className="px-4 py-2 font-mono text-xs text-gray-600">
{d.use_case ?? <span className="text-amber-600"> ungemappt</span>}
</td>
</tr>
{cat.label}{' '}
<span className="font-normal text-gray-500">
({cat.families.reduce((s, f) => s + f.docs.length, 0)} Quellen ·{' '}
{cat.total.toLocaleString('de-DE')} Pflichten)
</span>
</td>
</tr>
{cat.families.map((fam) => (
<Fragment key={cat.key + fam.family}>
<tr className="bg-gray-50">
<td
colSpan={4}
className="px-4 py-1 pl-8 text-xs font-medium uppercase tracking-wide text-gray-500"
>
{fam.family}
</td>
</tr>
{fam.docs.map((d) => (
<tr key={d.source_regulation}>
<td className="px-4 py-2 pl-8 text-gray-900">
{d.source_regulation}
</td>
<td className="px-4 py-2">
<span
className={`rounded px-2 py-0.5 text-xs font-medium ${licenseTierBadgeClass(d.license_rule)}`}
title={d.license_tier}
>
Tier {d.license_rule ?? '?'}
</span>
</td>
<td className="px-4 py-2 text-right">
{d.atom_count.toLocaleString('de-DE')}
</td>
<td className="px-4 py-2 font-mono text-xs text-gray-600">
{d.use_case ?? (
<span className="text-amber-600"> ungemappt</span>
)}
</td>
</tr>
))}
</Fragment>
))}
</Fragment>
))}
</tbody>
</table>