From 8c42fefa774c0efd9ff1b5896a52c260bc636002 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 28 Feb 2026 17:46:11 +0100 Subject: [PATCH] feat(rag): add QA Split-View Chunk-Browser for ingestion verification New ChunkBrowserQA component replaces inline chunk browser with: - Document sidebar with live chunk counts per regulation (batched Qdrant count API) - Sequential chunk navigation with arrow keys (1/N through all chunks of a document) - Overlap display showing previous/next chunk boundaries (amber-highlighted) - Split-view with original PDF via iframe (estimated page from chunk index) - Adjustable chunks-per-page ratio for PDF page estimation Extracts REGULATIONS_IN_RAG and REGULATION_INFO to shared rag-constants.ts. Co-Authored-By: Claude Opus 4.6 --- .../ai/rag/components/ChunkBrowserQA.tsx | 552 ++++++++++++++++++ .../ai/rag/components/rag-pdf-mapping.ts | 110 ++++ admin-lehrer/app/(admin)/ai/rag/page.tsx | 359 +----------- .../app/(admin)/ai/rag/rag-constants.ts | 222 +++++++ admin-lehrer/app/api/legal-corpus/route.ts | 26 + admin-lehrer/package-lock.json | 463 ++++++++++++++- 6 files changed, 1377 insertions(+), 355 deletions(-) create mode 100644 admin-lehrer/app/(admin)/ai/rag/components/ChunkBrowserQA.tsx create mode 100644 admin-lehrer/app/(admin)/ai/rag/components/rag-pdf-mapping.ts create mode 100644 admin-lehrer/app/(admin)/ai/rag/rag-constants.ts diff --git a/admin-lehrer/app/(admin)/ai/rag/components/ChunkBrowserQA.tsx b/admin-lehrer/app/(admin)/ai/rag/components/ChunkBrowserQA.tsx new file mode 100644 index 0000000..84171b1 --- /dev/null +++ b/admin-lehrer/app/(admin)/ai/rag/components/ChunkBrowserQA.tsx @@ -0,0 +1,552 @@ +'use client' + +import React, { useState, useEffect, useCallback, useRef } from 'react' +import { RAG_PDF_MAPPING } from './rag-pdf-mapping' +import { REGULATIONS_IN_RAG, REGULATION_INFO } from '../rag-constants' + +interface ChunkBrowserQAProps { + apiProxy: string +} + +type RegGroupKey = 'eu_regulation' | 'eu_directive' | 'de_law' | 'at_law' | 'ch_law' | 'national_law' | 'bsi_standard' | 'eu_guideline' | 'international_standard' | 'other' + +const GROUP_LABELS: Record = { + eu_regulation: 'EU Verordnungen', + eu_directive: 'EU Richtlinien', + de_law: 'DE Gesetze', + at_law: 'AT Gesetze', + ch_law: 'CH Gesetze', + national_law: 'Nationale Gesetze (EU)', + bsi_standard: 'BSI Standards', + eu_guideline: 'EDPB / Guidelines', + international_standard: 'Internationale Standards', + other: 'Sonstige', +} + +const GROUP_ORDER: RegGroupKey[] = [ + 'eu_regulation', 'eu_directive', 'de_law', 'at_law', 'ch_law', + 'national_law', 'bsi_standard', 'eu_guideline', 'international_standard', 'other', +] + +const COLLECTIONS = [ + 'bp_compliance_gesetze', + 'bp_compliance_ce', + 'bp_compliance_datenschutz', +] + +export function ChunkBrowserQA({ apiProxy }: ChunkBrowserQAProps) { + // Filter-Sidebar + const [selectedRegulation, setSelectedRegulation] = useState(null) + const [regulationCounts, setRegulationCounts] = useState>({}) + const [filterSearch, setFilterSearch] = useState('') + const [countsLoading, setCountsLoading] = useState(false) + + // Dokument-Chunks (sequenziell) + const [docChunks, setDocChunks] = useState[]>([]) + const [docChunkIndex, setDocChunkIndex] = useState(0) + const [docTotalChunks, setDocTotalChunks] = useState(0) + const [docLoading, setDocLoading] = useState(false) + const docChunksRef = useRef(docChunks) + docChunksRef.current = docChunks + + // Split-View + const [splitViewActive, setSplitViewActive] = useState(true) + const [chunksPerPage, setChunksPerPage] = useState(6) + + // Collection + const [collection, setCollection] = useState('bp_compliance_gesetze') + + // Sidebar collapsed groups + const [collapsedGroups, setCollapsedGroups] = useState>(new Set()) + + // Build grouped regulations for sidebar + const regulationsInCollection = Object.entries(REGULATIONS_IN_RAG) + .filter(([, info]) => info.collection === collection) + .map(([code]) => code) + + const groupedRegulations = React.useMemo(() => { + const groups: Record = { + eu_regulation: [], eu_directive: [], de_law: [], at_law: [], ch_law: [], + national_law: [], bsi_standard: [], eu_guideline: [], international_standard: [], other: [], + } + for (const code of regulationsInCollection) { + const reg = REGULATION_INFO.find(r => r.code === code) + const type = (reg?.type || 'other') as RegGroupKey + const groupKey = type in groups ? type : 'other' + groups[groupKey].push({ + code, + name: reg?.name || code, + type: reg?.type || 'unknown', + }) + } + return groups + }, [regulationsInCollection.join(',')]) + + // Load regulation counts for current collection + const loadRegulationCounts = useCallback(async (col: string) => { + const codes = Object.entries(REGULATIONS_IN_RAG) + .filter(([, info]) => info.collection === col) + .map(([code]) => code) + if (codes.length === 0) return + + setCountsLoading(true) + try { + const params = new URLSearchParams({ + action: 'regulation-counts-batch', + collection: col, + codes: codes.join(','), + }) + const res = await fetch(`${apiProxy}?${params}`) + if (res.ok) { + const data = await res.json() + setRegulationCounts(prev => ({ ...prev, ...data.counts })) + } + } catch (error) { + console.error('Failed to load regulation counts:', error) + } finally { + setCountsLoading(false) + } + }, [apiProxy]) + + // Load all chunks for a regulation (paginated scroll) + const loadDocumentChunks = useCallback(async (regulationCode: string) => { + const ragInfo = REGULATIONS_IN_RAG[regulationCode] + if (!ragInfo) return + + setDocLoading(true) + setDocChunks([]) + setDocChunkIndex(0) + setDocTotalChunks(0) + + const allChunks: Record[] = [] + let offset: string | null = null + + try { + // Paginated scroll, 100 at a time + let safety = 0 + do { + const params = new URLSearchParams({ + action: 'scroll', + collection: ragInfo.collection, + limit: '100', + filter_key: 'regulation_code', + filter_value: regulationCode, + }) + if (offset) params.append('offset', offset) + + const res = await fetch(`${apiProxy}?${params}`) + if (!res.ok) break + + const data = await res.json() + const chunks = data.chunks || [] + allChunks.push(...chunks) + offset = data.next_offset || null + safety++ + } while (offset && safety < 200) // safety limit ~20k chunks + + // Sort by chunk_index + allChunks.sort((a, b) => { + const ai = Number(a.chunk_index ?? a.chunk_id ?? 0) + const bi = Number(b.chunk_index ?? b.chunk_id ?? 0) + return ai - bi + }) + + setDocChunks(allChunks) + setDocTotalChunks(allChunks.length) + setDocChunkIndex(0) + } catch (error) { + console.error('Failed to load document chunks:', error) + } finally { + setDocLoading(false) + } + }, [apiProxy]) + + // Initial load + useEffect(() => { + loadRegulationCounts(collection) + }, [collection, loadRegulationCounts]) + + // Current chunk + const currentChunk = docChunks[docChunkIndex] || null + const prevChunk = docChunkIndex > 0 ? docChunks[docChunkIndex - 1] : null + const nextChunk = docChunkIndex < docChunks.length - 1 ? docChunks[docChunkIndex + 1] : null + + // PDF page estimation + const estimatePdfPage = (chunkIndex: number): number => { + const mapping = selectedRegulation ? RAG_PDF_MAPPING[selectedRegulation] : null + const cpp = mapping?.chunksPerPage || chunksPerPage + return Math.floor(chunkIndex / cpp) + 1 + } + + const pdfPage = currentChunk ? estimatePdfPage(docChunkIndex) : 1 + const pdfMapping = selectedRegulation ? RAG_PDF_MAPPING[selectedRegulation] : null + const pdfUrl = pdfMapping ? `/rag-originals/${pdfMapping.filename}#page=${pdfPage}` : null + + // Handlers + const handleSelectRegulation = (code: string) => { + setSelectedRegulation(code) + loadDocumentChunks(code) + } + + const handleCollectionChange = (col: string) => { + setCollection(col) + setSelectedRegulation(null) + setDocChunks([]) + setDocChunkIndex(0) + setDocTotalChunks(0) + setRegulationCounts({}) + } + + const handlePrev = () => { + if (docChunkIndex > 0) setDocChunkIndex(i => i - 1) + } + + const handleNext = () => { + if (docChunkIndex < docChunks.length - 1) setDocChunkIndex(i => i + 1) + } + + const handleKeyDown = useCallback((e: KeyboardEvent) => { + if (e.key === 'ArrowLeft' || e.key === 'ArrowUp') { + e.preventDefault() + setDocChunkIndex(i => Math.max(0, i - 1)) + } else if (e.key === 'ArrowRight' || e.key === 'ArrowDown') { + e.preventDefault() + setDocChunkIndex(i => Math.min(docChunksRef.current.length - 1, i + 1)) + } + }, []) + + useEffect(() => { + if (selectedRegulation && docChunks.length > 0) { + window.addEventListener('keydown', handleKeyDown) + return () => window.removeEventListener('keydown', handleKeyDown) + } + }, [selectedRegulation, docChunks.length, handleKeyDown]) + + const toggleGroup = (group: string) => { + setCollapsedGroups(prev => { + const next = new Set(prev) + if (next.has(group)) next.delete(group) + else next.add(group) + return next + }) + } + + // Get text content from a chunk + const getChunkText = (chunk: Record | null): string => { + if (!chunk) return '' + return String(chunk.text || chunk.content || chunk.chunk_text || '') + } + + // Overlap extraction + const getOverlapPrev = (): string => { + if (!prevChunk) return '' + const text = getChunkText(prevChunk) + return text.length > 150 ? '...' + text.slice(-150) : text + } + + const getOverlapNext = (): string => { + if (!nextChunk) return '' + const text = getChunkText(nextChunk) + return text.length > 150 ? text.slice(0, 150) + '...' : text + } + + // Filter sidebar items + const filteredRegulations = React.useMemo(() => { + if (!filterSearch.trim()) return groupedRegulations + const term = filterSearch.toLowerCase() + const filtered: typeof groupedRegulations = { + eu_regulation: [], eu_directive: [], de_law: [], at_law: [], ch_law: [], + national_law: [], bsi_standard: [], eu_guideline: [], international_standard: [], other: [], + } + for (const [group, items] of Object.entries(groupedRegulations)) { + filtered[group as RegGroupKey] = items.filter( + r => r.code.toLowerCase().includes(term) || r.name.toLowerCase().includes(term) + ) + } + return filtered + }, [groupedRegulations, filterSearch]) + + // Regulation name lookup + const getRegName = (code: string): string => { + const reg = REGULATION_INFO.find(r => r.code === code) + return reg?.name || code + } + + return ( +
+ {/* Header bar */} +
+
+
+ + +
+ + {selectedRegulation && ( + <> +
+ + QA-Modus: {selectedRegulation} — {getRegName(selectedRegulation)} + +
+
+ + Chunk {docChunkIndex + 1} / {docTotalChunks} + + + +
+
+ + + +
+ + )} +
+
+ + {/* Main content: Sidebar + Content */} +
+ {/* Sidebar */} +
+
+ setFilterSearch(e.target.value)} + placeholder="Suche..." + className="w-full px-2 py-1.5 border rounded-lg text-sm focus:ring-2 focus:ring-teal-500" + /> + {countsLoading && ( +
Counts werden geladen...
+ )} +
+
+ {GROUP_ORDER.map(group => { + const items = filteredRegulations[group] + if (items.length === 0) return null + const isCollapsed = collapsedGroups.has(group) + return ( +
+ + {!isCollapsed && items.map(reg => { + const count = regulationCounts[reg.code] ?? REGULATIONS_IN_RAG[reg.code]?.chunks ?? 0 + const isSelected = selectedRegulation === reg.code + return ( + + ) + })} +
+ ) + })} +
+
+ + {/* Content area */} + {!selectedRegulation ? ( +
+
+
🔍
+

Waehle ein Dokument in der Sidebar, um die QA-Ansicht zu starten.

+

Pfeiltasten navigieren zwischen Chunks.

+
+
+ ) : docLoading ? ( +
+
+
+

Chunks werden geladen...

+

+ {selectedRegulation}: {REGULATIONS_IN_RAG[selectedRegulation]?.chunks || '?'} Chunks erwartet +

+
+
+ ) : ( +
+ {/* Chunk-Text Panel */} +
+
+ Chunk-Text + + Index: {docChunkIndex} / {docTotalChunks - 1} + +
+
+ {/* Overlap from previous chunk */} + {prevChunk && ( +
+
↑ Overlap (vorheriger Chunk #{docChunkIndex - 1})
+

{getOverlapPrev()}

+
+ )} + + {/* Current chunk text */} + {currentChunk && ( +
+ {getChunkText(currentChunk)} +
+ )} + + {/* Overlap from next chunk */} + {nextChunk && ( +
+
↓ Overlap (naechster Chunk #{docChunkIndex + 1})
+

{getOverlapNext()}

+
+ )} + + {/* Metadata */} + {currentChunk && ( +
+
Metadaten
+
+ {Object.entries(currentChunk) + .filter(([k]) => !['text', 'content', 'chunk_text', 'id'].includes(k)) + .map(([k, v]) => ( +
+ {k}: + {String(v)} +
+ ))} +
+
+ )} +
+ + {/* Bottom nav */} +
+ +
+ { + const v = parseInt(e.target.value, 10) + if (!isNaN(v) && v >= 0 && v < docTotalChunks) setDocChunkIndex(v) + }} + className="w-20 px-2 py-1 border rounded text-xs text-center" + /> + / {docTotalChunks - 1} +
+ +
+
+ + {/* PDF-Viewer Panel */} + {splitViewActive && ( +
+
+ Original-PDF + + Seite ~{pdfPage} + {pdfMapping?.totalPages ? ` / ${pdfMapping.totalPages}` : ''} + +
+
+ {pdfUrl ? ( +