Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6b9c7984b4 | |||
| e646091ba2 | |||
| 069b855b49 | |||
| 01af9b56a6 | |||
| 017c9b3c12 |
@@ -24,6 +24,20 @@ describe('advisor-rag', () => {
|
|||||||
expect(out).toEqual([{ content: 'Art. 35 DSGVO ...', source: 'DSGVO', score: 0.91 }])
|
expect(out).toEqual([{ content: 'Art. 35 DSGVO ...', source: 'DSGVO', score: 0.91 }])
|
||||||
})
|
})
|
||||||
|
|
||||||
|
it('haengt article/paragraph an die Quelle an (Fallback ohne article_label)', () => {
|
||||||
|
const out = mod.mapSdkResults([
|
||||||
|
{ text: 'Pflicht zur Benennung ...', regulation_short: 'BDSG', article: '§ 38', paragraph: '(1)', score: 0.8 },
|
||||||
|
])
|
||||||
|
expect(out).toEqual([{ content: 'Pflicht zur Benennung ...', source: 'BDSG § 38 (1)', score: 0.8 }])
|
||||||
|
})
|
||||||
|
|
||||||
|
it('nutzt article_label direkt, wenn vorhanden (druckbare Fundstelle)', () => {
|
||||||
|
const out = mod.mapSdkResults([
|
||||||
|
{ text: 'x', regulation_short: 'BDSG', article: '38', paragraph: '1', sub: 'Satz 2', article_label: 'BDSG § 38 Abs. 1', score: 0.9 },
|
||||||
|
])
|
||||||
|
expect(out[0].source).toBe('BDSG § 38 Abs. 1')
|
||||||
|
})
|
||||||
|
|
||||||
it('faellt auf regulation_name/code zurueck und filtert leere Inhalte', () => {
|
it('faellt auf regulation_name/code zurueck und filtert leere Inhalte', () => {
|
||||||
const out = mod.mapSdkResults([
|
const out = mod.mapSdkResults([
|
||||||
{ text: '', regulation_short: 'X' },
|
{ text: '', regulation_short: 'X' },
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
* — damit profitiert der Advisor vom reicheren Embedding.
|
* — damit profitiert der Advisor vom reicheren Embedding.
|
||||||
*
|
*
|
||||||
* Fehler je Collection werden geschluckt (graceful: Antwort ohne diesen Treffer).
|
* Fehler je Collection werden geschluckt (graceful: Antwort ohne diesen Treffer).
|
||||||
|
* Fundstellen via article_label sind live ab dem Prod-Re-Ingest 2026-06.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const SDK_URL =
|
const SDK_URL =
|
||||||
@@ -31,6 +32,12 @@ interface SdkRagResult {
|
|||||||
regulation_code?: string
|
regulation_code?: string
|
||||||
regulation_name?: string
|
regulation_name?: string
|
||||||
regulation_short?: string
|
regulation_short?: string
|
||||||
|
article_label?: string
|
||||||
|
article?: string
|
||||||
|
paragraph?: string
|
||||||
|
sub?: string
|
||||||
|
citation_style?: string
|
||||||
|
is_recital?: boolean
|
||||||
category?: string
|
category?: string
|
||||||
source_url?: string
|
source_url?: string
|
||||||
score?: number
|
score?: number
|
||||||
@@ -47,7 +54,15 @@ export function mapSdkResults(results: SdkRagResult[] | undefined): ScoredPassag
|
|||||||
return (results || [])
|
return (results || [])
|
||||||
.map((r) => ({
|
.map((r) => ({
|
||||||
content: r.text || '',
|
content: r.text || '',
|
||||||
source: r.regulation_short || r.regulation_name || r.regulation_code || 'Unbekannt',
|
// Fundstelle: article_label ist die fertig formatierte, druckbare Quelle aus der
|
||||||
|
// Ingestion ("BDSG § 38 Abs. 1"); Fallback baut sie aus den strukturierten Feldern
|
||||||
|
// (bzw. alt-ingestierte Chunks ohne Legal-Metadaten). Siehe rag_reingest_spec.md §2/§7.
|
||||||
|
source:
|
||||||
|
(r.article_label && r.article_label.trim()) ||
|
||||||
|
[r.regulation_short || r.regulation_name || r.regulation_code, r.article, r.paragraph, r.sub]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(' ') ||
|
||||||
|
'Unbekannt',
|
||||||
score: typeof r.score === 'number' ? r.score : 0,
|
score: typeof r.score === 'number' ? r.score : 0,
|
||||||
}))
|
}))
|
||||||
.filter((p) => p.content)
|
.filter((p) => p.content)
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
* Frueher: bp-core-rag-service:8097 — der existiert auf prod NICHT (nur macmini/dev),
|
* Frueher: bp-core-rag-service:8097 — der existiert auf prod NICHT (nur macmini/dev),
|
||||||
* dadurch lieferte die Drafting-Engine dort keinen RAG-Kontext. Die ai-sdk embeddet
|
* dadurch lieferte die Drafting-Engine dort keinen RAG-Kontext. Die ai-sdk embeddet
|
||||||
* mit bge-m3 und ist prod-erreichbar. Genutzt von draft-, chat- und vendor-review-Routes.
|
* mit bge-m3 und ist prod-erreichbar. Genutzt von draft-, chat- und vendor-review-Routes.
|
||||||
|
* Fundstellen via article_label sind live ab dem Prod-Re-Ingest 2026-06.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const SDK_URL =
|
const SDK_URL =
|
||||||
@@ -18,6 +19,10 @@ interface SdkRagResult {
|
|||||||
regulation_code?: string
|
regulation_code?: string
|
||||||
regulation_name?: string
|
regulation_name?: string
|
||||||
regulation_short?: string
|
regulation_short?: string
|
||||||
|
article_label?: string
|
||||||
|
article?: string
|
||||||
|
paragraph?: string
|
||||||
|
sub?: string
|
||||||
// Rueckwaerts-kompatibel, falls eine Quelle noch das alte rag-service-Format liefert:
|
// Rueckwaerts-kompatibel, falls eine Quelle noch das alte rag-service-Format liefert:
|
||||||
content?: string
|
content?: string
|
||||||
source_name?: string
|
source_name?: string
|
||||||
@@ -56,12 +61,13 @@ export async function queryRAG(query: string, topK = 3, collection?: string): Pr
|
|||||||
|
|
||||||
return results
|
return results
|
||||||
.map((r, i) => {
|
.map((r, i) => {
|
||||||
|
const base =
|
||||||
|
r.regulation_short || r.regulation_name || r.regulation_code || r.source_name || r.source_code
|
||||||
|
// article_label = fertig formatierte Fundstelle aus der Ingestion ("BDSG § 38 Abs. 1");
|
||||||
|
// Fallback baut sie aus den strukturierten Feldern. Siehe rag_reingest_spec.md §2/§7.
|
||||||
const source =
|
const source =
|
||||||
r.regulation_short ||
|
(r.article_label && r.article_label.trim()) ||
|
||||||
r.regulation_name ||
|
[base, r.article, r.paragraph, r.sub].filter(Boolean).join(' ') ||
|
||||||
r.regulation_code ||
|
|
||||||
r.source_name ||
|
|
||||||
r.source_code ||
|
|
||||||
'Unbekannt'
|
'Unbekannt'
|
||||||
const content = r.text || r.content || ''
|
const content = r.text || r.content || ''
|
||||||
return `[Quelle ${i + 1}: ${source}]\n${content}`
|
return `[Quelle ${i + 1}: ${source}]\n${content}`
|
||||||
|
|||||||
@@ -95,12 +95,29 @@ func (c *LegalRAGClient) searchInternal(ctx context.Context, collection string,
|
|||||||
|
|
||||||
results := make([]LegalSearchResult, len(hits))
|
results := make([]LegalSearchResult, len(hits))
|
||||||
for i, hit := range hits {
|
for i, hit := range hits {
|
||||||
|
// Legal-Metadaten nach rag_reingest_spec.md §2: bevorzugt die normalisierten Felder
|
||||||
|
// (article_label/regulation_code/article/...); Fallback auf alte Feldnamen, solange der
|
||||||
|
// Korpus noch nicht re-ingestiert ist (regulation_id, section="§ 38").
|
||||||
|
regCode := getString(hit.Payload, "regulation_code")
|
||||||
|
if regCode == "" {
|
||||||
|
regCode = getString(hit.Payload, "regulation_id")
|
||||||
|
}
|
||||||
|
article := getString(hit.Payload, "article")
|
||||||
|
if article == "" {
|
||||||
|
article = getString(hit.Payload, "section")
|
||||||
|
}
|
||||||
results[i] = LegalSearchResult{
|
results[i] = LegalSearchResult{
|
||||||
Text: getString(hit.Payload, "chunk_text"),
|
Text: getString(hit.Payload, "chunk_text"),
|
||||||
RegulationCode: getString(hit.Payload, "regulation_id"),
|
RegulationCode: regCode,
|
||||||
RegulationName: getString(hit.Payload, "regulation_name_de"),
|
RegulationName: getString(hit.Payload, "regulation_name_de"),
|
||||||
RegulationShort: getString(hit.Payload, "regulation_short"),
|
RegulationShort: getString(hit.Payload, "regulation_short"),
|
||||||
Category: getString(hit.Payload, "category"),
|
Category: getString(hit.Payload, "category"),
|
||||||
|
ArticleLabel: getString(hit.Payload, "article_label"),
|
||||||
|
Article: article,
|
||||||
|
Paragraph: getString(hit.Payload, "paragraph"),
|
||||||
|
Sub: getString(hit.Payload, "sub"),
|
||||||
|
IsRecital: getBool(hit.Payload, "is_recital"),
|
||||||
|
CitationStyle: getString(hit.Payload, "citation_style"),
|
||||||
Pages: getIntSlice(hit.Payload, "pages"),
|
Pages: getIntSlice(hit.Payload, "pages"),
|
||||||
SourceURL: getString(hit.Payload, "source"),
|
SourceURL: getString(hit.Payload, "source"),
|
||||||
Score: hit.Score,
|
Score: hit.Score,
|
||||||
|
|||||||
@@ -191,6 +191,13 @@ func (c *LegalRAGClient) ScrollDocumentIndex(ctx context.Context, collection str
|
|||||||
|
|
||||||
// Helper functions
|
// Helper functions
|
||||||
|
|
||||||
|
func getBool(m map[string]interface{}, key string) bool {
|
||||||
|
if v, ok := m[key].(bool); ok {
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func getString(m map[string]interface{}, key string) string {
|
func getString(m map[string]interface{}, key string) string {
|
||||||
if v, ok := m[key]; ok {
|
if v, ok := m[key]; ok {
|
||||||
if s, ok := v.(string); ok {
|
if s, ok := v.(string); ok {
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ package ucca
|
|||||||
import "time"
|
import "time"
|
||||||
|
|
||||||
// LegalSearchResult represents a single search result from the compliance corpus.
|
// LegalSearchResult represents a single search result from the compliance corpus.
|
||||||
|
// Legal-Zitatfelder (article_label/article/paragraph/...) sind live ab dem
|
||||||
|
// Prod-Re-Ingest 2026-06 (siehe docs-src/development/rag_reingest_spec.md §2).
|
||||||
type LegalSearchResult struct {
|
type LegalSearchResult struct {
|
||||||
Text string `json:"text"`
|
Text string `json:"text"`
|
||||||
RegulationCode string `json:"regulation_code"`
|
RegulationCode string `json:"regulation_code"`
|
||||||
@@ -10,7 +12,11 @@ type LegalSearchResult struct {
|
|||||||
RegulationShort string `json:"regulation_short"`
|
RegulationShort string `json:"regulation_short"`
|
||||||
Category string `json:"category"`
|
Category string `json:"category"`
|
||||||
Article string `json:"article,omitempty"`
|
Article string `json:"article,omitempty"`
|
||||||
|
ArticleLabel string `json:"article_label,omitempty"`
|
||||||
Paragraph string `json:"paragraph,omitempty"`
|
Paragraph string `json:"paragraph,omitempty"`
|
||||||
|
Sub string `json:"sub,omitempty"`
|
||||||
|
IsRecital bool `json:"is_recital,omitempty"`
|
||||||
|
CitationStyle string `json:"citation_style,omitempty"`
|
||||||
Pages []int `json:"pages,omitempty"`
|
Pages []int `json:"pages,omitempty"`
|
||||||
SourceURL string `json:"source_url"`
|
SourceURL string `json:"source_url"`
|
||||||
Score float64 `json:"score"`
|
Score float64 `json:"score"`
|
||||||
|
|||||||
@@ -33,6 +33,12 @@ type RegulatoryNewsFilter struct {
|
|||||||
// GetRegulatoryNews scans all v2 obligations for upcoming deadlines
|
// GetRegulatoryNews scans all v2 obligations for upcoming deadlines
|
||||||
// and returns formatted news items sorted by urgency.
|
// and returns formatted news items sorted by urgency.
|
||||||
func GetRegulatoryNews(regulations map[string]*V2RegulationFile, filter RegulatoryNewsFilter) []RegulatoryNewsItem {
|
func GetRegulatoryNews(regulations map[string]*V2RegulationFile, filter RegulatoryNewsFilter) []RegulatoryNewsItem {
|
||||||
|
return GetRegulatoryNewsAt(regulations, filter, time.Now().UTC())
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetRegulatoryNewsAt is GetRegulatoryNews with an injectable reference time so the
|
||||||
|
// upcoming-deadline window is deterministic in tests (no time-bomb once a deadline passes).
|
||||||
|
func GetRegulatoryNewsAt(regulations map[string]*V2RegulationFile, filter RegulatoryNewsFilter, now time.Time) []RegulatoryNewsItem {
|
||||||
if filter.HorizonDays <= 0 {
|
if filter.HorizonDays <= 0 {
|
||||||
filter.HorizonDays = 365
|
filter.HorizonDays = 365
|
||||||
}
|
}
|
||||||
@@ -40,7 +46,7 @@ func GetRegulatoryNews(regulations map[string]*V2RegulationFile, filter Regulato
|
|||||||
filter.Limit = 5
|
filter.Limit = 5
|
||||||
}
|
}
|
||||||
|
|
||||||
today := time.Now().UTC().Truncate(24 * time.Hour)
|
today := now.UTC().Truncate(24 * time.Hour)
|
||||||
horizon := today.AddDate(0, 0, filter.HorizonDays)
|
horizon := today.AddDate(0, 0, filter.HorizonDays)
|
||||||
var items []RegulatoryNewsItem
|
var items []RegulatoryNewsItem
|
||||||
|
|
||||||
|
|||||||
@@ -174,7 +174,10 @@ func TestGetRegulatoryNews_FromRealFiles(t *testing.T) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Skipf("could not load v2 regulations: %v", err)
|
t.Skipf("could not load v2 regulations: %v", err)
|
||||||
}
|
}
|
||||||
items := GetRegulatoryNews(regs, RegulatoryNewsFilter{Limit: 20, HorizonDays: 730})
|
// Fixed reference date so the test is deterministic regardless of the wall clock:
|
||||||
|
// VBR-OBL-001 (deadline 2026-06-19) must fall within [ref, ref+730d].
|
||||||
|
ref := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
items := GetRegulatoryNewsAt(regs, RegulatoryNewsFilter{Limit: 20, HorizonDays: 730}, ref)
|
||||||
// Should find at least the Widerrufsbutton obligation
|
// Should find at least the Widerrufsbutton obligation
|
||||||
found := false
|
found := false
|
||||||
for _, item := range items {
|
for _, item := range items {
|
||||||
|
|||||||
@@ -0,0 +1,130 @@
|
|||||||
|
# RAG Re-Ingest-Spezifikation — Zitierfähige Chunks (Cross-Session-Vertrag)
|
||||||
|
|
||||||
|
> **Status:** v1 — Vertrag zwischen **core-Session** (Ingestion/Pipeline) und **compliance-Session** (Consumer: ai-sdk/Advisor).
|
||||||
|
> **Datum:** 2026-06-19
|
||||||
|
> **Ziel:** Chunks neu ingestieren mit sauberem Text + vollständigen **Legal-Metadaten**, damit Controls/Findings **zitierfähig** werden ("BDSG § 38 Abs. 1", "Art. 13 Abs. 1 lit. c DSGVO") — **ohne** die teuren Pass 0a/0b/Dedup.
|
||||||
|
|
||||||
|
## 0. Kernentscheidung (Frage 1): Re-Link statt Regenerieren
|
||||||
|
|
||||||
|
Controls hängen **an `control_uuid`, nicht an Chunks** (`canonical_controls` hat keine `chunk_id`/`chunk_hash`-Spalte; ebenso atom_classification, control_classification, doc_check_controls). **Ein Chunk-Re-Ingest bricht keine Control.** Das einzige stale-werdende Artefakt ist das Idempotenz-Ledger `compliance.canonical_processed_chunks` (Key `sha256(text)+collection+document_version`) — relevant nur für *zukünftige* Generierung.
|
||||||
|
|
||||||
|
→ **Pass 0a/0b/Dedup entfallen.** Stattdessen werden Controls per **Textabgleich** an die neuen Chunks **re-gelinkt** (Zitat-Anreicherung). Reichweite:
|
||||||
|
- **7 %** der 315.914 Controls haben `source_original_text` (Re-Link-Anker) → direkter Abgleich.
|
||||||
|
- **~93 % Atome** erben das Zitat über `parent_control_uuid`.
|
||||||
|
- **self-written** brauchen kein Chunk-Zitat (eigene Bibliothek).
|
||||||
|
- **unmatched Reste** → billiges per-Control-LLM-Zitat (Tier-3), keine Regenerierung.
|
||||||
|
|
||||||
|
## 1. Ingestion-Anforderungen (Frage 2)
|
||||||
|
|
||||||
|
1. **`chunk_strategy="legal"` EXPLIZIT setzen.** (Korrektur zur Historie: `recursive` aliased inzwischen auf `chunk_text_legal`, `embedding-service/main.py:1079/1093`, live-verifiziert — Upload mit `recursive` lieferte `section:"§ 38"`, `paragraph:"(1)"`, `paragraph_num:1`. Trotzdem `legal` explizit, nicht aufs Alias verlassen.)
|
||||||
|
2. **Deterministische Chunk-ID** (heute random) = `sha1(regulation_code|article|paragraph|chunk_index|document_version)` → stabiler Re-Link + Alt/Neu-Koexistenz.
|
||||||
|
3. **`chunk_hash` IN die Payload** schreiben (heute nur im PG-Ledger) = `sha256(normalisierter chunk_text)`.
|
||||||
|
4. **Echte `document_version`** (heute hardcoded `"1.0"`) → Re-Chunk kollidiert sonst im Ledger.
|
||||||
|
5. **Upload-before-delete** je Collection (alte Chunks erst nach Verify löschen).
|
||||||
|
|
||||||
|
## 2. PAYLOAD-FELD-VERTRAG (verbindlich — Consumer liest GENAU diese Namen)
|
||||||
|
|
||||||
|
Die ai-sdk/Advisor liest die **consumer-facing** Felder. Ingestion füllt alle.
|
||||||
|
|
||||||
|
| Feld | Typ | Consumer | Beschreibung / Beispiel |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `article_label` | string | **JA (Anzeige)** | **Fertig formatiert, direkt druckbar.** "BDSG § 38 Abs. 1" · "Art. 13 Abs. 1 lit. c DSGVO". Ingestion formatiert (kennt §- vs Art.-Stil). |
|
||||||
|
| `regulation_code` | string | **JA** | Kurzcode, UPPERCASE: `BDSG`,`DSGVO`,`TTDSG`,`DDG`,`CRA`,`NIS2` |
|
||||||
|
| `citation_style` | enum | **JA** | `paragraph` (§-Gesetze) \| `article` (EU-Verordnungen) — steuert §/Art.-Rendering, falls Consumer selbst formatiert |
|
||||||
|
| `article` | string | **JA** | bare Nummer: `"38"` bzw. `"13"` (egal ob § oder Art.) |
|
||||||
|
| `paragraph` | string | **JA** | bare Absatz: `"1"` (nicht `"(1)"`) |
|
||||||
|
| `sub` | string\|null | **JA** | feinste Granularität: `"lit. c"` · `"Satz 2"` · `"Nr. 3"` |
|
||||||
|
| `is_recital` | bool | **JA** | Erwägungsgrund vs operativer Artikel |
|
||||||
|
| `regulation_name` | string | optional | Volltext: "Bundesdatenschutzgesetz" |
|
||||||
|
| `page` | int\|null | optional | Seite (PDF-Quellen) |
|
||||||
|
| `chunk_text` | string | **JA** | sauberer Text (keine Soft-Hyphens/OCR-Reste) |
|
||||||
|
| `section_header` | string | optional | Kontext-Überschrift, **separat** (nicht inline im chunk_text) |
|
||||||
|
| `chunk_id` | string | — | deterministisch (s. §1.2) |
|
||||||
|
| `chunk_hash` | string | — | sha256(normalisierter Text) |
|
||||||
|
| `document_id`, `document_version`, `chunk_index` | — | — | Identität/Versionierung |
|
||||||
|
| `doc_type`, `use_case[]`, `source_type`, `license`, `bundesland`, `year` | — | Scope | Routing/Scope (source_type: gesetz/leitlinie/urteil) |
|
||||||
|
|
||||||
|
**Verbindlich:** `article_label` ist der bevorzugte Anzeige-Pfad (Ingestion ownt die Zitat-Formatierung, weil sie die Regulierung kennt). Die strukturierten Teile (`regulation_code`/`article`/`paragraph`/`sub`) sind zusätzlich für Filtern/Gruppieren da.
|
||||||
|
|
||||||
|
## 3. Normalisierung alt → neu (core-seitiger Transform)
|
||||||
|
|
||||||
|
Der legal-Chunker emittiert heute uneinheitlich; Ingestion normalisiert:
|
||||||
|
|
||||||
|
| heute (raw payload) | → neu |
|
||||||
|
|---|---|
|
||||||
|
| `section` = `"§ 38"` / `"Artikel 13"` | `citation_style` (§→`paragraph`, Art/Artikel→`article`) + `article`=`"38"`/`"13"` |
|
||||||
|
| `section_title` | `section_header` |
|
||||||
|
| `paragraph`=`"(1)"` / `paragraph_num`=1 | `paragraph`=`"1"` |
|
||||||
|
| (lit./Satz/Nr. aus `_PARAGRAPH_RE`) | `sub` (best-effort) |
|
||||||
|
| `regulation_id`/`regulation_short` (extra_metadata) | `regulation_code` (UPPERCASE) |
|
||||||
|
| — | `article_label` (formatiert aus regulation_code+style+article+paragraph+sub) |
|
||||||
|
|
||||||
|
`article_label`-Formatierung:
|
||||||
|
- `paragraph`-Stil: `"{regulation_code} § {article} Abs. {paragraph}"` (+ `" {sub}"`)
|
||||||
|
- `article`-Stil: `"Art. {article} Abs. {paragraph} {sub} {regulation_code}"`
|
||||||
|
|
||||||
|
## 4. Control-Re-Link (core)
|
||||||
|
|
||||||
|
Erweiterung von `control-pipeline/services/citation_backfill.py` (heute 3-Tier, Tier-1 = `sha256(source_original_text)` → Chunk-Hash-Index): Tier-1 bricht beim Re-Chunk (neuer Text → neuer Hash) → **Fuzzy/Embedding-Alignment** ergänzen (`source_original_text` ↔ neue Chunk-Texte, Substring + Cosine). Präzedenz: PDF-QA-Matcher (~52 % Trefferquote). Füllt `canonical_controls.source_citation = {regulation_code, article, paragraph, sub, page, source_type, license, url}`. Atome erben über `parent_control_uuid`. `doc_check_controls` re-derivieren danach automatisch zitierfähig (`derive_doc_check_controls.py` liest `source_citation->>'article'/'source'`).
|
||||||
|
|
||||||
|
**Beim künftigen Generieren IMMER `source_original_text` setzen** (warum heute nur 7 % re-linkbar sind).
|
||||||
|
|
||||||
|
## 5. Pipeline-Reihenfolge
|
||||||
|
|
||||||
|
1. Re-Ingest je Collection: `strategy="legal"`, deterministische IDs, neue `document_version`, normalisierte Payload + `chunk_hash`.
|
||||||
|
2. Verify: Zähler alt/neu + Stichprobe `article`/`paragraph`/`article_label` befüllt.
|
||||||
|
3. Re-Link (`citation_backfill` erweitert) → `source_citation`; Atome erben.
|
||||||
|
4. Reste → Tier-3-LLM-Zitat.
|
||||||
|
5. Alte Chunks löschen.
|
||||||
|
6. *(optional, nur für künftige Generierung)* Ledger `canonical_processed_chunks` neu aufbauen.
|
||||||
|
|
||||||
|
## 6. Arbeitsteilung + Akzeptanz
|
||||||
|
|
||||||
|
**core-Session:** Ingest-Spec umsetzen, `citation_backfill` Hash→Fuzzy→Embedding, Payload-Normalisierung (§3), deterministische IDs/`chunk_hash`/`document_version`, AGG-Lücke.
|
||||||
|
|
||||||
|
**compliance-Session (Consumer):** ai-sdk `legal_rag_client.go` + Advisor/Drafting auf die **§2-Feldnamen** ummappen (kein Deploy vor Pin — jetzt gepinnt), Prod-Qdrant-Verify, **6-Fragen-Re-Test** (sind §38 BDSG / AGG / CRA Art. 14 grounded zitiert?).
|
||||||
|
|
||||||
|
**Akzeptanzkriterium:** Advisor rendert für eine Beispielfrage eine echte Fundstelle aus `article_label` (z. B. "BDSG § 38 Abs. 1"), nicht nur "Quelle: BDSG".
|
||||||
|
|
||||||
|
## 7. Consumer-Detailabschnitt (compliance-Session — implementiert 2026-06-19)
|
||||||
|
|
||||||
|
Status: Code steht + getestet (17 Tests grün, tsc sauber). **Deploy gekoppelt an den Re-Ingest** (liest bis dahin leere Felder → graceful, Advisor zeigt wie heute "Quelle: BDSG").
|
||||||
|
|
||||||
|
### 7.1 ai-sdk: Payload → Response (`internal/ucca/legal_rag_client.go` `searchInternal`, Struct `legal_rag_types.go`)
|
||||||
|
|
||||||
|
| Response-Feld (JSON) | gelesen aus Payload | Fallback (alt-Korpus) |
|
||||||
|
|---|---|---|
|
||||||
|
| `article_label` | `article_label` | — (leer → Consumer baut selbst) |
|
||||||
|
| `regulation_code` | `regulation_code` | → `regulation_id` |
|
||||||
|
| `article` | `article` | → `section` ("§ 38") |
|
||||||
|
| `paragraph` | `paragraph` | — |
|
||||||
|
| `sub` | `sub` | — |
|
||||||
|
| `citation_style` | `citation_style` | — |
|
||||||
|
| `is_recital` | `is_recital` (bool) | — |
|
||||||
|
| `text` | `chunk_text` | — |
|
||||||
|
| `regulation_name` | `regulation_name_de` | — |
|
||||||
|
| `regulation_short` | `regulation_short` | — |
|
||||||
|
| `category`,`pages`,`source_url`,`score` | `category`,`pages`,`source`,(score) | — |
|
||||||
|
|
||||||
|
→ `/sdk/v1/rag/search` liefert diese Felder snake_case. Neuer Bool-Helfer `getBool` ergänzt.
|
||||||
|
|
||||||
|
### 7.2 Advisor + Drafting: Fundstellen-Format
|
||||||
|
|
||||||
|
Beide Konsumenten (`admin-compliance/lib/sdk/agents/advisor-rag.ts`, `.../drafting-engine/rag-query.ts`) bilden die Quellenzeile so:
|
||||||
|
|
||||||
|
```
|
||||||
|
source = article_label?.trim() // bevorzugt: druckbar aus Ingestion
|
||||||
|
|| [regulation_short|regulation_name|regulation_code, article, paragraph, sub]
|
||||||
|
.filter(Boolean).join(' ') // Fallback: strukturiert zusammensetzen
|
||||||
|
|| 'Unbekannt'
|
||||||
|
Ausgabe je Treffer: [Quelle N: {source}]\n{text}
|
||||||
|
```
|
||||||
|
|
||||||
|
→ Der Advisor **druckt `article_label` direkt** (kein §-vs-Art-Ableiten); `citation_style` nur nötig, falls wir später selbst formatieren. Erfüllt das Akzeptanzkriterium (§6): "BDSG § 38 Abs. 1" statt nur "Quelle: BDSG".
|
||||||
|
|
||||||
|
### 7.3 Deploy-Kopplung
|
||||||
|
Code ist additiv/safe (neue Felder leer bis Re-Ingest). **Kein Solo-Deploy** — geht mit dem Re-Ingest-Go-Live live, danach **6-Fragen-Re-Test** auf prod (§38/AGG/CRA Art. 14 grounded zitiert?).
|
||||||
|
|
||||||
|
## 8. Offen (core)
|
||||||
|
- **AGG-Lücke**: Quelle (§ 15 Abs. 4 — Bewerberdaten-Frist) zu spezifizieren + ingestieren.
|
||||||
|
- Soft-Hyphen/OCR-Normalisierung des `chunk_text` — Regelsatz definieren.
|
||||||
Reference in New Issue
Block a user