017c9b3c12
ai-sdk (legal_rag_client/scroll/types) liest die gepinnten Spec-Felder
article_label/regulation_code/article/paragraph/sub/citation_style/is_recital
mit Fallback auf alt-ingestierte Chunks (regulation_id, section); neuer getBool-Helfer.
Advisor + Drafting-Engine bilden die Quellenzeile primaer aus article_label
("BDSG § 38 Abs. 1"), sonst aus den strukturierten Feldern. 17 Tests gruen, tsc sauber.
Vertrag: docs-src/development/rag_reingest_spec.md (§2/§7). Deploy an den Re-Ingest
gekoppelt — neue Felder sind bis dahin leer (graceful Fallback).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
243 lines
6.5 KiB
Go
243 lines
6.5 KiB
Go
package ucca
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
)
|
|
|
|
// ScrollChunks iterates over all chunks in a Qdrant collection using the scroll API.
|
|
// Pass an empty offset to start from the beginning. Returns chunks, next offset ID, and error.
|
|
func (c *LegalRAGClient) ScrollChunks(ctx context.Context, collection string, offset string, limit int) ([]ScrollChunkResult, string, error) {
|
|
scrollReq := qdrantScrollRequest{
|
|
Limit: limit,
|
|
WithPayload: true,
|
|
WithVectors: false,
|
|
}
|
|
if offset != "" {
|
|
var offsetInt uint64
|
|
if _, err := fmt.Sscanf(offset, "%d", &offsetInt); err == nil {
|
|
scrollReq.Offset = offsetInt
|
|
} else {
|
|
scrollReq.Offset = offset
|
|
}
|
|
}
|
|
|
|
jsonBody, err := json.Marshal(scrollReq)
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("failed to marshal scroll request: %w", err)
|
|
}
|
|
|
|
url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, collection)
|
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("failed to create scroll request: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
if c.qdrantAPIKey != "" {
|
|
req.Header.Set("api-key", c.qdrantAPIKey)
|
|
}
|
|
|
|
resp, err := c.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("scroll request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return nil, "", fmt.Errorf("qdrant returned %d: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
var scrollResp qdrantScrollResponse
|
|
if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil {
|
|
return nil, "", fmt.Errorf("failed to decode scroll response: %w", err)
|
|
}
|
|
|
|
chunks := make([]ScrollChunkResult, len(scrollResp.Result.Points))
|
|
for i, pt := range scrollResp.Result.Points {
|
|
pointID := ""
|
|
if pt.ID != nil {
|
|
pointID = fmt.Sprintf("%v", pt.ID)
|
|
}
|
|
|
|
chunks[i] = ScrollChunkResult{
|
|
ID: pointID,
|
|
Text: getString(pt.Payload, "text"),
|
|
RegulationCode: getString(pt.Payload, "regulation_code"),
|
|
RegulationName: getString(pt.Payload, "regulation_name"),
|
|
RegulationShort: getString(pt.Payload, "regulation_short"),
|
|
Category: getString(pt.Payload, "category"),
|
|
Article: getString(pt.Payload, "article"),
|
|
Paragraph: getString(pt.Payload, "paragraph"),
|
|
SourceURL: getString(pt.Payload, "source_url"),
|
|
}
|
|
|
|
if chunks[i].Text == "" {
|
|
chunks[i].Text = getString(pt.Payload, "chunk_text")
|
|
}
|
|
if chunks[i].RegulationCode == "" {
|
|
chunks[i].RegulationCode = getString(pt.Payload, "regulation_id")
|
|
}
|
|
if chunks[i].RegulationName == "" {
|
|
chunks[i].RegulationName = getString(pt.Payload, "regulation_name_de")
|
|
}
|
|
if chunks[i].SourceURL == "" {
|
|
chunks[i].SourceURL = getString(pt.Payload, "source")
|
|
}
|
|
}
|
|
|
|
nextOffset := ""
|
|
if scrollResp.Result.NextPageOffset != nil {
|
|
switch v := scrollResp.Result.NextPageOffset.(type) {
|
|
case float64:
|
|
nextOffset = fmt.Sprintf("%.0f", v)
|
|
case string:
|
|
nextOffset = v
|
|
default:
|
|
nextOffset = fmt.Sprintf("%v", v)
|
|
}
|
|
}
|
|
|
|
return chunks, nextOffset, nil
|
|
}
|
|
|
|
// ScrollDocumentIndex scrolls through all chunks in a collection using minimal
|
|
// payload (no text/vectors) and returns a deduplicated list of documents.
|
|
func (c *LegalRAGClient) ScrollDocumentIndex(ctx context.Context, collection string) ([]CEDocumentInfo, error) {
|
|
includeFields := []string{"regulation_id", "regulation_name_de", "regulation_name_en", "category", "source", "source_org"}
|
|
|
|
// regulation_id → aggregated info
|
|
docMap := make(map[string]*CEDocumentInfo)
|
|
var offset interface{}
|
|
batchLimit := 500
|
|
|
|
for {
|
|
reqBody := map[string]interface{}{
|
|
"limit": batchLimit,
|
|
"with_payload": map[string]interface{}{"include": includeFields},
|
|
"with_vectors": false,
|
|
}
|
|
if offset != nil {
|
|
reqBody["offset"] = offset
|
|
}
|
|
|
|
jsonBody, err := json.Marshal(reqBody)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal scroll request: %w", err)
|
|
}
|
|
|
|
url := fmt.Sprintf("%s/collections/%s/points/scroll", c.qdrantURL, collection)
|
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create scroll request: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
if c.qdrantAPIKey != "" {
|
|
req.Header.Set("api-key", c.qdrantAPIKey)
|
|
}
|
|
|
|
resp, err := c.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("scroll request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return nil, fmt.Errorf("qdrant returned %d: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
var scrollResp qdrantScrollResponse
|
|
if err := json.NewDecoder(resp.Body).Decode(&scrollResp); err != nil {
|
|
return nil, fmt.Errorf("failed to decode scroll response: %w", err)
|
|
}
|
|
|
|
for _, pt := range scrollResp.Result.Points {
|
|
regID := getString(pt.Payload, "regulation_id")
|
|
if regID == "" {
|
|
continue
|
|
}
|
|
if existing, ok := docMap[regID]; ok {
|
|
existing.ChunkCount++
|
|
continue
|
|
}
|
|
docMap[regID] = &CEDocumentInfo{
|
|
RegulationID: regID,
|
|
NameDE: getString(pt.Payload, "regulation_name_de"),
|
|
NameEN: getString(pt.Payload, "regulation_name_en"),
|
|
Category: getString(pt.Payload, "category"),
|
|
SourceURL: getString(pt.Payload, "source"),
|
|
SourceOrg: getString(pt.Payload, "source_org"),
|
|
ChunkCount: 1,
|
|
}
|
|
}
|
|
|
|
if scrollResp.Result.NextPageOffset == nil {
|
|
break
|
|
}
|
|
offset = scrollResp.Result.NextPageOffset
|
|
}
|
|
|
|
docs := make([]CEDocumentInfo, 0, len(docMap))
|
|
for _, d := range docMap {
|
|
docs = append(docs, *d)
|
|
}
|
|
return docs, nil
|
|
}
|
|
|
|
// Helper functions
|
|
|
|
func getBool(m map[string]interface{}, key string) bool {
|
|
if v, ok := m[key].(bool); ok {
|
|
return v
|
|
}
|
|
return false
|
|
}
|
|
|
|
func getString(m map[string]interface{}, key string) string {
|
|
if v, ok := m[key]; ok {
|
|
if s, ok := v.(string); ok {
|
|
return s
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func getIntSlice(m map[string]interface{}, key string) []int {
|
|
v, ok := m[key]
|
|
if !ok {
|
|
return nil
|
|
}
|
|
arr, ok := v.([]interface{})
|
|
if !ok {
|
|
return nil
|
|
}
|
|
result := make([]int, 0, len(arr))
|
|
for _, item := range arr {
|
|
if f, ok := item.(float64); ok {
|
|
result = append(result, int(f))
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
func contains(slice []string, item string) bool {
|
|
for _, s := range slice {
|
|
if s == item {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func truncateText(text string, maxLen int) string {
|
|
if len(text) <= maxLen {
|
|
return text
|
|
}
|
|
return text[:maxLen] + "..."
|
|
}
|