fix(rag): Arithmetic error, dedup auth, EGBGB timeout
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 41s
CI/CD / test-python-backend-compliance (push) Successful in 41s
CI/CD / test-python-document-crawler (push) Successful in 27s
CI/CD / test-python-dsms-gateway (push) Successful in 21s
CI/CD / deploy-hetzner (push) Successful in 19s

- collection_count() returns 0 (not ?) on failure — fixes arithmetic error
- Pass QDRANT_API_KEY to ingestion container for dedup checks
- Include api-key header in collection_count() and dedup scroll queries
- Lower large-file threshold to 256KB (EGBGB 310KB was timing out)
- More targeted EGBGB XML extraction (Art. 246a + Anlage only)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-12 12:05:07 +01:00
parent cf60c39658
commit 57f390190d
2 changed files with 26 additions and 11 deletions

View File

@@ -75,6 +75,7 @@ jobs:
-e "WORK_DIR=/tmp/rag-ingestion" \ -e "WORK_DIR=/tmp/rag-ingestion" \
-e "RAG_URL=http://bp-core-rag-service:8097/api/v1/documents/upload" \ -e "RAG_URL=http://bp-core-rag-service:8097/api/v1/documents/upload" \
-e "QDRANT_URL=https://qdrant-dev.breakpilot.ai" \ -e "QDRANT_URL=https://qdrant-dev.breakpilot.ai" \
-e "QDRANT_API_KEY=z9cKbT74vl1aKPD1QGIlKWfET47VH93u" \
-e "SDK_URL=http://bp-compliance-ai-sdk:8090" \ -e "SDK_URL=http://bp-compliance-ai-sdk:8090" \
alpine:3.19 \ alpine:3.19 \
sh -c " sh -c "

View File

@@ -68,8 +68,10 @@ upload_file() {
local reg_id local reg_id
reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "") reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "")
if [[ -n "$reg_id" && -n "${QDRANT_URL:-}" ]]; then if [[ -n "$reg_id" && -n "${QDRANT_URL:-}" ]]; then
local qdrant_auth=""
[[ -n "${QDRANT_API_KEY:-}" ]] && qdrant_auth="-H api-key:${QDRANT_API_KEY}"
local existing local existing
existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \ existing=$(curl -sk --max-time 5 $qdrant_auth -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \ -d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \
2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0") 2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0")
@@ -92,7 +94,7 @@ upload_file() {
# Use longer timeout for large files (>500KB) # Use longer timeout for large files (>500KB)
local curl_opts="$CURL_OPTS" local curl_opts="$CURL_OPTS"
if [[ "$filesize" -gt 384000 ]]; then if [[ "$filesize" -gt 256000 ]]; then
curl_opts="$CURL_OPTS_LARGE" curl_opts="$CURL_OPTS_LARGE"
log " (large file, using extended timeout)" log " (large file, using extended timeout)"
fi fi
@@ -241,8 +243,17 @@ concat_bundestag_gesetz() {
collection_count() { collection_count() {
local col="$1" local col="$1"
curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \ local qdrant_auth=""
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?" [[ -n "${QDRANT_API_KEY:-}" ]] && qdrant_auth="-H api-key:${QDRANT_API_KEY}"
local count
count=$(curl -sk --max-time 10 $qdrant_auth "${QDRANT_URL}/collections/${col}" 2>/dev/null \
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null) || count=""
# Ensure numeric output (0 on failure)
if [[ "$count" =~ ^[0-9]+$ ]]; then
echo "$count"
else
echo "0"
fi
} }
# ============================================================================= # =============================================================================
@@ -1032,12 +1043,15 @@ root = tree.getroot()
text_parts = [] text_parts = []
for norm in root.iter(): for norm in root.iter():
if norm.tag.endswith('norm'): if norm.tag.endswith('norm'):
# Capture all text — EGBGB is not too large full_text = ' '.join(t.strip() for t in norm.itertext() if t.strip())
parts = [t.strip() for t in norm.itertext() if t.strip()] # Only capture Art. 246/246a/246b/246c and Anlage (Muster-Widerrufsbelehrung)
if any('246' in p or 'Anlage' in p or 'Widerruf' in p or 'Muster' in p for p in parts): if any(kw in full_text for kw in ['Art. 246', 'Artikel 246', '§ 246', 'Anlage 1', 'Anlage 2', 'Widerrufsbelehrung', 'Widerrufsformular']):
parts = [t.strip() for t in norm.itertext() if t.strip()]
text_parts.extend(parts) text_parts.extend(parts)
# Limit output to avoid timeout (max 100KB)
output = 'EGBGB - Informationspflichten und Muster-Widerrufsbelehrung (Art. 246a + Anlage 1+2)\n\n' + '\n'.join(text_parts)
with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
f.write('EGBGB - Informationspflichten und Muster-Widerrufsbelehrung\n\n' + '\n'.join(text_parts)) f.write(output[:100000])
" 2>/dev/null " 2>/dev/null
if [[ -f "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" && $(wc -c < "$WORK_DIR/pdfs/EGBGB_Widerruf.txt") -gt 100 ]]; then if [[ -f "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" && $(wc -c < "$WORK_DIR/pdfs/EGBGB_Widerruf.txt") -gt 100 ]]; then
upload_file "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" "$col" "compliance" "legal_reference" "2025" \ upload_file "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" "$col" "compliance" "legal_reference" "2025" \
@@ -1051,7 +1065,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
local after local after
after=$(collection_count "$col") after=$(collection_count "$col")
local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?" local diff=$(( after - before ))
log "Collection $col: ${before:-?}${after:-?} chunks (+${diff})" log "Collection $col: ${before:-?}${after:-?} chunks (+${diff})"
# ========================================================================= # =========================================================================
@@ -1112,7 +1126,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
done done
after=$(collection_count "$col") after=$(collection_count "$col")
local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?" local diff=$(( after - before ))
log "Collection $col: ${before:-?}${after:-?} chunks (+${diff})" log "Collection $col: ${before:-?}${after:-?} chunks (+${diff})"
# ========================================================================= # =========================================================================
@@ -1154,7 +1168,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
fi fi
after=$(collection_count "$col") after=$(collection_count "$col")
local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?" local diff=$(( after - before ))
log "Collection $col: ${before:-?}${after:-?} chunks (+${diff})" log "Collection $col: ${before:-?}${after:-?} chunks (+${diff})"
# ========================================================================= # =========================================================================