fix(rag): Arithmetic error, dedup auth, EGBGB timeout
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 41s
CI/CD / test-python-backend-compliance (push) Successful in 41s
CI/CD / test-python-document-crawler (push) Successful in 27s
CI/CD / test-python-dsms-gateway (push) Successful in 21s
CI/CD / deploy-hetzner (push) Successful in 19s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 41s
CI/CD / test-python-backend-compliance (push) Successful in 41s
CI/CD / test-python-document-crawler (push) Successful in 27s
CI/CD / test-python-dsms-gateway (push) Successful in 21s
CI/CD / deploy-hetzner (push) Successful in 19s
- collection_count() returns 0 (not ?) on failure — fixes arithmetic error - Pass QDRANT_API_KEY to ingestion container for dedup checks - Include api-key header in collection_count() and dedup scroll queries - Lower large-file threshold to 256KB (EGBGB 310KB was timing out) - More targeted EGBGB XML extraction (Art. 246a + Anlage only) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -75,6 +75,7 @@ jobs:
|
||||
-e "WORK_DIR=/tmp/rag-ingestion" \
|
||||
-e "RAG_URL=http://bp-core-rag-service:8097/api/v1/documents/upload" \
|
||||
-e "QDRANT_URL=https://qdrant-dev.breakpilot.ai" \
|
||||
-e "QDRANT_API_KEY=z9cKbT74vl1aKPD1QGIlKWfET47VH93u" \
|
||||
-e "SDK_URL=http://bp-compliance-ai-sdk:8090" \
|
||||
alpine:3.19 \
|
||||
sh -c "
|
||||
|
||||
@@ -68,8 +68,10 @@ upload_file() {
|
||||
local reg_id
|
||||
reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "")
|
||||
if [[ -n "$reg_id" && -n "${QDRANT_URL:-}" ]]; then
|
||||
local qdrant_auth=""
|
||||
[[ -n "${QDRANT_API_KEY:-}" ]] && qdrant_auth="-H api-key:${QDRANT_API_KEY}"
|
||||
local existing
|
||||
existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \
|
||||
existing=$(curl -sk --max-time 5 $qdrant_auth -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \
|
||||
2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0")
|
||||
@@ -92,7 +94,7 @@ upload_file() {
|
||||
|
||||
# Use longer timeout for large files (>500KB)
|
||||
local curl_opts="$CURL_OPTS"
|
||||
if [[ "$filesize" -gt 384000 ]]; then
|
||||
if [[ "$filesize" -gt 256000 ]]; then
|
||||
curl_opts="$CURL_OPTS_LARGE"
|
||||
log " (large file, using extended timeout)"
|
||||
fi
|
||||
@@ -241,8 +243,17 @@ concat_bundestag_gesetz() {
|
||||
|
||||
collection_count() {
|
||||
local col="$1"
|
||||
curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
||||
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?"
|
||||
local qdrant_auth=""
|
||||
[[ -n "${QDRANT_API_KEY:-}" ]] && qdrant_auth="-H api-key:${QDRANT_API_KEY}"
|
||||
local count
|
||||
count=$(curl -sk --max-time 10 $qdrant_auth "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
||||
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null) || count=""
|
||||
# Ensure numeric output (0 on failure)
|
||||
if [[ "$count" =~ ^[0-9]+$ ]]; then
|
||||
echo "$count"
|
||||
else
|
||||
echo "0"
|
||||
fi
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
@@ -1032,12 +1043,15 @@ root = tree.getroot()
|
||||
text_parts = []
|
||||
for norm in root.iter():
|
||||
if norm.tag.endswith('norm'):
|
||||
# Capture all text — EGBGB is not too large
|
||||
parts = [t.strip() for t in norm.itertext() if t.strip()]
|
||||
if any('246' in p or 'Anlage' in p or 'Widerruf' in p or 'Muster' in p for p in parts):
|
||||
full_text = ' '.join(t.strip() for t in norm.itertext() if t.strip())
|
||||
# Only capture Art. 246/246a/246b/246c and Anlage (Muster-Widerrufsbelehrung)
|
||||
if any(kw in full_text for kw in ['Art. 246', 'Artikel 246', '§ 246', 'Anlage 1', 'Anlage 2', 'Widerrufsbelehrung', 'Widerrufsformular']):
|
||||
parts = [t.strip() for t in norm.itertext() if t.strip()]
|
||||
text_parts.extend(parts)
|
||||
# Limit output to avoid timeout (max 100KB)
|
||||
output = 'EGBGB - Informationspflichten und Muster-Widerrufsbelehrung (Art. 246a + Anlage 1+2)\n\n' + '\n'.join(text_parts)
|
||||
with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
|
||||
f.write('EGBGB - Informationspflichten und Muster-Widerrufsbelehrung\n\n' + '\n'.join(text_parts))
|
||||
f.write(output[:100000])
|
||||
" 2>/dev/null
|
||||
if [[ -f "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" && $(wc -c < "$WORK_DIR/pdfs/EGBGB_Widerruf.txt") -gt 100 ]]; then
|
||||
upload_file "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" "$col" "compliance" "legal_reference" "2025" \
|
||||
@@ -1051,7 +1065,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
|
||||
|
||||
local after
|
||||
after=$(collection_count "$col")
|
||||
local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?"
|
||||
local diff=$(( after - before ))
|
||||
log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"
|
||||
|
||||
# =========================================================================
|
||||
@@ -1112,7 +1126,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
|
||||
done
|
||||
|
||||
after=$(collection_count "$col")
|
||||
local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?"
|
||||
local diff=$(( after - before ))
|
||||
log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"
|
||||
|
||||
# =========================================================================
|
||||
@@ -1154,7 +1168,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
|
||||
fi
|
||||
|
||||
after=$(collection_count "$col")
|
||||
local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?"
|
||||
local diff=$(( after - before ))
|
||||
log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"
|
||||
|
||||
# =========================================================================
|
||||
|
||||
Reference in New Issue
Block a user