diff --git a/.gitea/workflows/rag-ingest.yaml b/.gitea/workflows/rag-ingest.yaml index b7c4769..612ff88 100644 --- a/.gitea/workflows/rag-ingest.yaml +++ b/.gitea/workflows/rag-ingest.yaml @@ -75,6 +75,7 @@ jobs: -e "WORK_DIR=/tmp/rag-ingestion" \ -e "RAG_URL=http://bp-core-rag-service:8097/api/v1/documents/upload" \ -e "QDRANT_URL=https://qdrant-dev.breakpilot.ai" \ + -e "QDRANT_API_KEY=z9cKbT74vl1aKPD1QGIlKWfET47VH93u" \ -e "SDK_URL=http://bp-compliance-ai-sdk:8090" \ alpine:3.19 \ sh -c " diff --git a/scripts/ingest-legal-corpus.sh b/scripts/ingest-legal-corpus.sh index 7a9e26b..2bda2cd 100755 --- a/scripts/ingest-legal-corpus.sh +++ b/scripts/ingest-legal-corpus.sh @@ -68,8 +68,10 @@ upload_file() { local reg_id reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "") if [[ -n "$reg_id" && -n "${QDRANT_URL:-}" ]]; then + local qdrant_auth="" + [[ -n "${QDRANT_API_KEY:-}" ]] && qdrant_auth="-H api-key:${QDRANT_API_KEY}" local existing - existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \ + existing=$(curl -sk --max-time 5 $qdrant_auth -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \ -H "Content-Type: application/json" \ -d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \ 2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0") @@ -92,7 +94,7 @@ upload_file() { # Use longer timeout for large files (>500KB) local curl_opts="$CURL_OPTS" - if [[ "$filesize" -gt 384000 ]]; then + if [[ "$filesize" -gt 256000 ]]; then curl_opts="$CURL_OPTS_LARGE" log " (large file, using extended timeout)" fi @@ -241,8 +243,17 @@ concat_bundestag_gesetz() { collection_count() { local col="$1" - curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \ - | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?" + local qdrant_auth="" + [[ -n "${QDRANT_API_KEY:-}" ]] && qdrant_auth="-H api-key:${QDRANT_API_KEY}" + local count + count=$(curl -sk --max-time 10 $qdrant_auth "${QDRANT_URL}/collections/${col}" 2>/dev/null \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null) || count="" + # Ensure numeric output (0 on failure) + if [[ "$count" =~ ^[0-9]+$ ]]; then + echo "$count" + else + echo "0" + fi } # ============================================================================= @@ -1032,12 +1043,15 @@ root = tree.getroot() text_parts = [] for norm in root.iter(): if norm.tag.endswith('norm'): - # Capture all text — EGBGB is not too large - parts = [t.strip() for t in norm.itertext() if t.strip()] - if any('246' in p or 'Anlage' in p or 'Widerruf' in p or 'Muster' in p for p in parts): + full_text = ' '.join(t.strip() for t in norm.itertext() if t.strip()) + # Only capture Art. 246/246a/246b/246c and Anlage (Muster-Widerrufsbelehrung) + if any(kw in full_text for kw in ['Art. 246', 'Artikel 246', '§ 246', 'Anlage 1', 'Anlage 2', 'Widerrufsbelehrung', 'Widerrufsformular']): + parts = [t.strip() for t in norm.itertext() if t.strip()] text_parts.extend(parts) +# Limit output to avoid timeout (max 100KB) +output = 'EGBGB - Informationspflichten und Muster-Widerrufsbelehrung (Art. 246a + Anlage 1+2)\n\n' + '\n'.join(text_parts) with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: - f.write('EGBGB - Informationspflichten und Muster-Widerrufsbelehrung\n\n' + '\n'.join(text_parts)) + f.write(output[:100000]) " 2>/dev/null if [[ -f "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" && $(wc -c < "$WORK_DIR/pdfs/EGBGB_Widerruf.txt") -gt 100 ]]; then upload_file "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" "$col" "compliance" "legal_reference" "2025" \ @@ -1051,7 +1065,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: local after after=$(collection_count "$col") - local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?" + local diff=$(( after - before )) log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})" # ========================================================================= @@ -1112,7 +1126,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: done after=$(collection_count "$col") - local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?" + local diff=$(( after - before )) log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})" # ========================================================================= @@ -1154,7 +1168,7 @@ with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: fi after=$(collection_count "$col") - local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?" + local diff=$(( after - before )) log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})" # =========================================================================