diff --git a/scripts/ingest-legal-corpus.sh b/scripts/ingest-legal-corpus.sh index 843c17f..7dc5ec5 100755 --- a/scripts/ingest-legal-corpus.sh +++ b/scripts/ingest-legal-corpus.sh @@ -19,6 +19,7 @@ QDRANT_URL="${QDRANT_URL:-http://localhost:6333}" SDK_URL="${SDK_URL:-https://localhost:8093}" DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}" CURL_OPTS="-sk --connect-timeout 10 --max-time 300" +CURL_OPTS_LARGE="-sk --connect-timeout 10 --max-time 900" # Counters UPLOADED=0 @@ -60,7 +61,7 @@ upload_file() { if [[ ! -f "$file" ]]; then warn "File not found: $file" FAILED=$((FAILED + 1)) - return 1 + return 0 # Don't abort script fi local filesize @@ -68,13 +69,20 @@ upload_file() { if [[ "$filesize" -lt 100 ]]; then warn "File too small (${filesize}B), skipping: $label" SKIPPED=$((SKIPPED + 1)) - return 1 + return 0 # Don't abort script fi log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)" + # Use longer timeout for large files (>500KB) + local curl_opts="$CURL_OPTS" + if [[ "$filesize" -gt 512000 ]]; then + curl_opts="$CURL_OPTS_LARGE" + log " (large file, using extended timeout)" + fi + local response - response=$(curl $CURL_OPTS -X POST "$RAG_URL" \ + response=$(curl $curl_opts -X POST "$RAG_URL" \ -F "file=@${file}" \ -F "collection=${collection}" \ -F "data_type=${data_type}" \ @@ -98,9 +106,9 @@ upload_file() { UPLOADED=$((UPLOADED + 1)) else fail "Upload failed: $label" - fail "Response: $response" + fail "Response: ${response:0:200}" FAILED=$((FAILED + 1)) - return 1 + return 0 # Don't abort script on individual upload failure fi }