# Gitea Actions — RAG Legal Corpus Ingestion # # Manuell triggerbarer Workflow zur Ingestion von Rechtstexten in Qdrant. # Trigger: Gitea UI → Actions → "RAG Ingestion" → Run # # Phasen: gesetze, eu, templates, datenschutz, verbraucherschutz, verify, version, all # # Voraussetzung: RAG-Service und Qdrant muessen auf Coolify laufen. # Die BreakPilot-Services muessen deployed sein (ci.yaml deploy-coolify). name: RAG Ingestion on: workflow_dispatch: inputs: phase: description: 'Ingestion Phase (gesetze, eu, templates, datenschutz, verbraucherschutz, dach, security, verify, version, all)' required: true default: 'verbraucherschutz' jobs: ingest: runs-on: docker container: docker:27-cli steps: - name: Setup run: | apk add --no-cache git curl bash > /dev/null 2>&1 - name: Checkout run: | git clone --depth 1 --branch main ${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git . - name: Run Ingestion run: | set -euo pipefail PHASE="${{ github.event.inputs.phase }}" echo "=== RAG Ingestion: Phase ${PHASE} ===" echo "" # Pruefen ob Services laufen echo "--- BreakPilot Container ---" docker ps --filter name=bp- --format "{{.Names}}: {{.Status}}" 2>/dev/null || true echo "" # Netzwerk finden in dem die bp-Services laufen BP_NETWORK=$(docker inspect bp-core-rag-service --format '{{range $k,$v := .NetworkSettings.Networks}}{{$k}}{{end}}' 2>/dev/null || echo "") if [ -z "$BP_NETWORK" ]; then BP_NETWORK=$(docker inspect bp-compliance-backend --format '{{range $k,$v := .NetworkSettings.Networks}}{{$k}}{{end}}' 2>/dev/null || echo "") fi if [ -z "$BP_NETWORK" ]; then echo "FEHLER: Keine BreakPilot-Container gefunden." echo "Bitte zuerst deployen (CI/CD Pipeline oder manuell)." echo "" echo "Verfuegbare Container:" docker ps --format " {{.Names}}" 2>/dev/null || true echo "" echo "Verfuegbare Netzwerke:" docker network ls --format " {{.Name}}" 2>/dev/null || true exit 1 fi echo "BreakPilot Netzwerk: $BP_NETWORK" echo "" # Ingestion-Container erstellen (noch nicht starten), # dann Scripts aus dem Checkout per docker cp hineinkopieren. # So verwenden wir IMMER die neueste Version der Scripts, # unabhaengig vom Deploy-Dir auf dem Host. CONTAINER_ID=$(docker create \ --network "$BP_NETWORK" \ -e "WORK_DIR=/tmp/rag-ingestion" \ -e "RAG_URL=http://bp-core-rag-service:8097/api/v1/documents/upload" \ -e "QDRANT_URL=https://qdrant-dev.breakpilot.ai" \ -e "QDRANT_API_KEY=z9cKbT74vl1aKPD1QGIlKWfET47VH93u" \ -e "SDK_URL=http://bp-compliance-ai-sdk:8090" \ alpine:3.19 \ sh -c " apk add --no-cache curl bash coreutils git python3 unzip > /dev/null 2>&1 mkdir -p /tmp/rag-ingestion/{pdfs,repos,texts} mkdir -p /workspace/scripts cp -r /workspace_scripts/* /workspace/scripts/ 2>/dev/null || true cd /workspace if [ '${PHASE}' = 'all' ]; then bash scripts/ingest-legal-corpus.sh elif [ '${PHASE}' = 'download' ]; then bash scripts/ingest-legal-corpus.sh --only download else echo '=== Running download phase first ===' bash scripts/ingest-legal-corpus.sh --only download echo '' echo '=== Running phase: ${PHASE} ===' bash scripts/ingest-legal-corpus.sh --only '${PHASE}' fi ") echo "Container: $CONTAINER_ID" # Workspace-Dir im Container anlegen und Scripts hineinkopieren docker cp scripts "${CONTAINER_ID}:/workspace_scripts" echo "Scripts kopiert (aus Git-Checkout)" # Container starten und Output streamen docker start -a "${CONTAINER_ID}" || EXITCODE=$? # Container aufraeumen docker rm -f "${CONTAINER_ID}" 2>/dev/null || true echo "" echo "=== Ingestion abgeschlossen ===" # Exit mit dem Original-Exitcode exit ${EXITCODE:-0}