feat: add RAG corpus versioning and source policy backend

Part 1 — RAG Corpus Versioning: - New DB table compliance_corpus_versions (migration 017) - Go CorpusVersionStore with CRUD operations - Assessment struct extended with corpus_version_id - API endpoints: GET /rag/corpus-status, /rag/corpus-versions/:collection - RAG routes (search, regulations) now registered in main.go - Ingestion script registers corpus versions after each run - Frontend staleness badge in SDK sidebar Part 3 — Source Policy Backend: - New FastAPI router with CRUD for allowed sources, PII rules, operations matrix, audit trail, stats, and compliance report - SQLAlchemy models for all source policy tables (migration 001) - Frontend API base corrected from edu-search:8088/8089 to backend-compliance:8002/api Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 07:58:08 +01:00
parent 187dbf1b77
commit a228b3b528
15 changed files with 2020 additions and 11 deletions
@@ -0,0 +1,971 @@
+#!/usr/bin/env bash
+# =============================================================================
+# BreakPilot Compliance — RAG Legal Corpus Ingestion
+#
+# Laedt 23 freie Rechtsquellen herunter und ingestiert sie in Qdrant
+# via die Core RAG-API (Port 8097).
+#
+# Ausfuehrung auf dem Mac Mini:
+#   ~/rag-ingestion/ingest-legal-corpus.sh [--skip-download] [--only PHASE]
+#
+# Phasen: download, gesetze, eu, templates, datenschutz, verify
+# =============================================================================
+set -euo pipefail
+
+# --- Configuration -----------------------------------------------------------
+WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion}"
+RAG_URL="https://localhost:8097/api/v1/documents/upload"
+QDRANT_URL="http://localhost:6333"
+SDK_URL="${SDK_URL:-https://localhost:8093}"
+DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}"
+CURL_OPTS="-sk --connect-timeout 10 --max-time 300"
+
+# Counters
+UPLOADED=0
+FAILED=0
+SKIPPED=0
+
+# --- CLI Args ----------------------------------------------------------------
+SKIP_DOWNLOAD=false
+ONLY_PHASE=""
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --skip-download) SKIP_DOWNLOAD=true; shift ;;
+    --only) ONLY_PHASE="$2"; shift 2 ;;
+    -h|--help)
+      echo "Usage: $0 [--skip-download] [--only PHASE]"
+      echo "Phases: download, gesetze, eu, templates, datenschutz, verify, version"
+      exit 0
+      ;;
+    *) echo "Unknown option: $1"; exit 1 ;;
+  esac
+done
+
+# --- Helpers -----------------------------------------------------------------
+log()   { echo "[$(date '+%H:%M:%S')] $*"; }
+ok()    { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
+warn()  { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
+fail()  { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
+
+upload_file() {
+  local file="$1"
+  local collection="$2"
+  local data_type="$3"
+  local use_case="$4"
+  local year="$5"
+  local metadata_json="$6"
+  local label="${7:-$(basename "$file")}"
+
+  if [[ ! -f "$file" ]]; then
+    warn "File not found: $file"
+    FAILED=$((FAILED + 1))
+    return 1
+  fi
+
+  local filesize
+  filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
+  if [[ "$filesize" -lt 100 ]]; then
+    warn "File too small (${filesize}B), skipping: $label"
+    SKIPPED=$((SKIPPED + 1))
+    return 1
+  fi
+
+  log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)"
+
+  local response
+  response=$(curl $CURL_OPTS -X POST "$RAG_URL" \
+    -F "file=@${file}" \
+    -F "collection=${collection}" \
+    -F "data_type=${data_type}" \
+    -F "use_case=${use_case}" \
+    -F "year=${year}" \
+    -F "chunk_strategy=recursive" \
+    -F "chunk_size=512" \
+    -F "chunk_overlap=50" \
+    -F "metadata_json=${metadata_json}" \
+    2>/dev/null) || true
+
+  if echo "$response" | grep -q '"chunks_count"'; then
+    local chunks
+    chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?")
+    ok "$label → $chunks chunks"
+    UPLOADED=$((UPLOADED + 1))
+  elif echo "$response" | grep -q '"vectors_indexed"'; then
+    local vectors
+    vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?")
+    ok "$label → $vectors vectors"
+    UPLOADED=$((UPLOADED + 1))
+  else
+    fail "Upload failed: $label"
+    fail "Response: $response"
+    FAILED=$((FAILED + 1))
+    return 1
+  fi
+}
+
+clone_repo() {
+  local url="$1"
+  local target="$2"
+
+  if [[ -d "$target" ]]; then
+    log "Repo exists: $target (skipping clone)"
+    return 0
+  fi
+
+  log "Cloning: $url"
+  git clone --depth 1 "$url" "$target" 2>/dev/null || {
+    warn "Clone failed: $url"
+    return 1
+  }
+}
+
+download_pdf() {
+  local url="$1"
+  local target="$2"
+
+  if [[ -f "$target" ]]; then
+    log "PDF exists: $(basename "$target") (skipping)"
+    return 0
+  fi
+
+  log "Downloading: $(basename "$target")"
+  curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || {
+    warn "Download failed: $url"
+    return 1
+  }
+}
+
+# Extract text from gesetze-im-internet.de HTML page
+extract_gesetz_html() {
+  local url="$1"
+  local output="$2"
+  local label="$3"
+
+  if [[ -f "$output" ]]; then
+    log "Text exists: $(basename "$output") (skipping)"
+    return 0
+  fi
+
+  log "Extracting: $label from gesetze-im-internet.de"
+  curl $CURL_OPTS -L "$url" 2>/dev/null \
+    | python3 -c "
+import sys, codecs
+
+# gesetze-im-internet.de uses ISO-8859-1 encoding
+sys.stdin = codecs.getreader('iso-8859-1')(sys.stdin.buffer)
+
+from html.parser import HTMLParser
+
+class TextExtractor(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.text = []
+        self.in_content = False
+        self.skip = False
+
+    def handle_starttag(self, tag, attrs):
+        attrs_dict = dict(attrs)
+        if tag == 'div' and 'jnhtml' in attrs_dict.get('class', ''):
+            self.in_content = True
+        if tag in ('script', 'style', 'nav', 'header', 'footer'):
+            self.skip = True
+
+    def handle_endtag(self, tag):
+        if tag in ('script', 'style', 'nav', 'header', 'footer'):
+            self.skip = False
+        if tag in ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'li'):
+            self.text.append('\n')
+
+    def handle_data(self, data):
+        if not self.skip:
+            self.text.append(data)
+
+parser = TextExtractor()
+parser.feed(sys.stdin.read())
+print(''.join(parser.text).strip())
+" > "$output" || {
+    warn "Extraction failed: $label"
+    return 1
+  }
+}
+
+# Concatenate Markdown files from bundestag/gesetze repo for a specific law
+concat_bundestag_gesetz() {
+  local gesetz_dir="$1"
+  local output="$2"
+  local label="$3"
+
+  if [[ ! -d "$gesetz_dir" ]]; then
+    warn "Gesetz directory not found: $gesetz_dir"
+    return 0
+  fi
+
+  log "Concatenating: $label"
+  {
+    echo "# $label"
+    echo ""
+    # Sort by paragraph number for correct ordering
+    find "$gesetz_dir" -name "*.md" -type f | sort | while read -r f; do
+      cat "$f"
+      echo ""
+      echo "---"
+      echo ""
+    done
+  } > "$output"
+}
+
+collection_count() {
+  local col="$1"
+  curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
+    | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?"
+}
+
+# =============================================================================
+# PHASE A: Downloads
+# =============================================================================
+phase_download() {
+  log "=========================================="
+  log "PHASE A: Downloads (PDFs + Git-Repos)"
+  log "=========================================="
+
+  mkdir -p "$WORK_DIR"/{pdfs,repos,texts}
+
+  # --- A1: EUR-Lex PDFs ---
+  log "--- EUR-Lex PDFs ---"
+
+  download_pdf \
+    "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32022R2065" \
+    "$WORK_DIR/pdfs/dsa_2022_2065.pdf"
+
+  download_pdf \
+    "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32002L0058" \
+    "$WORK_DIR/pdfs/eprivacy_2002_58.pdf"
+
+  download_pdf \
+    "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32021D0914" \
+    "$WORK_DIR/pdfs/scc_2021_914.pdf"
+
+  # --- A2: Deutsche Gesetze (Einzelparagraphen) ---
+  log "--- Deutsche Gesetze (Einzelparagraphen) ---"
+
+  extract_gesetz_html \
+    "https://www.gesetze-im-internet.de/ddg/__5.html" \
+    "$WORK_DIR/texts/ddg_5.txt" \
+    "DDG § 5 (Impressum)"
+
+  # TDDDG heisst auf gesetze-im-internet.de noch "ttdsg"
+  extract_gesetz_html \
+    "https://www.gesetze-im-internet.de/ttdsg/__25.html" \
+    "$WORK_DIR/texts/tdddg_25.txt" \
+    "TDDDG § 25 (Cookies)"
+
+  extract_gesetz_html \
+    "https://www.gesetze-im-internet.de/urhg/__5.html" \
+    "$WORK_DIR/texts/urhg_5.txt" \
+    "UrhG § 5 (Amtliche Werke)"
+
+  # EGBGB Art. 246a § 1 (enthaelt Verweis auf Muster-Widerrufsbelehrung)
+  extract_gesetz_html \
+    "https://www.gesetze-im-internet.de/bgbeg/art_246a__1.html" \
+    "$WORK_DIR/texts/egbgb_widerruf.txt" \
+    "EGBGB Muster-Widerrufsbelehrung"
+
+  # --- A3: Git-Repos ---
+  log "--- Git-Repos ---"
+
+  clone_repo "https://github.com/bundestag/gesetze.git" \
+    "$WORK_DIR/repos/bundestag-gesetze"
+
+  clone_repo "https://github.com/github/site-policy.git" \
+    "$WORK_DIR/repos/github-site-policy"
+
+  clone_repo "https://github.com/opengovfoundation/site-policy.git" \
+    "$WORK_DIR/repos/opengov-site-policy"
+
+  clone_repo "https://github.com/creativecommons/cc-legal-tools-data.git" \
+    "$WORK_DIR/repos/cc-legal-tools"
+
+  clone_repo "https://github.com/oprvc/oprvc.github.io.git" \
+    "$WORK_DIR/repos/oprvc"
+
+  clone_repo "https://github.com/webflorist/privacy-policy-text.git" \
+    "$WORK_DIR/repos/webflorist"
+
+  clone_repo "https://github.com/Tempest-Solutions-Company/privacy-policy-generator.git" \
+    "$WORK_DIR/repos/tempest-privacy" || true
+
+  clone_repo "https://github.com/Tempest-Solutions-Company/terms-of-service-generator.git" \
+    "$WORK_DIR/repos/tempest-tos" || true
+
+  clone_repo "https://github.com/Tempest-Solutions-Company/cookie-banner-consent-solution.git" \
+    "$WORK_DIR/repos/tempest-cookie" || true
+
+  clone_repo "https://github.com/orestbida/cookieconsent.git" \
+    "$WORK_DIR/repos/cookieconsent" || true
+
+  # CommonPaper hat separate Repos pro Vertragstyp
+  clone_repo "https://github.com/CommonPaper/CSA.git" \
+    "$WORK_DIR/repos/common-paper-csa" || true
+  clone_repo "https://github.com/CommonPaper/SLA.git" \
+    "$WORK_DIR/repos/common-paper-sla" || true
+  clone_repo "https://github.com/CommonPaper/PSA.git" \
+    "$WORK_DIR/repos/common-paper-psa" || true
+
+  # OpenCode.de (Datennutzungsklauseln) - try HTTPS
+  clone_repo "https://gitlab.opencode.de/wernerth/datennutzungsklauseln-muster.git" \
+    "$WORK_DIR/repos/datennutzungsklauseln" || true
+
+  # --- A4: EDPB/EDPS PDFs (verifizierte URLs) ---
+  log "--- EDPB/EDPS Guidance PDFs ---"
+
+  # EDPB Guidelines 05/2020 on Consent
+  download_pdf \
+    "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202005_consent_en.pdf" \
+    "$WORK_DIR/pdfs/edpb_consent_guidelines.pdf"
+
+  # EDPB Guidelines 4/2019 Data Protection by Design and Default
+  download_pdf \
+    "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201904_dataprotection_by_design_and_by_default_v2.0_en.pdf" \
+    "$WORK_DIR/pdfs/edpb_privacy_by_design.pdf"
+
+  # EDPB Guidelines 03/2022 Dark Patterns
+  download_pdf \
+    "https://www.edpb.europa.eu/system/files/2023-02/edpb_03-2022_guidelines_on_deceptive_design_patterns_in_social_media_platform_interfaces_v2_en_0.pdf" \
+    "$WORK_DIR/pdfs/edpb_dark_patterns.pdf"
+
+  # EDPB Guidelines 8/2020 Social Media Targeting
+  download_pdf \
+    "https://www.edpb.europa.eu/system/files/2021-04/edpb_guidelines_082020_on_the_targeting_of_social_media_users_en.pdf" \
+    "$WORK_DIR/pdfs/edpb_social_media_targeting.pdf"
+
+  # EDPB Cookie Banner Taskforce Report (Jan 2023)
+  download_pdf \
+    "https://www.edpb.europa.eu/system/files/2023-01/edpb_20230118_report_cookie_banner_taskforce_en.pdf" \
+    "$WORK_DIR/pdfs/edpb_cookie_banner_taskforce.pdf"
+
+  # EDPB Guidelines 2/2023 ePrivacy Art. 5(3) Technical Scope
+  download_pdf \
+    "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202302_technical_scope_art_53_eprivacydirective_v2_en_0.pdf" \
+    "$WORK_DIR/pdfs/edpb_eprivacy_art53.pdf"
+
+  # EDPB Guidelines 1/2024 Legitimate Interest
+  download_pdf \
+    "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202401_legitimateinterest_en.pdf" \
+    "$WORK_DIR/pdfs/edpb_legitimate_interest.pdf"
+
+  # EDPB DPO Coordinated Enforcement Report 2024
+  download_pdf \
+    "https://www.edpb.europa.eu/system/files/2024-01/edpb_report_20240116_cef_dpo_en.pdf" \
+    "$WORK_DIR/pdfs/edpb_dpo_report.pdf"
+
+  # EDPS GenAI Orientations (June 2024)
+  download_pdf \
+    "https://www.edps.europa.eu/system/files/2024-06/24-06-03_genai_orientations_en.pdf" \
+    "$WORK_DIR/pdfs/edps_generative_ai.pdf"
+
+  # EDPS Digital Ethics Report (2018)
+  download_pdf \
+    "https://edps.europa.eu/sites/edp/files/publication/18-01-25_eag_report_en.pdf" \
+    "$WORK_DIR/pdfs/edps_digital_ethics.pdf"
+
+  # --- A5: Text-Extraktion aus Repos ---
+  log "--- Text-Extraktion aus Repos ---"
+
+  # Bundestag/gesetze: Verfuegbare Gesetze (Repo ist teilweise veraltet)
+  # DDG, TDDDG, EGBGB fehlen im Repo - nur BGB, UrhG, TMG vorhanden
+  local -a bundestag_gesetze=(
+    "b/bgb:BGB"
+    "u/urhg:UrhG"
+    "t/tmg:TMG"
+  )
+  for entry in "${bundestag_gesetze[@]}"; do
+    local path="${entry%%:*}"
+    local label="${entry##*:}"
+    local gesetz_dir="$WORK_DIR/repos/bundestag-gesetze/$path"
+    if [[ -d "$gesetz_dir" ]]; then
+      local name
+      name=$(echo "$label" | tr '[:upper:]' '[:lower:]')
+      concat_bundestag_gesetz "$gesetz_dir" \
+        "$WORK_DIR/texts/bundestag_${name}_komplett.txt" \
+        "$label (komplett)"
+    else
+      warn "Bundestag Gesetz nicht gefunden: $gesetz_dir"
+    fi
+  done
+
+  log "Download phase complete."
+}
+
+# =============================================================================
+# PHASE B: Deutsche Gesetze → bp_compliance_gesetze
+# =============================================================================
+phase_gesetze() {
+  log "=========================================="
+  log "PHASE B: Deutsche Gesetze → bp_compliance_gesetze"
+  log "=========================================="
+
+  local col="bp_compliance_gesetze"
+  local before
+  before=$(collection_count "$col")
+  log "Collection $col: $before chunks (before)"
+
+  # B1: Einzelparagraphen
+  upload_file "$WORK_DIR/texts/ddg_5.txt" "$col" "compliance" "legal_reference" "2024" \
+    '{"regulation_id":"ddg_5","regulation_name_de":"Digitale-Dienste-Gesetz § 5","category":"impressum","license":"public_law","source":"gesetze-im-internet.de"}' \
+    "DDG § 5 (Impressumspflicht)"
+
+  upload_file "$WORK_DIR/texts/tdddg_25.txt" "$col" "compliance" "legal_reference" "2024" \
+    '{"regulation_id":"tdddg_25","regulation_name_de":"TDDDG § 25","category":"cookies","license":"public_law","source":"gesetze-im-internet.de"}' \
+    "TDDDG § 25 (Cookies/Endgeraetezugriff)"
+
+  upload_file "$WORK_DIR/texts/urhg_5.txt" "$col" "compliance" "legal_reference" "2024" \
+    '{"regulation_id":"urhg_5","regulation_name_de":"UrhG § 5","category":"urheberrecht","license":"public_law","source":"gesetze-im-internet.de"}' \
+    "UrhG § 5 (Amtliche Werke)"
+
+  upload_file "$WORK_DIR/texts/egbgb_widerruf.txt" "$col" "compliance" "legal_reference" "2024" \
+    '{"regulation_id":"egbgb_widerruf","regulation_name_de":"EGBGB Muster-Widerrufsbelehrung","category":"widerruf","license":"public_law","source":"gesetze-im-internet.de"}' \
+    "EGBGB Muster-Widerrufsbelehrung"
+
+  # B2: Bundestag/gesetze (komplett)
+  local -a bundestag_upload=(
+    "bgb:BGB:Buergerliches Gesetzbuch"
+    "urhg:UrhG:Urheberrechtsgesetz"
+    "tmg:TMG:Telemediengesetz"
+  )
+  for entry in "${bundestag_upload[@]}"; do
+    local gesetz="${entry%%:*}"
+    local rest="${entry#*:}"
+    local label="${rest%%:*}"
+    local fullname="${rest#*:}"
+    local file="$WORK_DIR/texts/bundestag_${gesetz}_komplett.txt"
+    if [[ -f "$file" ]]; then
+      upload_file "$file" "$col" "compliance" "legal_reference" "2024" \
+        "{\"regulation_id\":\"${gesetz}_komplett\",\"regulation_name_de\":\"$fullname ($label komplett)\",\"category\":\"volltext\",\"license\":\"unlicense\",\"source\":\"github.com/bundestag/gesetze\"}" \
+        "$label komplett (Bundestag-Repo)"
+    fi
+  done
+
+  local after
+  after=$(collection_count "$col")
+  log "Collection $col: $before → $after chunks"
+}
+
+# =============================================================================
+# PHASE C: EU-Rechtstexte → bp_compliance_ce
+# =============================================================================
+phase_eu() {
+  log "=========================================="
+  log "PHASE C: EU-Rechtstexte → bp_compliance_ce"
+  log "=========================================="
+
+  local col="bp_compliance_ce"
+  local before
+  before=$(collection_count "$col")
+  log "Collection $col: $before chunks (before)"
+
+  upload_file "$WORK_DIR/pdfs/dsa_2022_2065.pdf" "$col" "compliance_ce" "legal_reference" "2022" \
+    '{"regulation_id":"eu_2022_2065","regulation_name_de":"Digital Services Act (DSA)","regulation_name_en":"Digital Services Act","regulation_short":"DSA","category":"plattformregulierung","celex":"32022R2065","source":"eur-lex","license":"public_law"}' \
+    "Digital Services Act (EU) 2022/2065"
+
+  upload_file "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" "$col" "compliance_ce" "legal_reference" "2002" \
+    '{"regulation_id":"eu_2002_58","regulation_name_de":"ePrivacy-Richtlinie","regulation_name_en":"ePrivacy Directive","regulation_short":"ePrivacy","category":"datenschutz","celex":"32002L0058","source":"eur-lex","license":"public_law"}' \
+    "ePrivacy-Richtlinie 2002/58/EC"
+
+  upload_file "$WORK_DIR/pdfs/scc_2021_914.pdf" "$col" "compliance_ce" "legal_reference" "2021" \
+    '{"regulation_id":"eu_2021_914","regulation_name_de":"Standardvertragsklauseln (SCC)","regulation_name_en":"Standard Contractual Clauses","regulation_short":"SCC","category":"datentransfer","celex":"32021D0914","source":"eur-lex","license":"public_law"}' \
+    "Standardvertragsklauseln (EU) 2021/914"
+
+  local after
+  after=$(collection_count "$col")
+  log "Collection $col: $before → $after chunks"
+}
+
+# =============================================================================
+# PHASE D: Templates/Textbausteine → bp_legal_templates
+# =============================================================================
+phase_templates() {
+  log "=========================================="
+  log "PHASE D: Templates → bp_legal_templates"
+  log "=========================================="
+
+  local col="bp_legal_templates"
+  local before
+  before=$(collection_count "$col")
+  log "Collection $col: $before chunks (before)"
+
+  # --- D1: GitHub Site Policy (CC0) ---
+  local repo="$WORK_DIR/repos/github-site-policy"
+  if [[ -d "$repo" ]]; then
+    log "--- GitHub Site Policy ---"
+    find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
+      local basename
+      basename=$(basename "$f" .md)
+      local doc_type="policy"
+      case "$basename" in
+        *terms*|*tos*|*service*) doc_type="tos" ;;
+        *privacy*|*data*) doc_type="privacy_policy" ;;
+        *dmca*|*copyright*) doc_type="dmca" ;;
+        *acceptable*|*use*) doc_type="acceptable_use" ;;
+      esac
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"github_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/github/site-policy\",\"filename\":\"$basename\"}" \
+        "GitHub: $basename"
+    done
+  fi
+
+  # --- D2: OpenGov Site Policy (CC0) ---
+  repo="$WORK_DIR/repos/opengov-site-policy"
+  if [[ -d "$repo" ]]; then
+    log "--- OpenGov Site Policy ---"
+    find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
+      local basename
+      basename=$(basename "$f" .md)
+      local doc_type="policy"
+      case "$basename" in
+        *terms*|*tos*) doc_type="tos" ;;
+        *privacy*) doc_type="privacy_policy" ;;
+      esac
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"opengov_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/opengovfoundation/site-policy\",\"filename\":\"$basename\"}" \
+        "OpenGov: $basename"
+    done
+  fi
+
+  # --- D3: Creative Commons Legal Tools (CC0) ---
+  repo="$WORK_DIR/repos/cc-legal-tools"
+  if [[ -d "$repo" ]]; then
+    log "--- CC Legal Tools (ausgewaehlte Lizenztexte) ---"
+    # Only ingest the main license deeds (DE legalcode where available, else EN)
+    for license_dir in "$repo"/legalcode/de/CC0_1.0 "$repo"/legalcode/de/CC-BY_4.0 "$repo"/legalcode/de/CC-BY-SA_4.0; do
+      if [[ -d "$license_dir" ]]; then
+        find "$license_dir" -name "*.html" -o -name "*.txt" -o -name "*.md" 2>/dev/null | head -3 | while read -r f; do
+          local basename
+          basename=$(basename "$f")
+          upload_file "$f" "$col" "legal_template" "template" "2024" \
+            "{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
+            "CC License: $basename"
+        done
+      fi
+    done
+    # Fallback: try top-level legalcode files
+    find "$repo"/legalcode -maxdepth 2 -name "*4.0*legalcode*de*" -type f 2>/dev/null | head -5 | while read -r f; do
+      local basename
+      basename=$(basename "$f")
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
+        "CC License: $basename"
+    done
+  fi
+
+  # --- D4: opr.vc DSGVO-Mustertexte (CC0) ---
+  repo="$WORK_DIR/repos/oprvc"
+  if [[ -d "$repo" ]]; then
+    log "--- opr.vc DSGVO-Mustertexte ---"
+    # Look for German privacy/DSGVO content
+    find "$repo" \( -name "*.md" -o -name "*.html" -o -name "*.txt" \) \
+      -not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
+      | grep -iE "(datenschutz|privacy|dsgvo|gdpr|impressum)" \
+      | head -20 | while read -r f; do
+      local basename
+      basename=$(basename "$f")
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
+        "opr.vc: $basename"
+    done
+    # If no specific files found, try all markdown files
+    if [[ $(find "$repo" \( -name "*.md" -o -name "*.html" \) -not -path "*/.git/*" -not -name "README.md" | grep -ciE "(datenschutz|privacy|dsgvo|gdpr)" 2>/dev/null) -eq 0 ]]; then
+      find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" | head -10 | while read -r f; do
+        local basename
+        basename=$(basename "$f")
+        upload_file "$f" "$col" "legal_template" "template" "2024" \
+          "{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
+          "opr.vc: $basename"
+      done
+    fi
+  fi
+
+  # --- D5: webflorist/privacy-policy-text (MIT) ---
+  repo="$WORK_DIR/repos/webflorist"
+  if [[ -d "$repo" ]]; then
+    log "--- webflorist Privacy Policy Text ---"
+    # Look for JSON/text building blocks (German)
+    find "$repo" \( -name "*.json" -o -name "*.txt" -o -name "*.md" -o -name "*.php" \) \
+      -not -path "*/.git/*" -not -path "*/node_modules/*" -not -name "package*.json" \
+      -not -name "composer.json" -not -name "README.md" 2>/dev/null \
+      | head -20 | while read -r f; do
+      local basename
+      basename=$(basename "$f")
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"webflorist\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/webflorist/privacy-policy-text\",\"filename\":\"$basename\"}" \
+        "webflorist: $basename"
+    done
+  fi
+
+  # --- D6: Tempest Privacy Policy Generator (MIT) ---
+  repo="$WORK_DIR/repos/tempest-privacy"
+  if [[ -d "$repo" ]]; then
+    log "--- Tempest Privacy Policy Generator ---"
+    find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
+      -not -path "*/.git/*" -not -path "*/node_modules/*" \
+      -not -name "package*.json" -not -name "README.md" 2>/dev/null \
+      | head -15 | while read -r f; do
+      local basename
+      basename=$(basename "$f")
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"tempest_privacy\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/privacy-policy-generator\",\"filename\":\"$basename\"}" \
+        "Tempest Privacy: $basename"
+    done
+  fi
+
+  # --- D7: Tempest Terms of Service Generator (MIT) ---
+  repo="$WORK_DIR/repos/tempest-tos"
+  if [[ -d "$repo" ]]; then
+    log "--- Tempest Terms of Service Generator ---"
+    find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
+      -not -path "*/.git/*" -not -path "*/node_modules/*" \
+      -not -name "package*.json" -not -name "README.md" 2>/dev/null \
+      | head -15 | while read -r f; do
+      local basename
+      basename=$(basename "$f")
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"tempest_tos\",\"doc_type\":\"tos\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/terms-of-service-generator\",\"filename\":\"$basename\"}" \
+        "Tempest ToS: $basename"
+    done
+  fi
+
+  # --- D8: Tempest Cookie Banner (MIT) ---
+  repo="$WORK_DIR/repos/tempest-cookie"
+  if [[ -d "$repo" ]]; then
+    log "--- Tempest Cookie Banner ---"
+    find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
+      -not -path "*/.git/*" -not -path "*/node_modules/*" \
+      -not -name "package*.json" -not -name "README.md" 2>/dev/null \
+      | head -15 | while read -r f; do
+      local basename
+      basename=$(basename "$f")
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"tempest_cookie\",\"doc_type\":\"cookie_banner\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/cookie-banner-consent-solution\",\"filename\":\"$basename\"}" \
+        "Tempest Cookie: $basename"
+    done
+  fi
+
+  # --- D9: CookieConsent (orestbida) - UI Strings (MIT) ---
+  repo="$WORK_DIR/repos/cookieconsent"
+  if [[ -d "$repo" ]]; then
+    log "--- CookieConsent UI Strings ---"
+    # Look for translation/language files
+    find "$repo" -path "*/translations/*" -o -path "*/languages/*" -o -path "*/i18n/*" -o -path "*/locales/*" 2>/dev/null \
+      | grep -iE "\.(json|js|ts)$" | head -10 | while read -r f; do
+      local basename
+      basename=$(basename "$f")
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
+        "CookieConsent: $basename"
+    done
+    # Also check for example configs
+    find "$repo" -name "*.md" -path "*/docs/*" 2>/dev/null | head -5 | while read -r f; do
+      local basename
+      basename=$(basename "$f")
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
+        "CookieConsent Docs: $basename"
+    done
+  fi
+
+  # --- D10: Common Paper (CC BY 4.0) ---
+  log "--- Common Paper Standards ---"
+  local -a cp_repos=(
+    "common-paper-csa:saas_contract:CSA"
+    "common-paper-sla:sla:SLA"
+    "common-paper-psa:psa:PSA"
+  )
+  for entry in "${cp_repos[@]}"; do
+    local cp_dir="${entry%%:*}"
+    local rest="${entry#*:}"
+    local cp_doc_type="${rest%%:*}"
+    local cp_label="${rest#*:}"
+    repo="$WORK_DIR/repos/$cp_dir"
+    if [[ -d "$repo" ]]; then
+      find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" \
+        -not -name "CONTRIBUTING.md" -not -name "CHANGELOG.md" -not -name "CODE_OF_CONDUCT.md" 2>/dev/null \
+        | head -10 | while read -r f; do
+        local basename
+        basename=$(basename "$f" .md)
+        upload_file "$f" "$col" "legal_template" "template" "2024" \
+          "{\"source_id\":\"common_paper\",\"doc_type\":\"$cp_doc_type\",\"license\":\"cc_by_4\",\"attribution\":\"Common Paper Inc., licensed under CC BY 4.0\",\"source\":\"github.com/CommonPaper/$cp_label\",\"filename\":\"$basename\"}" \
+          "CommonPaper $cp_label: $basename"
+      done
+    fi
+  done
+
+  # --- D11: Datennutzungsklauseln (CC BY 4.0) ---
+  repo="$WORK_DIR/repos/datennutzungsklauseln"
+  if [[ -d "$repo" ]]; then
+    log "--- Datennutzungsklauseln ---"
+    find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
+      | head -15 | while read -r f; do
+      local basename
+      basename=$(basename "$f" .md)
+      upload_file "$f" "$col" "legal_template" "template" "2024" \
+        "{\"source_id\":\"datennutzungsklauseln\",\"doc_type\":\"data_clause\",\"license\":\"cc_by_4\",\"attribution\":\"OpenCode.de, lizenziert unter CC BY 4.0\",\"source\":\"gitlab.opencode.de/wernerth/datennutzungsklauseln-muster\",\"filename\":\"$basename\"}" \
+        "Datennutzungsklausel: $basename"
+    done
+  fi
+
+  local after
+  after=$(collection_count "$col")
+  log "Collection $col: $before → $after chunks"
+}
+
+# =============================================================================
+# PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz
+# =============================================================================
+phase_datenschutz() {
+  log "=========================================="
+  log "PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz"
+  log "=========================================="
+
+  local col="bp_compliance_datenschutz"
+  local before
+  before=$(collection_count "$col")
+  log "Collection $col: $before chunks (before)"
+
+  # EDPB Guidelines
+  for pdf in "$WORK_DIR"/pdfs/edpb_*.pdf; do
+    if [[ -f "$pdf" ]]; then
+      local basename
+      basename=$(basename "$pdf" .pdf)
+      local guideline_name="${basename#edpb_}"
+      guideline_name="${guideline_name//_/ }"
+      upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
+        "{\"source_id\":\"edpb\",\"doc_type\":\"guidance\",\"guideline_name\":\"$guideline_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Board (EDPB)\",\"source\":\"edpb.europa.eu\"}" \
+        "EDPB: $guideline_name"
+    fi
+  done
+
+  # EDPS Guidance
+  for pdf in "$WORK_DIR"/pdfs/edps_*.pdf; do
+    if [[ -f "$pdf" ]]; then
+      local basename
+      basename=$(basename "$pdf" .pdf)
+      local guidance_name="${basename#edps_}"
+      guidance_name="${guidance_name//_/ }"
+      upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
+        "{\"source_id\":\"edps\",\"doc_type\":\"guidance\",\"guidance_name\":\"$guidance_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Supervisor (EDPS)\",\"source\":\"edps.europa.eu\"}" \
+        "EDPS: $guidance_name"
+    fi
+  done
+
+  local after
+  after=$(collection_count "$col")
+  log "Collection $col: $before → $after chunks"
+}
+
+# =============================================================================
+# PHASE F: Verifizierung
+# =============================================================================
+phase_verify() {
+  log "=========================================="
+  log "PHASE F: Verifizierung"
+  log "=========================================="
+
+  echo ""
+  echo "=== Collection Stats ==="
+  for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
+    local count
+    count=$(collection_count "$col")
+    printf "  %-30s %s chunks\n" "$col" "$count"
+  done
+
+  echo ""
+  echo "=== Test-Suchen ==="
+
+  log "Suche: 'Impressumspflicht digitale Dienste' in bp_compliance_gesetze"
+  curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
+    -H 'Content-Type: application/json' \
+    -d '{"query":"Impressumspflicht digitale Dienste","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
+    | python3 -c "
+import sys,json
+try:
+    data = json.load(sys.stdin)
+    results = data.get('results', [])
+    print(f'  Treffer: {len(results)}')
+    for r in results[:3]:
+        print(f'    [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
+except: print('  (parse error)')
+" 2>/dev/null || echo "  (search failed)"
+
+  log "Suche: 'Cookie Einwilligung' in bp_compliance_ce"
+  curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
+    -H 'Content-Type: application/json' \
+    -d '{"query":"Cookie Einwilligung ePrivacy","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
+    | python3 -c "
+import sys,json
+try:
+    data = json.load(sys.stdin)
+    results = data.get('results', [])
+    print(f'  Treffer: {len(results)}')
+    for r in results[:3]:
+        print(f'    [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
+except: print('  (parse error)')
+" 2>/dev/null || echo "  (search failed)"
+
+  log "Suche: 'Privacy Policy Template GDPR' in bp_legal_templates"
+  curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
+    -H 'Content-Type: application/json' \
+    -d '{"query":"Privacy Policy Template GDPR","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
+    | python3 -c "
+import sys,json
+try:
+    data = json.load(sys.stdin)
+    results = data.get('results', [])
+    print(f'  Treffer: {len(results)}')
+    for r in results[:3]:
+        print(f'    [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
+except: print('  (parse error)')
+" 2>/dev/null || echo "  (search failed)"
+
+  echo ""
+}
+
+# =============================================================================
+# PHASE G: Corpus Version Registration
+# =============================================================================
+phase_register_version() {
+  log "=========================================="
+  log "PHASE G: Corpus Version Registration"
+  log "=========================================="
+
+  local today
+  today=$(date '+%Y-%m-%d')
+
+  for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
+    local count
+    count=$(collection_count "$col")
+
+    if [[ "$count" == "?" || "$count" == "0" ]]; then
+      warn "Skipping version for $col (count=$count)"
+      continue
+    fi
+
+    # Determine next version number for today
+    local existing_count
+    existing_count=$(psql "$DB_URL" -tAc \
+      "SELECT COUNT(*) FROM compliance_corpus_versions WHERE collection_name='$col' AND version LIKE '${today}.%'" \
+      2>/dev/null || echo "0")
+    local seq=$((existing_count + 1))
+    local version="${today}.${seq}"
+
+    # Get regulations list based on collection
+    local regs=""
+    case "$col" in
+      bp_compliance_ce)
+        regs='{eu_2022_2065,eu_2002_58,eu_2021_914}'
+        ;;
+      bp_compliance_gesetze)
+        regs='{ddg_5,tdddg_25,urhg_5,egbgb_widerruf,bgb_komplett,urhg_komplett,tmg_komplett}'
+        ;;
+      bp_legal_templates)
+        regs='{github_site_policy,opengov_site_policy,cc_legal_tools,common_paper,webflorist,tempest,cookieconsent}'
+        ;;
+      bp_compliance_datenschutz)
+        regs='{edpb_consent,edpb_privacy_by_design,edpb_dark_patterns,edpb_social_media,edpb_cookie_banner,edps_generative_ai,edps_digital_ethics}'
+        ;;
+    esac
+
+    # Compute digest from Qdrant collection info
+    local digest
+    digest=$(curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
+      | python3 -c "import sys,json,hashlib; d=json.load(sys.stdin); print(hashlib.sha256(json.dumps(d.get('result',{}), sort_keys=True).encode()).hexdigest()[:32])" \
+      2>/dev/null || echo "")
+
+    log "Registering version $version for $col ($count chunks)"
+
+    psql "$DB_URL" -c "
+      INSERT INTO compliance_corpus_versions
+        (version, collection_name, documents_count, chunks_count, regulations, digest, ingestion_source, created_by)
+      VALUES
+        ('${version}', '${col}', ${UPLOADED}, ${count}, '${regs}', '${digest}', 'ingest-legal-corpus.sh', 'system')
+    " 2>/dev/null && ok "Version $version registered for $col" || warn "Version registration failed for $col (DB not available?)"
+  done
+}
+
+# =============================================================================
+# MAIN
+# =============================================================================
+main() {
+  log "=========================================="
+  log "BreakPilot Legal Corpus Ingestion"
+  log "=========================================="
+  log "Work dir:  $WORK_DIR"
+  log "RAG API:   $RAG_URL"
+  log "Qdrant:    $QDRANT_URL"
+  echo ""
+
+  # Check RAG API is reachable
+  if ! curl $CURL_OPTS "$RAG_URL" -X POST 2>/dev/null | grep -q "file"; then
+    fail "RAG API not reachable at $RAG_URL"
+    exit 1
+  fi
+  ok "RAG API reachable"
+
+  # Check Qdrant
+  if ! curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then
+    fail "Qdrant not reachable at $QDRANT_URL"
+    exit 1
+  fi
+  ok "Qdrant reachable"
+  echo ""
+
+  # Run phases
+  if [[ -n "$ONLY_PHASE" ]]; then
+    case "$ONLY_PHASE" in
+      download)    phase_download ;;
+      gesetze)     phase_gesetze ;;
+      eu)          phase_eu ;;
+      templates)   phase_templates ;;
+      datenschutz) phase_datenschutz ;;
+      verify)      phase_verify ;;
+      version)     phase_register_version ;;
+      *) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;;
+    esac
+  else
+    if [[ "$SKIP_DOWNLOAD" != "true" ]]; then
+      phase_download
+    else
+      log "Skipping download phase (--skip-download)"
+    fi
+    echo ""
+    phase_gesetze
+    echo ""
+    phase_eu
+    echo ""
+    phase_templates
+    echo ""
+    phase_datenschutz
+    echo ""
+    phase_verify
+    echo ""
+    phase_register_version
+  fi
+
+  # Summary
+  echo ""
+  log "=========================================="
+  log "ERGEBNIS"
+  log "=========================================="
+  log "Uploaded:  $UPLOADED"
+  log "Failed:    $FAILED"
+  log "Skipped:   $SKIPPED"
+  log "=========================================="
+
+  if [[ $FAILED -gt 0 ]]; then
+    warn "$FAILED uploads fehlgeschlagen!"
+    exit 1
+  fi
+
+  ok "Ingestion abgeschlossen!"
+}
+
+main "$@"