breakpilot-compliance/scripts/ingest-legal-corpus.sh

#!/usr/bin/env bash
# =============================================================================
# BreakPilot Compliance — RAG Legal Corpus Ingestion
#
# Laedt 23 freie Rechtsquellen herunter und ingestiert sie in Qdrant
# via die Core RAG-API (Port 8097).
#
# Ausfuehrung auf dem Mac Mini:
#   ~/rag-ingestion/ingest-legal-corpus.sh [--skip-download] [--only PHASE]
#
# Phasen: download, gesetze, eu, templates, datenschutz, verify
# =============================================================================
set -euo pipefail

# --- Configuration -----------------------------------------------------------
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion}"
RAG_URL="${RAG_URL:-https://localhost:8097/api/v1/documents/upload}"
QDRANT_URL="${QDRANT_URL:-http://localhost:6333}"
SDK_URL="${SDK_URL:-https://localhost:8093}"
DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}"
CURL_OPTS="-sk --connect-timeout 10 --max-time 300"
CURL_OPTS_LARGE="-sk --connect-timeout 10 --max-time 900"

# Counters
UPLOADED=0
FAILED=0
SKIPPED=0

# --- CLI Args ----------------------------------------------------------------
SKIP_DOWNLOAD=false
ONLY_PHASE=""

while [[ $# -gt 0 ]]; do
  case $1 in
    --skip-download) SKIP_DOWNLOAD=true; shift ;;
    --only) ONLY_PHASE="$2"; shift 2 ;;
    -h|--help)
      echo "Usage: $0 [--skip-download] [--only PHASE]"
      echo "Phases: download, gesetze, eu, templates, datenschutz, verbraucherschutz, verify, version"
      exit 0
      ;;
    *) echo "Unknown option: $1"; exit 1 ;;
  esac
done

# --- Helpers -----------------------------------------------------------------
log()   { echo "[$(date '+%H:%M:%S')] $*"; }
ok()    { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
warn()  { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
fail()  { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }

upload_file() {
  local file="$1"
  local collection="$2"
  local data_type="$3"
  local use_case="$4"
  local year="$5"
  local metadata_json="$6"
  local label="${7:-$(basename "$file")}"

  if [[ ! -f "$file" ]]; then
    warn "File not found: $file"
    FAILED=$((FAILED + 1))
    return 0  # Don't abort script
  fi

  # Dedup-Check: Prüfe ob regulation_id bereits in Qdrant vorhanden ist
  local reg_id
  reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "")
  if [[ -n "$reg_id" && -n "${QDRANT_URL:-}" ]]; then
    local qdrant_auth=""
    [[ -n "${QDRANT_API_KEY:-}" ]] && qdrant_auth="-H api-key:${QDRANT_API_KEY}"
    local existing
    existing=$(curl -sk --max-time 5 $qdrant_auth -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \
      -H "Content-Type: application/json" \
      -d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \
      2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0")
    if [[ "$existing" -gt 0 ]] 2>/dev/null; then
      log "⏭ Skip (already in Qdrant): $label [regulation_id=$reg_id]"
      SKIPPED=$((SKIPPED + 1))
      return 0
    fi
  fi

  local filesize
  filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
  if [[ "$filesize" -lt 100 ]]; then
    warn "File too small (${filesize}B), skipping: $label"
    SKIPPED=$((SKIPPED + 1))
    return 0  # Don't abort script
  fi

  log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)"

  # Use longer timeout for large files (>500KB)
  local curl_opts="$CURL_OPTS"
  if [[ "$filesize" -gt 256000 ]]; then
    curl_opts="$CURL_OPTS_LARGE"
    log "  (large file, using extended timeout)"
  fi

  local response
  response=$(curl $curl_opts -X POST "$RAG_URL" \
    -F "file=@${file}" \
    -F "collection=${collection}" \
    -F "data_type=${data_type}" \
    -F "use_case=${use_case}" \
    -F "year=${year}" \
    -F "chunk_strategy=recursive" \
    -F "chunk_size=512" \
    -F "chunk_overlap=50" \
    -F "metadata_json=${metadata_json}" \
    2>/dev/null) || true

  if echo "$response" | grep -q '"chunks_count"'; then
    local chunks
    chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?")
    ok "$label → $chunks chunks"
    UPLOADED=$((UPLOADED + 1))
  elif echo "$response" | grep -q '"vectors_indexed"'; then
    local vectors
    vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?")
    ok "$label → $vectors vectors"
    UPLOADED=$((UPLOADED + 1))
  else
    fail "Upload failed: $label"
    fail "Response: ${response:0:200}"
    FAILED=$((FAILED + 1))
    return 0  # Don't abort script on individual upload failure
  fi
}

clone_repo() {
  local url="$1"
  local target="$2"

  if [[ -d "$target" ]]; then
    log "Repo exists: $target (skipping clone)"
    return 0
  fi

  log "Cloning: $url"
  git clone --depth 1 "$url" "$target" 2>/dev/null || {
    warn "Clone failed: $url"
    return 1
  }
}

download_pdf() {
  local url="$1"
  local target="$2"

  if [[ -f "$target" ]]; then
    log "PDF exists: $(basename "$target") (skipping)"
    return 0
  fi

  log "Downloading: $(basename "$target")"
  curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || {
    warn "Download failed: $url"
    rm -f "$target"
    return 0
  }
  # Verify file is not empty/too small (e.g. HTML error page)
  local fsize
  fsize=$(stat -f%z "$target" 2>/dev/null || stat -c%s "$target" 2>/dev/null || echo 0)
  if [[ "$fsize" -lt 1000 ]]; then
    warn "Download too small (${fsize}B), likely error page: $(basename "$target")"
    rm -f "$target"
  fi
}

# Extract text from gesetze-im-internet.de HTML page
extract_gesetz_html() {
  local url="$1"
  local output="$2"
  local label="$3"

  if [[ -f "$output" ]]; then
    log "Text exists: $(basename "$output") (skipping)"
    return 0
  fi

  log "Extracting: $label from gesetze-im-internet.de"
  curl $CURL_OPTS -L "$url" 2>/dev/null \
    | python3 -c "
import sys, codecs

# gesetze-im-internet.de uses ISO-8859-1 encoding
sys.stdin = codecs.getreader('iso-8859-1')(sys.stdin.buffer)

from html.parser import HTMLParser

class TextExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.text = []
        self.in_content = False
        self.skip = False

    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        if tag == 'div' and 'jnhtml' in attrs_dict.get('class', ''):
            self.in_content = True
        if tag in ('script', 'style', 'nav', 'header', 'footer'):
            self.skip = True

    def handle_endtag(self, tag):
        if tag in ('script', 'style', 'nav', 'header', 'footer'):
            self.skip = False
        if tag in ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'li'):
            self.text.append('\n')

    def handle_data(self, data):
        if not self.skip:
            self.text.append(data)

parser = TextExtractor()
parser.feed(sys.stdin.read())
print(''.join(parser.text).strip())
" > "$output" || {
    warn "Extraction failed: $label"
    rm -f "$output"
    return 0
  }
}

# Concatenate Markdown files from bundestag/gesetze repo for a specific law
concat_bundestag_gesetz() {
  local gesetz_dir="$1"
  local output="$2"
  local label="$3"

  if [[ ! -d "$gesetz_dir" ]]; then
    warn "Gesetz directory not found: $gesetz_dir"
    return 0
  fi

  log "Concatenating: $label"
  {
    echo "# $label"
    echo ""
    # Sort by paragraph number for correct ordering
    find "$gesetz_dir" -name "*.md" -type f | sort | while read -r f; do
      cat "$f"
      echo ""
      echo "---"
      echo ""
    done
  } > "$output"
}

collection_count() {
  local col="$1"
  local qdrant_auth=""
  [[ -n "${QDRANT_API_KEY:-}" ]] && qdrant_auth="-H api-key:${QDRANT_API_KEY}"
  local count
  count=$(curl -sk --max-time 10 $qdrant_auth "${QDRANT_URL}/collections/${col}" 2>/dev/null \
    | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null) || count=""
  # Ensure numeric output (0 on failure)
  if [[ "$count" =~ ^[0-9]+$ ]]; then
    echo "$count"
  else
    echo "0"
  fi
}

# =============================================================================
# PHASE A: Downloads
# =============================================================================
phase_download() {
  log "=========================================="
  log "PHASE A: Downloads (PDFs + Git-Repos)"
  log "=========================================="

  mkdir -p "$WORK_DIR"/{pdfs,repos,texts}

  # --- A1: EUR-Lex PDFs ---
  log "--- EUR-Lex PDFs ---"

  download_pdf \
    "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32022R2065" \
    "$WORK_DIR/pdfs/dsa_2022_2065.pdf" || true

  download_pdf \
    "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32002L0058" \
    "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" || true

  download_pdf \
    "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32021D0914" \
    "$WORK_DIR/pdfs/scc_2021_914.pdf" || true

  # --- A2: Deutsche Gesetze (Einzelparagraphen) ---
  log "--- Deutsche Gesetze (Einzelparagraphen) ---"

  extract_gesetz_html \
    "https://www.gesetze-im-internet.de/ddg/__5.html" \
    "$WORK_DIR/texts/ddg_5.txt" \
    "DDG § 5 (Impressum)"

  # TDDDG heisst auf gesetze-im-internet.de noch "ttdsg"
  extract_gesetz_html \
    "https://www.gesetze-im-internet.de/ttdsg/__25.html" \
    "$WORK_DIR/texts/tdddg_25.txt" \
    "TDDDG § 25 (Cookies)"

  extract_gesetz_html \
    "https://www.gesetze-im-internet.de/urhg/__5.html" \
    "$WORK_DIR/texts/urhg_5.txt" \
    "UrhG § 5 (Amtliche Werke)"

  # EGBGB Art. 246a § 1 (enthaelt Verweis auf Muster-Widerrufsbelehrung)
  extract_gesetz_html \
    "https://www.gesetze-im-internet.de/bgbeg/art_246a__1.html" \
    "$WORK_DIR/texts/egbgb_widerruf.txt" \
    "EGBGB Muster-Widerrufsbelehrung"

  # --- A3: Git-Repos ---
  log "--- Git-Repos ---"

  clone_repo "https://github.com/bundestag/gesetze.git" \
    "$WORK_DIR/repos/bundestag-gesetze"

  clone_repo "https://github.com/github/site-policy.git" \
    "$WORK_DIR/repos/github-site-policy"

  clone_repo "https://github.com/opengovfoundation/site-policy.git" \
    "$WORK_DIR/repos/opengov-site-policy"

  clone_repo "https://github.com/creativecommons/cc-legal-tools-data.git" \
    "$WORK_DIR/repos/cc-legal-tools"

  clone_repo "https://github.com/oprvc/oprvc.github.io.git" \
    "$WORK_DIR/repos/oprvc"

  clone_repo "https://github.com/webflorist/privacy-policy-text.git" \
    "$WORK_DIR/repos/webflorist"

  clone_repo "https://github.com/Tempest-Solutions-Company/privacy-policy-generator.git" \
    "$WORK_DIR/repos/tempest-privacy" || true

  clone_repo "https://github.com/Tempest-Solutions-Company/terms-of-service-generator.git" \
    "$WORK_DIR/repos/tempest-tos" || true

  clone_repo "https://github.com/Tempest-Solutions-Company/cookie-banner-consent-solution.git" \
    "$WORK_DIR/repos/tempest-cookie" || true

  clone_repo "https://github.com/orestbida/cookieconsent.git" \
    "$WORK_DIR/repos/cookieconsent" || true

  # CommonPaper hat separate Repos pro Vertragstyp
  clone_repo "https://github.com/CommonPaper/CSA.git" \
    "$WORK_DIR/repos/common-paper-csa" || true
  clone_repo "https://github.com/CommonPaper/SLA.git" \
    "$WORK_DIR/repos/common-paper-sla" || true
  clone_repo "https://github.com/CommonPaper/PSA.git" \
    "$WORK_DIR/repos/common-paper-psa" || true

  # OpenCode.de (Datennutzungsklauseln) - try HTTPS
  clone_repo "https://gitlab.opencode.de/wernerth/datennutzungsklauseln-muster.git" \
    "$WORK_DIR/repos/datennutzungsklauseln" || true

  # --- A4: EDPB/EDPS PDFs (verifizierte URLs) ---
  log "--- EDPB/EDPS Guidance PDFs ---"

  # EDPB Guidelines 05/2020 on Consent
  download_pdf \
    "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202005_consent_en.pdf" \
    "$WORK_DIR/pdfs/edpb_consent_guidelines.pdf"

  # EDPB Guidelines 4/2019 Data Protection by Design and Default
  download_pdf \
    "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201904_dataprotection_by_design_and_by_default_v2.0_en.pdf" \
    "$WORK_DIR/pdfs/edpb_privacy_by_design.pdf"

  # EDPB Guidelines 03/2022 Dark Patterns
  download_pdf \
    "https://www.edpb.europa.eu/system/files/2023-02/edpb_03-2022_guidelines_on_deceptive_design_patterns_in_social_media_platform_interfaces_v2_en_0.pdf" \
    "$WORK_DIR/pdfs/edpb_dark_patterns.pdf"

  # EDPB Guidelines 8/2020 Social Media Targeting
  download_pdf \
    "https://www.edpb.europa.eu/system/files/2021-04/edpb_guidelines_082020_on_the_targeting_of_social_media_users_en.pdf" \
    "$WORK_DIR/pdfs/edpb_social_media_targeting.pdf"

  # EDPB Cookie Banner Taskforce Report (Jan 2023)
  download_pdf \
    "https://www.edpb.europa.eu/system/files/2023-01/edpb_20230118_report_cookie_banner_taskforce_en.pdf" \
    "$WORK_DIR/pdfs/edpb_cookie_banner_taskforce.pdf"

  # EDPB Guidelines 2/2023 ePrivacy Art. 5(3) Technical Scope
  download_pdf \
    "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202302_technical_scope_art_53_eprivacydirective_v2_en_0.pdf" \
    "$WORK_DIR/pdfs/edpb_eprivacy_art53.pdf"

  # EDPB Guidelines 1/2024 Legitimate Interest
  download_pdf \
    "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202401_legitimateinterest_en.pdf" \
    "$WORK_DIR/pdfs/edpb_legitimate_interest.pdf"

  # EDPB DPO Coordinated Enforcement Report 2024
  download_pdf \
    "https://www.edpb.europa.eu/system/files/2024-01/edpb_report_20240116_cef_dpo_en.pdf" \
    "$WORK_DIR/pdfs/edpb_dpo_report.pdf"

  # EDPS GenAI Orientations (June 2024)
  download_pdf \
    "https://www.edps.europa.eu/system/files/2024-06/24-06-03_genai_orientations_en.pdf" \
    "$WORK_DIR/pdfs/edps_generative_ai.pdf"

  # EDPS Digital Ethics Report (2018)
  download_pdf \
    "https://edps.europa.eu/sites/edp/files/publication/18-01-25_eag_report_en.pdf" \
    "$WORK_DIR/pdfs/edps_digital_ethics.pdf"

  # --- A5: Text-Extraktion aus Repos ---
  log "--- Text-Extraktion aus Repos ---"

  # Bundestag/gesetze: Verfuegbare Gesetze (Repo ist teilweise veraltet)
  # DDG, TDDDG, EGBGB fehlen im Repo - nur BGB, UrhG, TMG vorhanden
  local -a bundestag_gesetze=(
    "b/bgb:BGB"
    "u/urhg:UrhG"
    "t/tmg:TMG"
  )
  for entry in "${bundestag_gesetze[@]}"; do
    local path="${entry%%:*}"
    local label="${entry##*:}"
    local gesetz_dir="$WORK_DIR/repos/bundestag-gesetze/$path"
    if [[ -d "$gesetz_dir" ]]; then
      local name
      name=$(echo "$label" | tr '[:upper:]' '[:lower:]')
      concat_bundestag_gesetz "$gesetz_dir" \
        "$WORK_DIR/texts/bundestag_${name}_komplett.txt" \
        "$label (komplett)"
    else
      warn "Bundestag Gesetz nicht gefunden: $gesetz_dir"
    fi
  done

  log "Download phase complete."
}

# =============================================================================
# PHASE B: Deutsche Gesetze → bp_compliance_gesetze
# =============================================================================
phase_gesetze() {
  log "=========================================="
  log "PHASE B: Deutsche Gesetze → bp_compliance_gesetze"
  log "=========================================="

  local col="bp_compliance_gesetze"
  local before
  before=$(collection_count "$col")
  log "Collection $col: $before chunks (before)"

  # B1: Einzelparagraphen
  upload_file "$WORK_DIR/texts/ddg_5.txt" "$col" "compliance" "legal_reference" "2024" \
    '{"regulation_id":"ddg_5","regulation_name_de":"Digitale-Dienste-Gesetz § 5","category":"impressum","license":"public_law","source":"gesetze-im-internet.de"}' \
    "DDG § 5 (Impressumspflicht)"

  upload_file "$WORK_DIR/texts/tdddg_25.txt" "$col" "compliance" "legal_reference" "2024" \
    '{"regulation_id":"tdddg_25","regulation_name_de":"TDDDG § 25","category":"cookies","license":"public_law","source":"gesetze-im-internet.de"}' \
    "TDDDG § 25 (Cookies/Endgeraetezugriff)"

  upload_file "$WORK_DIR/texts/urhg_5.txt" "$col" "compliance" "legal_reference" "2024" \
    '{"regulation_id":"urhg_5","regulation_name_de":"UrhG § 5","category":"urheberrecht","license":"public_law","source":"gesetze-im-internet.de"}' \
    "UrhG § 5 (Amtliche Werke)"

  upload_file "$WORK_DIR/texts/egbgb_widerruf.txt" "$col" "compliance" "legal_reference" "2024" \
    '{"regulation_id":"egbgb_widerruf","regulation_name_de":"EGBGB Muster-Widerrufsbelehrung","category":"widerruf","license":"public_law","source":"gesetze-im-internet.de"}' \
    "EGBGB Muster-Widerrufsbelehrung"

  # B2: Bundestag/gesetze (komplett)
  local -a bundestag_upload=(
    "bgb:BGB:Buergerliches Gesetzbuch"
    "urhg:UrhG:Urheberrechtsgesetz"
    "tmg:TMG:Telemediengesetz"
  )
  for entry in "${bundestag_upload[@]}"; do
    local gesetz="${entry%%:*}"
    local rest="${entry#*:}"
    local label="${rest%%:*}"
    local fullname="${rest#*:}"
    local file="$WORK_DIR/texts/bundestag_${gesetz}_komplett.txt"
    if [[ -f "$file" ]]; then
      upload_file "$file" "$col" "compliance" "legal_reference" "2024" \
        "{\"regulation_id\":\"${gesetz}_komplett\",\"regulation_name_de\":\"$fullname ($label komplett)\",\"category\":\"volltext\",\"license\":\"unlicense\",\"source\":\"github.com/bundestag/gesetze\"}" \
        "$label komplett (Bundestag-Repo)"
    fi
  done

  local after
  after=$(collection_count "$col")
  log "Collection $col: $before → $after chunks"
}

# =============================================================================
# PHASE C: EU-Rechtstexte → bp_compliance_ce
# =============================================================================
phase_eu() {
  log "=========================================="
  log "PHASE C: EU-Rechtstexte → bp_compliance_ce"
  log "=========================================="

  local col="bp_compliance_ce"
  local before
  before=$(collection_count "$col")
  log "Collection $col: $before chunks (before)"

  upload_file "$WORK_DIR/pdfs/dsa_2022_2065.pdf" "$col" "compliance_ce" "legal_reference" "2022" \
    '{"regulation_id":"eu_2022_2065","regulation_name_de":"Digital Services Act (DSA)","regulation_name_en":"Digital Services Act","regulation_short":"DSA","category":"plattformregulierung","celex":"32022R2065","source":"eur-lex","license":"public_law"}' \
    "Digital Services Act (EU) 2022/2065"

  upload_file "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" "$col" "compliance_ce" "legal_reference" "2002" \
    '{"regulation_id":"eu_2002_58","regulation_name_de":"ePrivacy-Richtlinie","regulation_name_en":"ePrivacy Directive","regulation_short":"ePrivacy","category":"datenschutz","celex":"32002L0058","source":"eur-lex","license":"public_law"}' \
    "ePrivacy-Richtlinie 2002/58/EC"

  upload_file "$WORK_DIR/pdfs/scc_2021_914.pdf" "$col" "compliance_ce" "legal_reference" "2021" \
    '{"regulation_id":"eu_2021_914","regulation_name_de":"Standardvertragsklauseln (SCC)","regulation_name_en":"Standard Contractual Clauses","regulation_short":"SCC","category":"datentransfer","celex":"32021D0914","source":"eur-lex","license":"public_law"}' \
    "Standardvertragsklauseln (EU) 2021/914"

  local after
  after=$(collection_count "$col")
  log "Collection $col: $before → $after chunks"
}

# =============================================================================
# PHASE D: Templates/Textbausteine → bp_legal_templates
# =============================================================================
phase_templates() {
  log "=========================================="
  log "PHASE D: Templates → bp_legal_templates"
  log "=========================================="

  local col="bp_legal_templates"
  local before
  before=$(collection_count "$col")
  log "Collection $col: $before chunks (before)"

  # --- D1: GitHub Site Policy (CC0) ---
  local repo="$WORK_DIR/repos/github-site-policy"
  if [[ -d "$repo" ]]; then
    log "--- GitHub Site Policy ---"
    find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
      local basename
      basename=$(basename "$f" .md)
      local doc_type="policy"
      case "$basename" in
        *terms*|*tos*|*service*) doc_type="tos" ;;
        *privacy*|*data*) doc_type="privacy_policy" ;;
        *dmca*|*copyright*) doc_type="dmca" ;;
        *acceptable*|*use*) doc_type="acceptable_use" ;;
      esac
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"github_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/github/site-policy\",\"filename\":\"$basename\"}" \
        "GitHub: $basename"
    done
  fi

  # --- D2: OpenGov Site Policy (CC0) ---
  repo="$WORK_DIR/repos/opengov-site-policy"
  if [[ -d "$repo" ]]; then
    log "--- OpenGov Site Policy ---"
    find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
      local basename
      basename=$(basename "$f" .md)
      local doc_type="policy"
      case "$basename" in
        *terms*|*tos*) doc_type="tos" ;;
        *privacy*) doc_type="privacy_policy" ;;
      esac
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"opengov_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/opengovfoundation/site-policy\",\"filename\":\"$basename\"}" \
        "OpenGov: $basename"
    done
  fi

  # --- D3: Creative Commons Legal Tools (CC0) ---
  repo="$WORK_DIR/repos/cc-legal-tools"
  if [[ -d "$repo" ]]; then
    log "--- CC Legal Tools (ausgewaehlte Lizenztexte) ---"
    # Only ingest the main license deeds (DE legalcode where available, else EN)
    for license_dir in "$repo"/legalcode/de/CC0_1.0 "$repo"/legalcode/de/CC-BY_4.0 "$repo"/legalcode/de/CC-BY-SA_4.0; do
      if [[ -d "$license_dir" ]]; then
        find "$license_dir" -name "*.html" -o -name "*.txt" -o -name "*.md" 2>/dev/null | head -3 | while read -r f; do
          local basename
          basename=$(basename "$f")
          upload_file "$f" "$col" "legal_template" "template" "2024" \
            "{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
            "CC License: $basename"
        done
      fi
    done
    # Fallback: try top-level legalcode files
    find "$repo"/legalcode -maxdepth 2 -name "*4.0*legalcode*de*" -type f 2>/dev/null | head -5 | while read -r f; do
      local basename
      basename=$(basename "$f")
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
        "CC License: $basename"
    done
  fi

  # --- D4: opr.vc DSGVO-Mustertexte (CC0) ---
  repo="$WORK_DIR/repos/oprvc"
  if [[ -d "$repo" ]]; then
    log "--- opr.vc DSGVO-Mustertexte ---"
    # Look for German privacy/DSGVO content
    find "$repo" \( -name "*.md" -o -name "*.html" -o -name "*.txt" \) \
      -not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
      | grep -iE "(datenschutz|privacy|dsgvo|gdpr|impressum)" \
      | head -20 | while read -r f; do
      local basename
      basename=$(basename "$f")
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
        "opr.vc: $basename"
    done
    # If no specific files found, try all markdown files
    if [[ $(find "$repo" \( -name "*.md" -o -name "*.html" \) -not -path "*/.git/*" -not -name "README.md" | grep -ciE "(datenschutz|privacy|dsgvo|gdpr)" 2>/dev/null) -eq 0 ]]; then
      find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" | head -10 | while read -r f; do
        local basename
        basename=$(basename "$f")
        upload_file "$f" "$col" "legal_template" "template" "2024" \
          "{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
          "opr.vc: $basename"
      done
    fi
  fi

  # --- D5: webflorist/privacy-policy-text (MIT) ---
  repo="$WORK_DIR/repos/webflorist"
  if [[ -d "$repo" ]]; then
    log "--- webflorist Privacy Policy Text ---"
    # Look for JSON/text building blocks (German)
    find "$repo" \( -name "*.json" -o -name "*.txt" -o -name "*.md" -o -name "*.php" \) \
      -not -path "*/.git/*" -not -path "*/node_modules/*" -not -name "package*.json" \
      -not -name "composer.json" -not -name "README.md" 2>/dev/null \
      | head -20 | while read -r f; do
      local basename
      basename=$(basename "$f")
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"webflorist\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/webflorist/privacy-policy-text\",\"filename\":\"$basename\"}" \
        "webflorist: $basename"
    done
  fi

  # --- D6: Tempest Privacy Policy Generator (MIT) ---
  repo="$WORK_DIR/repos/tempest-privacy"
  if [[ -d "$repo" ]]; then
    log "--- Tempest Privacy Policy Generator ---"
    find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
      -not -path "*/.git/*" -not -path "*/node_modules/*" \
      -not -name "package*.json" -not -name "README.md" 2>/dev/null \
      | head -15 | while read -r f; do
      local basename
      basename=$(basename "$f")
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"tempest_privacy\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/privacy-policy-generator\",\"filename\":\"$basename\"}" \
        "Tempest Privacy: $basename"
    done
  fi

  # --- D7: Tempest Terms of Service Generator (MIT) ---
  repo="$WORK_DIR/repos/tempest-tos"
  if [[ -d "$repo" ]]; then
    log "--- Tempest Terms of Service Generator ---"
    find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
      -not -path "*/.git/*" -not -path "*/node_modules/*" \
      -not -name "package*.json" -not -name "README.md" 2>/dev/null \
      | head -15 | while read -r f; do
      local basename
      basename=$(basename "$f")
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"tempest_tos\",\"doc_type\":\"tos\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/terms-of-service-generator\",\"filename\":\"$basename\"}" \
        "Tempest ToS: $basename"
    done
  fi

  # --- D8: Tempest Cookie Banner (MIT) ---
  repo="$WORK_DIR/repos/tempest-cookie"
  if [[ -d "$repo" ]]; then
    log "--- Tempest Cookie Banner ---"
    find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
      -not -path "*/.git/*" -not -path "*/node_modules/*" \
      -not -name "package*.json" -not -name "README.md" 2>/dev/null \
      | head -15 | while read -r f; do
      local basename
      basename=$(basename "$f")
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"tempest_cookie\",\"doc_type\":\"cookie_banner\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/cookie-banner-consent-solution\",\"filename\":\"$basename\"}" \
        "Tempest Cookie: $basename"
    done
  fi

  # --- D9: CookieConsent (orestbida) - UI Strings (MIT) ---
  repo="$WORK_DIR/repos/cookieconsent"
  if [[ -d "$repo" ]]; then
    log "--- CookieConsent UI Strings ---"
    # Look for translation/language files
    find "$repo" -path "*/translations/*" -o -path "*/languages/*" -o -path "*/i18n/*" -o -path "*/locales/*" 2>/dev/null \
      | grep -iE "\.(json|js|ts)$" | head -10 | while read -r f; do
      local basename
      basename=$(basename "$f")
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
        "CookieConsent: $basename"
    done
    # Also check for example configs
    find "$repo" -name "*.md" -path "*/docs/*" 2>/dev/null | head -5 | while read -r f; do
      local basename
      basename=$(basename "$f")
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
        "CookieConsent Docs: $basename"
    done
  fi

  # --- D10: Common Paper (CC BY 4.0) ---
  log "--- Common Paper Standards ---"
  local -a cp_repos=(
    "common-paper-csa:saas_contract:CSA"
    "common-paper-sla:sla:SLA"
    "common-paper-psa:psa:PSA"
  )
  for entry in "${cp_repos[@]}"; do
    local cp_dir="${entry%%:*}"
    local rest="${entry#*:}"
    local cp_doc_type="${rest%%:*}"
    local cp_label="${rest#*:}"
    repo="$WORK_DIR/repos/$cp_dir"
    if [[ -d "$repo" ]]; then
      find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" \
        -not -name "CONTRIBUTING.md" -not -name "CHANGELOG.md" -not -name "CODE_OF_CONDUCT.md" 2>/dev/null \
        | head -10 | while read -r f; do
        local basename
        basename=$(basename "$f" .md)
        upload_file "$f" "$col" "legal_template" "template" "2024" \
          "{\"source_id\":\"common_paper\",\"doc_type\":\"$cp_doc_type\",\"license\":\"cc_by_4\",\"attribution\":\"Common Paper Inc., licensed under CC BY 4.0\",\"source\":\"github.com/CommonPaper/$cp_label\",\"filename\":\"$basename\"}" \
          "CommonPaper $cp_label: $basename"
      done
    fi
  done

  # --- D11: Datennutzungsklauseln (CC BY 4.0) ---
  repo="$WORK_DIR/repos/datennutzungsklauseln"
  if [[ -d "$repo" ]]; then
    log "--- Datennutzungsklauseln ---"
    find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
      | head -15 | while read -r f; do
      local basename
      basename=$(basename "$f" .md)
      upload_file "$f" "$col" "legal_template" "template" "2024" \
        "{\"source_id\":\"datennutzungsklauseln\",\"doc_type\":\"data_clause\",\"license\":\"cc_by_4\",\"attribution\":\"OpenCode.de, lizenziert unter CC BY 4.0\",\"source\":\"gitlab.opencode.de/wernerth/datennutzungsklauseln-muster\",\"filename\":\"$basename\"}" \
        "Datennutzungsklausel: $basename"
    done
  fi

  local after
  after=$(collection_count "$col")
  log "Collection $col: $before → $after chunks"
}

# =============================================================================
# PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz
# =============================================================================
phase_datenschutz() {
  log "=========================================="
  log "PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz"
  log "=========================================="

  local col="bp_compliance_datenschutz"
  local before
  before=$(collection_count "$col")
  log "Collection $col: $before chunks (before)"

  # EDPB Guidelines
  for pdf in "$WORK_DIR"/pdfs/edpb_*.pdf; do
    if [[ -f "$pdf" ]]; then
      local basename
      basename=$(basename "$pdf" .pdf)
      local guideline_name="${basename#edpb_}"
      guideline_name="${guideline_name//_/ }"
      upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
        "{\"source_id\":\"edpb\",\"doc_type\":\"guidance\",\"guideline_name\":\"$guideline_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Board (EDPB)\",\"source\":\"edpb.europa.eu\"}" \
        "EDPB: $guideline_name"
    fi
  done

  # EDPS Guidance
  for pdf in "$WORK_DIR"/pdfs/edps_*.pdf; do
    if [[ -f "$pdf" ]]; then
      local basename
      basename=$(basename "$pdf" .pdf)
      local guidance_name="${basename#edps_}"
      guidance_name="${guidance_name//_/ }"
      upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
        "{\"source_id\":\"edps\",\"doc_type\":\"guidance\",\"guidance_name\":\"$guidance_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Supervisor (EDPS)\",\"source\":\"edps.europa.eu\"}" \
        "EDPS: $guidance_name"
    fi
  done

  local after
  after=$(collection_count "$col")
  log "Collection $col: $before → $after chunks"
}

# =============================================================================
# PHASE H: Layer 1 Safe Core — Verbraucherschutz, EU-Recht, NIST
# =============================================================================
# ~60 Dokumente: EUR-Lex (CC BY 4.0), gesetze-im-internet.de (Public Domain),
# NIST (Public Domain), HLEG (CC BY 4.0)
# =============================================================================
phase_verbraucherschutz() {
  log "=========================================="
  log "PHASE H: Layer 1 Safe Core (~60 Dokumente)"
  log "=========================================="

  mkdir -p "$WORK_DIR"/{pdfs,texts}

  # =========================================================================
  # H1: Deutsche Gesetze → bp_compliance_gesetze
  # Quelle: gesetze-im-internet.de (Public Domain, § 5 UrhG)
  # =========================================================================
  local col="bp_compliance_gesetze"
  local before
  before=$(collection_count "$col")
  log "--- H1: Deutsche Gesetze → $col ($before chunks) ---"

  # Verbraucherschutz-Gesetze
  local -a de_gesetze=(
    "pangv_2022/PAngV:PAngV:Preisangabenverordnung:verbraucherschutz"
    "vsbg/VSBG:VSBG:Verbraucherstreitbeilegungsgesetz:verbraucherschutz"
    "prodhaftg/ProdHaftG:ProdHaftG:Produkthaftungsgesetz:verbraucherschutz"
    "verpackg/VerpackG:VerpackG:Verpackungsgesetz:verbraucherschutz"
    "elektrog_2015/ElektroG:ElektroG:Elektro- und Elektronikgeraetegesetz:verbraucherschutz"
    "battdg/BattDG:BattDG:Batteriegesetz:verbraucherschutz"
    "bfsg/BFSG:BFSG:Barrierefreiheitsstaerkungsgesetz:verbraucherschutz"
    "uwg_2004/UWG:UWG:Gesetz gegen den unlauteren Wettbewerb:verbraucherschutz"
    # Datenschutz + IT
    "bdsg_2018/BDSG:BDSG:Bundesdatenschutzgesetz:datenschutz"
    "ddg/DDG:DDG:Digitale-Dienste-Gesetz:ecommerce"
    "tkg_2021/TKG:TKG:Telekommunikationsgesetz:datenschutz"
    # Handels-/Steuerrecht (Loeschfristen)
    "hgb/HGB:HGB:Handelsgesetzbuch:aufbewahrung"
    "ao_1977/AO:AO:Abgabenordnung:aufbewahrung"
    # Gewerberecht
    "gewo/GewO:GewO:Gewerbeordnung:gewerberecht"
  )

  for entry in "${de_gesetze[@]}"; do
    local path="${entry%%:*}"
    local rest="${entry#*:}"
    local short="${rest%%:*}"
    rest="${rest#*:}"
    local fullname="${rest%%:*}"
    local category="${rest#*:}"
    local pdf_file="$WORK_DIR/pdfs/${short}.pdf"

    download_pdf \
      "https://www.gesetze-im-internet.de/${path}.pdf" \
      "$pdf_file" || true

    if [[ -f "$pdf_file" ]]; then
      upload_file "$pdf_file" "$col" "compliance" "legal_reference" "2025" \
        "{\"regulation_id\":\"${short,,}\",\"regulation_name_de\":\"$fullname ($short)\",\"category\":\"$category\",\"license\":\"public_domain_§5_UrhG\",\"source\":\"gesetze-im-internet.de\"}" \
        "$short ($fullname)"
    fi
  done

  # BGB in Teilen statt komplett (2.7MB PDF ist zu gross fuer CPU-Embeddings)
  # gesetze-im-internet.de bietet XML-Download pro Gesetz
  local bgb_xml="$WORK_DIR/pdfs/bgb_xml.zip"
  curl -sL "https://www.gesetze-im-internet.de/bgb/xml.zip" -o "$bgb_xml" 2>/dev/null
  if [[ -f "$bgb_xml" && $(stat -f%z "$bgb_xml" 2>/dev/null || stat -c%s "$bgb_xml" 2>/dev/null || echo 0) -gt 1000 ]]; then
    local bgb_extract="$WORK_DIR/pdfs/bgb_xml"
    mkdir -p "$bgb_extract"
    unzip -qo "$bgb_xml" -d "$bgb_extract" 2>/dev/null || true

    # Relevante BGB-Abschnitte als Text extrahieren und einzeln uploaden
    # Die XML-Datei hat <norm> Elemente mit <metadaten><enbez>§ 305</enbez>
    local bgb_xmlfile
    bgb_xmlfile=$(find "$bgb_extract" -name "*.xml" | head -1)
    if [[ -n "$bgb_xmlfile" ]]; then
      # BGB Teil 1: AGB-Recht §§ 305-310
      python3 -c "
import xml.etree.ElementTree as ET, sys, re
tree = ET.parse('$bgb_xmlfile')
root = tree.getroot()
ns = {'': root.tag.split('}')[0].lstrip('{') if '}' in root.tag else ''}
text_parts = []
capture = False
for norm in root.iter():
    if norm.tag.endswith('norm'):
        enbez = norm.find('.//' + ('{' + ns[''] + '}' if ns[''] else '') + 'enbez')
        if enbez is not None and enbez.text:
            num = re.search(r'§\s*(\d+)', enbez.text)
            if num:
                n = int(num.group(1))
                capture = 305 <= n <= 310
        else:
            capture = False
        if capture:
            for t in norm.itertext():
                text_parts.append(t.strip())
with open('$WORK_DIR/pdfs/BGB_AGB_305_310.txt', 'w') as f:
    f.write('BGB AGB-Recht §§ 305-310\n\n' + '\n'.join(p for p in text_parts if p))
" 2>/dev/null
      if [[ -f "$WORK_DIR/pdfs/BGB_AGB_305_310.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_AGB_305_310.txt") -gt 100 ]]; then
        upload_file "$WORK_DIR/pdfs/BGB_AGB_305_310.txt" "$col" "compliance" "legal_reference" "2025" \
          '{"regulation_id":"bgb_agb","regulation_name_de":"BGB AGB-Recht (§§ 305-310)","category":"vertragsrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
          "BGB AGB-Recht §§ 305-310"
      fi

      # BGB Teil 2: Fernabsatzrecht §§ 312-312k
      python3 -c "
import xml.etree.ElementTree as ET, sys, re
tree = ET.parse('$bgb_xmlfile')
root = tree.getroot()
text_parts = []
capture = False
for norm in root.iter():
    if norm.tag.endswith('norm'):
        enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
        if enbez is not None and enbez.text:
            if re.search(r'§\s*312', enbez.text):
                capture = True
            elif re.search(r'§\s*31[3-9]|§\s*32', enbez.text):
                capture = False
        else:
            if capture and not any(norm.itertext()):
                capture = False
        if capture:
            for t in norm.itertext():
                text_parts.append(t.strip())
with open('$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt', 'w') as f:
    f.write('BGB Fernabsatzrecht §§ 312-312k\n\n' + '\n'.join(p for p in text_parts if p))
" 2>/dev/null
      if [[ -f "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt") -gt 100 ]]; then
        upload_file "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt" "$col" "compliance" "legal_reference" "2025" \
          '{"regulation_id":"bgb_fernabsatz","regulation_name_de":"BGB Fernabsatzrecht (§§ 312-312k)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
          "BGB Fernabsatzrecht §§ 312-312k"
      fi

      # BGB Teil 3: Kaufrecht + Gewährleistung §§ 433-480
      python3 -c "
import xml.etree.ElementTree as ET, sys, re
tree = ET.parse('$bgb_xmlfile')
root = tree.getroot()
text_parts = []
capture = False
for norm in root.iter():
    if norm.tag.endswith('norm'):
        enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
        if enbez is not None and enbez.text:
            num = re.search(r'§\s*(\d+)', enbez.text)
            if num:
                n = int(num.group(1))
                capture = 433 <= n <= 480
        else:
            capture = False
        if capture:
            for t in norm.itertext():
                text_parts.append(t.strip())
with open('$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt', 'w') as f:
    f.write('BGB Kaufrecht §§ 433-480\n\n' + '\n'.join(p for p in text_parts if p))
" 2>/dev/null
      if [[ -f "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt") -gt 100 ]]; then
        upload_file "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt" "$col" "compliance" "legal_reference" "2025" \
          '{"regulation_id":"bgb_kaufrecht","regulation_name_de":"BGB Kaufrecht + Gewaehrleistung (§§ 433-480)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
          "BGB Kaufrecht §§ 433-480"
      fi

      # BGB Teil 4: Widerrufsrecht §§ 355-361
      python3 -c "
import xml.etree.ElementTree as ET, sys, re
tree = ET.parse('$bgb_xmlfile')
root = tree.getroot()
text_parts = []
capture = False
for norm in root.iter():
    if norm.tag.endswith('norm'):
        enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
        if enbez is not None and enbez.text:
            num = re.search(r'§\s*(\d+)', enbez.text)
            if num:
                n = int(num.group(1))
                capture = 355 <= n <= 361
        else:
            capture = False
        if capture:
            for t in norm.itertext():
                text_parts.append(t.strip())
with open('$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt', 'w') as f:
    f.write('BGB Widerrufsrecht §§ 355-361\n\n' + '\n'.join(p for p in text_parts if p))
" 2>/dev/null
      if [[ -f "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt") -gt 100 ]]; then
        upload_file "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt" "$col" "compliance" "legal_reference" "2025" \
          '{"regulation_id":"bgb_widerruf","regulation_name_de":"BGB Widerrufsrecht (§§ 355-361)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
          "BGB Widerrufsrecht §§ 355-361"
      fi

      # BGB Teil 5: Digitale Produkte §§ 327-327u
      python3 -c "
import xml.etree.ElementTree as ET, sys, re
tree = ET.parse('$bgb_xmlfile')
root = tree.getroot()
text_parts = []
capture = False
for norm in root.iter():
    if norm.tag.endswith('norm'):
        enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
        if enbez is not None and enbez.text:
            if re.search(r'§\s*327', enbez.text):
                capture = True
            elif re.search(r'§\s*328', enbez.text):
                capture = False
        if capture:
            for t in norm.itertext():
                text_parts.append(t.strip())
with open('$WORK_DIR/pdfs/BGB_Digital_327.txt', 'w') as f:
    f.write('BGB Digitale Produkte §§ 327-327u\n\n' + '\n'.join(p for p in text_parts if p))
" 2>/dev/null
      if [[ -f "$WORK_DIR/pdfs/BGB_Digital_327.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Digital_327.txt") -gt 100 ]]; then
        upload_file "$WORK_DIR/pdfs/BGB_Digital_327.txt" "$col" "compliance" "legal_reference" "2025" \
          '{"regulation_id":"bgb_digital","regulation_name_de":"BGB Digitale Produkte (§§ 327-327u)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
          "BGB Digitale Produkte §§ 327-327u"
      fi
    else
      warn "BGB XML file not found in archive"
    fi
  else
    warn "BGB XML download failed"
  fi

  # EGBGB — XML statt PDF (BGBEG.pdf war leer)
  local egbgb_xml="$WORK_DIR/pdfs/bgbeg_xml.zip"
  curl -sL "https://www.gesetze-im-internet.de/bgbeg/xml.zip" -o "$egbgb_xml" 2>/dev/null
  if [[ -f "$egbgb_xml" && $(stat -f%z "$egbgb_xml" 2>/dev/null || stat -c%s "$egbgb_xml" 2>/dev/null || echo 0) -gt 1000 ]]; then
    local egbgb_extract="$WORK_DIR/pdfs/egbgb_xml"
    mkdir -p "$egbgb_extract"
    unzip -qo "$egbgb_xml" -d "$egbgb_extract" 2>/dev/null || true
    local egbgb_xmlfile
    egbgb_xmlfile=$(find "$egbgb_extract" -name "*.xml" | head -1)
    if [[ -n "$egbgb_xmlfile" ]]; then
      # Art. 246a EGBGB (Informationspflichten Fernabsatz) + Anlage 1+2 (Widerrufsbelehrung)
      python3 -c "
import xml.etree.ElementTree as ET
tree = ET.parse('$egbgb_xmlfile')
root = tree.getroot()
text_parts = []
for norm in root.iter():
    if norm.tag.endswith('norm'):
        full_text = ' '.join(t.strip() for t in norm.itertext() if t.strip())
        # Only capture Art. 246/246a/246b/246c and Anlage (Muster-Widerrufsbelehrung)
        if any(kw in full_text for kw in ['Art. 246', 'Artikel 246', '§ 246', 'Anlage 1', 'Anlage 2', 'Widerrufsbelehrung', 'Widerrufsformular']):
            parts = [t.strip() for t in norm.itertext() if t.strip()]
            text_parts.extend(parts)
# Limit output to avoid timeout (max 100KB)
output = 'EGBGB - Informationspflichten und Muster-Widerrufsbelehrung (Art. 246a + Anlage 1+2)\n\n' + '\n'.join(text_parts)
with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
    f.write(output[:100000])
" 2>/dev/null
      if [[ -f "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" && $(wc -c < "$WORK_DIR/pdfs/EGBGB_Widerruf.txt") -gt 100 ]]; then
        upload_file "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" "$col" "compliance" "legal_reference" "2025" \
          '{"regulation_id":"egbgb","regulation_name_de":"EGBGB (Muster-Widerrufsbelehrung, Art. 246a + Anlage 1+2)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
          "EGBGB Muster-Widerrufsbelehrung"
      fi
    fi
  else
    warn "EGBGB XML download failed"
  fi

  local after
  after=$(collection_count "$col")
  local diff=$(( after - before ))
  log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"

  # =========================================================================
  # H2: EU-Recht → bp_compliance_ce
  # Quelle: EUR-Lex (CC BY 4.0, Wiederverwendung erlaubt)
  # URL-Muster: /legal-content/DE/TXT/PDF/?uri=CELEX:{id}
  # =========================================================================
  col="bp_compliance_ce"
  before=$(collection_count "$col")
  log "--- H2: EU-Recht → $col ($before chunks) ---"

  # Array: CELEX_ID:filename:short:name_de:name_en:category:year
  local -a eu_gesetze=(
    # --- Datenschutz ---
    "32016R0679:DSGVO_2016_679:DSGVO:Datenschutz-Grundverordnung:General Data Protection Regulation:datenschutz:2016"
    # --- Verbraucherschutz (Kernbestand) ---
    "32011L0083:Consumer_Rights_2011_83:CRD:Verbraucherrechte-Richtlinie:Consumer Rights Directive:verbraucherschutz:2011"
    "32019L0770:Digital_Content_2019_770:DCD:Richtlinie digitale Inhalte:Digital Content Directive:verbraucherschutz:2019"
    "32019L0771:Sale_of_Goods_2019_771:SGD:Warenkauf-Richtlinie:Sale of Goods Directive:verbraucherschutz:2019"
    "32000L0031:ECommerce_2000_31:ECD:E-Commerce-Richtlinie:E-Commerce Directive:ecommerce:2000"
    "31993L0013:Unfair_Terms_93_13:UCTD:Klausel-Richtlinie:Unfair Contract Terms Directive:verbraucherschutz:1993"
    "32005L0029:Unfair_Practices_2005_29:UCPD:Richtlinie unlautere Geschaeftspraktiken:Unfair Commercial Practices Directive:verbraucherschutz:2005"
    "31998L0006:Price_Indication_98_6:PID:Preisangaben-Richtlinie:Price Indication Directive:verbraucherschutz:1998"
    "32019L2161:Omnibus_2019_2161:OMN:Omnibus-Richtlinie (Modernisierung Verbraucherschutz):Omnibus Directive:verbraucherschutz:2019"
    # --- Plattformregulierung ---
    "32022R1925:DMA_2022_1925:DMA:Digital Markets Act:Digital Markets Act:plattformregulierung:2022"
    # --- KI + Sicherheit ---
    "32024R1689:AI_Act_2024_1689:AI_Act:KI-Verordnung:Artificial Intelligence Act:ki_regulierung:2024"
    "32022L2555:NIS2_2022_2555:NIS2:NIS-2-Richtlinie:NIS2 Directive:it_sicherheit:2022"
    # --- Produktsicherheit + Haftung ---
    "32023R0988:GPSR_2023_988:GPSR:Allgemeine Produktsicherheitsverordnung:General Product Safety Regulation:produktsicherheit:2023"
    "31985L0374:Product_Liability_85_374:PLD:Produkthaftungs-Richtlinie:Product Liability Directive:produkthaftung:1985"
    "32023R1542:Batterie_VO_2023_1542:BattVO:Batterieverordnung:Battery Regulation:produktsicherheit:2023"
    # --- Datentransfer ---
    # SCC bereits in Phase C, hier nicht duplizieren
  )

  for entry in "${eu_gesetze[@]}"; do
    IFS=':' read -r celex filename short name_de name_en category year <<< "$entry"
    local pdf_file="$WORK_DIR/pdfs/${filename}.pdf"

    # AI Act hat spezielle URL (OJ statt CELEX)
    if [[ "$celex" == "32024R1689" ]]; then
      download_pdf \
        "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=OJ:L_202401689" \
        "$pdf_file" || true
    else
      download_pdf \
        "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:${celex}" \
        "$pdf_file" || true
    fi

    if [[ -f "$pdf_file" ]]; then
      upload_file "$pdf_file" "$col" "compliance_ce" "legal_reference" "$year" \
        "{\"regulation_id\":\"${short,,}\",\"regulation_name_de\":\"$name_de\",\"regulation_name_en\":\"$name_en\",\"regulation_short\":\"$short\",\"celex\":\"$celex\",\"category\":\"$category\",\"license\":\"CC_BY_4.0\",\"source\":\"eur-lex\"}" \
        "$short — $name_de"
    fi
  done

  after=$(collection_count "$col")
  local diff=$(( after - before ))
  log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"

  # =========================================================================
  # H3: NIST Security Frameworks → bp_compliance_security
  # Quelle: nist.gov (Public Domain, US Government Work)
  # =========================================================================
  col="bp_compliance_datenschutz"
  before=$(collection_count "$col")
  log "--- H3: NIST + Ethics → $col ($before chunks) ---"

  # NIST Cybersecurity Framework 2.0
  download_pdf \
    "https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.29.pdf" \
    "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
      '{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Cybersecurity Framework 2.0","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
      "NIST Cybersecurity Framework 2.0"
  fi

  # NIST Privacy Framework 1.0
  download_pdf \
    "https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.01162020.pdf" \
    "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \
      '{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Privacy Framework 1.0","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
      "NIST Privacy Framework 1.0"
  fi

  # HLEG Ethics Guidelines for Trustworthy AI
  download_pdf \
    "https://op.europa.eu/en/publication-detail/-/publication/d3988569-0434-11ea-8c1f-01aa75ed71a1/language-en/format-PDF" \
    "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" "$col" "compliance_datenschutz" "guidance" "2019" \
      '{"source_id":"hleg","doc_type":"ethics_guidelines","guideline_name":"Ethics Guidelines for Trustworthy AI","license":"CC_BY_4.0","attribution":"High-Level Expert Group on AI (HLEG)","source":"op.europa.eu"}' \
      "HLEG Ethics Guidelines Trustworthy AI"
  fi

  after=$(collection_count "$col")
  local diff=$(( after - before ))
  log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"

  # =========================================================================
  # Summary
  # =========================================================================
  echo ""
  log "Phase H abgeschlossen."
  log "Naechste Schritte (TODO — separate Phasen):"
  log "  Layer 2: Nationale Gesetze EU/EWR (FR, ES, IT, AT, NL, UK) — Portal-Recherche noetig"
  log "  Layer 3: DPA Guidance (CNIL, AEPD, Garante, AP, IMY) — Einzel-URLs recherchieren"
  log "  Layer 4: OWASP Top 10, offene Security-Frameworks"
  log "  Layer 5: EuGH + BGH Leitentscheidungen"
}

# =============================================================================
# PHASE I: DACH-Erweiterung (Quellenkatalog)
# DSK-Templates, Gerichtsentscheidungen, fehlende Gesetze
# Lizenzen: §5 UrhG (DE), §7 UrhG (AT), Art.5 URG (CH), DL-DE/BY-2.0, CC BY 4.0
# =============================================================================
phase_dach() {
  log "=========================================="
  log "PHASE I: DACH-Erweiterung"
  log "=========================================="

  # =========================================================================
  # I1: Fehlende DE-Gesetze → bp_compliance_gesetze
  # =========================================================================
  local col="bp_compliance_gesetze"
  local before
  before=$(collection_count "$col")
  log "--- I1: Fehlende DE-Gesetze → $col ($before chunks) ---"

  # UStG (Umsatzsteuergesetz) — Retention/Rechnungspflichten
  download_pdf "https://www.gesetze-im-internet.de/ustg_1980/UStG.pdf" "$WORK_DIR/pdfs/UStG.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/UStG.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/UStG.pdf" "$col" "compliance" "legal_reference" "2025" \
      '{"regulation_id":"ustg","regulation_name_de":"Umsatzsteuergesetz (UStG)","category":"steuerrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
      "UStG (Umsatzsteuergesetz)"
  fi

  # MStV § 18 — Medienstaatsvertrag (Impressumspflicht)
  # Nicht als PDF auf gesetze-im-internet.de, daher von Bayern
  curl -sL "https://www.gesetze-bayern.de/Content/Pdf/MStV" -o "$WORK_DIR/pdfs/MStV.pdf" 2>/dev/null || true
  if [[ -f "$WORK_DIR/pdfs/MStV.pdf" && $(stat -f%z "$WORK_DIR/pdfs/MStV.pdf" 2>/dev/null || stat -c%s "$WORK_DIR/pdfs/MStV.pdf" 2>/dev/null || echo 0) -gt 1000 ]]; then
    upload_file "$WORK_DIR/pdfs/MStV.pdf" "$col" "compliance" "legal_reference" "2025" \
      '{"regulation_id":"mstv","regulation_name_de":"Medienstaatsvertrag (MStV)","category":"medienrecht","license":"public_domain_§5_UrhG","source":"gesetze-bayern.de"}' \
      "MStV (Medienstaatsvertrag)"
  else
    warn "MStV PDF download failed — skipping"
  fi

  local after
  after=$(collection_count "$col")
  local diff=$(( after - before ))
  log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"

  # =========================================================================
  # I2: DSK Templates & BfDI → bp_legal_templates
  # Lizenz: DL-DE/BY-2.0 (kommerziell mit Attribution)
  # =========================================================================
  col="bp_legal_templates"
  before=$(collection_count "$col")
  log "--- I2: DSK/BfDI Templates → $col ($before chunks) ---"

  # DSK Muster-VVT (Art. 30 DSGVO)
  download_pdf \
    "https://www.datenschutzkonferenz-online.de/media/ah/201802_ah_muster_verantwortliche.pdf" \
    "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/DSK_Muster_VVT.pdf" "$col" "compliance_template" "template" "2018" \
      '{"regulation_id":"dsk_muster_vvt","regulation_name_de":"DSK Muster-VVT (Art. 30 DSGVO)","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \
      "DSK Muster-VVT"
  fi

  # DSK Kurzpapier Nr. 5 DSFA
  download_pdf \
    "https://www.datenschutzkonferenz-online.de/media/kp/dsk_kpnr_5.pdf" \
    "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/DSK_KP5_DSFA.pdf" "$col" "compliance_template" "guidance" "2018" \
      '{"regulation_id":"dsk_kp5_dsfa","regulation_name_de":"DSK Kurzpapier Nr. 5 Datenschutz-Folgenabschaetzung","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Datenschutzkonferenz (DSK)","source":"datenschutzkonferenz-online.de"}' \
      "DSK Kurzpapier Nr. 5 DSFA"
  fi

  # BfDI Beispiel-VVT
  download_pdf \
    "https://www.bfdi.bund.de/SharedDocs/Downloads/DE/DokumenteBfDI/AccessForAll/2023/2023_Verzeichnis-Verarbeitungst%C3%A4tigkeiten.pdf?__blob=publicationFile&v=2" \
    "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/BfDI_Beispiel_VVT.pdf" "$col" "compliance_template" "template" "2023" \
      '{"regulation_id":"bfdi_beispiel_vvt","regulation_name_de":"BfDI Beispiel-VVT mit Loeschfristen","category":"datenschutz","license":"DL-DE/BY-2.0","attribution":"Bundesbeauftragter fuer den Datenschutz und die Informationsfreiheit (BfDI)","source":"bfdi.bund.de"}' \
      "BfDI Beispiel-VVT"
  fi

  after=$(collection_count "$col")
  diff=$(( after - before ))
  log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"

  # =========================================================================
  # I3: BSI Stand-der-Technik (OSCAL) → bp_compliance_gesetze
  # Lizenz: CC BY-SA 4.0
  # =========================================================================
  col="bp_compliance_gesetze"
  before=$(collection_count "$col")
  log "--- I3: BSI OSCAL → $col ($before chunks) ---"

  # BSI IT-Grundschutz Kompendium (PDF)
  download_pdf \
    "https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Grundschutz/Kompendium/IT_Grundschutz_Kompendium_Edition2024.pdf?__blob=publicationFile&v=4" \
    "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/BSI_Grundschutz_2024.pdf" "$col" "compliance" "guidance" "2024" \
      '{"regulation_id":"bsi_grundschutz_2024","regulation_name_de":"BSI IT-Grundschutz Kompendium 2024","category":"informationssicherheit","license":"CC_BY-SA_4.0","attribution":"Bundesamt fuer Sicherheit in der Informationstechnik (BSI)","source":"bsi.bund.de"}' \
      "BSI IT-Grundschutz Kompendium 2024"
  fi

  after=$(collection_count "$col")
  diff=$(( after - before ))
  log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"

  # =========================================================================
  # I4: Gerichtsentscheidungen → bp_compliance_datenschutz
  # Lizenz: §5 UrhG (DE), §7 UrhG (AT), Art.5 URG (CH) — amtliche Werke
  # =========================================================================
  col="bp_compliance_datenschutz"
  before=$(collection_count "$col")
  log "--- I4: Gerichtsentscheidungen → $col ($before chunks) ---"

  # DE: LG Bonn 29 OWi 1/20 (1&1 Bussgeldbescheid, Art. 32)
  curl -sL "https://www.justiz.nrw.de/nrwe/lgs/bonn/lg_bonn/j2020/29OWi1_20_Urteil_20201111.html" \
    -o "$WORK_DIR/texts/LG_Bonn_29OWi1_20.html" 2>/dev/null || true
  if [[ -f "$WORK_DIR/texts/LG_Bonn_29OWi1_20.html" && $(wc -c < "$WORK_DIR/texts/LG_Bonn_29OWi1_20.html") -gt 500 ]]; then
    # Convert HTML to text
    python3 -c "
import re
with open('$WORK_DIR/texts/LG_Bonn_29OWi1_20.html') as f:
    html = f.read()
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
with open('$WORK_DIR/texts/LG_Bonn_29OWi1_20.txt', 'w') as f:
    f.write('LG Bonn, Urteil vom 11.11.2020, 29 OWi 1/20 (1&1 Telecom)\n\n' + text)
" 2>/dev/null
    upload_file "$WORK_DIR/texts/LG_Bonn_29OWi1_20.txt" "$col" "compliance_datenschutz" "case_law" "2020" \
      '{"regulation_id":"lg_bonn_29owi1_20","doc_type":"court_decision","court":"LG Bonn","case_number":"29 OWi 1/20","date":"2020-11-11","topic":"Art. 32 DSGVO Authentifizierung Bussgeld","country":"DE","license":"public_domain_§5_UrhG","source":"justiz.nrw.de"}' \
      "LG Bonn 29 OWi 1/20 (Art. 32 Bussgeld)"
  fi

  # DE: BGH I ZR 7/16 (Planet49 Cookie-Einwilligung)
  curl -sL "https://www.bundesgerichtshof.de/SharedDocs/Pressemitteilungen/DE/2020/2020067.html" \
    -o "$WORK_DIR/texts/BGH_Planet49.html" 2>/dev/null || true
  if [[ -f "$WORK_DIR/texts/BGH_Planet49.html" && $(wc -c < "$WORK_DIR/texts/BGH_Planet49.html") -gt 500 ]]; then
    python3 -c "
import re
with open('$WORK_DIR/texts/BGH_Planet49.html') as f:
    html = f.read()
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
with open('$WORK_DIR/texts/BGH_Planet49.txt', 'w') as f:
    f.write('BGH, Urteil vom 28.05.2020, I ZR 7/16 (Planet49 Cookie-Einwilligung)\n\n' + text)
" 2>/dev/null
    upload_file "$WORK_DIR/texts/BGH_Planet49.txt" "$col" "compliance_datenschutz" "case_law" "2020" \
      '{"regulation_id":"bgh_i_zr_7_16","doc_type":"court_decision","court":"BGH","case_number":"I ZR 7/16","date":"2020-05-28","topic":"Cookie-Einwilligung Opt-in Planet49","country":"DE","license":"public_domain_§5_UrhG","source":"bundesgerichtshof.de"}' \
      "BGH I ZR 7/16 (Planet49 Cookie-Einwilligung)"
  fi

  # DE: BGH Pressemitteilung 218/2024 (Art. 82 DSGVO Schadensersatz)
  curl -sL "https://www.bundesgerichtshof.de/SharedDocs/Pressemitteilungen/DE/2024/2024218.html" \
    -o "$WORK_DIR/texts/BGH_Art82_2024.html" 2>/dev/null || true
  if [[ -f "$WORK_DIR/texts/BGH_Art82_2024.html" && $(wc -c < "$WORK_DIR/texts/BGH_Art82_2024.html") -gt 500 ]]; then
    python3 -c "
import re
with open('$WORK_DIR/texts/BGH_Art82_2024.html') as f:
    html = f.read()
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
with open('$WORK_DIR/texts/BGH_Art82_2024.txt', 'w') as f:
    f.write('BGH, Pressemitteilung 218/2024, Art. 82 DSGVO Schadensersatz\n\n' + text)
" 2>/dev/null
    upload_file "$WORK_DIR/texts/BGH_Art82_2024.txt" "$col" "compliance_datenschutz" "case_law" "2024" \
      '{"regulation_id":"bgh_art82_2024_218","doc_type":"court_decision","court":"BGH","date":"2024-11-18","topic":"Art. 82 DSGVO immaterieller Schadensersatz Bemessung","country":"DE","license":"public_domain_§5_UrhG","source":"bundesgerichtshof.de"}' \
      "BGH PM 218/2024 (Art. 82 Schadensersatz)"
  fi

  # DE: BGH VI ZR 396/24 (Art. 82 Konkretisierung)
  curl -sL "https://www.bundesgerichtshof.de/SharedDocs/Entscheidungen/DE/2025/2025-11-11-VIZR396_24.html" \
    -o "$WORK_DIR/texts/BGH_VI_ZR_396_24.html" 2>/dev/null || true
  if [[ -f "$WORK_DIR/texts/BGH_VI_ZR_396_24.html" && $(wc -c < "$WORK_DIR/texts/BGH_VI_ZR_396_24.html") -gt 500 ]]; then
    python3 -c "
import re
with open('$WORK_DIR/texts/BGH_VI_ZR_396_24.html') as f:
    html = f.read()
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
with open('$WORK_DIR/texts/BGH_VI_ZR_396_24.txt', 'w') as f:
    f.write('BGH, Urteil vom 11.11.2025, VI ZR 396/24 (Art. 82 DSGVO Konkretisierung)\n\n' + text)
" 2>/dev/null
    upload_file "$WORK_DIR/texts/BGH_VI_ZR_396_24.txt" "$col" "compliance_datenschutz" "case_law" "2025" \
      '{"regulation_id":"bgh_vi_zr_396_24","doc_type":"court_decision","court":"BGH","case_number":"VI ZR 396/24","date":"2025-11-11","topic":"Art. 82 DSGVO Darlegungslast Schadensbemessung","country":"DE","license":"public_domain_§5_UrhG","source":"bundesgerichtshof.de"}' \
      "BGH VI ZR 396/24 (Art. 82 Konkretisierung)"
  fi

  # AT: OGH 6 Ob 70/24y (Schutzzweck/Kausalitaet)
  curl -sL "https://www.ogh.gv.at/entscheidungen/entscheidungen-ogh/datenschutzrecht-zum-schutzzweck-der-datenschutz-grundverordnung-dsgvo-und-des-datenschutzgesetzes-dsg/" \
    -o "$WORK_DIR/texts/OGH_6Ob70_24y.html" 2>/dev/null || true
  if [[ -f "$WORK_DIR/texts/OGH_6Ob70_24y.html" && $(wc -c < "$WORK_DIR/texts/OGH_6Ob70_24y.html") -gt 500 ]]; then
    python3 -c "
import re
with open('$WORK_DIR/texts/OGH_6Ob70_24y.html') as f:
    html = f.read()
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
with open('$WORK_DIR/texts/OGH_6Ob70_24y.txt', 'w') as f:
    f.write('OGH, 6 Ob 70/24y, 15.05.2024 (Schutzzweck DSGVO/DSG)\n\n' + text)
" 2>/dev/null
    upload_file "$WORK_DIR/texts/OGH_6Ob70_24y.txt" "$col" "compliance_datenschutz" "case_law" "2024" \
      '{"regulation_id":"ogh_6ob70_24y","doc_type":"court_decision","court":"OGH","case_number":"6 Ob 70/24y","date":"2024-05-15","topic":"Schutzzweck DSGVO Kausalitaet Schadensersatz","country":"AT","license":"public_domain_§7_UrhG_AT","source":"ogh.gv.at"}' \
      "OGH 6 Ob 70/24y (Schutzzweck DSGVO)"
  fi

  # AT: OGH 6 Ob 102/24d (Art. 15 Herkunft + Art. 82)
  curl -sL "https://www.ogh.gv.at/entscheidungen/vorabentscheidungsersuchen-eugh/vorabentscheidungsersuchen-zum-auskunftsrecht-nach-art-15-dsgvo-und-dem-recht-auf-schadenersatz-nach-art-82-dsgvo/" \
    -o "$WORK_DIR/texts/OGH_6Ob102_24d.html" 2>/dev/null || true
  if [[ -f "$WORK_DIR/texts/OGH_6Ob102_24d.html" && $(wc -c < "$WORK_DIR/texts/OGH_6Ob102_24d.html") -gt 500 ]]; then
    python3 -c "
import re
with open('$WORK_DIR/texts/OGH_6Ob102_24d.html') as f:
    html = f.read()
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
with open('$WORK_DIR/texts/OGH_6Ob102_24d.txt', 'w') as f:
    f.write('OGH, 6 Ob 102/24d, 18.02.2025 (EuGH-Vorlage Art. 15 + Art. 82)\n\n' + text)
" 2>/dev/null
    upload_file "$WORK_DIR/texts/OGH_6Ob102_24d.txt" "$col" "compliance_datenschutz" "case_law" "2025" \
      '{"regulation_id":"ogh_6ob102_24d","doc_type":"court_decision","court":"OGH","case_number":"6 Ob 102/24d","date":"2025-02-18","topic":"Art. 15 Auskunftsrecht Herkunft Art. 82 Schadensersatz EuGH-Vorlage","country":"AT","license":"public_domain_§7_UrhG_AT","source":"ogh.gv.at"}' \
      "OGH 6 Ob 102/24d (Art. 15 + Art. 82 EuGH-Vorlage)"
  fi

  # CH: BVGer B-915/2022 (BVGE 2024 IV/2 — DSG-Auskunft vs. Akteneinsicht)
  download_pdf \
    "https://jurispub.admin.ch/publiws/download?decisionId=ed176fe0-fb98-425f-8ded-ca9da49c816b" \
    "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/BVGE_2024_IV_2.pdf" "$col" "compliance_datenschutz" "case_law" "2024" \
      '{"regulation_id":"bvge_2024_iv_2","doc_type":"court_decision","court":"BVGer","case_number":"B-915/2022","date":"2024-04-03","topic":"DSG-Auskunft vs. Akteneinsicht CH","country":"CH","license":"public_domain_Art5_URG","source":"jurispub.admin.ch"}' \
      "BVGer B-915/2022 (DSG-Auskunft vs. Akteneinsicht)"
  fi

  # CH: BGer 1C 562/2024 (Datensperre/Anonymisierung)
  curl -sL "https://relevancy.bger.ch/cgi-bin/JumpCGI?id=13.01.2025_1C_562%2F2024" \
    -o "$WORK_DIR/texts/BGer_1C_562_2024.html" 2>/dev/null || true
  if [[ -f "$WORK_DIR/texts/BGer_1C_562_2024.html" && $(wc -c < "$WORK_DIR/texts/BGer_1C_562_2024.html") -gt 500 ]]; then
    python3 -c "
import re
with open('$WORK_DIR/texts/BGer_1C_562_2024.html') as f:
    html = f.read()
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
with open('$WORK_DIR/texts/BGer_1C_562_2024.txt', 'w') as f:
    f.write('BGer, 1C 562/2024, 13.01.2025 (Datensperre/Anonymisierung)\n\n' + text)
" 2>/dev/null
    upload_file "$WORK_DIR/texts/BGer_1C_562_2024.txt" "$col" "compliance_datenschutz" "case_law" "2025" \
      '{"regulation_id":"bger_1c_562_2024","doc_type":"court_decision","court":"BGer","case_number":"1C 562/2024","date":"2025-01-13","topic":"Datensperre Anonymisierung DSG","country":"CH","license":"public_domain_Art5_URG","source":"bger.ch"}' \
      "BGer 1C 562/2024 (Datensperre/Anonymisierung)"
  fi

  after=$(collection_count "$col")
  diff=$(( after - before ))
  log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"

  echo ""
  log "Phase I abgeschlossen."
}

# =============================================================================
# PHASE J: Security Guidelines & Standards
# Nur lizenzkompatible Dokumente (Public Domain / CC BY / CC BY-SA)
# =============================================================================
phase_security() {
  log "=========================================="
  log "PHASE J: Security Guidelines & Standards"
  log "=========================================="

  mkdir -p "$WORK_DIR"/{pdfs,texts,repos}

  # =========================================================================
  # J1: NIST Standards (Public Domain, US Government Work)
  # → bp_compliance_datenschutz (hat bereits NIST CSF + Privacy Framework)
  # =========================================================================
  local col="bp_compliance_datenschutz"
  local before
  before=$(collection_count "$col")
  log "--- J1: NIST Security Standards → $col ($before chunks) ---"

  # NIST SP 800-53 Rev. 5 — Security and Privacy Controls (GROSS: ~490 Seiten)
  download_pdf \
    "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-53r5.pdf" \
    "$WORK_DIR/pdfs/NIST_SP_800_53r5.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/NIST_SP_800_53r5.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/NIST_SP_800_53r5.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \
      '{"regulation_id":"nist_sp800_53r5","source_id":"nist","doc_type":"controls_catalog","guideline_name":"NIST SP 800-53 Rev. 5 Security and Privacy Controls","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
      "NIST SP 800-53 Rev. 5 (Security & Privacy Controls)"
  fi

  # NIST SP 800-218 — Secure Software Development Framework (SSDF)
  download_pdf \
    "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-218.pdf" \
    "$WORK_DIR/pdfs/NIST_SP_800_218.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/NIST_SP_800_218.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/NIST_SP_800_218.pdf" "$col" "compliance_datenschutz" "guidance" "2022" \
      '{"regulation_id":"nist_sp800_218","source_id":"nist","doc_type":"framework","guideline_name":"NIST SP 800-218 Secure Software Development Framework (SSDF)","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
      "NIST SP 800-218 SSDF"
  fi

  # NIST SP 800-63-3 — Digital Identity Guidelines
  download_pdf \
    "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-63-3.pdf" \
    "$WORK_DIR/pdfs/NIST_SP_800_63_3.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/NIST_SP_800_63_3.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/NIST_SP_800_63_3.pdf" "$col" "compliance_datenschutz" "guidance" "2017" \
      '{"regulation_id":"nist_sp800_63_3","source_id":"nist","doc_type":"guidelines","guideline_name":"NIST SP 800-63-3 Digital Identity Guidelines","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
      "NIST SP 800-63-3 (Digital Identity)"
  fi

  # NIST SP 800-207 — Zero Trust Architecture
  download_pdf \
    "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-207.pdf" \
    "$WORK_DIR/pdfs/NIST_SP_800_207.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/NIST_SP_800_207.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/NIST_SP_800_207.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \
      '{"regulation_id":"nist_sp800_207","source_id":"nist","doc_type":"architecture","guideline_name":"NIST SP 800-207 Zero Trust Architecture","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
      "NIST SP 800-207 (Zero Trust Architecture)"
  fi

  # NISTIR 8259A — IoT Device Cybersecurity Core Baseline
  download_pdf \
    "https://nvlpubs.nist.gov/nistpubs/ir/2020/NIST.IR.8259A.pdf" \
    "$WORK_DIR/pdfs/NISTIR_8259A.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/NISTIR_8259A.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/NISTIR_8259A.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \
      '{"regulation_id":"nistir_8259a","source_id":"nist","doc_type":"baseline","guideline_name":"NISTIR 8259A IoT Device Cybersecurity Core Baseline","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
      "NISTIR 8259A (IoT Core Baseline)"
  fi

  # NISTIR 8259B — IoT Non-Technical Supporting Capability Core Baseline
  download_pdf \
    "https://nvlpubs.nist.gov/nistpubs/ir/2020/NIST.IR.8259B.pdf" \
    "$WORK_DIR/pdfs/NISTIR_8259B.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/NISTIR_8259B.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/NISTIR_8259B.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \
      '{"regulation_id":"nistir_8259b","source_id":"nist","doc_type":"baseline","guideline_name":"NISTIR 8259B IoT Non-Technical Supporting Capability Core Baseline","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
      "NISTIR 8259B (IoT Non-Technical Baseline)"
  fi

  # NIST AI RMF 1.0 — AI Risk Management Framework
  download_pdf \
    "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.100-1.pdf" \
    "$WORK_DIR/pdfs/NIST_AI_RMF_100_1.pdf" || true
  if [[ -f "$WORK_DIR/pdfs/NIST_AI_RMF_100_1.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/NIST_AI_RMF_100_1.pdf" "$col" "compliance_datenschutz" "guidance" "2023" \
      '{"regulation_id":"nist_ai_rmf","source_id":"nist","doc_type":"framework","guideline_name":"NIST AI Risk Management Framework (AI RMF) 1.0","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
      "NIST AI RMF 1.0 (AI Risk Management)"
  fi

  local after
  after=$(collection_count "$col")
  local diff=$(( after - before ))
  log "Collection $col: ${before} → ${after} chunks (+${diff})"

  # =========================================================================
  # J2: OWASP Security Standards (CC BY / CC BY-SA)
  # → bp_compliance_datenschutz
  # Strategie: GitHub Repos klonen, Markdown extrahieren, als Text uploaden
  # =========================================================================
  before=$(collection_count "$col")
  log "--- J2: OWASP Standards → $col ($before chunks) ---"

  # --- J2a: OWASP Top 10 (2021 stable, 2025 in development) ---
  local owasp_top10_repo="$WORK_DIR/repos/owasp-top10"
  if [[ ! -d "$owasp_top10_repo" ]]; then
    git clone --depth 1 "https://github.com/OWASP/Top10.git" "$owasp_top10_repo" 2>/dev/null || true
  fi
  if [[ -d "$owasp_top10_repo" ]]; then
    # Concatenate all Top 10 documents (2021 edition, stable)
    local top10_dir="$owasp_top10_repo/2021/docs"
    if [[ -d "$top10_dir" ]]; then
      find "$top10_dir" -name "*.md" -not -name "index.md" | sort | while read -r f; do
        echo "---"
        echo "# $(basename "$f" .md)"
        cat "$f"
        echo ""
      done > "$WORK_DIR/texts/OWASP_Top10_2021.txt" 2>/dev/null
      if [[ -f "$WORK_DIR/texts/OWASP_Top10_2021.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_Top10_2021.txt") -gt 1000 ]]; then
        upload_file "$WORK_DIR/texts/OWASP_Top10_2021.txt" "$col" "compliance_datenschutz" "guidance" "2021" \
          '{"regulation_id":"owasp_top10_2021","source_id":"owasp","doc_type":"risk_catalog","guideline_name":"OWASP Top 10 Web Application Security Risks (2021)","license":"CC_BY_3.0","attribution":"OWASP Foundation","source":"github.com/OWASP/Top10"}' \
          "OWASP Top 10 (2021)"
      fi
    fi
  fi

  # --- J2b: OWASP API Security Top 10 (2023) ---
  local owasp_api_repo="$WORK_DIR/repos/owasp-api-security"
  if [[ ! -d "$owasp_api_repo" ]]; then
    git clone --depth 1 "https://github.com/OWASP/API-Security.git" "$owasp_api_repo" 2>/dev/null || true
  fi
  if [[ -d "$owasp_api_repo" ]]; then
    local api_dir="$owasp_api_repo/editions/2023/en"
    if [[ -d "$api_dir" ]]; then
      find "$api_dir" -name "*.md" | sort | while read -r f; do
        echo "---"
        echo "# $(basename "$f" .md)"
        cat "$f"
        echo ""
      done > "$WORK_DIR/texts/OWASP_API_Security_2023.txt" 2>/dev/null
      if [[ -f "$WORK_DIR/texts/OWASP_API_Security_2023.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_API_Security_2023.txt") -gt 1000 ]]; then
        upload_file "$WORK_DIR/texts/OWASP_API_Security_2023.txt" "$col" "compliance_datenschutz" "guidance" "2023" \
          '{"regulation_id":"owasp_api_top10_2023","source_id":"owasp","doc_type":"risk_catalog","guideline_name":"OWASP API Security Top 10 (2023)","license":"CC_BY-SA_4.0","attribution":"OWASP Foundation","source":"github.com/OWASP/API-Security"}' \
          "OWASP API Security Top 10 (2023)"
      fi
    fi
  fi

  # --- J2c: OWASP ASVS (Application Security Verification Standard) ---
  local owasp_asvs_repo="$WORK_DIR/repos/owasp-asvs"
  if [[ ! -d "$owasp_asvs_repo" ]]; then
    git clone --depth 1 "https://github.com/OWASP/ASVS.git" "$owasp_asvs_repo" 2>/dev/null || true
  fi
  if [[ -d "$owasp_asvs_repo" ]]; then
    # ASVS 5.0 or 4.0 — check which is available
    local asvs_dir=""
    for candidate in "$owasp_asvs_repo/5.0/en" "$owasp_asvs_repo/4.0/en"; do
      if [[ -d "$candidate" ]]; then asvs_dir="$candidate"; break; fi
    done
    if [[ -n "$asvs_dir" ]]; then
      find "$asvs_dir" -name "*.md" | sort | while read -r f; do
        echo "---"
        echo "# $(basename "$f" .md)"
        cat "$f"
        echo ""
      done > "$WORK_DIR/texts/OWASP_ASVS.txt" 2>/dev/null
      if [[ -f "$WORK_DIR/texts/OWASP_ASVS.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_ASVS.txt") -gt 1000 ]]; then
        upload_file "$WORK_DIR/texts/OWASP_ASVS.txt" "$col" "compliance_datenschutz" "guidance" "2024" \
          '{"regulation_id":"owasp_asvs","source_id":"owasp","doc_type":"verification_standard","guideline_name":"OWASP Application Security Verification Standard (ASVS)","license":"CC_BY-SA_4.0","attribution":"OWASP Foundation","source":"github.com/OWASP/ASVS"}' \
          "OWASP ASVS (Application Security Verification Standard)"
      fi
    fi
  fi

  # --- J2d: OWASP MASVS (Mobile Application Security Verification Standard) ---
  local owasp_masvs_repo="$WORK_DIR/repos/owasp-masvs"
  if [[ ! -d "$owasp_masvs_repo" ]]; then
    git clone --depth 1 "https://github.com/OWASP/owasp-masvs.git" "$owasp_masvs_repo" 2>/dev/null || true
  fi
  if [[ -d "$owasp_masvs_repo" ]]; then
    # MASVS v2 structure
    local masvs_dir=""
    for candidate in "$owasp_masvs_repo/Document" "$owasp_masvs_repo/document" "$owasp_masvs_repo"; do
      if [[ -d "$candidate" ]] && find "$candidate" -maxdepth 2 -name "*.md" | head -1 | grep -q .; then
        masvs_dir="$candidate"; break
      fi
    done
    if [[ -n "$masvs_dir" ]]; then
      find "$masvs_dir" -name "*.md" -not -name "README.md" -not -name "CONTRIBUTING.md" -not -path "*/.github/*" | sort | while read -r f; do
        echo "---"
        echo "# $(basename "$f" .md)"
        cat "$f"
        echo ""
      done > "$WORK_DIR/texts/OWASP_MASVS.txt" 2>/dev/null
      if [[ -f "$WORK_DIR/texts/OWASP_MASVS.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_MASVS.txt") -gt 1000 ]]; then
        upload_file "$WORK_DIR/texts/OWASP_MASVS.txt" "$col" "compliance_datenschutz" "guidance" "2024" \
          '{"regulation_id":"owasp_masvs","source_id":"owasp","doc_type":"verification_standard","guideline_name":"OWASP Mobile Application Security Verification Standard (MASVS)","license":"CC_BY-SA_4.0","attribution":"OWASP Foundation","source":"github.com/OWASP/owasp-masvs"}' \
          "OWASP MASVS (Mobile Security Verification)"
      fi
    fi
  fi

  # --- J2e: OWASP SAMM (Software Assurance Maturity Model) ---
  local owasp_samm_repo="$WORK_DIR/repos/owasp-samm"
  if [[ ! -d "$owasp_samm_repo" ]]; then
    git clone --depth 1 "https://github.com/OWASP/samm.git" "$owasp_samm_repo" 2>/dev/null || true
  fi
  if [[ -d "$owasp_samm_repo" ]]; then
    # SAMM model content
    local samm_dir=""
    for candidate in "$owasp_samm_repo/Website/content" "$owasp_samm_repo/model" "$owasp_samm_repo"; do
      if [[ -d "$candidate" ]] && find "$candidate" -maxdepth 3 -name "*.md" | head -1 | grep -q .; then
        samm_dir="$candidate"; break
      fi
    done
    if [[ -n "$samm_dir" ]]; then
      find "$samm_dir" -name "*.md" -not -name "README.md" -not -name "CONTRIBUTING.md" -not -path "*/.github/*" | sort | while read -r f; do
        echo "---"
        echo "# $(basename "$f" .md)"
        cat "$f"
        echo ""
      done > "$WORK_DIR/texts/OWASP_SAMM.txt" 2>/dev/null
      if [[ -f "$WORK_DIR/texts/OWASP_SAMM.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_SAMM.txt") -gt 1000 ]]; then
        upload_file "$WORK_DIR/texts/OWASP_SAMM.txt" "$col" "compliance_datenschutz" "guidance" "2020" \
          '{"regulation_id":"owasp_samm","source_id":"owasp","doc_type":"maturity_model","guideline_name":"OWASP Software Assurance Maturity Model (SAMM) v2","license":"CC_BY-SA_4.0","attribution":"OWASP Foundation","source":"github.com/OWASP/samm"}' \
          "OWASP SAMM v2 (Software Assurance Maturity Model)"
      fi
    fi
  fi

  # --- J2f: OWASP Mobile Top 10 ---
  local owasp_mobile_repo="$WORK_DIR/repos/owasp-mobile-top10"
  if [[ ! -d "$owasp_mobile_repo" ]]; then
    git clone --depth 1 "https://github.com/niccolopetti/owasp_mobile_top_10.git" "$owasp_mobile_repo" 2>/dev/null || \
    git clone --depth 1 "https://github.com/niccolopetti/owasp_mobile_top_10.git" "$owasp_mobile_repo" 2>/dev/null || true
  fi
  # Fallback: direkt von der OWASP-Seite als HTML holen
  if [[ ! -d "$owasp_mobile_repo" ]] || ! find "$owasp_mobile_repo" -name "*.md" | head -1 | grep -q .; then
    curl -sL "https://owasp.org/www-project-mobile-top-10/" \
      -o "$WORK_DIR/texts/OWASP_Mobile_Top10_raw.html" 2>/dev/null || true
    if [[ -f "$WORK_DIR/texts/OWASP_Mobile_Top10_raw.html" && $(wc -c < "$WORK_DIR/texts/OWASP_Mobile_Top10_raw.html") -gt 1000 ]]; then
      python3 -c "
import re
with open('$WORK_DIR/texts/OWASP_Mobile_Top10_raw.html') as f:
    html = f.read()
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
with open('$WORK_DIR/texts/OWASP_Mobile_Top10.txt', 'w') as f:
    f.write('OWASP Mobile Top 10\n\n' + text[:100000])
" 2>/dev/null
    fi
  else
    find "$owasp_mobile_repo" -name "*.md" -not -name "README.md" | sort | while read -r f; do
      echo "---"
      cat "$f"
      echo ""
    done > "$WORK_DIR/texts/OWASP_Mobile_Top10.txt" 2>/dev/null
  fi
  if [[ -f "$WORK_DIR/texts/OWASP_Mobile_Top10.txt" && $(wc -c < "$WORK_DIR/texts/OWASP_Mobile_Top10.txt") -gt 500 ]]; then
    upload_file "$WORK_DIR/texts/OWASP_Mobile_Top10.txt" "$col" "compliance_datenschutz" "guidance" "2024" \
      '{"regulation_id":"owasp_mobile_top10","source_id":"owasp","doc_type":"risk_catalog","guideline_name":"OWASP Mobile Top 10","license":"CC_BY-SA","attribution":"OWASP Foundation","source":"owasp.org"}' \
      "OWASP Mobile Top 10"
  fi

  after=$(collection_count "$col")
  diff=$(( after - before ))
  log "Collection $col (OWASP): ${before} → ${after} chunks (+${diff})"

  # =========================================================================
  # J3: ENISA Guidelines (CC BY 4.0)
  # → bp_compliance_ce (EU-Content)
  # =========================================================================
  col="bp_compliance_ce"
  before=$(collection_count "$col")
  log "--- J3: ENISA Guidelines → $col ($before chunks) ---"

  # ENISA Procurement Guidelines for Cybersecurity in Hospitals
  download_pdf \
    "https://www.enisa.europa.eu/publications/good-practices-for-the-security-of-healthcare-services/@@download/fullReport" \
    "$WORK_DIR/pdfs/ENISA_Procurement_Hospitals.pdf" || true
  # Fallback URL if the above doesn't work
  if [[ ! -f "$WORK_DIR/pdfs/ENISA_Procurement_Hospitals.pdf" ]]; then
    download_pdf \
      "https://www.enisa.europa.eu/publications/procurement-guidelines-for-cybersecurity-in-hospitals/@@download/fullReport" \
      "$WORK_DIR/pdfs/ENISA_Procurement_Hospitals.pdf" || true
  fi
  if [[ -f "$WORK_DIR/pdfs/ENISA_Procurement_Hospitals.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/ENISA_Procurement_Hospitals.pdf" "$col" "compliance_ce" "guidance" "2024" \
      '{"regulation_id":"enisa_procurement_hospitals","source_id":"enisa","doc_type":"procurement_guidelines","guideline_name":"ENISA Procurement Guidelines for Cybersecurity in Hospitals","license":"CC_BY_4.0","attribution":"European Union Agency for Cybersecurity (ENISA)","source":"enisa.europa.eu"}' \
      "ENISA Procurement Guidelines Hospitals"
  fi

  # ENISA Cloud Security Guide for SMEs
  download_pdf \
    "https://www.enisa.europa.eu/publications/cloud-security-guide-for-smes/@@download/fullReport" \
    "$WORK_DIR/pdfs/ENISA_Cloud_Security_SMEs.pdf" || true
  if [[ ! -f "$WORK_DIR/pdfs/ENISA_Cloud_Security_SMEs.pdf" ]]; then
    download_pdf \
      "https://www.enisa.europa.eu/publications/cloud-security-guide-for-small-and-medium-sized-enterprises/@@download/fullReport" \
      "$WORK_DIR/pdfs/ENISA_Cloud_Security_SMEs.pdf" || true
  fi
  if [[ -f "$WORK_DIR/pdfs/ENISA_Cloud_Security_SMEs.pdf" ]]; then
    upload_file "$WORK_DIR/pdfs/ENISA_Cloud_Security_SMEs.pdf" "$col" "compliance_ce" "guidance" "2015" \
      '{"regulation_id":"enisa_cloud_smes","source_id":"enisa","doc_type":"security_guide","guideline_name":"ENISA Cloud Security Guide for SMEs","license":"CC_BY_4.0","attribution":"European Union Agency for Cybersecurity (ENISA)","source":"enisa.europa.eu"}' \
      "ENISA Cloud Security Guide SMEs"
  fi

  after=$(collection_count "$col")
  diff=$(( after - before ))
  log "Collection $col (ENISA): ${before} → ${after} chunks (+${diff})"

  # =========================================================================
  # Summary
  # =========================================================================
  echo ""
  log "Phase J abgeschlossen."
  log "Ingestiert: NIST (7 Standards), OWASP (6 Projekte), ENISA (2 Guides)"
  log ""
  log "NICHT ingestiert (lizenzrechtlich ausgeschlossen):"
  log "  BSI TR-03161-1/2/3, TR-03109, TR-03116, TR-03125 (Lizenz unklar)"
  log "  BSI C5:2020, Standards 200-1/2/3/4 (Lizenz unklar)"
  log "  BSI Grundschutz 2023 (Alle Rechte vorbehalten)"
  log "  ETSI EN 303 645 + TS/TR (Copyright, Reproduktion verboten)"
  log "  ISO/IEC 27001, 27002, 27701 (paywalled)"
  log "  ISO/SAE 21434 (paywalled)"
  log "  UN R155, R156 (non-commercial only)"
  log "  IEC 62304, 81001-5-1 (paywalled + KI-Verbot)"
  log "  CSA CCM/CAIQ (non-commercial)"
  log "  CIS Controls v8.1 (CC BY-NC-ND)"
  log "  MDCG 2019-16 (Lizenz unklar)"
}

# =============================================================================
# PHASE F: Verifizierung
# =============================================================================
phase_verify() {
  log "=========================================="
  log "PHASE F: Verifizierung"
  log "=========================================="

  echo ""
  echo "=== Collection Stats ==="
  for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
    local count
    count=$(collection_count "$col")
    printf "  %-30s %s chunks\n" "$col" "$count"
  done

  echo ""
  echo "=== Test-Suchen ==="

  log "Suche: 'Impressumspflicht digitale Dienste' in bp_compliance_gesetze"
  curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
    -H 'Content-Type: application/json' \
    -d '{"query":"Impressumspflicht digitale Dienste","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
    | python3 -c "
import sys,json
try:
    data = json.load(sys.stdin)
    results = data.get('results', [])
    print(f'  Treffer: {len(results)}')
    for r in results[:3]:
        print(f'    [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
except: print('  (parse error)')
" 2>/dev/null || echo "  (search failed)"

  log "Suche: 'Cookie Einwilligung' in bp_compliance_ce"
  curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
    -H 'Content-Type: application/json' \
    -d '{"query":"Cookie Einwilligung ePrivacy","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
    | python3 -c "
import sys,json
try:
    data = json.load(sys.stdin)
    results = data.get('results', [])
    print(f'  Treffer: {len(results)}')
    for r in results[:3]:
        print(f'    [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
except: print('  (parse error)')
" 2>/dev/null || echo "  (search failed)"

  log "Suche: 'Widerrufsbelehrung Fernabsatz' in bp_compliance_gesetze"
  curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
    -H 'Content-Type: application/json' \
    -d '{"query":"Widerrufsbelehrung Fernabsatz Widerrufsfrist","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
    | python3 -c "
import sys,json
try:
    data = json.load(sys.stdin)
    results = data.get('results', [])
    print(f'  Treffer: {len(results)}')
    for r in results[:3]:
        print(f'    [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
except: print('  (parse error)')
" 2>/dev/null || echo "  (search failed)"

  log "Suche: 'AI Act Hochrisiko Konformitaet' in bp_compliance_ce"
  curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
    -H 'Content-Type: application/json' \
    -d '{"query":"AI Act Hochrisiko Konformitaetsbewertung","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
    | python3 -c "
import sys,json
try:
    data = json.load(sys.stdin)
    results = data.get('results', [])
    print(f'  Treffer: {len(results)}')
    for r in results[:3]:
        print(f'    [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
except: print('  (parse error)')
" 2>/dev/null || echo "  (search failed)"

  log "Suche: 'Privacy Policy Template GDPR' in bp_legal_templates"
  curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
    -H 'Content-Type: application/json' \
    -d '{"query":"Privacy Policy Template GDPR","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
    | python3 -c "
import sys,json
try:
    data = json.load(sys.stdin)
    results = data.get('results', [])
    print(f'  Treffer: {len(results)}')
    for r in results[:3]:
        print(f'    [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
except: print('  (parse error)')
" 2>/dev/null || echo "  (search failed)"

  echo ""
}

# =============================================================================
# PHASE G: Corpus Version Registration
# =============================================================================
phase_register_version() {
  log "=========================================="
  log "PHASE G: Corpus Version Registration"
  log "=========================================="

  local today
  today=$(date '+%Y-%m-%d')

  for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
    local count
    count=$(collection_count "$col")

    if [[ "$count" == "?" || "$count" == "0" ]]; then
      warn "Skipping version for $col (count=$count)"
      continue
    fi

    # Determine next version number for today
    local existing_count
    existing_count=$(psql "$DB_URL" -tAc \
      "SELECT COUNT(*) FROM compliance_corpus_versions WHERE collection_name='$col' AND version LIKE '${today}.%'" \
      2>/dev/null || echo "0")
    local seq=$((existing_count + 1))
    local version="${today}.${seq}"

    # Get regulations list based on collection
    local regs=""
    case "$col" in
      bp_compliance_ce)
        regs='{eu_2022_2065,eu_2002_58,eu_2021_914}'
        ;;
      bp_compliance_gesetze)
        regs='{ddg_5,tdddg_25,urhg_5,egbgb_widerruf,bgb_komplett,urhg_komplett,tmg_komplett}'
        ;;
      bp_legal_templates)
        regs='{github_site_policy,opengov_site_policy,cc_legal_tools,common_paper,webflorist,tempest,cookieconsent}'
        ;;
      bp_compliance_datenschutz)
        regs='{edpb_consent,edpb_privacy_by_design,edpb_dark_patterns,edpb_social_media,edpb_cookie_banner,edps_generative_ai,edps_digital_ethics}'
        ;;
    esac

    # Compute digest from Qdrant collection info
    local digest
    digest=$(curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
      | python3 -c "import sys,json,hashlib; d=json.load(sys.stdin); print(hashlib.sha256(json.dumps(d.get('result',{}), sort_keys=True).encode()).hexdigest()[:32])" \
      2>/dev/null || echo "")

    log "Registering version $version for $col ($count chunks)"

    psql "$DB_URL" -c "
      INSERT INTO compliance_corpus_versions
        (version, collection_name, documents_count, chunks_count, regulations, digest, ingestion_source, created_by)
      VALUES
        ('${version}', '${col}', ${UPLOADED}, ${count}, '${regs}', '${digest}', 'ingest-legal-corpus.sh', 'system')
    " 2>/dev/null && ok "Version $version registered for $col" || warn "Version registration failed for $col (DB not available?)"
  done
}

# =============================================================================
# MAIN
# =============================================================================
main() {
  log "=========================================="
  log "BreakPilot Legal Corpus Ingestion"
  log "=========================================="
  log "Work dir:  $WORK_DIR"
  log "RAG API:   $RAG_URL"
  log "Qdrant:    $QDRANT_URL"
  echo ""

  # Check RAG API is reachable
  if ! curl $CURL_OPTS "$RAG_URL" -X POST 2>/dev/null | grep -q "file"; then
    fail "RAG API not reachable at $RAG_URL"
    exit 1
  fi
  ok "RAG API reachable"

  # Check Qdrant
  if ! curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then
    fail "Qdrant not reachable at $QDRANT_URL"
    exit 1
  fi
  ok "Qdrant reachable"
  echo ""

  # Run phases
  if [[ -n "$ONLY_PHASE" ]]; then
    case "$ONLY_PHASE" in
      download)    phase_download ;;
      gesetze)     phase_gesetze ;;
      eu)          phase_eu ;;
      templates)   phase_templates ;;
      datenschutz)        phase_datenschutz ;;
      verbraucherschutz)  phase_verbraucherschutz ;;
      dach)               phase_dach ;;
      security)           phase_security ;;
      verify)             phase_verify ;;
      version)            phase_register_version ;;
      *) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;;
    esac
  else
    if [[ "$SKIP_DOWNLOAD" != "true" ]]; then
      phase_download
    else
      log "Skipping download phase (--skip-download)"
    fi
    echo ""
    phase_gesetze
    echo ""
    phase_eu
    echo ""
    phase_templates
    echo ""
    phase_datenschutz
    echo ""
    phase_verbraucherschutz
    echo ""
    phase_dach
    echo ""
    phase_security
    echo ""
    phase_verify
    echo ""
    phase_register_version
  fi

  # Summary
  echo ""
  log "=========================================="
  log "ERGEBNIS"
  log "=========================================="
  log "Uploaded:  $UPLOADED"
  log "Failed:    $FAILED"
  log "Skipped:   $SKIPPED"
  log "=========================================="

  if [[ $FAILED -gt 0 ]]; then
    warn "$FAILED uploads fehlgeschlagen!"
    exit 1
  fi

  ok "Ingestion abgeschlossen!"
}

main "$@"