#!/usr/bin/env bash # ============================================================================= # BreakPilot Compliance — RAG Legal Corpus Ingestion # # Laedt 23 freie Rechtsquellen herunter und ingestiert sie in Qdrant # via die Core RAG-API (Port 8097). # # Ausfuehrung auf dem Mac Mini: # ~/rag-ingestion/ingest-legal-corpus.sh [--skip-download] [--only PHASE] # # Phasen: download, gesetze, eu, templates, datenschutz, verify # ============================================================================= set -euo pipefail # --- Configuration ----------------------------------------------------------- WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion}" RAG_URL="https://localhost:8097/api/v1/documents/upload" QDRANT_URL="http://localhost:6333" SDK_URL="${SDK_URL:-https://localhost:8093}" DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}" CURL_OPTS="-sk --connect-timeout 10 --max-time 300" # Counters UPLOADED=0 FAILED=0 SKIPPED=0 # --- CLI Args ---------------------------------------------------------------- SKIP_DOWNLOAD=false ONLY_PHASE="" while [[ $# -gt 0 ]]; do case $1 in --skip-download) SKIP_DOWNLOAD=true; shift ;; --only) ONLY_PHASE="$2"; shift 2 ;; -h|--help) echo "Usage: $0 [--skip-download] [--only PHASE]" echo "Phases: download, gesetze, eu, templates, datenschutz, verify, version" exit 0 ;; *) echo "Unknown option: $1"; exit 1 ;; esac done # --- Helpers ----------------------------------------------------------------- log() { echo "[$(date '+%H:%M:%S')] $*"; } ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; } warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; } fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; } upload_file() { local file="$1" local collection="$2" local data_type="$3" local use_case="$4" local year="$5" local metadata_json="$6" local label="${7:-$(basename "$file")}" if [[ ! -f "$file" ]]; then warn "File not found: $file" FAILED=$((FAILED + 1)) return 1 fi local filesize filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0) if [[ "$filesize" -lt 100 ]]; then warn "File too small (${filesize}B), skipping: $label" SKIPPED=$((SKIPPED + 1)) return 1 fi log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)" local response response=$(curl $CURL_OPTS -X POST "$RAG_URL" \ -F "file=@${file}" \ -F "collection=${collection}" \ -F "data_type=${data_type}" \ -F "use_case=${use_case}" \ -F "year=${year}" \ -F "chunk_strategy=recursive" \ -F "chunk_size=512" \ -F "chunk_overlap=50" \ -F "metadata_json=${metadata_json}" \ 2>/dev/null) || true if echo "$response" | grep -q '"chunks_count"'; then local chunks chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?") ok "$label → $chunks chunks" UPLOADED=$((UPLOADED + 1)) elif echo "$response" | grep -q '"vectors_indexed"'; then local vectors vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?") ok "$label → $vectors vectors" UPLOADED=$((UPLOADED + 1)) else fail "Upload failed: $label" fail "Response: $response" FAILED=$((FAILED + 1)) return 1 fi } clone_repo() { local url="$1" local target="$2" if [[ -d "$target" ]]; then log "Repo exists: $target (skipping clone)" return 0 fi log "Cloning: $url" git clone --depth 1 "$url" "$target" 2>/dev/null || { warn "Clone failed: $url" return 1 } } download_pdf() { local url="$1" local target="$2" if [[ -f "$target" ]]; then log "PDF exists: $(basename "$target") (skipping)" return 0 fi log "Downloading: $(basename "$target")" curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || { warn "Download failed: $url" return 1 } } # Extract text from gesetze-im-internet.de HTML page extract_gesetz_html() { local url="$1" local output="$2" local label="$3" if [[ -f "$output" ]]; then log "Text exists: $(basename "$output") (skipping)" return 0 fi log "Extracting: $label from gesetze-im-internet.de" curl $CURL_OPTS -L "$url" 2>/dev/null \ | python3 -c " import sys, codecs # gesetze-im-internet.de uses ISO-8859-1 encoding sys.stdin = codecs.getreader('iso-8859-1')(sys.stdin.buffer) from html.parser import HTMLParser class TextExtractor(HTMLParser): def __init__(self): super().__init__() self.text = [] self.in_content = False self.skip = False def handle_starttag(self, tag, attrs): attrs_dict = dict(attrs) if tag == 'div' and 'jnhtml' in attrs_dict.get('class', ''): self.in_content = True if tag in ('script', 'style', 'nav', 'header', 'footer'): self.skip = True def handle_endtag(self, tag): if tag in ('script', 'style', 'nav', 'header', 'footer'): self.skip = False if tag in ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'li'): self.text.append('\n') def handle_data(self, data): if not self.skip: self.text.append(data) parser = TextExtractor() parser.feed(sys.stdin.read()) print(''.join(parser.text).strip()) " > "$output" || { warn "Extraction failed: $label" return 1 } } # Concatenate Markdown files from bundestag/gesetze repo for a specific law concat_bundestag_gesetz() { local gesetz_dir="$1" local output="$2" local label="$3" if [[ ! -d "$gesetz_dir" ]]; then warn "Gesetz directory not found: $gesetz_dir" return 0 fi log "Concatenating: $label" { echo "# $label" echo "" # Sort by paragraph number for correct ordering find "$gesetz_dir" -name "*.md" -type f | sort | while read -r f; do cat "$f" echo "" echo "---" echo "" done } > "$output" } collection_count() { local col="$1" curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \ | python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?" } # ============================================================================= # PHASE A: Downloads # ============================================================================= phase_download() { log "==========================================" log "PHASE A: Downloads (PDFs + Git-Repos)" log "==========================================" mkdir -p "$WORK_DIR"/{pdfs,repos,texts} # --- A1: EUR-Lex PDFs --- log "--- EUR-Lex PDFs ---" download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32022R2065" \ "$WORK_DIR/pdfs/dsa_2022_2065.pdf" download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32002L0058" \ "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" download_pdf \ "https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32021D0914" \ "$WORK_DIR/pdfs/scc_2021_914.pdf" # --- A2: Deutsche Gesetze (Einzelparagraphen) --- log "--- Deutsche Gesetze (Einzelparagraphen) ---" extract_gesetz_html \ "https://www.gesetze-im-internet.de/ddg/__5.html" \ "$WORK_DIR/texts/ddg_5.txt" \ "DDG § 5 (Impressum)" # TDDDG heisst auf gesetze-im-internet.de noch "ttdsg" extract_gesetz_html \ "https://www.gesetze-im-internet.de/ttdsg/__25.html" \ "$WORK_DIR/texts/tdddg_25.txt" \ "TDDDG § 25 (Cookies)" extract_gesetz_html \ "https://www.gesetze-im-internet.de/urhg/__5.html" \ "$WORK_DIR/texts/urhg_5.txt" \ "UrhG § 5 (Amtliche Werke)" # EGBGB Art. 246a § 1 (enthaelt Verweis auf Muster-Widerrufsbelehrung) extract_gesetz_html \ "https://www.gesetze-im-internet.de/bgbeg/art_246a__1.html" \ "$WORK_DIR/texts/egbgb_widerruf.txt" \ "EGBGB Muster-Widerrufsbelehrung" # --- A3: Git-Repos --- log "--- Git-Repos ---" clone_repo "https://github.com/bundestag/gesetze.git" \ "$WORK_DIR/repos/bundestag-gesetze" clone_repo "https://github.com/github/site-policy.git" \ "$WORK_DIR/repos/github-site-policy" clone_repo "https://github.com/opengovfoundation/site-policy.git" \ "$WORK_DIR/repos/opengov-site-policy" clone_repo "https://github.com/creativecommons/cc-legal-tools-data.git" \ "$WORK_DIR/repos/cc-legal-tools" clone_repo "https://github.com/oprvc/oprvc.github.io.git" \ "$WORK_DIR/repos/oprvc" clone_repo "https://github.com/webflorist/privacy-policy-text.git" \ "$WORK_DIR/repos/webflorist" clone_repo "https://github.com/Tempest-Solutions-Company/privacy-policy-generator.git" \ "$WORK_DIR/repos/tempest-privacy" || true clone_repo "https://github.com/Tempest-Solutions-Company/terms-of-service-generator.git" \ "$WORK_DIR/repos/tempest-tos" || true clone_repo "https://github.com/Tempest-Solutions-Company/cookie-banner-consent-solution.git" \ "$WORK_DIR/repos/tempest-cookie" || true clone_repo "https://github.com/orestbida/cookieconsent.git" \ "$WORK_DIR/repos/cookieconsent" || true # CommonPaper hat separate Repos pro Vertragstyp clone_repo "https://github.com/CommonPaper/CSA.git" \ "$WORK_DIR/repos/common-paper-csa" || true clone_repo "https://github.com/CommonPaper/SLA.git" \ "$WORK_DIR/repos/common-paper-sla" || true clone_repo "https://github.com/CommonPaper/PSA.git" \ "$WORK_DIR/repos/common-paper-psa" || true # OpenCode.de (Datennutzungsklauseln) - try HTTPS clone_repo "https://gitlab.opencode.de/wernerth/datennutzungsklauseln-muster.git" \ "$WORK_DIR/repos/datennutzungsklauseln" || true # --- A4: EDPB/EDPS PDFs (verifizierte URLs) --- log "--- EDPB/EDPS Guidance PDFs ---" # EDPB Guidelines 05/2020 on Consent download_pdf \ "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202005_consent_en.pdf" \ "$WORK_DIR/pdfs/edpb_consent_guidelines.pdf" # EDPB Guidelines 4/2019 Data Protection by Design and Default download_pdf \ "https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201904_dataprotection_by_design_and_by_default_v2.0_en.pdf" \ "$WORK_DIR/pdfs/edpb_privacy_by_design.pdf" # EDPB Guidelines 03/2022 Dark Patterns download_pdf \ "https://www.edpb.europa.eu/system/files/2023-02/edpb_03-2022_guidelines_on_deceptive_design_patterns_in_social_media_platform_interfaces_v2_en_0.pdf" \ "$WORK_DIR/pdfs/edpb_dark_patterns.pdf" # EDPB Guidelines 8/2020 Social Media Targeting download_pdf \ "https://www.edpb.europa.eu/system/files/2021-04/edpb_guidelines_082020_on_the_targeting_of_social_media_users_en.pdf" \ "$WORK_DIR/pdfs/edpb_social_media_targeting.pdf" # EDPB Cookie Banner Taskforce Report (Jan 2023) download_pdf \ "https://www.edpb.europa.eu/system/files/2023-01/edpb_20230118_report_cookie_banner_taskforce_en.pdf" \ "$WORK_DIR/pdfs/edpb_cookie_banner_taskforce.pdf" # EDPB Guidelines 2/2023 ePrivacy Art. 5(3) Technical Scope download_pdf \ "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202302_technical_scope_art_53_eprivacydirective_v2_en_0.pdf" \ "$WORK_DIR/pdfs/edpb_eprivacy_art53.pdf" # EDPB Guidelines 1/2024 Legitimate Interest download_pdf \ "https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202401_legitimateinterest_en.pdf" \ "$WORK_DIR/pdfs/edpb_legitimate_interest.pdf" # EDPB DPO Coordinated Enforcement Report 2024 download_pdf \ "https://www.edpb.europa.eu/system/files/2024-01/edpb_report_20240116_cef_dpo_en.pdf" \ "$WORK_DIR/pdfs/edpb_dpo_report.pdf" # EDPS GenAI Orientations (June 2024) download_pdf \ "https://www.edps.europa.eu/system/files/2024-06/24-06-03_genai_orientations_en.pdf" \ "$WORK_DIR/pdfs/edps_generative_ai.pdf" # EDPS Digital Ethics Report (2018) download_pdf \ "https://edps.europa.eu/sites/edp/files/publication/18-01-25_eag_report_en.pdf" \ "$WORK_DIR/pdfs/edps_digital_ethics.pdf" # --- A5: Text-Extraktion aus Repos --- log "--- Text-Extraktion aus Repos ---" # Bundestag/gesetze: Verfuegbare Gesetze (Repo ist teilweise veraltet) # DDG, TDDDG, EGBGB fehlen im Repo - nur BGB, UrhG, TMG vorhanden local -a bundestag_gesetze=( "b/bgb:BGB" "u/urhg:UrhG" "t/tmg:TMG" ) for entry in "${bundestag_gesetze[@]}"; do local path="${entry%%:*}" local label="${entry##*:}" local gesetz_dir="$WORK_DIR/repos/bundestag-gesetze/$path" if [[ -d "$gesetz_dir" ]]; then local name name=$(echo "$label" | tr '[:upper:]' '[:lower:]') concat_bundestag_gesetz "$gesetz_dir" \ "$WORK_DIR/texts/bundestag_${name}_komplett.txt" \ "$label (komplett)" else warn "Bundestag Gesetz nicht gefunden: $gesetz_dir" fi done log "Download phase complete." } # ============================================================================= # PHASE B: Deutsche Gesetze → bp_compliance_gesetze # ============================================================================= phase_gesetze() { log "==========================================" log "PHASE B: Deutsche Gesetze → bp_compliance_gesetze" log "==========================================" local col="bp_compliance_gesetze" local before before=$(collection_count "$col") log "Collection $col: $before chunks (before)" # B1: Einzelparagraphen upload_file "$WORK_DIR/texts/ddg_5.txt" "$col" "compliance" "legal_reference" "2024" \ '{"regulation_id":"ddg_5","regulation_name_de":"Digitale-Dienste-Gesetz § 5","category":"impressum","license":"public_law","source":"gesetze-im-internet.de"}' \ "DDG § 5 (Impressumspflicht)" upload_file "$WORK_DIR/texts/tdddg_25.txt" "$col" "compliance" "legal_reference" "2024" \ '{"regulation_id":"tdddg_25","regulation_name_de":"TDDDG § 25","category":"cookies","license":"public_law","source":"gesetze-im-internet.de"}' \ "TDDDG § 25 (Cookies/Endgeraetezugriff)" upload_file "$WORK_DIR/texts/urhg_5.txt" "$col" "compliance" "legal_reference" "2024" \ '{"regulation_id":"urhg_5","regulation_name_de":"UrhG § 5","category":"urheberrecht","license":"public_law","source":"gesetze-im-internet.de"}' \ "UrhG § 5 (Amtliche Werke)" upload_file "$WORK_DIR/texts/egbgb_widerruf.txt" "$col" "compliance" "legal_reference" "2024" \ '{"regulation_id":"egbgb_widerruf","regulation_name_de":"EGBGB Muster-Widerrufsbelehrung","category":"widerruf","license":"public_law","source":"gesetze-im-internet.de"}' \ "EGBGB Muster-Widerrufsbelehrung" # B2: Bundestag/gesetze (komplett) local -a bundestag_upload=( "bgb:BGB:Buergerliches Gesetzbuch" "urhg:UrhG:Urheberrechtsgesetz" "tmg:TMG:Telemediengesetz" ) for entry in "${bundestag_upload[@]}"; do local gesetz="${entry%%:*}" local rest="${entry#*:}" local label="${rest%%:*}" local fullname="${rest#*:}" local file="$WORK_DIR/texts/bundestag_${gesetz}_komplett.txt" if [[ -f "$file" ]]; then upload_file "$file" "$col" "compliance" "legal_reference" "2024" \ "{\"regulation_id\":\"${gesetz}_komplett\",\"regulation_name_de\":\"$fullname ($label komplett)\",\"category\":\"volltext\",\"license\":\"unlicense\",\"source\":\"github.com/bundestag/gesetze\"}" \ "$label komplett (Bundestag-Repo)" fi done local after after=$(collection_count "$col") log "Collection $col: $before → $after chunks" } # ============================================================================= # PHASE C: EU-Rechtstexte → bp_compliance_ce # ============================================================================= phase_eu() { log "==========================================" log "PHASE C: EU-Rechtstexte → bp_compliance_ce" log "==========================================" local col="bp_compliance_ce" local before before=$(collection_count "$col") log "Collection $col: $before chunks (before)" upload_file "$WORK_DIR/pdfs/dsa_2022_2065.pdf" "$col" "compliance_ce" "legal_reference" "2022" \ '{"regulation_id":"eu_2022_2065","regulation_name_de":"Digital Services Act (DSA)","regulation_name_en":"Digital Services Act","regulation_short":"DSA","category":"plattformregulierung","celex":"32022R2065","source":"eur-lex","license":"public_law"}' \ "Digital Services Act (EU) 2022/2065" upload_file "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" "$col" "compliance_ce" "legal_reference" "2002" \ '{"regulation_id":"eu_2002_58","regulation_name_de":"ePrivacy-Richtlinie","regulation_name_en":"ePrivacy Directive","regulation_short":"ePrivacy","category":"datenschutz","celex":"32002L0058","source":"eur-lex","license":"public_law"}' \ "ePrivacy-Richtlinie 2002/58/EC" upload_file "$WORK_DIR/pdfs/scc_2021_914.pdf" "$col" "compliance_ce" "legal_reference" "2021" \ '{"regulation_id":"eu_2021_914","regulation_name_de":"Standardvertragsklauseln (SCC)","regulation_name_en":"Standard Contractual Clauses","regulation_short":"SCC","category":"datentransfer","celex":"32021D0914","source":"eur-lex","license":"public_law"}' \ "Standardvertragsklauseln (EU) 2021/914" local after after=$(collection_count "$col") log "Collection $col: $before → $after chunks" } # ============================================================================= # PHASE D: Templates/Textbausteine → bp_legal_templates # ============================================================================= phase_templates() { log "==========================================" log "PHASE D: Templates → bp_legal_templates" log "==========================================" local col="bp_legal_templates" local before before=$(collection_count "$col") log "Collection $col: $before chunks (before)" # --- D1: GitHub Site Policy (CC0) --- local repo="$WORK_DIR/repos/github-site-policy" if [[ -d "$repo" ]]; then log "--- GitHub Site Policy ---" find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do local basename basename=$(basename "$f" .md) local doc_type="policy" case "$basename" in *terms*|*tos*|*service*) doc_type="tos" ;; *privacy*|*data*) doc_type="privacy_policy" ;; *dmca*|*copyright*) doc_type="dmca" ;; *acceptable*|*use*) doc_type="acceptable_use" ;; esac upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"github_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/github/site-policy\",\"filename\":\"$basename\"}" \ "GitHub: $basename" done fi # --- D2: OpenGov Site Policy (CC0) --- repo="$WORK_DIR/repos/opengov-site-policy" if [[ -d "$repo" ]]; then log "--- OpenGov Site Policy ---" find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do local basename basename=$(basename "$f" .md) local doc_type="policy" case "$basename" in *terms*|*tos*) doc_type="tos" ;; *privacy*) doc_type="privacy_policy" ;; esac upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"opengov_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/opengovfoundation/site-policy\",\"filename\":\"$basename\"}" \ "OpenGov: $basename" done fi # --- D3: Creative Commons Legal Tools (CC0) --- repo="$WORK_DIR/repos/cc-legal-tools" if [[ -d "$repo" ]]; then log "--- CC Legal Tools (ausgewaehlte Lizenztexte) ---" # Only ingest the main license deeds (DE legalcode where available, else EN) for license_dir in "$repo"/legalcode/de/CC0_1.0 "$repo"/legalcode/de/CC-BY_4.0 "$repo"/legalcode/de/CC-BY-SA_4.0; do if [[ -d "$license_dir" ]]; then find "$license_dir" -name "*.html" -o -name "*.txt" -o -name "*.md" 2>/dev/null | head -3 | while read -r f; do local basename basename=$(basename "$f") upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \ "CC License: $basename" done fi done # Fallback: try top-level legalcode files find "$repo"/legalcode -maxdepth 2 -name "*4.0*legalcode*de*" -type f 2>/dev/null | head -5 | while read -r f; do local basename basename=$(basename "$f") upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \ "CC License: $basename" done fi # --- D4: opr.vc DSGVO-Mustertexte (CC0) --- repo="$WORK_DIR/repos/oprvc" if [[ -d "$repo" ]]; then log "--- opr.vc DSGVO-Mustertexte ---" # Look for German privacy/DSGVO content find "$repo" \( -name "*.md" -o -name "*.html" -o -name "*.txt" \) \ -not -path "*/.git/*" -not -name "README.md" 2>/dev/null \ | grep -iE "(datenschutz|privacy|dsgvo|gdpr|impressum)" \ | head -20 | while read -r f; do local basename basename=$(basename "$f") upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \ "opr.vc: $basename" done # If no specific files found, try all markdown files if [[ $(find "$repo" \( -name "*.md" -o -name "*.html" \) -not -path "*/.git/*" -not -name "README.md" | grep -ciE "(datenschutz|privacy|dsgvo|gdpr)" 2>/dev/null) -eq 0 ]]; then find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" | head -10 | while read -r f; do local basename basename=$(basename "$f") upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \ "opr.vc: $basename" done fi fi # --- D5: webflorist/privacy-policy-text (MIT) --- repo="$WORK_DIR/repos/webflorist" if [[ -d "$repo" ]]; then log "--- webflorist Privacy Policy Text ---" # Look for JSON/text building blocks (German) find "$repo" \( -name "*.json" -o -name "*.txt" -o -name "*.md" -o -name "*.php" \) \ -not -path "*/.git/*" -not -path "*/node_modules/*" -not -name "package*.json" \ -not -name "composer.json" -not -name "README.md" 2>/dev/null \ | head -20 | while read -r f; do local basename basename=$(basename "$f") upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"webflorist\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/webflorist/privacy-policy-text\",\"filename\":\"$basename\"}" \ "webflorist: $basename" done fi # --- D6: Tempest Privacy Policy Generator (MIT) --- repo="$WORK_DIR/repos/tempest-privacy" if [[ -d "$repo" ]]; then log "--- Tempest Privacy Policy Generator ---" find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \ -not -path "*/.git/*" -not -path "*/node_modules/*" \ -not -name "package*.json" -not -name "README.md" 2>/dev/null \ | head -15 | while read -r f; do local basename basename=$(basename "$f") upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"tempest_privacy\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/privacy-policy-generator\",\"filename\":\"$basename\"}" \ "Tempest Privacy: $basename" done fi # --- D7: Tempest Terms of Service Generator (MIT) --- repo="$WORK_DIR/repos/tempest-tos" if [[ -d "$repo" ]]; then log "--- Tempest Terms of Service Generator ---" find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \ -not -path "*/.git/*" -not -path "*/node_modules/*" \ -not -name "package*.json" -not -name "README.md" 2>/dev/null \ | head -15 | while read -r f; do local basename basename=$(basename "$f") upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"tempest_tos\",\"doc_type\":\"tos\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/terms-of-service-generator\",\"filename\":\"$basename\"}" \ "Tempest ToS: $basename" done fi # --- D8: Tempest Cookie Banner (MIT) --- repo="$WORK_DIR/repos/tempest-cookie" if [[ -d "$repo" ]]; then log "--- Tempest Cookie Banner ---" find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \ -not -path "*/.git/*" -not -path "*/node_modules/*" \ -not -name "package*.json" -not -name "README.md" 2>/dev/null \ | head -15 | while read -r f; do local basename basename=$(basename "$f") upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"tempest_cookie\",\"doc_type\":\"cookie_banner\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/cookie-banner-consent-solution\",\"filename\":\"$basename\"}" \ "Tempest Cookie: $basename" done fi # --- D9: CookieConsent (orestbida) - UI Strings (MIT) --- repo="$WORK_DIR/repos/cookieconsent" if [[ -d "$repo" ]]; then log "--- CookieConsent UI Strings ---" # Look for translation/language files find "$repo" -path "*/translations/*" -o -path "*/languages/*" -o -path "*/i18n/*" -o -path "*/locales/*" 2>/dev/null \ | grep -iE "\.(json|js|ts)$" | head -10 | while read -r f; do local basename basename=$(basename "$f") upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \ "CookieConsent: $basename" done # Also check for example configs find "$repo" -name "*.md" -path "*/docs/*" 2>/dev/null | head -5 | while read -r f; do local basename basename=$(basename "$f") upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \ "CookieConsent Docs: $basename" done fi # --- D10: Common Paper (CC BY 4.0) --- log "--- Common Paper Standards ---" local -a cp_repos=( "common-paper-csa:saas_contract:CSA" "common-paper-sla:sla:SLA" "common-paper-psa:psa:PSA" ) for entry in "${cp_repos[@]}"; do local cp_dir="${entry%%:*}" local rest="${entry#*:}" local cp_doc_type="${rest%%:*}" local cp_label="${rest#*:}" repo="$WORK_DIR/repos/$cp_dir" if [[ -d "$repo" ]]; then find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" \ -not -name "CONTRIBUTING.md" -not -name "CHANGELOG.md" -not -name "CODE_OF_CONDUCT.md" 2>/dev/null \ | head -10 | while read -r f; do local basename basename=$(basename "$f" .md) upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"common_paper\",\"doc_type\":\"$cp_doc_type\",\"license\":\"cc_by_4\",\"attribution\":\"Common Paper Inc., licensed under CC BY 4.0\",\"source\":\"github.com/CommonPaper/$cp_label\",\"filename\":\"$basename\"}" \ "CommonPaper $cp_label: $basename" done fi done # --- D11: Datennutzungsklauseln (CC BY 4.0) --- repo="$WORK_DIR/repos/datennutzungsklauseln" if [[ -d "$repo" ]]; then log "--- Datennutzungsklauseln ---" find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" 2>/dev/null \ | head -15 | while read -r f; do local basename basename=$(basename "$f" .md) upload_file "$f" "$col" "legal_template" "template" "2024" \ "{\"source_id\":\"datennutzungsklauseln\",\"doc_type\":\"data_clause\",\"license\":\"cc_by_4\",\"attribution\":\"OpenCode.de, lizenziert unter CC BY 4.0\",\"source\":\"gitlab.opencode.de/wernerth/datennutzungsklauseln-muster\",\"filename\":\"$basename\"}" \ "Datennutzungsklausel: $basename" done fi local after after=$(collection_count "$col") log "Collection $col: $before → $after chunks" } # ============================================================================= # PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz # ============================================================================= phase_datenschutz() { log "==========================================" log "PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz" log "==========================================" local col="bp_compliance_datenschutz" local before before=$(collection_count "$col") log "Collection $col: $before chunks (before)" # EDPB Guidelines for pdf in "$WORK_DIR"/pdfs/edpb_*.pdf; do if [[ -f "$pdf" ]]; then local basename basename=$(basename "$pdf" .pdf) local guideline_name="${basename#edpb_}" guideline_name="${guideline_name//_/ }" upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \ "{\"source_id\":\"edpb\",\"doc_type\":\"guidance\",\"guideline_name\":\"$guideline_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Board (EDPB)\",\"source\":\"edpb.europa.eu\"}" \ "EDPB: $guideline_name" fi done # EDPS Guidance for pdf in "$WORK_DIR"/pdfs/edps_*.pdf; do if [[ -f "$pdf" ]]; then local basename basename=$(basename "$pdf" .pdf) local guidance_name="${basename#edps_}" guidance_name="${guidance_name//_/ }" upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \ "{\"source_id\":\"edps\",\"doc_type\":\"guidance\",\"guidance_name\":\"$guidance_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Supervisor (EDPS)\",\"source\":\"edps.europa.eu\"}" \ "EDPS: $guidance_name" fi done local after after=$(collection_count "$col") log "Collection $col: $before → $after chunks" } # ============================================================================= # PHASE F: Verifizierung # ============================================================================= phase_verify() { log "==========================================" log "PHASE F: Verifizierung" log "==========================================" echo "" echo "=== Collection Stats ===" for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do local count count=$(collection_count "$col") printf " %-30s %s chunks\n" "$col" "$count" done echo "" echo "=== Test-Suchen ===" log "Suche: 'Impressumspflicht digitale Dienste' in bp_compliance_gesetze" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"Impressumspflicht digitale Dienste","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...') except: print(' (parse error)') " 2>/dev/null || echo " (search failed)" log "Suche: 'Cookie Einwilligung' in bp_compliance_ce" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"Cookie Einwilligung ePrivacy","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...') except: print(' (parse error)') " 2>/dev/null || echo " (search failed)" log "Suche: 'Privacy Policy Template GDPR' in bp_legal_templates" curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \ -H 'Content-Type: application/json' \ -d '{"query":"Privacy Policy Template GDPR","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \ | python3 -c " import sys,json try: data = json.load(sys.stdin) results = data.get('results', []) print(f' Treffer: {len(results)}') for r in results[:3]: print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...') except: print(' (parse error)') " 2>/dev/null || echo " (search failed)" echo "" } # ============================================================================= # PHASE G: Corpus Version Registration # ============================================================================= phase_register_version() { log "==========================================" log "PHASE G: Corpus Version Registration" log "==========================================" local today today=$(date '+%Y-%m-%d') for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do local count count=$(collection_count "$col") if [[ "$count" == "?" || "$count" == "0" ]]; then warn "Skipping version for $col (count=$count)" continue fi # Determine next version number for today local existing_count existing_count=$(psql "$DB_URL" -tAc \ "SELECT COUNT(*) FROM compliance_corpus_versions WHERE collection_name='$col' AND version LIKE '${today}.%'" \ 2>/dev/null || echo "0") local seq=$((existing_count + 1)) local version="${today}.${seq}" # Get regulations list based on collection local regs="" case "$col" in bp_compliance_ce) regs='{eu_2022_2065,eu_2002_58,eu_2021_914}' ;; bp_compliance_gesetze) regs='{ddg_5,tdddg_25,urhg_5,egbgb_widerruf,bgb_komplett,urhg_komplett,tmg_komplett}' ;; bp_legal_templates) regs='{github_site_policy,opengov_site_policy,cc_legal_tools,common_paper,webflorist,tempest,cookieconsent}' ;; bp_compliance_datenschutz) regs='{edpb_consent,edpb_privacy_by_design,edpb_dark_patterns,edpb_social_media,edpb_cookie_banner,edps_generative_ai,edps_digital_ethics}' ;; esac # Compute digest from Qdrant collection info local digest digest=$(curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \ | python3 -c "import sys,json,hashlib; d=json.load(sys.stdin); print(hashlib.sha256(json.dumps(d.get('result',{}), sort_keys=True).encode()).hexdigest()[:32])" \ 2>/dev/null || echo "") log "Registering version $version for $col ($count chunks)" psql "$DB_URL" -c " INSERT INTO compliance_corpus_versions (version, collection_name, documents_count, chunks_count, regulations, digest, ingestion_source, created_by) VALUES ('${version}', '${col}', ${UPLOADED}, ${count}, '${regs}', '${digest}', 'ingest-legal-corpus.sh', 'system') " 2>/dev/null && ok "Version $version registered for $col" || warn "Version registration failed for $col (DB not available?)" done } # ============================================================================= # MAIN # ============================================================================= main() { log "==========================================" log "BreakPilot Legal Corpus Ingestion" log "==========================================" log "Work dir: $WORK_DIR" log "RAG API: $RAG_URL" log "Qdrant: $QDRANT_URL" echo "" # Check RAG API is reachable if ! curl $CURL_OPTS "$RAG_URL" -X POST 2>/dev/null | grep -q "file"; then fail "RAG API not reachable at $RAG_URL" exit 1 fi ok "RAG API reachable" # Check Qdrant if ! curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then fail "Qdrant not reachable at $QDRANT_URL" exit 1 fi ok "Qdrant reachable" echo "" # Run phases if [[ -n "$ONLY_PHASE" ]]; then case "$ONLY_PHASE" in download) phase_download ;; gesetze) phase_gesetze ;; eu) phase_eu ;; templates) phase_templates ;; datenschutz) phase_datenschutz ;; verify) phase_verify ;; version) phase_register_version ;; *) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;; esac else if [[ "$SKIP_DOWNLOAD" != "true" ]]; then phase_download else log "Skipping download phase (--skip-download)" fi echo "" phase_gesetze echo "" phase_eu echo "" phase_templates echo "" phase_datenschutz echo "" phase_verify echo "" phase_register_version fi # Summary echo "" log "==========================================" log "ERGEBNIS" log "==========================================" log "Uploaded: $UPLOADED" log "Failed: $FAILED" log "Skipped: $SKIPPED" log "==========================================" if [[ $FAILED -gt 0 ]]; then warn "$FAILED uploads fehlgeschlagen!" exit 1 fi ok "Ingestion abgeschlossen!" } main "$@"