- Add Qdrant dedup check in upload_file() — skip if regulation_id already exists - Split BGB (2.7MB) into 5 targeted parts via XML extraction: AGB §§305-310, Fernabsatz §§312-312k, Kaufrecht §§433-480, Widerruf §§355-361, Digitale Produkte §§327-327u - Lower large-file threshold 512KB→384KB (fixes GewO 432KB timeout) - Fix arithmetic syntax error when collection_count returns "?" - Replace EGBGB PDF (was empty) with XML extraction - Add unzip to Alpine container for XML archives Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1412 lines
59 KiB
Bash
Executable File
1412 lines
59 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# BreakPilot Compliance — RAG Legal Corpus Ingestion
|
|
#
|
|
# Laedt 23 freie Rechtsquellen herunter und ingestiert sie in Qdrant
|
|
# via die Core RAG-API (Port 8097).
|
|
#
|
|
# Ausfuehrung auf dem Mac Mini:
|
|
# ~/rag-ingestion/ingest-legal-corpus.sh [--skip-download] [--only PHASE]
|
|
#
|
|
# Phasen: download, gesetze, eu, templates, datenschutz, verify
|
|
# =============================================================================
|
|
set -euo pipefail
|
|
|
|
# --- Configuration -----------------------------------------------------------
|
|
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion}"
|
|
RAG_URL="${RAG_URL:-https://localhost:8097/api/v1/documents/upload}"
|
|
QDRANT_URL="${QDRANT_URL:-http://localhost:6333}"
|
|
SDK_URL="${SDK_URL:-https://localhost:8093}"
|
|
DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}"
|
|
CURL_OPTS="-sk --connect-timeout 10 --max-time 300"
|
|
CURL_OPTS_LARGE="-sk --connect-timeout 10 --max-time 900"
|
|
|
|
# Counters
|
|
UPLOADED=0
|
|
FAILED=0
|
|
SKIPPED=0
|
|
|
|
# --- CLI Args ----------------------------------------------------------------
|
|
SKIP_DOWNLOAD=false
|
|
ONLY_PHASE=""
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
--skip-download) SKIP_DOWNLOAD=true; shift ;;
|
|
--only) ONLY_PHASE="$2"; shift 2 ;;
|
|
-h|--help)
|
|
echo "Usage: $0 [--skip-download] [--only PHASE]"
|
|
echo "Phases: download, gesetze, eu, templates, datenschutz, verbraucherschutz, verify, version"
|
|
exit 0
|
|
;;
|
|
*) echo "Unknown option: $1"; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
# --- Helpers -----------------------------------------------------------------
|
|
log() { echo "[$(date '+%H:%M:%S')] $*"; }
|
|
ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
|
|
warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
|
|
fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
|
|
|
|
upload_file() {
|
|
local file="$1"
|
|
local collection="$2"
|
|
local data_type="$3"
|
|
local use_case="$4"
|
|
local year="$5"
|
|
local metadata_json="$6"
|
|
local label="${7:-$(basename "$file")}"
|
|
|
|
if [[ ! -f "$file" ]]; then
|
|
warn "File not found: $file"
|
|
FAILED=$((FAILED + 1))
|
|
return 0 # Don't abort script
|
|
fi
|
|
|
|
# Dedup-Check: Prüfe ob regulation_id bereits in Qdrant vorhanden ist
|
|
local reg_id
|
|
reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "")
|
|
if [[ -n "$reg_id" && -n "${QDRANT_URL:-}" ]]; then
|
|
local existing
|
|
existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \
|
|
2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0")
|
|
if [[ "$existing" -gt 0 ]] 2>/dev/null; then
|
|
log "⏭ Skip (already in Qdrant): $label [regulation_id=$reg_id]"
|
|
SKIPPED=$((SKIPPED + 1))
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
local filesize
|
|
filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
|
|
if [[ "$filesize" -lt 100 ]]; then
|
|
warn "File too small (${filesize}B), skipping: $label"
|
|
SKIPPED=$((SKIPPED + 1))
|
|
return 0 # Don't abort script
|
|
fi
|
|
|
|
log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)"
|
|
|
|
# Use longer timeout for large files (>500KB)
|
|
local curl_opts="$CURL_OPTS"
|
|
if [[ "$filesize" -gt 384000 ]]; then
|
|
curl_opts="$CURL_OPTS_LARGE"
|
|
log " (large file, using extended timeout)"
|
|
fi
|
|
|
|
local response
|
|
response=$(curl $curl_opts -X POST "$RAG_URL" \
|
|
-F "file=@${file}" \
|
|
-F "collection=${collection}" \
|
|
-F "data_type=${data_type}" \
|
|
-F "use_case=${use_case}" \
|
|
-F "year=${year}" \
|
|
-F "chunk_strategy=recursive" \
|
|
-F "chunk_size=512" \
|
|
-F "chunk_overlap=50" \
|
|
-F "metadata_json=${metadata_json}" \
|
|
2>/dev/null) || true
|
|
|
|
if echo "$response" | grep -q '"chunks_count"'; then
|
|
local chunks
|
|
chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?")
|
|
ok "$label → $chunks chunks"
|
|
UPLOADED=$((UPLOADED + 1))
|
|
elif echo "$response" | grep -q '"vectors_indexed"'; then
|
|
local vectors
|
|
vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?")
|
|
ok "$label → $vectors vectors"
|
|
UPLOADED=$((UPLOADED + 1))
|
|
else
|
|
fail "Upload failed: $label"
|
|
fail "Response: ${response:0:200}"
|
|
FAILED=$((FAILED + 1))
|
|
return 0 # Don't abort script on individual upload failure
|
|
fi
|
|
}
|
|
|
|
clone_repo() {
|
|
local url="$1"
|
|
local target="$2"
|
|
|
|
if [[ -d "$target" ]]; then
|
|
log "Repo exists: $target (skipping clone)"
|
|
return 0
|
|
fi
|
|
|
|
log "Cloning: $url"
|
|
git clone --depth 1 "$url" "$target" 2>/dev/null || {
|
|
warn "Clone failed: $url"
|
|
return 1
|
|
}
|
|
}
|
|
|
|
download_pdf() {
|
|
local url="$1"
|
|
local target="$2"
|
|
|
|
if [[ -f "$target" ]]; then
|
|
log "PDF exists: $(basename "$target") (skipping)"
|
|
return 0
|
|
fi
|
|
|
|
log "Downloading: $(basename "$target")"
|
|
curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || {
|
|
warn "Download failed: $url"
|
|
return 1
|
|
}
|
|
}
|
|
|
|
# Extract text from gesetze-im-internet.de HTML page
|
|
extract_gesetz_html() {
|
|
local url="$1"
|
|
local output="$2"
|
|
local label="$3"
|
|
|
|
if [[ -f "$output" ]]; then
|
|
log "Text exists: $(basename "$output") (skipping)"
|
|
return 0
|
|
fi
|
|
|
|
log "Extracting: $label from gesetze-im-internet.de"
|
|
curl $CURL_OPTS -L "$url" 2>/dev/null \
|
|
| python3 -c "
|
|
import sys, codecs
|
|
|
|
# gesetze-im-internet.de uses ISO-8859-1 encoding
|
|
sys.stdin = codecs.getreader('iso-8859-1')(sys.stdin.buffer)
|
|
|
|
from html.parser import HTMLParser
|
|
|
|
class TextExtractor(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.text = []
|
|
self.in_content = False
|
|
self.skip = False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs_dict = dict(attrs)
|
|
if tag == 'div' and 'jnhtml' in attrs_dict.get('class', ''):
|
|
self.in_content = True
|
|
if tag in ('script', 'style', 'nav', 'header', 'footer'):
|
|
self.skip = True
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in ('script', 'style', 'nav', 'header', 'footer'):
|
|
self.skip = False
|
|
if tag in ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'li'):
|
|
self.text.append('\n')
|
|
|
|
def handle_data(self, data):
|
|
if not self.skip:
|
|
self.text.append(data)
|
|
|
|
parser = TextExtractor()
|
|
parser.feed(sys.stdin.read())
|
|
print(''.join(parser.text).strip())
|
|
" > "$output" || {
|
|
warn "Extraction failed: $label"
|
|
return 1
|
|
}
|
|
}
|
|
|
|
# Concatenate Markdown files from bundestag/gesetze repo for a specific law
|
|
concat_bundestag_gesetz() {
|
|
local gesetz_dir="$1"
|
|
local output="$2"
|
|
local label="$3"
|
|
|
|
if [[ ! -d "$gesetz_dir" ]]; then
|
|
warn "Gesetz directory not found: $gesetz_dir"
|
|
return 0
|
|
fi
|
|
|
|
log "Concatenating: $label"
|
|
{
|
|
echo "# $label"
|
|
echo ""
|
|
# Sort by paragraph number for correct ordering
|
|
find "$gesetz_dir" -name "*.md" -type f | sort | while read -r f; do
|
|
cat "$f"
|
|
echo ""
|
|
echo "---"
|
|
echo ""
|
|
done
|
|
} > "$output"
|
|
}
|
|
|
|
collection_count() {
|
|
local col="$1"
|
|
curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
|
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?"
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE A: Downloads
|
|
# =============================================================================
|
|
phase_download() {
|
|
log "=========================================="
|
|
log "PHASE A: Downloads (PDFs + Git-Repos)"
|
|
log "=========================================="
|
|
|
|
mkdir -p "$WORK_DIR"/{pdfs,repos,texts}
|
|
|
|
# --- A1: EUR-Lex PDFs ---
|
|
log "--- EUR-Lex PDFs ---"
|
|
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32022R2065" \
|
|
"$WORK_DIR/pdfs/dsa_2022_2065.pdf"
|
|
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32002L0058" \
|
|
"$WORK_DIR/pdfs/eprivacy_2002_58.pdf"
|
|
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32021D0914" \
|
|
"$WORK_DIR/pdfs/scc_2021_914.pdf"
|
|
|
|
# --- A2: Deutsche Gesetze (Einzelparagraphen) ---
|
|
log "--- Deutsche Gesetze (Einzelparagraphen) ---"
|
|
|
|
extract_gesetz_html \
|
|
"https://www.gesetze-im-internet.de/ddg/__5.html" \
|
|
"$WORK_DIR/texts/ddg_5.txt" \
|
|
"DDG § 5 (Impressum)"
|
|
|
|
# TDDDG heisst auf gesetze-im-internet.de noch "ttdsg"
|
|
extract_gesetz_html \
|
|
"https://www.gesetze-im-internet.de/ttdsg/__25.html" \
|
|
"$WORK_DIR/texts/tdddg_25.txt" \
|
|
"TDDDG § 25 (Cookies)"
|
|
|
|
extract_gesetz_html \
|
|
"https://www.gesetze-im-internet.de/urhg/__5.html" \
|
|
"$WORK_DIR/texts/urhg_5.txt" \
|
|
"UrhG § 5 (Amtliche Werke)"
|
|
|
|
# EGBGB Art. 246a § 1 (enthaelt Verweis auf Muster-Widerrufsbelehrung)
|
|
extract_gesetz_html \
|
|
"https://www.gesetze-im-internet.de/bgbeg/art_246a__1.html" \
|
|
"$WORK_DIR/texts/egbgb_widerruf.txt" \
|
|
"EGBGB Muster-Widerrufsbelehrung"
|
|
|
|
# --- A3: Git-Repos ---
|
|
log "--- Git-Repos ---"
|
|
|
|
clone_repo "https://github.com/bundestag/gesetze.git" \
|
|
"$WORK_DIR/repos/bundestag-gesetze"
|
|
|
|
clone_repo "https://github.com/github/site-policy.git" \
|
|
"$WORK_DIR/repos/github-site-policy"
|
|
|
|
clone_repo "https://github.com/opengovfoundation/site-policy.git" \
|
|
"$WORK_DIR/repos/opengov-site-policy"
|
|
|
|
clone_repo "https://github.com/creativecommons/cc-legal-tools-data.git" \
|
|
"$WORK_DIR/repos/cc-legal-tools"
|
|
|
|
clone_repo "https://github.com/oprvc/oprvc.github.io.git" \
|
|
"$WORK_DIR/repos/oprvc"
|
|
|
|
clone_repo "https://github.com/webflorist/privacy-policy-text.git" \
|
|
"$WORK_DIR/repos/webflorist"
|
|
|
|
clone_repo "https://github.com/Tempest-Solutions-Company/privacy-policy-generator.git" \
|
|
"$WORK_DIR/repos/tempest-privacy" || true
|
|
|
|
clone_repo "https://github.com/Tempest-Solutions-Company/terms-of-service-generator.git" \
|
|
"$WORK_DIR/repos/tempest-tos" || true
|
|
|
|
clone_repo "https://github.com/Tempest-Solutions-Company/cookie-banner-consent-solution.git" \
|
|
"$WORK_DIR/repos/tempest-cookie" || true
|
|
|
|
clone_repo "https://github.com/orestbida/cookieconsent.git" \
|
|
"$WORK_DIR/repos/cookieconsent" || true
|
|
|
|
# CommonPaper hat separate Repos pro Vertragstyp
|
|
clone_repo "https://github.com/CommonPaper/CSA.git" \
|
|
"$WORK_DIR/repos/common-paper-csa" || true
|
|
clone_repo "https://github.com/CommonPaper/SLA.git" \
|
|
"$WORK_DIR/repos/common-paper-sla" || true
|
|
clone_repo "https://github.com/CommonPaper/PSA.git" \
|
|
"$WORK_DIR/repos/common-paper-psa" || true
|
|
|
|
# OpenCode.de (Datennutzungsklauseln) - try HTTPS
|
|
clone_repo "https://gitlab.opencode.de/wernerth/datennutzungsklauseln-muster.git" \
|
|
"$WORK_DIR/repos/datennutzungsklauseln" || true
|
|
|
|
# --- A4: EDPB/EDPS PDFs (verifizierte URLs) ---
|
|
log "--- EDPB/EDPS Guidance PDFs ---"
|
|
|
|
# EDPB Guidelines 05/2020 on Consent
|
|
download_pdf \
|
|
"https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202005_consent_en.pdf" \
|
|
"$WORK_DIR/pdfs/edpb_consent_guidelines.pdf"
|
|
|
|
# EDPB Guidelines 4/2019 Data Protection by Design and Default
|
|
download_pdf \
|
|
"https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201904_dataprotection_by_design_and_by_default_v2.0_en.pdf" \
|
|
"$WORK_DIR/pdfs/edpb_privacy_by_design.pdf"
|
|
|
|
# EDPB Guidelines 03/2022 Dark Patterns
|
|
download_pdf \
|
|
"https://www.edpb.europa.eu/system/files/2023-02/edpb_03-2022_guidelines_on_deceptive_design_patterns_in_social_media_platform_interfaces_v2_en_0.pdf" \
|
|
"$WORK_DIR/pdfs/edpb_dark_patterns.pdf"
|
|
|
|
# EDPB Guidelines 8/2020 Social Media Targeting
|
|
download_pdf \
|
|
"https://www.edpb.europa.eu/system/files/2021-04/edpb_guidelines_082020_on_the_targeting_of_social_media_users_en.pdf" \
|
|
"$WORK_DIR/pdfs/edpb_social_media_targeting.pdf"
|
|
|
|
# EDPB Cookie Banner Taskforce Report (Jan 2023)
|
|
download_pdf \
|
|
"https://www.edpb.europa.eu/system/files/2023-01/edpb_20230118_report_cookie_banner_taskforce_en.pdf" \
|
|
"$WORK_DIR/pdfs/edpb_cookie_banner_taskforce.pdf"
|
|
|
|
# EDPB Guidelines 2/2023 ePrivacy Art. 5(3) Technical Scope
|
|
download_pdf \
|
|
"https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202302_technical_scope_art_53_eprivacydirective_v2_en_0.pdf" \
|
|
"$WORK_DIR/pdfs/edpb_eprivacy_art53.pdf"
|
|
|
|
# EDPB Guidelines 1/2024 Legitimate Interest
|
|
download_pdf \
|
|
"https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202401_legitimateinterest_en.pdf" \
|
|
"$WORK_DIR/pdfs/edpb_legitimate_interest.pdf"
|
|
|
|
# EDPB DPO Coordinated Enforcement Report 2024
|
|
download_pdf \
|
|
"https://www.edpb.europa.eu/system/files/2024-01/edpb_report_20240116_cef_dpo_en.pdf" \
|
|
"$WORK_DIR/pdfs/edpb_dpo_report.pdf"
|
|
|
|
# EDPS GenAI Orientations (June 2024)
|
|
download_pdf \
|
|
"https://www.edps.europa.eu/system/files/2024-06/24-06-03_genai_orientations_en.pdf" \
|
|
"$WORK_DIR/pdfs/edps_generative_ai.pdf"
|
|
|
|
# EDPS Digital Ethics Report (2018)
|
|
download_pdf \
|
|
"https://edps.europa.eu/sites/edp/files/publication/18-01-25_eag_report_en.pdf" \
|
|
"$WORK_DIR/pdfs/edps_digital_ethics.pdf"
|
|
|
|
# --- A5: Text-Extraktion aus Repos ---
|
|
log "--- Text-Extraktion aus Repos ---"
|
|
|
|
# Bundestag/gesetze: Verfuegbare Gesetze (Repo ist teilweise veraltet)
|
|
# DDG, TDDDG, EGBGB fehlen im Repo - nur BGB, UrhG, TMG vorhanden
|
|
local -a bundestag_gesetze=(
|
|
"b/bgb:BGB"
|
|
"u/urhg:UrhG"
|
|
"t/tmg:TMG"
|
|
)
|
|
for entry in "${bundestag_gesetze[@]}"; do
|
|
local path="${entry%%:*}"
|
|
local label="${entry##*:}"
|
|
local gesetz_dir="$WORK_DIR/repos/bundestag-gesetze/$path"
|
|
if [[ -d "$gesetz_dir" ]]; then
|
|
local name
|
|
name=$(echo "$label" | tr '[:upper:]' '[:lower:]')
|
|
concat_bundestag_gesetz "$gesetz_dir" \
|
|
"$WORK_DIR/texts/bundestag_${name}_komplett.txt" \
|
|
"$label (komplett)"
|
|
else
|
|
warn "Bundestag Gesetz nicht gefunden: $gesetz_dir"
|
|
fi
|
|
done
|
|
|
|
log "Download phase complete."
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE B: Deutsche Gesetze → bp_compliance_gesetze
|
|
# =============================================================================
|
|
phase_gesetze() {
|
|
log "=========================================="
|
|
log "PHASE B: Deutsche Gesetze → bp_compliance_gesetze"
|
|
log "=========================================="
|
|
|
|
local col="bp_compliance_gesetze"
|
|
local before
|
|
before=$(collection_count "$col")
|
|
log "Collection $col: $before chunks (before)"
|
|
|
|
# B1: Einzelparagraphen
|
|
upload_file "$WORK_DIR/texts/ddg_5.txt" "$col" "compliance" "legal_reference" "2024" \
|
|
'{"regulation_id":"ddg_5","regulation_name_de":"Digitale-Dienste-Gesetz § 5","category":"impressum","license":"public_law","source":"gesetze-im-internet.de"}' \
|
|
"DDG § 5 (Impressumspflicht)"
|
|
|
|
upload_file "$WORK_DIR/texts/tdddg_25.txt" "$col" "compliance" "legal_reference" "2024" \
|
|
'{"regulation_id":"tdddg_25","regulation_name_de":"TDDDG § 25","category":"cookies","license":"public_law","source":"gesetze-im-internet.de"}' \
|
|
"TDDDG § 25 (Cookies/Endgeraetezugriff)"
|
|
|
|
upload_file "$WORK_DIR/texts/urhg_5.txt" "$col" "compliance" "legal_reference" "2024" \
|
|
'{"regulation_id":"urhg_5","regulation_name_de":"UrhG § 5","category":"urheberrecht","license":"public_law","source":"gesetze-im-internet.de"}' \
|
|
"UrhG § 5 (Amtliche Werke)"
|
|
|
|
upload_file "$WORK_DIR/texts/egbgb_widerruf.txt" "$col" "compliance" "legal_reference" "2024" \
|
|
'{"regulation_id":"egbgb_widerruf","regulation_name_de":"EGBGB Muster-Widerrufsbelehrung","category":"widerruf","license":"public_law","source":"gesetze-im-internet.de"}' \
|
|
"EGBGB Muster-Widerrufsbelehrung"
|
|
|
|
# B2: Bundestag/gesetze (komplett)
|
|
local -a bundestag_upload=(
|
|
"bgb:BGB:Buergerliches Gesetzbuch"
|
|
"urhg:UrhG:Urheberrechtsgesetz"
|
|
"tmg:TMG:Telemediengesetz"
|
|
)
|
|
for entry in "${bundestag_upload[@]}"; do
|
|
local gesetz="${entry%%:*}"
|
|
local rest="${entry#*:}"
|
|
local label="${rest%%:*}"
|
|
local fullname="${rest#*:}"
|
|
local file="$WORK_DIR/texts/bundestag_${gesetz}_komplett.txt"
|
|
if [[ -f "$file" ]]; then
|
|
upload_file "$file" "$col" "compliance" "legal_reference" "2024" \
|
|
"{\"regulation_id\":\"${gesetz}_komplett\",\"regulation_name_de\":\"$fullname ($label komplett)\",\"category\":\"volltext\",\"license\":\"unlicense\",\"source\":\"github.com/bundestag/gesetze\"}" \
|
|
"$label komplett (Bundestag-Repo)"
|
|
fi
|
|
done
|
|
|
|
local after
|
|
after=$(collection_count "$col")
|
|
log "Collection $col: $before → $after chunks"
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE C: EU-Rechtstexte → bp_compliance_ce
|
|
# =============================================================================
|
|
phase_eu() {
|
|
log "=========================================="
|
|
log "PHASE C: EU-Rechtstexte → bp_compliance_ce"
|
|
log "=========================================="
|
|
|
|
local col="bp_compliance_ce"
|
|
local before
|
|
before=$(collection_count "$col")
|
|
log "Collection $col: $before chunks (before)"
|
|
|
|
upload_file "$WORK_DIR/pdfs/dsa_2022_2065.pdf" "$col" "compliance_ce" "legal_reference" "2022" \
|
|
'{"regulation_id":"eu_2022_2065","regulation_name_de":"Digital Services Act (DSA)","regulation_name_en":"Digital Services Act","regulation_short":"DSA","category":"plattformregulierung","celex":"32022R2065","source":"eur-lex","license":"public_law"}' \
|
|
"Digital Services Act (EU) 2022/2065"
|
|
|
|
upload_file "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" "$col" "compliance_ce" "legal_reference" "2002" \
|
|
'{"regulation_id":"eu_2002_58","regulation_name_de":"ePrivacy-Richtlinie","regulation_name_en":"ePrivacy Directive","regulation_short":"ePrivacy","category":"datenschutz","celex":"32002L0058","source":"eur-lex","license":"public_law"}' \
|
|
"ePrivacy-Richtlinie 2002/58/EC"
|
|
|
|
upload_file "$WORK_DIR/pdfs/scc_2021_914.pdf" "$col" "compliance_ce" "legal_reference" "2021" \
|
|
'{"regulation_id":"eu_2021_914","regulation_name_de":"Standardvertragsklauseln (SCC)","regulation_name_en":"Standard Contractual Clauses","regulation_short":"SCC","category":"datentransfer","celex":"32021D0914","source":"eur-lex","license":"public_law"}' \
|
|
"Standardvertragsklauseln (EU) 2021/914"
|
|
|
|
local after
|
|
after=$(collection_count "$col")
|
|
log "Collection $col: $before → $after chunks"
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE D: Templates/Textbausteine → bp_legal_templates
|
|
# =============================================================================
|
|
phase_templates() {
|
|
log "=========================================="
|
|
log "PHASE D: Templates → bp_legal_templates"
|
|
log "=========================================="
|
|
|
|
local col="bp_legal_templates"
|
|
local before
|
|
before=$(collection_count "$col")
|
|
log "Collection $col: $before chunks (before)"
|
|
|
|
# --- D1: GitHub Site Policy (CC0) ---
|
|
local repo="$WORK_DIR/repos/github-site-policy"
|
|
if [[ -d "$repo" ]]; then
|
|
log "--- GitHub Site Policy ---"
|
|
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f" .md)
|
|
local doc_type="policy"
|
|
case "$basename" in
|
|
*terms*|*tos*|*service*) doc_type="tos" ;;
|
|
*privacy*|*data*) doc_type="privacy_policy" ;;
|
|
*dmca*|*copyright*) doc_type="dmca" ;;
|
|
*acceptable*|*use*) doc_type="acceptable_use" ;;
|
|
esac
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"github_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/github/site-policy\",\"filename\":\"$basename\"}" \
|
|
"GitHub: $basename"
|
|
done
|
|
fi
|
|
|
|
# --- D2: OpenGov Site Policy (CC0) ---
|
|
repo="$WORK_DIR/repos/opengov-site-policy"
|
|
if [[ -d "$repo" ]]; then
|
|
log "--- OpenGov Site Policy ---"
|
|
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f" .md)
|
|
local doc_type="policy"
|
|
case "$basename" in
|
|
*terms*|*tos*) doc_type="tos" ;;
|
|
*privacy*) doc_type="privacy_policy" ;;
|
|
esac
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"opengov_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/opengovfoundation/site-policy\",\"filename\":\"$basename\"}" \
|
|
"OpenGov: $basename"
|
|
done
|
|
fi
|
|
|
|
# --- D3: Creative Commons Legal Tools (CC0) ---
|
|
repo="$WORK_DIR/repos/cc-legal-tools"
|
|
if [[ -d "$repo" ]]; then
|
|
log "--- CC Legal Tools (ausgewaehlte Lizenztexte) ---"
|
|
# Only ingest the main license deeds (DE legalcode where available, else EN)
|
|
for license_dir in "$repo"/legalcode/de/CC0_1.0 "$repo"/legalcode/de/CC-BY_4.0 "$repo"/legalcode/de/CC-BY-SA_4.0; do
|
|
if [[ -d "$license_dir" ]]; then
|
|
find "$license_dir" -name "*.html" -o -name "*.txt" -o -name "*.md" 2>/dev/null | head -3 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f")
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
|
|
"CC License: $basename"
|
|
done
|
|
fi
|
|
done
|
|
# Fallback: try top-level legalcode files
|
|
find "$repo"/legalcode -maxdepth 2 -name "*4.0*legalcode*de*" -type f 2>/dev/null | head -5 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f")
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
|
|
"CC License: $basename"
|
|
done
|
|
fi
|
|
|
|
# --- D4: opr.vc DSGVO-Mustertexte (CC0) ---
|
|
repo="$WORK_DIR/repos/oprvc"
|
|
if [[ -d "$repo" ]]; then
|
|
log "--- opr.vc DSGVO-Mustertexte ---"
|
|
# Look for German privacy/DSGVO content
|
|
find "$repo" \( -name "*.md" -o -name "*.html" -o -name "*.txt" \) \
|
|
-not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
|
|
| grep -iE "(datenschutz|privacy|dsgvo|gdpr|impressum)" \
|
|
| head -20 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f")
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
|
|
"opr.vc: $basename"
|
|
done
|
|
# If no specific files found, try all markdown files
|
|
if [[ $(find "$repo" \( -name "*.md" -o -name "*.html" \) -not -path "*/.git/*" -not -name "README.md" | grep -ciE "(datenschutz|privacy|dsgvo|gdpr)" 2>/dev/null) -eq 0 ]]; then
|
|
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" | head -10 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f")
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
|
|
"opr.vc: $basename"
|
|
done
|
|
fi
|
|
fi
|
|
|
|
# --- D5: webflorist/privacy-policy-text (MIT) ---
|
|
repo="$WORK_DIR/repos/webflorist"
|
|
if [[ -d "$repo" ]]; then
|
|
log "--- webflorist Privacy Policy Text ---"
|
|
# Look for JSON/text building blocks (German)
|
|
find "$repo" \( -name "*.json" -o -name "*.txt" -o -name "*.md" -o -name "*.php" \) \
|
|
-not -path "*/.git/*" -not -path "*/node_modules/*" -not -name "package*.json" \
|
|
-not -name "composer.json" -not -name "README.md" 2>/dev/null \
|
|
| head -20 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f")
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"webflorist\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/webflorist/privacy-policy-text\",\"filename\":\"$basename\"}" \
|
|
"webflorist: $basename"
|
|
done
|
|
fi
|
|
|
|
# --- D6: Tempest Privacy Policy Generator (MIT) ---
|
|
repo="$WORK_DIR/repos/tempest-privacy"
|
|
if [[ -d "$repo" ]]; then
|
|
log "--- Tempest Privacy Policy Generator ---"
|
|
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
|
|
-not -path "*/.git/*" -not -path "*/node_modules/*" \
|
|
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
|
|
| head -15 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f")
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"tempest_privacy\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/privacy-policy-generator\",\"filename\":\"$basename\"}" \
|
|
"Tempest Privacy: $basename"
|
|
done
|
|
fi
|
|
|
|
# --- D7: Tempest Terms of Service Generator (MIT) ---
|
|
repo="$WORK_DIR/repos/tempest-tos"
|
|
if [[ -d "$repo" ]]; then
|
|
log "--- Tempest Terms of Service Generator ---"
|
|
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
|
|
-not -path "*/.git/*" -not -path "*/node_modules/*" \
|
|
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
|
|
| head -15 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f")
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"tempest_tos\",\"doc_type\":\"tos\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/terms-of-service-generator\",\"filename\":\"$basename\"}" \
|
|
"Tempest ToS: $basename"
|
|
done
|
|
fi
|
|
|
|
# --- D8: Tempest Cookie Banner (MIT) ---
|
|
repo="$WORK_DIR/repos/tempest-cookie"
|
|
if [[ -d "$repo" ]]; then
|
|
log "--- Tempest Cookie Banner ---"
|
|
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
|
|
-not -path "*/.git/*" -not -path "*/node_modules/*" \
|
|
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
|
|
| head -15 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f")
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"tempest_cookie\",\"doc_type\":\"cookie_banner\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/cookie-banner-consent-solution\",\"filename\":\"$basename\"}" \
|
|
"Tempest Cookie: $basename"
|
|
done
|
|
fi
|
|
|
|
# --- D9: CookieConsent (orestbida) - UI Strings (MIT) ---
|
|
repo="$WORK_DIR/repos/cookieconsent"
|
|
if [[ -d "$repo" ]]; then
|
|
log "--- CookieConsent UI Strings ---"
|
|
# Look for translation/language files
|
|
find "$repo" -path "*/translations/*" -o -path "*/languages/*" -o -path "*/i18n/*" -o -path "*/locales/*" 2>/dev/null \
|
|
| grep -iE "\.(json|js|ts)$" | head -10 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f")
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
|
|
"CookieConsent: $basename"
|
|
done
|
|
# Also check for example configs
|
|
find "$repo" -name "*.md" -path "*/docs/*" 2>/dev/null | head -5 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f")
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
|
|
"CookieConsent Docs: $basename"
|
|
done
|
|
fi
|
|
|
|
# --- D10: Common Paper (CC BY 4.0) ---
|
|
log "--- Common Paper Standards ---"
|
|
local -a cp_repos=(
|
|
"common-paper-csa:saas_contract:CSA"
|
|
"common-paper-sla:sla:SLA"
|
|
"common-paper-psa:psa:PSA"
|
|
)
|
|
for entry in "${cp_repos[@]}"; do
|
|
local cp_dir="${entry%%:*}"
|
|
local rest="${entry#*:}"
|
|
local cp_doc_type="${rest%%:*}"
|
|
local cp_label="${rest#*:}"
|
|
repo="$WORK_DIR/repos/$cp_dir"
|
|
if [[ -d "$repo" ]]; then
|
|
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" \
|
|
-not -name "CONTRIBUTING.md" -not -name "CHANGELOG.md" -not -name "CODE_OF_CONDUCT.md" 2>/dev/null \
|
|
| head -10 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f" .md)
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"common_paper\",\"doc_type\":\"$cp_doc_type\",\"license\":\"cc_by_4\",\"attribution\":\"Common Paper Inc., licensed under CC BY 4.0\",\"source\":\"github.com/CommonPaper/$cp_label\",\"filename\":\"$basename\"}" \
|
|
"CommonPaper $cp_label: $basename"
|
|
done
|
|
fi
|
|
done
|
|
|
|
# --- D11: Datennutzungsklauseln (CC BY 4.0) ---
|
|
repo="$WORK_DIR/repos/datennutzungsklauseln"
|
|
if [[ -d "$repo" ]]; then
|
|
log "--- Datennutzungsklauseln ---"
|
|
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
|
|
| head -15 | while read -r f; do
|
|
local basename
|
|
basename=$(basename "$f" .md)
|
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
|
"{\"source_id\":\"datennutzungsklauseln\",\"doc_type\":\"data_clause\",\"license\":\"cc_by_4\",\"attribution\":\"OpenCode.de, lizenziert unter CC BY 4.0\",\"source\":\"gitlab.opencode.de/wernerth/datennutzungsklauseln-muster\",\"filename\":\"$basename\"}" \
|
|
"Datennutzungsklausel: $basename"
|
|
done
|
|
fi
|
|
|
|
local after
|
|
after=$(collection_count "$col")
|
|
log "Collection $col: $before → $after chunks"
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz
|
|
# =============================================================================
|
|
phase_datenschutz() {
|
|
log "=========================================="
|
|
log "PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz"
|
|
log "=========================================="
|
|
|
|
local col="bp_compliance_datenschutz"
|
|
local before
|
|
before=$(collection_count "$col")
|
|
log "Collection $col: $before chunks (before)"
|
|
|
|
# EDPB Guidelines
|
|
for pdf in "$WORK_DIR"/pdfs/edpb_*.pdf; do
|
|
if [[ -f "$pdf" ]]; then
|
|
local basename
|
|
basename=$(basename "$pdf" .pdf)
|
|
local guideline_name="${basename#edpb_}"
|
|
guideline_name="${guideline_name//_/ }"
|
|
upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
|
|
"{\"source_id\":\"edpb\",\"doc_type\":\"guidance\",\"guideline_name\":\"$guideline_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Board (EDPB)\",\"source\":\"edpb.europa.eu\"}" \
|
|
"EDPB: $guideline_name"
|
|
fi
|
|
done
|
|
|
|
# EDPS Guidance
|
|
for pdf in "$WORK_DIR"/pdfs/edps_*.pdf; do
|
|
if [[ -f "$pdf" ]]; then
|
|
local basename
|
|
basename=$(basename "$pdf" .pdf)
|
|
local guidance_name="${basename#edps_}"
|
|
guidance_name="${guidance_name//_/ }"
|
|
upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
|
|
"{\"source_id\":\"edps\",\"doc_type\":\"guidance\",\"guidance_name\":\"$guidance_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Supervisor (EDPS)\",\"source\":\"edps.europa.eu\"}" \
|
|
"EDPS: $guidance_name"
|
|
fi
|
|
done
|
|
|
|
local after
|
|
after=$(collection_count "$col")
|
|
log "Collection $col: $before → $after chunks"
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE H: Layer 1 Safe Core — Verbraucherschutz, EU-Recht, NIST
|
|
# =============================================================================
|
|
# ~60 Dokumente: EUR-Lex (CC BY 4.0), gesetze-im-internet.de (Public Domain),
|
|
# NIST (Public Domain), HLEG (CC BY 4.0)
|
|
# =============================================================================
|
|
phase_verbraucherschutz() {
|
|
log "=========================================="
|
|
log "PHASE H: Layer 1 Safe Core (~60 Dokumente)"
|
|
log "=========================================="
|
|
|
|
mkdir -p "$WORK_DIR"/{pdfs,texts}
|
|
|
|
# =========================================================================
|
|
# H1: Deutsche Gesetze → bp_compliance_gesetze
|
|
# Quelle: gesetze-im-internet.de (Public Domain, § 5 UrhG)
|
|
# =========================================================================
|
|
local col="bp_compliance_gesetze"
|
|
local before
|
|
before=$(collection_count "$col")
|
|
log "--- H1: Deutsche Gesetze → $col ($before chunks) ---"
|
|
|
|
# Verbraucherschutz-Gesetze
|
|
local -a de_gesetze=(
|
|
"pangv_2022/PAngV:PAngV:Preisangabenverordnung:verbraucherschutz"
|
|
"vsbg/VSBG:VSBG:Verbraucherstreitbeilegungsgesetz:verbraucherschutz"
|
|
"prodhaftg/ProdHaftG:ProdHaftG:Produkthaftungsgesetz:verbraucherschutz"
|
|
"verpackg/VerpackG:VerpackG:Verpackungsgesetz:verbraucherschutz"
|
|
"elektrog_2015/ElektroG:ElektroG:Elektro- und Elektronikgeraetegesetz:verbraucherschutz"
|
|
"battdg/BattDG:BattDG:Batteriegesetz:verbraucherschutz"
|
|
"bfsg/BFSG:BFSG:Barrierefreiheitsstaerkungsgesetz:verbraucherschutz"
|
|
"uwg_2004/UWG:UWG:Gesetz gegen den unlauteren Wettbewerb:verbraucherschutz"
|
|
# Datenschutz + IT
|
|
"bdsg_2018/BDSG:BDSG:Bundesdatenschutzgesetz:datenschutz"
|
|
"ddg/DDG:DDG:Digitale-Dienste-Gesetz:ecommerce"
|
|
"tkg_2021/TKG:TKG:Telekommunikationsgesetz:datenschutz"
|
|
# Handels-/Steuerrecht (Loeschfristen)
|
|
"hgb/HGB:HGB:Handelsgesetzbuch:aufbewahrung"
|
|
"ao_1977/AO:AO:Abgabenordnung:aufbewahrung"
|
|
# Gewerberecht
|
|
"gewo/GewO:GewO:Gewerbeordnung:gewerberecht"
|
|
)
|
|
|
|
for entry in "${de_gesetze[@]}"; do
|
|
local path="${entry%%:*}"
|
|
local rest="${entry#*:}"
|
|
local short="${rest%%:*}"
|
|
rest="${rest#*:}"
|
|
local fullname="${rest%%:*}"
|
|
local category="${rest#*:}"
|
|
local pdf_file="$WORK_DIR/pdfs/${short}.pdf"
|
|
|
|
download_pdf \
|
|
"https://www.gesetze-im-internet.de/${path}.pdf" \
|
|
"$pdf_file"
|
|
|
|
if [[ -f "$pdf_file" ]]; then
|
|
upload_file "$pdf_file" "$col" "compliance" "legal_reference" "2025" \
|
|
"{\"regulation_id\":\"${short,,}\",\"regulation_name_de\":\"$fullname ($short)\",\"category\":\"$category\",\"license\":\"public_domain_§5_UrhG\",\"source\":\"gesetze-im-internet.de\"}" \
|
|
"$short ($fullname)"
|
|
fi
|
|
done
|
|
|
|
# BGB in Teilen statt komplett (2.7MB PDF ist zu gross fuer CPU-Embeddings)
|
|
# gesetze-im-internet.de bietet XML-Download pro Gesetz
|
|
local bgb_xml="$WORK_DIR/pdfs/bgb_xml.zip"
|
|
curl -sL "https://www.gesetze-im-internet.de/bgb/xml.zip" -o "$bgb_xml" 2>/dev/null
|
|
if [[ -f "$bgb_xml" && $(stat -f%z "$bgb_xml" 2>/dev/null || stat -c%s "$bgb_xml" 2>/dev/null || echo 0) -gt 1000 ]]; then
|
|
local bgb_extract="$WORK_DIR/pdfs/bgb_xml"
|
|
mkdir -p "$bgb_extract"
|
|
unzip -qo "$bgb_xml" -d "$bgb_extract" 2>/dev/null || true
|
|
|
|
# Relevante BGB-Abschnitte als Text extrahieren und einzeln uploaden
|
|
# Die XML-Datei hat <norm> Elemente mit <metadaten><enbez>§ 305</enbez>
|
|
local bgb_xmlfile
|
|
bgb_xmlfile=$(find "$bgb_extract" -name "*.xml" | head -1)
|
|
if [[ -n "$bgb_xmlfile" ]]; then
|
|
# BGB Teil 1: AGB-Recht §§ 305-310
|
|
python3 -c "
|
|
import xml.etree.ElementTree as ET, sys, re
|
|
tree = ET.parse('$bgb_xmlfile')
|
|
root = tree.getroot()
|
|
ns = {'': root.tag.split('}')[0].lstrip('{') if '}' in root.tag else ''}
|
|
text_parts = []
|
|
capture = False
|
|
for norm in root.iter():
|
|
if norm.tag.endswith('norm'):
|
|
enbez = norm.find('.//' + ('{' + ns[''] + '}' if ns[''] else '') + 'enbez')
|
|
if enbez is not None and enbez.text:
|
|
num = re.search(r'§\s*(\d+)', enbez.text)
|
|
if num:
|
|
n = int(num.group(1))
|
|
capture = 305 <= n <= 310
|
|
else:
|
|
capture = False
|
|
if capture:
|
|
for t in norm.itertext():
|
|
text_parts.append(t.strip())
|
|
with open('$WORK_DIR/pdfs/BGB_AGB_305_310.txt', 'w') as f:
|
|
f.write('BGB AGB-Recht §§ 305-310\n\n' + '\n'.join(p for p in text_parts if p))
|
|
" 2>/dev/null
|
|
if [[ -f "$WORK_DIR/pdfs/BGB_AGB_305_310.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_AGB_305_310.txt") -gt 100 ]]; then
|
|
upload_file "$WORK_DIR/pdfs/BGB_AGB_305_310.txt" "$col" "compliance" "legal_reference" "2025" \
|
|
'{"regulation_id":"bgb_agb","regulation_name_de":"BGB AGB-Recht (§§ 305-310)","category":"vertragsrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
|
|
"BGB AGB-Recht §§ 305-310"
|
|
fi
|
|
|
|
# BGB Teil 2: Fernabsatzrecht §§ 312-312k
|
|
python3 -c "
|
|
import xml.etree.ElementTree as ET, sys, re
|
|
tree = ET.parse('$bgb_xmlfile')
|
|
root = tree.getroot()
|
|
text_parts = []
|
|
capture = False
|
|
for norm in root.iter():
|
|
if norm.tag.endswith('norm'):
|
|
enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
|
|
if enbez is not None and enbez.text:
|
|
if re.search(r'§\s*312', enbez.text):
|
|
capture = True
|
|
elif re.search(r'§\s*31[3-9]|§\s*32', enbez.text):
|
|
capture = False
|
|
else:
|
|
if capture and not any(norm.itertext()):
|
|
capture = False
|
|
if capture:
|
|
for t in norm.itertext():
|
|
text_parts.append(t.strip())
|
|
with open('$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt', 'w') as f:
|
|
f.write('BGB Fernabsatzrecht §§ 312-312k\n\n' + '\n'.join(p for p in text_parts if p))
|
|
" 2>/dev/null
|
|
if [[ -f "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt") -gt 100 ]]; then
|
|
upload_file "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt" "$col" "compliance" "legal_reference" "2025" \
|
|
'{"regulation_id":"bgb_fernabsatz","regulation_name_de":"BGB Fernabsatzrecht (§§ 312-312k)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
|
|
"BGB Fernabsatzrecht §§ 312-312k"
|
|
fi
|
|
|
|
# BGB Teil 3: Kaufrecht + Gewährleistung §§ 433-480
|
|
python3 -c "
|
|
import xml.etree.ElementTree as ET, sys, re
|
|
tree = ET.parse('$bgb_xmlfile')
|
|
root = tree.getroot()
|
|
text_parts = []
|
|
capture = False
|
|
for norm in root.iter():
|
|
if norm.tag.endswith('norm'):
|
|
enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
|
|
if enbez is not None and enbez.text:
|
|
num = re.search(r'§\s*(\d+)', enbez.text)
|
|
if num:
|
|
n = int(num.group(1))
|
|
capture = 433 <= n <= 480
|
|
else:
|
|
capture = False
|
|
if capture:
|
|
for t in norm.itertext():
|
|
text_parts.append(t.strip())
|
|
with open('$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt', 'w') as f:
|
|
f.write('BGB Kaufrecht §§ 433-480\n\n' + '\n'.join(p for p in text_parts if p))
|
|
" 2>/dev/null
|
|
if [[ -f "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt") -gt 100 ]]; then
|
|
upload_file "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt" "$col" "compliance" "legal_reference" "2025" \
|
|
'{"regulation_id":"bgb_kaufrecht","regulation_name_de":"BGB Kaufrecht + Gewaehrleistung (§§ 433-480)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
|
|
"BGB Kaufrecht §§ 433-480"
|
|
fi
|
|
|
|
# BGB Teil 4: Widerrufsrecht §§ 355-361
|
|
python3 -c "
|
|
import xml.etree.ElementTree as ET, sys, re
|
|
tree = ET.parse('$bgb_xmlfile')
|
|
root = tree.getroot()
|
|
text_parts = []
|
|
capture = False
|
|
for norm in root.iter():
|
|
if norm.tag.endswith('norm'):
|
|
enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
|
|
if enbez is not None and enbez.text:
|
|
num = re.search(r'§\s*(\d+)', enbez.text)
|
|
if num:
|
|
n = int(num.group(1))
|
|
capture = 355 <= n <= 361
|
|
else:
|
|
capture = False
|
|
if capture:
|
|
for t in norm.itertext():
|
|
text_parts.append(t.strip())
|
|
with open('$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt', 'w') as f:
|
|
f.write('BGB Widerrufsrecht §§ 355-361\n\n' + '\n'.join(p for p in text_parts if p))
|
|
" 2>/dev/null
|
|
if [[ -f "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt") -gt 100 ]]; then
|
|
upload_file "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt" "$col" "compliance" "legal_reference" "2025" \
|
|
'{"regulation_id":"bgb_widerruf","regulation_name_de":"BGB Widerrufsrecht (§§ 355-361)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
|
|
"BGB Widerrufsrecht §§ 355-361"
|
|
fi
|
|
|
|
# BGB Teil 5: Digitale Produkte §§ 327-327u
|
|
python3 -c "
|
|
import xml.etree.ElementTree as ET, sys, re
|
|
tree = ET.parse('$bgb_xmlfile')
|
|
root = tree.getroot()
|
|
text_parts = []
|
|
capture = False
|
|
for norm in root.iter():
|
|
if norm.tag.endswith('norm'):
|
|
enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
|
|
if enbez is not None and enbez.text:
|
|
if re.search(r'§\s*327', enbez.text):
|
|
capture = True
|
|
elif re.search(r'§\s*328', enbez.text):
|
|
capture = False
|
|
if capture:
|
|
for t in norm.itertext():
|
|
text_parts.append(t.strip())
|
|
with open('$WORK_DIR/pdfs/BGB_Digital_327.txt', 'w') as f:
|
|
f.write('BGB Digitale Produkte §§ 327-327u\n\n' + '\n'.join(p for p in text_parts if p))
|
|
" 2>/dev/null
|
|
if [[ -f "$WORK_DIR/pdfs/BGB_Digital_327.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Digital_327.txt") -gt 100 ]]; then
|
|
upload_file "$WORK_DIR/pdfs/BGB_Digital_327.txt" "$col" "compliance" "legal_reference" "2025" \
|
|
'{"regulation_id":"bgb_digital","regulation_name_de":"BGB Digitale Produkte (§§ 327-327u)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
|
|
"BGB Digitale Produkte §§ 327-327u"
|
|
fi
|
|
else
|
|
warn "BGB XML file not found in archive"
|
|
fi
|
|
else
|
|
warn "BGB XML download failed"
|
|
fi
|
|
|
|
# EGBGB — XML statt PDF (BGBEG.pdf war leer)
|
|
local egbgb_xml="$WORK_DIR/pdfs/bgbeg_xml.zip"
|
|
curl -sL "https://www.gesetze-im-internet.de/bgbeg/xml.zip" -o "$egbgb_xml" 2>/dev/null
|
|
if [[ -f "$egbgb_xml" && $(stat -f%z "$egbgb_xml" 2>/dev/null || stat -c%s "$egbgb_xml" 2>/dev/null || echo 0) -gt 1000 ]]; then
|
|
local egbgb_extract="$WORK_DIR/pdfs/egbgb_xml"
|
|
mkdir -p "$egbgb_extract"
|
|
unzip -qo "$egbgb_xml" -d "$egbgb_extract" 2>/dev/null || true
|
|
local egbgb_xmlfile
|
|
egbgb_xmlfile=$(find "$egbgb_extract" -name "*.xml" | head -1)
|
|
if [[ -n "$egbgb_xmlfile" ]]; then
|
|
# Art. 246a EGBGB (Informationspflichten Fernabsatz) + Anlage 1+2 (Widerrufsbelehrung)
|
|
python3 -c "
|
|
import xml.etree.ElementTree as ET
|
|
tree = ET.parse('$egbgb_xmlfile')
|
|
root = tree.getroot()
|
|
text_parts = []
|
|
for norm in root.iter():
|
|
if norm.tag.endswith('norm'):
|
|
# Capture all text — EGBGB is not too large
|
|
parts = [t.strip() for t in norm.itertext() if t.strip()]
|
|
if any('246' in p or 'Anlage' in p or 'Widerruf' in p or 'Muster' in p for p in parts):
|
|
text_parts.extend(parts)
|
|
with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
|
|
f.write('EGBGB - Informationspflichten und Muster-Widerrufsbelehrung\n\n' + '\n'.join(text_parts))
|
|
" 2>/dev/null
|
|
if [[ -f "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" && $(wc -c < "$WORK_DIR/pdfs/EGBGB_Widerruf.txt") -gt 100 ]]; then
|
|
upload_file "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" "$col" "compliance" "legal_reference" "2025" \
|
|
'{"regulation_id":"egbgb","regulation_name_de":"EGBGB (Muster-Widerrufsbelehrung, Art. 246a + Anlage 1+2)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
|
|
"EGBGB Muster-Widerrufsbelehrung"
|
|
fi
|
|
fi
|
|
else
|
|
warn "EGBGB XML download failed"
|
|
fi
|
|
|
|
local after
|
|
after=$(collection_count "$col")
|
|
local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?"
|
|
log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"
|
|
|
|
# =========================================================================
|
|
# H2: EU-Recht → bp_compliance_ce
|
|
# Quelle: EUR-Lex (CC BY 4.0, Wiederverwendung erlaubt)
|
|
# URL-Muster: /legal-content/DE/TXT/PDF/?uri=CELEX:{id}
|
|
# =========================================================================
|
|
col="bp_compliance_ce"
|
|
before=$(collection_count "$col")
|
|
log "--- H2: EU-Recht → $col ($before chunks) ---"
|
|
|
|
# Array: CELEX_ID:filename:short:name_de:name_en:category:year
|
|
local -a eu_gesetze=(
|
|
# --- Datenschutz ---
|
|
"32016R0679:DSGVO_2016_679:DSGVO:Datenschutz-Grundverordnung:General Data Protection Regulation:datenschutz:2016"
|
|
# --- Verbraucherschutz (Kernbestand) ---
|
|
"32011L0083:Consumer_Rights_2011_83:CRD:Verbraucherrechte-Richtlinie:Consumer Rights Directive:verbraucherschutz:2011"
|
|
"32019L0770:Digital_Content_2019_770:DCD:Richtlinie digitale Inhalte:Digital Content Directive:verbraucherschutz:2019"
|
|
"32019L0771:Sale_of_Goods_2019_771:SGD:Warenkauf-Richtlinie:Sale of Goods Directive:verbraucherschutz:2019"
|
|
"32000L0031:ECommerce_2000_31:ECD:E-Commerce-Richtlinie:E-Commerce Directive:ecommerce:2000"
|
|
"31993L0013:Unfair_Terms_93_13:UCTD:Klausel-Richtlinie:Unfair Contract Terms Directive:verbraucherschutz:1993"
|
|
"32005L0029:Unfair_Practices_2005_29:UCPD:Richtlinie unlautere Geschaeftspraktiken:Unfair Commercial Practices Directive:verbraucherschutz:2005"
|
|
"31998L0006:Price_Indication_98_6:PID:Preisangaben-Richtlinie:Price Indication Directive:verbraucherschutz:1998"
|
|
"32019L2161:Omnibus_2019_2161:OMN:Omnibus-Richtlinie (Modernisierung Verbraucherschutz):Omnibus Directive:verbraucherschutz:2019"
|
|
# --- Plattformregulierung ---
|
|
"32022R1925:DMA_2022_1925:DMA:Digital Markets Act:Digital Markets Act:plattformregulierung:2022"
|
|
# --- KI + Sicherheit ---
|
|
"32024R1689:AI_Act_2024_1689:AI_Act:KI-Verordnung:Artificial Intelligence Act:ki_regulierung:2024"
|
|
"32022L2555:NIS2_2022_2555:NIS2:NIS-2-Richtlinie:NIS2 Directive:it_sicherheit:2022"
|
|
# --- Produktsicherheit + Haftung ---
|
|
"32023R0988:GPSR_2023_988:GPSR:Allgemeine Produktsicherheitsverordnung:General Product Safety Regulation:produktsicherheit:2023"
|
|
"31985L0374:Product_Liability_85_374:PLD:Produkthaftungs-Richtlinie:Product Liability Directive:produkthaftung:1985"
|
|
"32023R1542:Batterie_VO_2023_1542:BattVO:Batterieverordnung:Battery Regulation:produktsicherheit:2023"
|
|
# --- Datentransfer ---
|
|
# SCC bereits in Phase C, hier nicht duplizieren
|
|
)
|
|
|
|
for entry in "${eu_gesetze[@]}"; do
|
|
IFS=':' read -r celex filename short name_de name_en category year <<< "$entry"
|
|
local pdf_file="$WORK_DIR/pdfs/${filename}.pdf"
|
|
|
|
# AI Act hat spezielle URL (OJ statt CELEX)
|
|
if [[ "$celex" == "32024R1689" ]]; then
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=OJ:L_202401689" \
|
|
"$pdf_file"
|
|
else
|
|
download_pdf \
|
|
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:${celex}" \
|
|
"$pdf_file"
|
|
fi
|
|
|
|
if [[ -f "$pdf_file" ]]; then
|
|
upload_file "$pdf_file" "$col" "compliance_ce" "legal_reference" "$year" \
|
|
"{\"regulation_id\":\"${short,,}\",\"regulation_name_de\":\"$name_de\",\"regulation_name_en\":\"$name_en\",\"regulation_short\":\"$short\",\"celex\":\"$celex\",\"category\":\"$category\",\"license\":\"CC_BY_4.0\",\"source\":\"eur-lex\"}" \
|
|
"$short — $name_de"
|
|
fi
|
|
done
|
|
|
|
after=$(collection_count "$col")
|
|
local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?"
|
|
log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"
|
|
|
|
# =========================================================================
|
|
# H3: NIST Security Frameworks → bp_compliance_security
|
|
# Quelle: nist.gov (Public Domain, US Government Work)
|
|
# =========================================================================
|
|
col="bp_compliance_datenschutz"
|
|
before=$(collection_count "$col")
|
|
log "--- H3: NIST + Ethics → $col ($before chunks) ---"
|
|
|
|
# NIST Cybersecurity Framework 2.0
|
|
download_pdf \
|
|
"https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.29.pdf" \
|
|
"$WORK_DIR/pdfs/NIST_CSF_2.0.pdf"
|
|
if [[ -f "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" ]]; then
|
|
upload_file "$WORK_DIR/pdfs/NIST_CSF_2.0.pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
|
|
'{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Cybersecurity Framework 2.0","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
|
|
"NIST Cybersecurity Framework 2.0"
|
|
fi
|
|
|
|
# NIST Privacy Framework 1.0
|
|
download_pdf \
|
|
"https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.01162020.pdf" \
|
|
"$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf"
|
|
if [[ -f "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" ]]; then
|
|
upload_file "$WORK_DIR/pdfs/NIST_Privacy_Framework.pdf" "$col" "compliance_datenschutz" "guidance" "2020" \
|
|
'{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Privacy Framework 1.0","license":"public_domain_us_gov","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
|
|
"NIST Privacy Framework 1.0"
|
|
fi
|
|
|
|
# HLEG Ethics Guidelines for Trustworthy AI
|
|
download_pdf \
|
|
"https://op.europa.eu/en/publication-detail/-/publication/d3988569-0434-11ea-8c1f-01aa75ed71a1/language-en/format-PDF" \
|
|
"$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf"
|
|
if [[ -f "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" ]]; then
|
|
upload_file "$WORK_DIR/pdfs/hleg_trustworthy_ai.pdf" "$col" "compliance_datenschutz" "guidance" "2019" \
|
|
'{"source_id":"hleg","doc_type":"ethics_guidelines","guideline_name":"Ethics Guidelines for Trustworthy AI","license":"CC_BY_4.0","attribution":"High-Level Expert Group on AI (HLEG)","source":"op.europa.eu"}' \
|
|
"HLEG Ethics Guidelines Trustworthy AI"
|
|
fi
|
|
|
|
after=$(collection_count "$col")
|
|
local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?"
|
|
log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"
|
|
|
|
# =========================================================================
|
|
# Summary
|
|
# =========================================================================
|
|
echo ""
|
|
log "Phase H abgeschlossen."
|
|
log "Naechste Schritte (TODO — separate Phasen):"
|
|
log " Layer 2: Nationale Gesetze EU/EWR (FR, ES, IT, AT, NL, UK) — Portal-Recherche noetig"
|
|
log " Layer 3: DPA Guidance (CNIL, AEPD, Garante, AP, IMY) — Einzel-URLs recherchieren"
|
|
log " Layer 4: OWASP Top 10, offene Security-Frameworks"
|
|
log " Layer 5: EuGH + BGH Leitentscheidungen"
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE F: Verifizierung
|
|
# =============================================================================
|
|
phase_verify() {
|
|
log "=========================================="
|
|
log "PHASE F: Verifizierung"
|
|
log "=========================================="
|
|
|
|
echo ""
|
|
echo "=== Collection Stats ==="
|
|
for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
|
|
local count
|
|
count=$(collection_count "$col")
|
|
printf " %-30s %s chunks\n" "$col" "$count"
|
|
done
|
|
|
|
echo ""
|
|
echo "=== Test-Suchen ==="
|
|
|
|
log "Suche: 'Impressumspflicht digitale Dienste' in bp_compliance_gesetze"
|
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query":"Impressumspflicht digitale Dienste","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
|
| python3 -c "
|
|
import sys,json
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
results = data.get('results', [])
|
|
print(f' Treffer: {len(results)}')
|
|
for r in results[:3]:
|
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
|
except: print(' (parse error)')
|
|
" 2>/dev/null || echo " (search failed)"
|
|
|
|
log "Suche: 'Cookie Einwilligung' in bp_compliance_ce"
|
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query":"Cookie Einwilligung ePrivacy","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
|
| python3 -c "
|
|
import sys,json
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
results = data.get('results', [])
|
|
print(f' Treffer: {len(results)}')
|
|
for r in results[:3]:
|
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
|
except: print(' (parse error)')
|
|
" 2>/dev/null || echo " (search failed)"
|
|
|
|
log "Suche: 'Widerrufsbelehrung Fernabsatz' in bp_compliance_gesetze"
|
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query":"Widerrufsbelehrung Fernabsatz Widerrufsfrist","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
|
| python3 -c "
|
|
import sys,json
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
results = data.get('results', [])
|
|
print(f' Treffer: {len(results)}')
|
|
for r in results[:3]:
|
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
|
except: print(' (parse error)')
|
|
" 2>/dev/null || echo " (search failed)"
|
|
|
|
log "Suche: 'AI Act Hochrisiko Konformitaet' in bp_compliance_ce"
|
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query":"AI Act Hochrisiko Konformitaetsbewertung","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
|
| python3 -c "
|
|
import sys,json
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
results = data.get('results', [])
|
|
print(f' Treffer: {len(results)}')
|
|
for r in results[:3]:
|
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
|
except: print(' (parse error)')
|
|
" 2>/dev/null || echo " (search failed)"
|
|
|
|
log "Suche: 'Privacy Policy Template GDPR' in bp_legal_templates"
|
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query":"Privacy Policy Template GDPR","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
|
| python3 -c "
|
|
import sys,json
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
results = data.get('results', [])
|
|
print(f' Treffer: {len(results)}')
|
|
for r in results[:3]:
|
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
|
except: print(' (parse error)')
|
|
" 2>/dev/null || echo " (search failed)"
|
|
|
|
echo ""
|
|
}
|
|
|
|
# =============================================================================
|
|
# PHASE G: Corpus Version Registration
|
|
# =============================================================================
|
|
phase_register_version() {
|
|
log "=========================================="
|
|
log "PHASE G: Corpus Version Registration"
|
|
log "=========================================="
|
|
|
|
local today
|
|
today=$(date '+%Y-%m-%d')
|
|
|
|
for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
|
|
local count
|
|
count=$(collection_count "$col")
|
|
|
|
if [[ "$count" == "?" || "$count" == "0" ]]; then
|
|
warn "Skipping version for $col (count=$count)"
|
|
continue
|
|
fi
|
|
|
|
# Determine next version number for today
|
|
local existing_count
|
|
existing_count=$(psql "$DB_URL" -tAc \
|
|
"SELECT COUNT(*) FROM compliance_corpus_versions WHERE collection_name='$col' AND version LIKE '${today}.%'" \
|
|
2>/dev/null || echo "0")
|
|
local seq=$((existing_count + 1))
|
|
local version="${today}.${seq}"
|
|
|
|
# Get regulations list based on collection
|
|
local regs=""
|
|
case "$col" in
|
|
bp_compliance_ce)
|
|
regs='{eu_2022_2065,eu_2002_58,eu_2021_914}'
|
|
;;
|
|
bp_compliance_gesetze)
|
|
regs='{ddg_5,tdddg_25,urhg_5,egbgb_widerruf,bgb_komplett,urhg_komplett,tmg_komplett}'
|
|
;;
|
|
bp_legal_templates)
|
|
regs='{github_site_policy,opengov_site_policy,cc_legal_tools,common_paper,webflorist,tempest,cookieconsent}'
|
|
;;
|
|
bp_compliance_datenschutz)
|
|
regs='{edpb_consent,edpb_privacy_by_design,edpb_dark_patterns,edpb_social_media,edpb_cookie_banner,edps_generative_ai,edps_digital_ethics}'
|
|
;;
|
|
esac
|
|
|
|
# Compute digest from Qdrant collection info
|
|
local digest
|
|
digest=$(curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
|
| python3 -c "import sys,json,hashlib; d=json.load(sys.stdin); print(hashlib.sha256(json.dumps(d.get('result',{}), sort_keys=True).encode()).hexdigest()[:32])" \
|
|
2>/dev/null || echo "")
|
|
|
|
log "Registering version $version for $col ($count chunks)"
|
|
|
|
psql "$DB_URL" -c "
|
|
INSERT INTO compliance_corpus_versions
|
|
(version, collection_name, documents_count, chunks_count, regulations, digest, ingestion_source, created_by)
|
|
VALUES
|
|
('${version}', '${col}', ${UPLOADED}, ${count}, '${regs}', '${digest}', 'ingest-legal-corpus.sh', 'system')
|
|
" 2>/dev/null && ok "Version $version registered for $col" || warn "Version registration failed for $col (DB not available?)"
|
|
done
|
|
}
|
|
|
|
# =============================================================================
|
|
# MAIN
|
|
# =============================================================================
|
|
main() {
|
|
log "=========================================="
|
|
log "BreakPilot Legal Corpus Ingestion"
|
|
log "=========================================="
|
|
log "Work dir: $WORK_DIR"
|
|
log "RAG API: $RAG_URL"
|
|
log "Qdrant: $QDRANT_URL"
|
|
echo ""
|
|
|
|
# Check RAG API is reachable
|
|
if ! curl $CURL_OPTS "$RAG_URL" -X POST 2>/dev/null | grep -q "file"; then
|
|
fail "RAG API not reachable at $RAG_URL"
|
|
exit 1
|
|
fi
|
|
ok "RAG API reachable"
|
|
|
|
# Check Qdrant
|
|
if ! curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then
|
|
fail "Qdrant not reachable at $QDRANT_URL"
|
|
exit 1
|
|
fi
|
|
ok "Qdrant reachable"
|
|
echo ""
|
|
|
|
# Run phases
|
|
if [[ -n "$ONLY_PHASE" ]]; then
|
|
case "$ONLY_PHASE" in
|
|
download) phase_download ;;
|
|
gesetze) phase_gesetze ;;
|
|
eu) phase_eu ;;
|
|
templates) phase_templates ;;
|
|
datenschutz) phase_datenschutz ;;
|
|
verbraucherschutz) phase_verbraucherschutz ;;
|
|
verify) phase_verify ;;
|
|
version) phase_register_version ;;
|
|
*) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;;
|
|
esac
|
|
else
|
|
if [[ "$SKIP_DOWNLOAD" != "true" ]]; then
|
|
phase_download
|
|
else
|
|
log "Skipping download phase (--skip-download)"
|
|
fi
|
|
echo ""
|
|
phase_gesetze
|
|
echo ""
|
|
phase_eu
|
|
echo ""
|
|
phase_templates
|
|
echo ""
|
|
phase_datenschutz
|
|
echo ""
|
|
phase_verbraucherschutz
|
|
echo ""
|
|
phase_verify
|
|
echo ""
|
|
phase_register_version
|
|
fi
|
|
|
|
# Summary
|
|
echo ""
|
|
log "=========================================="
|
|
log "ERGEBNIS"
|
|
log "=========================================="
|
|
log "Uploaded: $UPLOADED"
|
|
log "Failed: $FAILED"
|
|
log "Skipped: $SKIPPED"
|
|
log "=========================================="
|
|
|
|
if [[ $FAILED -gt 0 ]]; then
|
|
warn "$FAILED uploads fehlgeschlagen!"
|
|
exit 1
|
|
fi
|
|
|
|
ok "Ingestion abgeschlossen!"
|
|
}
|
|
|
|
main "$@"
|