From c88653b22145093861baff4859825839e8bafeef Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 12 Mar 2026 09:39:09 +0100 Subject: [PATCH] fix(rag): Dedup check, BGB split, GewO timeout, arithmetic fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Qdrant dedup check in upload_file() — skip if regulation_id already exists - Split BGB (2.7MB) into 5 targeted parts via XML extraction: AGB §§305-310, Fernabsatz §§312-312k, Kaufrecht §§433-480, Widerruf §§355-361, Digitale Produkte §§327-327u - Lower large-file threshold 512KB→384KB (fixes GewO 432KB timeout) - Fix arithmetic syntax error when collection_count returns "?" - Replace EGBGB PDF (was empty) with XML extraction - Add unzip to Alpine container for XML archives Co-Authored-By: Claude Opus 4.6 --- .gitea/workflows/rag-ingest.yaml | 2 +- scripts/ingest-legal-corpus.sh | 239 ++++++++++++++++++++++++++++--- 2 files changed, 220 insertions(+), 21 deletions(-) diff --git a/.gitea/workflows/rag-ingest.yaml b/.gitea/workflows/rag-ingest.yaml index c7d50c5..b7c4769 100644 --- a/.gitea/workflows/rag-ingest.yaml +++ b/.gitea/workflows/rag-ingest.yaml @@ -78,7 +78,7 @@ jobs: -e "SDK_URL=http://bp-compliance-ai-sdk:8090" \ alpine:3.19 \ sh -c " - apk add --no-cache curl bash coreutils git python3 > /dev/null 2>&1 + apk add --no-cache curl bash coreutils git python3 unzip > /dev/null 2>&1 mkdir -p /tmp/rag-ingestion/{pdfs,repos,texts} cd /workspace if [ '${PHASE}' = 'all' ]; then diff --git a/scripts/ingest-legal-corpus.sh b/scripts/ingest-legal-corpus.sh index 7dc5ec5..7a9e26b 100755 --- a/scripts/ingest-legal-corpus.sh +++ b/scripts/ingest-legal-corpus.sh @@ -64,6 +64,22 @@ upload_file() { return 0 # Don't abort script fi + # Dedup-Check: Prüfe ob regulation_id bereits in Qdrant vorhanden ist + local reg_id + reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "") + if [[ -n "$reg_id" && -n "${QDRANT_URL:-}" ]]; then + local existing + existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \ + -H "Content-Type: application/json" \ + -d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \ + 2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0") + if [[ "$existing" -gt 0 ]] 2>/dev/null; then + log "⏭ Skip (already in Qdrant): $label [regulation_id=$reg_id]" + SKIPPED=$((SKIPPED + 1)) + return 0 + fi + fi + local filesize filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0) if [[ "$filesize" -lt 100 ]]; then @@ -76,7 +92,7 @@ upload_file() { # Use longer timeout for large files (>500KB) local curl_opts="$CURL_OPTS" - if [[ "$filesize" -gt 512000 ]]; then + if [[ "$filesize" -gt 384000 ]]; then curl_opts="$CURL_OPTS_LARGE" log " (large file, using extended timeout)" fi @@ -833,29 +849,210 @@ phase_verbraucherschutz() { fi done - # BGB komplett (Fernabsatz, Digitale Inhalte, Kaufrecht, AGB-Recht) - download_pdf \ - "https://www.gesetze-im-internet.de/bgb/BGB.pdf" \ - "$WORK_DIR/pdfs/BGB_full.pdf" - if [[ -f "$WORK_DIR/pdfs/BGB_full.pdf" ]]; then - upload_file "$WORK_DIR/pdfs/BGB_full.pdf" "$col" "compliance" "legal_reference" "2025" \ - '{"regulation_id":"bgb_komplett","regulation_name_de":"BGB (komplett: AGB-Recht, Fernabsatz, Digitale Inhalte, Kaufrecht)","category":"vertragsrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ - "BGB komplett" + # BGB in Teilen statt komplett (2.7MB PDF ist zu gross fuer CPU-Embeddings) + # gesetze-im-internet.de bietet XML-Download pro Gesetz + local bgb_xml="$WORK_DIR/pdfs/bgb_xml.zip" + curl -sL "https://www.gesetze-im-internet.de/bgb/xml.zip" -o "$bgb_xml" 2>/dev/null + if [[ -f "$bgb_xml" && $(stat -f%z "$bgb_xml" 2>/dev/null || stat -c%s "$bgb_xml" 2>/dev/null || echo 0) -gt 1000 ]]; then + local bgb_extract="$WORK_DIR/pdfs/bgb_xml" + mkdir -p "$bgb_extract" + unzip -qo "$bgb_xml" -d "$bgb_extract" 2>/dev/null || true + + # Relevante BGB-Abschnitte als Text extrahieren und einzeln uploaden + # Die XML-Datei hat Elemente mit § 305 + local bgb_xmlfile + bgb_xmlfile=$(find "$bgb_extract" -name "*.xml" | head -1) + if [[ -n "$bgb_xmlfile" ]]; then + # BGB Teil 1: AGB-Recht §§ 305-310 + python3 -c " +import xml.etree.ElementTree as ET, sys, re +tree = ET.parse('$bgb_xmlfile') +root = tree.getroot() +ns = {'': root.tag.split('}')[0].lstrip('{') if '}' in root.tag else ''} +text_parts = [] +capture = False +for norm in root.iter(): + if norm.tag.endswith('norm'): + enbez = norm.find('.//' + ('{' + ns[''] + '}' if ns[''] else '') + 'enbez') + if enbez is not None and enbez.text: + num = re.search(r'§\s*(\d+)', enbez.text) + if num: + n = int(num.group(1)) + capture = 305 <= n <= 310 + else: + capture = False + if capture: + for t in norm.itertext(): + text_parts.append(t.strip()) +with open('$WORK_DIR/pdfs/BGB_AGB_305_310.txt', 'w') as f: + f.write('BGB AGB-Recht §§ 305-310\n\n' + '\n'.join(p for p in text_parts if p)) +" 2>/dev/null + if [[ -f "$WORK_DIR/pdfs/BGB_AGB_305_310.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_AGB_305_310.txt") -gt 100 ]]; then + upload_file "$WORK_DIR/pdfs/BGB_AGB_305_310.txt" "$col" "compliance" "legal_reference" "2025" \ + '{"regulation_id":"bgb_agb","regulation_name_de":"BGB AGB-Recht (§§ 305-310)","category":"vertragsrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ + "BGB AGB-Recht §§ 305-310" + fi + + # BGB Teil 2: Fernabsatzrecht §§ 312-312k + python3 -c " +import xml.etree.ElementTree as ET, sys, re +tree = ET.parse('$bgb_xmlfile') +root = tree.getroot() +text_parts = [] +capture = False +for norm in root.iter(): + if norm.tag.endswith('norm'): + enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez') + if enbez is not None and enbez.text: + if re.search(r'§\s*312', enbez.text): + capture = True + elif re.search(r'§\s*31[3-9]|§\s*32', enbez.text): + capture = False + else: + if capture and not any(norm.itertext()): + capture = False + if capture: + for t in norm.itertext(): + text_parts.append(t.strip()) +with open('$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt', 'w') as f: + f.write('BGB Fernabsatzrecht §§ 312-312k\n\n' + '\n'.join(p for p in text_parts if p)) +" 2>/dev/null + if [[ -f "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt") -gt 100 ]]; then + upload_file "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt" "$col" "compliance" "legal_reference" "2025" \ + '{"regulation_id":"bgb_fernabsatz","regulation_name_de":"BGB Fernabsatzrecht (§§ 312-312k)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ + "BGB Fernabsatzrecht §§ 312-312k" + fi + + # BGB Teil 3: Kaufrecht + Gewährleistung §§ 433-480 + python3 -c " +import xml.etree.ElementTree as ET, sys, re +tree = ET.parse('$bgb_xmlfile') +root = tree.getroot() +text_parts = [] +capture = False +for norm in root.iter(): + if norm.tag.endswith('norm'): + enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez') + if enbez is not None and enbez.text: + num = re.search(r'§\s*(\d+)', enbez.text) + if num: + n = int(num.group(1)) + capture = 433 <= n <= 480 + else: + capture = False + if capture: + for t in norm.itertext(): + text_parts.append(t.strip()) +with open('$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt', 'w') as f: + f.write('BGB Kaufrecht §§ 433-480\n\n' + '\n'.join(p for p in text_parts if p)) +" 2>/dev/null + if [[ -f "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt") -gt 100 ]]; then + upload_file "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt" "$col" "compliance" "legal_reference" "2025" \ + '{"regulation_id":"bgb_kaufrecht","regulation_name_de":"BGB Kaufrecht + Gewaehrleistung (§§ 433-480)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ + "BGB Kaufrecht §§ 433-480" + fi + + # BGB Teil 4: Widerrufsrecht §§ 355-361 + python3 -c " +import xml.etree.ElementTree as ET, sys, re +tree = ET.parse('$bgb_xmlfile') +root = tree.getroot() +text_parts = [] +capture = False +for norm in root.iter(): + if norm.tag.endswith('norm'): + enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez') + if enbez is not None and enbez.text: + num = re.search(r'§\s*(\d+)', enbez.text) + if num: + n = int(num.group(1)) + capture = 355 <= n <= 361 + else: + capture = False + if capture: + for t in norm.itertext(): + text_parts.append(t.strip()) +with open('$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt', 'w') as f: + f.write('BGB Widerrufsrecht §§ 355-361\n\n' + '\n'.join(p for p in text_parts if p)) +" 2>/dev/null + if [[ -f "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt") -gt 100 ]]; then + upload_file "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt" "$col" "compliance" "legal_reference" "2025" \ + '{"regulation_id":"bgb_widerruf","regulation_name_de":"BGB Widerrufsrecht (§§ 355-361)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ + "BGB Widerrufsrecht §§ 355-361" + fi + + # BGB Teil 5: Digitale Produkte §§ 327-327u + python3 -c " +import xml.etree.ElementTree as ET, sys, re +tree = ET.parse('$bgb_xmlfile') +root = tree.getroot() +text_parts = [] +capture = False +for norm in root.iter(): + if norm.tag.endswith('norm'): + enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez') + if enbez is not None and enbez.text: + if re.search(r'§\s*327', enbez.text): + capture = True + elif re.search(r'§\s*328', enbez.text): + capture = False + if capture: + for t in norm.itertext(): + text_parts.append(t.strip()) +with open('$WORK_DIR/pdfs/BGB_Digital_327.txt', 'w') as f: + f.write('BGB Digitale Produkte §§ 327-327u\n\n' + '\n'.join(p for p in text_parts if p)) +" 2>/dev/null + if [[ -f "$WORK_DIR/pdfs/BGB_Digital_327.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Digital_327.txt") -gt 100 ]]; then + upload_file "$WORK_DIR/pdfs/BGB_Digital_327.txt" "$col" "compliance" "legal_reference" "2025" \ + '{"regulation_id":"bgb_digital","regulation_name_de":"BGB Digitale Produkte (§§ 327-327u)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ + "BGB Digitale Produkte §§ 327-327u" + fi + else + warn "BGB XML file not found in archive" + fi + else + warn "BGB XML download failed" fi - # EGBGB (Muster-Widerrufsbelehrung Anlage 1+2) - download_pdf \ - "https://www.gesetze-im-internet.de/bgbeg/BGBEG.pdf" \ - "$WORK_DIR/pdfs/BGBEG.pdf" - if [[ -f "$WORK_DIR/pdfs/BGBEG.pdf" ]]; then - upload_file "$WORK_DIR/pdfs/BGBEG.pdf" "$col" "compliance" "legal_reference" "2025" \ - '{"regulation_id":"egbgb","regulation_name_de":"EGBGB (Muster-Widerrufsbelehrung, Informationspflichten)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ - "EGBGB (Muster-Widerrufsbelehrung)" + # EGBGB — XML statt PDF (BGBEG.pdf war leer) + local egbgb_xml="$WORK_DIR/pdfs/bgbeg_xml.zip" + curl -sL "https://www.gesetze-im-internet.de/bgbeg/xml.zip" -o "$egbgb_xml" 2>/dev/null + if [[ -f "$egbgb_xml" && $(stat -f%z "$egbgb_xml" 2>/dev/null || stat -c%s "$egbgb_xml" 2>/dev/null || echo 0) -gt 1000 ]]; then + local egbgb_extract="$WORK_DIR/pdfs/egbgb_xml" + mkdir -p "$egbgb_extract" + unzip -qo "$egbgb_xml" -d "$egbgb_extract" 2>/dev/null || true + local egbgb_xmlfile + egbgb_xmlfile=$(find "$egbgb_extract" -name "*.xml" | head -1) + if [[ -n "$egbgb_xmlfile" ]]; then + # Art. 246a EGBGB (Informationspflichten Fernabsatz) + Anlage 1+2 (Widerrufsbelehrung) + python3 -c " +import xml.etree.ElementTree as ET +tree = ET.parse('$egbgb_xmlfile') +root = tree.getroot() +text_parts = [] +for norm in root.iter(): + if norm.tag.endswith('norm'): + # Capture all text — EGBGB is not too large + parts = [t.strip() for t in norm.itertext() if t.strip()] + if any('246' in p or 'Anlage' in p or 'Widerruf' in p or 'Muster' in p for p in parts): + text_parts.extend(parts) +with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f: + f.write('EGBGB - Informationspflichten und Muster-Widerrufsbelehrung\n\n' + '\n'.join(text_parts)) +" 2>/dev/null + if [[ -f "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" && $(wc -c < "$WORK_DIR/pdfs/EGBGB_Widerruf.txt") -gt 100 ]]; then + upload_file "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" "$col" "compliance" "legal_reference" "2025" \ + '{"regulation_id":"egbgb","regulation_name_de":"EGBGB (Muster-Widerrufsbelehrung, Art. 246a + Anlage 1+2)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \ + "EGBGB Muster-Widerrufsbelehrung" + fi + fi + else + warn "EGBGB XML download failed" fi local after after=$(collection_count "$col") - log "Collection $col: $before → $after chunks (+$((after - before)))" + local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?" + log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})" # ========================================================================= # H2: EU-Recht → bp_compliance_ce @@ -915,7 +1112,8 @@ phase_verbraucherschutz() { done after=$(collection_count "$col") - log "Collection $col: $before → $after chunks (+$((after - before)))" + local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?" + log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})" # ========================================================================= # H3: NIST Security Frameworks → bp_compliance_security @@ -956,7 +1154,8 @@ phase_verbraucherschutz() { fi after=$(collection_count "$col") - log "Collection $col: $before → $after chunks (+$((after - before)))" + local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?" + log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})" # ========================================================================= # Summary