From c88653b22145093861baff4859825839e8bafeef Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Thu, 12 Mar 2026 09:39:09 +0100
Subject: [PATCH] fix(rag): Dedup check, BGB split, GewO timeout, arithmetic
 fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add Qdrant dedup check in upload_file() — skip if regulation_id already exists
- Split BGB (2.7MB) into 5 targeted parts via XML extraction:
  AGB §§305-310, Fernabsatz §§312-312k, Kaufrecht §§433-480,
  Widerruf §§355-361, Digitale Produkte §§327-327u
- Lower large-file threshold 512KB→384KB (fixes GewO 432KB timeout)
- Fix arithmetic syntax error when collection_count returns "?"
- Replace EGBGB PDF (was empty) with XML extraction
- Add unzip to Alpine container for XML archives

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gitea/workflows/rag-ingest.yaml |   2 +-
 scripts/ingest-legal-corpus.sh   | 239 ++++++++++++++++++++++++++++---
 2 files changed, 220 insertions(+), 21 deletions(-)

diff --git a/.gitea/workflows/rag-ingest.yaml b/.gitea/workflows/rag-ingest.yaml
index c7d50c5..b7c4769 100644
--- a/.gitea/workflows/rag-ingest.yaml
+++ b/.gitea/workflows/rag-ingest.yaml
@@ -78,7 +78,7 @@ jobs:
             -e "SDK_URL=http://bp-compliance-ai-sdk:8090" \
             alpine:3.19 \
             sh -c "
-              apk add --no-cache curl bash coreutils git python3 > /dev/null 2>&1
+              apk add --no-cache curl bash coreutils git python3 unzip > /dev/null 2>&1
               mkdir -p /tmp/rag-ingestion/{pdfs,repos,texts}
               cd /workspace
               if [ '${PHASE}' = 'all' ]; then
diff --git a/scripts/ingest-legal-corpus.sh b/scripts/ingest-legal-corpus.sh
index 7dc5ec5..7a9e26b 100755
--- a/scripts/ingest-legal-corpus.sh
+++ b/scripts/ingest-legal-corpus.sh
@@ -64,6 +64,22 @@ upload_file() {
     return 0  # Don't abort script
   fi
 
+  # Dedup-Check: Prüfe ob regulation_id bereits in Qdrant vorhanden ist
+  local reg_id
+  reg_id=$(echo "$metadata_json" | python3 -c "import sys,json; print(json.load(sys.stdin).get('regulation_id',''))" 2>/dev/null || echo "")
+  if [[ -n "$reg_id" && -n "${QDRANT_URL:-}" ]]; then
+    local existing
+    existing=$(curl -sk --max-time 5 -X POST "${QDRANT_URL}/collections/${collection}/points/scroll" \
+      -H "Content-Type: application/json" \
+      -d "{\"filter\":{\"must\":[{\"key\":\"regulation_id\",\"match\":{\"value\":\"$reg_id\"}}]},\"limit\":1}" \
+      2>/dev/null | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(len(r.get('points',[])))" 2>/dev/null || echo "0")
+    if [[ "$existing" -gt 0 ]] 2>/dev/null; then
+      log "⏭ Skip (already in Qdrant): $label [regulation_id=$reg_id]"
+      SKIPPED=$((SKIPPED + 1))
+      return 0
+    fi
+  fi
+
   local filesize
   filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
   if [[ "$filesize" -lt 100 ]]; then
@@ -76,7 +92,7 @@ upload_file() {
 
   # Use longer timeout for large files (>500KB)
   local curl_opts="$CURL_OPTS"
-  if [[ "$filesize" -gt 512000 ]]; then
+  if [[ "$filesize" -gt 384000 ]]; then
     curl_opts="$CURL_OPTS_LARGE"
     log "  (large file, using extended timeout)"
   fi
@@ -833,29 +849,210 @@ phase_verbraucherschutz() {
     fi
   done
 
-  # BGB komplett (Fernabsatz, Digitale Inhalte, Kaufrecht, AGB-Recht)
-  download_pdf \
-    "https://www.gesetze-im-internet.de/bgb/BGB.pdf" \
-    "$WORK_DIR/pdfs/BGB_full.pdf"
-  if [[ -f "$WORK_DIR/pdfs/BGB_full.pdf" ]]; then
-    upload_file "$WORK_DIR/pdfs/BGB_full.pdf" "$col" "compliance" "legal_reference" "2025" \
-      '{"regulation_id":"bgb_komplett","regulation_name_de":"BGB (komplett: AGB-Recht, Fernabsatz, Digitale Inhalte, Kaufrecht)","category":"vertragsrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
-      "BGB komplett"
+  # BGB in Teilen statt komplett (2.7MB PDF ist zu gross fuer CPU-Embeddings)
+  # gesetze-im-internet.de bietet XML-Download pro Gesetz
+  local bgb_xml="$WORK_DIR/pdfs/bgb_xml.zip"
+  curl -sL "https://www.gesetze-im-internet.de/bgb/xml.zip" -o "$bgb_xml" 2>/dev/null
+  if [[ -f "$bgb_xml" && $(stat -f%z "$bgb_xml" 2>/dev/null || stat -c%s "$bgb_xml" 2>/dev/null || echo 0) -gt 1000 ]]; then
+    local bgb_extract="$WORK_DIR/pdfs/bgb_xml"
+    mkdir -p "$bgb_extract"
+    unzip -qo "$bgb_xml" -d "$bgb_extract" 2>/dev/null || true
+
+    # Relevante BGB-Abschnitte als Text extrahieren und einzeln uploaden
+    # Die XML-Datei hat <norm> Elemente mit <metadaten><enbez>§ 305</enbez>
+    local bgb_xmlfile
+    bgb_xmlfile=$(find "$bgb_extract" -name "*.xml" | head -1)
+    if [[ -n "$bgb_xmlfile" ]]; then
+      # BGB Teil 1: AGB-Recht §§ 305-310
+      python3 -c "
+import xml.etree.ElementTree as ET, sys, re
+tree = ET.parse('$bgb_xmlfile')
+root = tree.getroot()
+ns = {'': root.tag.split('}')[0].lstrip('{') if '}' in root.tag else ''}
+text_parts = []
+capture = False
+for norm in root.iter():
+    if norm.tag.endswith('norm'):
+        enbez = norm.find('.//' + ('{' + ns[''] + '}' if ns[''] else '') + 'enbez')
+        if enbez is not None and enbez.text:
+            num = re.search(r'§\s*(\d+)', enbez.text)
+            if num:
+                n = int(num.group(1))
+                capture = 305 <= n <= 310
+        else:
+            capture = False
+        if capture:
+            for t in norm.itertext():
+                text_parts.append(t.strip())
+with open('$WORK_DIR/pdfs/BGB_AGB_305_310.txt', 'w') as f:
+    f.write('BGB AGB-Recht §§ 305-310\n\n' + '\n'.join(p for p in text_parts if p))
+" 2>/dev/null
+      if [[ -f "$WORK_DIR/pdfs/BGB_AGB_305_310.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_AGB_305_310.txt") -gt 100 ]]; then
+        upload_file "$WORK_DIR/pdfs/BGB_AGB_305_310.txt" "$col" "compliance" "legal_reference" "2025" \
+          '{"regulation_id":"bgb_agb","regulation_name_de":"BGB AGB-Recht (§§ 305-310)","category":"vertragsrecht","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
+          "BGB AGB-Recht §§ 305-310"
+      fi
+
+      # BGB Teil 2: Fernabsatzrecht §§ 312-312k
+      python3 -c "
+import xml.etree.ElementTree as ET, sys, re
+tree = ET.parse('$bgb_xmlfile')
+root = tree.getroot()
+text_parts = []
+capture = False
+for norm in root.iter():
+    if norm.tag.endswith('norm'):
+        enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
+        if enbez is not None and enbez.text:
+            if re.search(r'§\s*312', enbez.text):
+                capture = True
+            elif re.search(r'§\s*31[3-9]|§\s*32', enbez.text):
+                capture = False
+        else:
+            if capture and not any(norm.itertext()):
+                capture = False
+        if capture:
+            for t in norm.itertext():
+                text_parts.append(t.strip())
+with open('$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt', 'w') as f:
+    f.write('BGB Fernabsatzrecht §§ 312-312k\n\n' + '\n'.join(p for p in text_parts if p))
+" 2>/dev/null
+      if [[ -f "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt") -gt 100 ]]; then
+        upload_file "$WORK_DIR/pdfs/BGB_Fernabsatz_312.txt" "$col" "compliance" "legal_reference" "2025" \
+          '{"regulation_id":"bgb_fernabsatz","regulation_name_de":"BGB Fernabsatzrecht (§§ 312-312k)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
+          "BGB Fernabsatzrecht §§ 312-312k"
+      fi
+
+      # BGB Teil 3: Kaufrecht + Gewährleistung §§ 433-480
+      python3 -c "
+import xml.etree.ElementTree as ET, sys, re
+tree = ET.parse('$bgb_xmlfile')
+root = tree.getroot()
+text_parts = []
+capture = False
+for norm in root.iter():
+    if norm.tag.endswith('norm'):
+        enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
+        if enbez is not None and enbez.text:
+            num = re.search(r'§\s*(\d+)', enbez.text)
+            if num:
+                n = int(num.group(1))
+                capture = 433 <= n <= 480
+        else:
+            capture = False
+        if capture:
+            for t in norm.itertext():
+                text_parts.append(t.strip())
+with open('$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt', 'w') as f:
+    f.write('BGB Kaufrecht §§ 433-480\n\n' + '\n'.join(p for p in text_parts if p))
+" 2>/dev/null
+      if [[ -f "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt") -gt 100 ]]; then
+        upload_file "$WORK_DIR/pdfs/BGB_Kaufrecht_433_480.txt" "$col" "compliance" "legal_reference" "2025" \
+          '{"regulation_id":"bgb_kaufrecht","regulation_name_de":"BGB Kaufrecht + Gewaehrleistung (§§ 433-480)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
+          "BGB Kaufrecht §§ 433-480"
+      fi
+
+      # BGB Teil 4: Widerrufsrecht §§ 355-361
+      python3 -c "
+import xml.etree.ElementTree as ET, sys, re
+tree = ET.parse('$bgb_xmlfile')
+root = tree.getroot()
+text_parts = []
+capture = False
+for norm in root.iter():
+    if norm.tag.endswith('norm'):
+        enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
+        if enbez is not None and enbez.text:
+            num = re.search(r'§\s*(\d+)', enbez.text)
+            if num:
+                n = int(num.group(1))
+                capture = 355 <= n <= 361
+        else:
+            capture = False
+        if capture:
+            for t in norm.itertext():
+                text_parts.append(t.strip())
+with open('$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt', 'w') as f:
+    f.write('BGB Widerrufsrecht §§ 355-361\n\n' + '\n'.join(p for p in text_parts if p))
+" 2>/dev/null
+      if [[ -f "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt") -gt 100 ]]; then
+        upload_file "$WORK_DIR/pdfs/BGB_Widerruf_355_361.txt" "$col" "compliance" "legal_reference" "2025" \
+          '{"regulation_id":"bgb_widerruf","regulation_name_de":"BGB Widerrufsrecht (§§ 355-361)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
+          "BGB Widerrufsrecht §§ 355-361"
+      fi
+
+      # BGB Teil 5: Digitale Produkte §§ 327-327u
+      python3 -c "
+import xml.etree.ElementTree as ET, sys, re
+tree = ET.parse('$bgb_xmlfile')
+root = tree.getroot()
+text_parts = []
+capture = False
+for norm in root.iter():
+    if norm.tag.endswith('norm'):
+        enbez = norm.find('.//' + ('{' + root.tag.split('}')[0].lstrip('{') + '}' if '}' in root.tag else '') + 'enbez')
+        if enbez is not None and enbez.text:
+            if re.search(r'§\s*327', enbez.text):
+                capture = True
+            elif re.search(r'§\s*328', enbez.text):
+                capture = False
+        if capture:
+            for t in norm.itertext():
+                text_parts.append(t.strip())
+with open('$WORK_DIR/pdfs/BGB_Digital_327.txt', 'w') as f:
+    f.write('BGB Digitale Produkte §§ 327-327u\n\n' + '\n'.join(p for p in text_parts if p))
+" 2>/dev/null
+      if [[ -f "$WORK_DIR/pdfs/BGB_Digital_327.txt" && $(wc -c < "$WORK_DIR/pdfs/BGB_Digital_327.txt") -gt 100 ]]; then
+        upload_file "$WORK_DIR/pdfs/BGB_Digital_327.txt" "$col" "compliance" "legal_reference" "2025" \
+          '{"regulation_id":"bgb_digital","regulation_name_de":"BGB Digitale Produkte (§§ 327-327u)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
+          "BGB Digitale Produkte §§ 327-327u"
+      fi
+    else
+      warn "BGB XML file not found in archive"
+    fi
+  else
+    warn "BGB XML download failed"
   fi
 
-  # EGBGB (Muster-Widerrufsbelehrung Anlage 1+2)
-  download_pdf \
-    "https://www.gesetze-im-internet.de/bgbeg/BGBEG.pdf" \
-    "$WORK_DIR/pdfs/BGBEG.pdf"
-  if [[ -f "$WORK_DIR/pdfs/BGBEG.pdf" ]]; then
-    upload_file "$WORK_DIR/pdfs/BGBEG.pdf" "$col" "compliance" "legal_reference" "2025" \
-      '{"regulation_id":"egbgb","regulation_name_de":"EGBGB (Muster-Widerrufsbelehrung, Informationspflichten)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
-      "EGBGB (Muster-Widerrufsbelehrung)"
+  # EGBGB — XML statt PDF (BGBEG.pdf war leer)
+  local egbgb_xml="$WORK_DIR/pdfs/bgbeg_xml.zip"
+  curl -sL "https://www.gesetze-im-internet.de/bgbeg/xml.zip" -o "$egbgb_xml" 2>/dev/null
+  if [[ -f "$egbgb_xml" && $(stat -f%z "$egbgb_xml" 2>/dev/null || stat -c%s "$egbgb_xml" 2>/dev/null || echo 0) -gt 1000 ]]; then
+    local egbgb_extract="$WORK_DIR/pdfs/egbgb_xml"
+    mkdir -p "$egbgb_extract"
+    unzip -qo "$egbgb_xml" -d "$egbgb_extract" 2>/dev/null || true
+    local egbgb_xmlfile
+    egbgb_xmlfile=$(find "$egbgb_extract" -name "*.xml" | head -1)
+    if [[ -n "$egbgb_xmlfile" ]]; then
+      # Art. 246a EGBGB (Informationspflichten Fernabsatz) + Anlage 1+2 (Widerrufsbelehrung)
+      python3 -c "
+import xml.etree.ElementTree as ET
+tree = ET.parse('$egbgb_xmlfile')
+root = tree.getroot()
+text_parts = []
+for norm in root.iter():
+    if norm.tag.endswith('norm'):
+        # Capture all text — EGBGB is not too large
+        parts = [t.strip() for t in norm.itertext() if t.strip()]
+        if any('246' in p or 'Anlage' in p or 'Widerruf' in p or 'Muster' in p for p in parts):
+            text_parts.extend(parts)
+with open('$WORK_DIR/pdfs/EGBGB_Widerruf.txt', 'w') as f:
+    f.write('EGBGB - Informationspflichten und Muster-Widerrufsbelehrung\n\n' + '\n'.join(text_parts))
+" 2>/dev/null
+      if [[ -f "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" && $(wc -c < "$WORK_DIR/pdfs/EGBGB_Widerruf.txt") -gt 100 ]]; then
+        upload_file "$WORK_DIR/pdfs/EGBGB_Widerruf.txt" "$col" "compliance" "legal_reference" "2025" \
+          '{"regulation_id":"egbgb","regulation_name_de":"EGBGB (Muster-Widerrufsbelehrung, Art. 246a + Anlage 1+2)","category":"verbraucherschutz","license":"public_domain_§5_UrhG","source":"gesetze-im-internet.de"}' \
+          "EGBGB Muster-Widerrufsbelehrung"
+      fi
+    fi
+  else
+    warn "EGBGB XML download failed"
   fi
 
   local after
   after=$(collection_count "$col")
-  log "Collection $col: $before → $after chunks (+$((after - before)))"
+  local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?"
+  log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"
 
   # =========================================================================
   # H2: EU-Recht → bp_compliance_ce
@@ -915,7 +1112,8 @@ phase_verbraucherschutz() {
   done
 
   after=$(collection_count "$col")
-  log "Collection $col: $before → $after chunks (+$((after - before)))"
+  local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?"
+  log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"
 
   # =========================================================================
   # H3: NIST Security Frameworks → bp_compliance_security
@@ -956,7 +1154,8 @@ phase_verbraucherschutz() {
   fi
 
   after=$(collection_count "$col")
-  log "Collection $col: $before → $after chunks (+$((after - before)))"
+  local diff=$(( ${after:-0} - ${before:-0} )) 2>/dev/null || diff="?"
+  log "Collection $col: ${before:-?} → ${after:-?} chunks (+${diff})"
 
   # =========================================================================
   # Summary