5 Commits

Author SHA1 Message Date
Benjamin Admin
71267e2a8a test: add tests for compliance advisor IFRS prompt and ingestion script
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 33s
CI / test-python-backend-compliance (push) Successful in 26s
CI / test-python-document-crawler (push) Successful in 23s
CI / test-python-dsms-gateway (push) Successful in 18s
46 tests covering:
- COMPLIANCE_COLLECTIONS validation
- IFRS endorsement warning content (5 points, CELEX, EFRAG reference)
- Ingestion script structure (download_pdf, upload_file functions)
- IFRS/EFRAG/ENISA URLs and metadata validation
- Chunking config and verification phase

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 16:46:45 +01:00
Benjamin Admin
52a9ad2279 docs: add Industry Compliance Ingestion documentation
- Document all 10 industry compliance PDFs and their sources
- Cover ingestion script usage, phases, chunking config
- Document IFRS timeout workaround and endorsement warning
- Add license overview for all document sources

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 09:50:34 +01:00
Benjamin Admin
ee79a48e8e fix: add User-Agent header to PDF downloads
Some sites (ENISA) reject requests without User-Agent header.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 09:33:06 +01:00
Benjamin Admin
9026e392dc fix: ENISA URLs + increase curl timeout for large PDFs
- Update ENISA download URLs to new site structure (publications → sites/default/files)
- Increase curl max-time from 300s to 600s for IFRS PDFs (7.5-8.2MB)
- Update ENISA Secure by Design metadata (title changed)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 09:07:01 +01:00
Benjamin Admin
9496e758fc feat: EU-IFRS 2023/1803 + EFRAG Endorsement ingestion & system prompt
- Ingestion script: Add 3 new PDFs (IFRS DE/EN, EFRAG Endorsement Status)
  to ingest-industry-compliance.sh (7 → 10 documents total)
- System prompt: Add EU-IFRS and EFRAG to competence area, add mandatory
  IFRS endorsement warning section for all IFRS/IAS queries

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 01:56:04 +01:00
6 changed files with 842 additions and 0 deletions

View File

@@ -0,0 +1,167 @@
import { describe, it, expect } from 'vitest'
import { readFileSync } from 'fs'
import { resolve } from 'path'
/**
* Tests for the ingestion script ingest-industry-compliance.sh
* Validates script structure, URLs, metadata, and configuration.
*/
const SCRIPT_PATH = resolve(__dirname, '../../scripts/ingest-industry-compliance.sh')
let scriptContent: string
try {
scriptContent = readFileSync(SCRIPT_PATH, 'utf-8')
} catch {
scriptContent = ''
}
describe('Ingestion Script: ingest-industry-compliance.sh', () => {
it('should exist and be non-empty', () => {
expect(scriptContent.length).toBeGreaterThan(0)
})
describe('download_pdf function', () => {
it('should define download_pdf function', () => {
expect(scriptContent).toContain('download_pdf()')
})
it('should use User-Agent header for downloads', () => {
expect(scriptContent).toContain('Mozilla/5.0')
})
it('should follow redirects with -L flag', () => {
expect(scriptContent).toMatch(/curl.*-L/)
})
it('should skip already downloaded files', () => {
expect(scriptContent).toContain('-f "$target"')
})
})
describe('upload_file function', () => {
it('should define upload_file function', () => {
expect(scriptContent).toContain('upload_file()')
})
it('should use recursive chunk strategy', () => {
expect(scriptContent).toContain('chunk_strategy=recursive')
})
it('should use chunk_size=512', () => {
expect(scriptContent).toContain('chunk_size=512')
})
it('should use chunk_overlap=50', () => {
expect(scriptContent).toContain('chunk_overlap=50')
})
it('should validate minimum file size', () => {
expect(scriptContent).toContain('"$filesize" -lt 100')
})
})
describe('IFRS Downloads', () => {
it('should download IFRS DE from EUR-Lex', () => {
expect(scriptContent).toContain(
'https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1803'
)
})
it('should download IFRS EN from EUR-Lex', () => {
expect(scriptContent).toContain(
'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R1803'
)
})
it('should save IFRS DE with correct filename', () => {
expect(scriptContent).toContain('ifrs_regulation_2023_1803_de.pdf')
})
it('should save IFRS EN with correct filename', () => {
expect(scriptContent).toContain('ifrs_regulation_2023_1803_en.pdf')
})
})
describe('EFRAG Download', () => {
it('should download EFRAG Endorsement Status Report', () => {
expect(scriptContent).toContain('efrag.org')
})
it('should save as efrag_endorsement_status_2025.pdf', () => {
expect(scriptContent).toContain('efrag_endorsement_status_2025.pdf')
})
})
describe('ENISA Downloads', () => {
it('should download ENISA from new URL pattern', () => {
expect(scriptContent).toContain('enisa.europa.eu/sites/default/files/publications')
})
it('should NOT use old Plone-style URLs', () => {
expect(scriptContent).not.toContain('@@download/fullReport')
})
})
describe('IFRS Metadata', () => {
it('should include CELEX number 32023R1803', () => {
expect(scriptContent).toContain('"celex":"32023R1803"')
})
it('should tag as regulation_short EU_IFRS', () => {
expect(scriptContent).toContain('"regulation_short":"EU_IFRS"')
})
it('should set category to rechnungslegung', () => {
expect(scriptContent).toContain('"category":"rechnungslegung"')
})
it('should include endorsement note', () => {
expect(scriptContent).toContain('EU-endorsed IFRS')
})
it('should set license to public_law', () => {
expect(scriptContent).toContain('"license":"public_law"')
})
})
describe('EFRAG Metadata', () => {
it('should set source_id to efrag', () => {
expect(scriptContent).toContain('"source_id":"efrag"')
})
it('should include EFRAG attribution', () => {
expect(scriptContent).toContain('European Financial Reporting Advisory Group')
})
})
describe('Target Collections', () => {
it('should reference bp_compliance_ce', () => {
expect(scriptContent).toContain('bp_compliance_ce')
})
it('should reference bp_compliance_datenschutz', () => {
expect(scriptContent).toContain('bp_compliance_datenschutz')
})
})
describe('Verification Phase', () => {
it('should have a phase_verify function', () => {
expect(scriptContent).toContain('phase_verify')
})
it('should test search for IFRS', () => {
expect(scriptContent).toContain('IFRS Rechnungslegung EU endorsed')
})
it('should test search for EFRAG', () => {
expect(scriptContent).toContain('EFRAG endorsement status')
})
})
describe('Curl Configuration', () => {
it('should set max-time to 600 seconds', () => {
expect(scriptContent).toContain('--max-time 600')
})
})
})

View File

@@ -0,0 +1,100 @@
import { describe, it, expect } from 'vitest'
/**
* Tests for Compliance Advisor system prompt:
* - COMPLIANCE_COLLECTIONS includes required collections
* - IFRS endorsement warning is present
* - EU-IFRS competence area is declared
*/
// Replicate COMPLIANCE_COLLECTIONS from route.ts
const COMPLIANCE_COLLECTIONS = [
'bp_compliance_gesetze',
'bp_compliance_ce',
'bp_compliance_datenschutz',
'bp_dsfa_corpus',
'bp_compliance_recht',
'bp_legal_templates',
] as const
// Replicate the IFRS system prompt sections
const IFRS_COMPETENCE = [
'EU-IFRS (Verordnung 2023/1803)',
'EFRAG Endorsement Status',
]
const IFRS_ENDORSEMENT_WARNING = `Bei ALLEN Fragen zu IFRS/IAS-Standards MUSST du folgende Punkte beachten:
1. Dein Wissen basiert auf den **EU-uebernommenen IFRS** (Verordnung 2023/1803, Stand Okt 2023).
2. Die IASB/IFRS Foundation gibt regelmaessig neue oder geaenderte Standards heraus, die von der EU noch NICHT uebernommen sein koennten.
3. Weise den Nutzer IMMER darauf hin: "Dieser Hinweis basiert auf den EU-endorsed IFRS (Stand: Verordnung 2023/1803). Pruefen Sie den aktuellen EFRAG Endorsement Status fuer neuere Standards."
4. Bei internationalen Ausschreibungen: Nur EU-endorsed IFRS sind fuer EU-Unternehmen rechtsverbindlich.
5. Verweise NICHT auf IFRS Foundation Originaltexte, sondern ausschliesslich auf die EU-Verordnung.`
describe('Compliance Advisor System Prompt', () => {
describe('COMPLIANCE_COLLECTIONS', () => {
it('should include bp_compliance_ce for IFRS/CE documents', () => {
expect(COMPLIANCE_COLLECTIONS).toContain('bp_compliance_ce')
})
it('should include bp_compliance_datenschutz for EFRAG/ENISA', () => {
expect(COMPLIANCE_COLLECTIONS).toContain('bp_compliance_datenschutz')
})
it('should include bp_compliance_gesetze for laws', () => {
expect(COMPLIANCE_COLLECTIONS).toContain('bp_compliance_gesetze')
})
it('should include bp_dsfa_corpus for DSFA', () => {
expect(COMPLIANCE_COLLECTIONS).toContain('bp_dsfa_corpus')
})
it('should have exactly 6 collections', () => {
expect(COMPLIANCE_COLLECTIONS).toHaveLength(6)
})
})
describe('IFRS Competence Area', () => {
it('should declare EU-IFRS Verordnung 2023/1803', () => {
expect(IFRS_COMPETENCE[0]).toContain('2023/1803')
})
it('should declare EFRAG Endorsement Status', () => {
expect(IFRS_COMPETENCE[1]).toContain('EFRAG')
})
})
describe('IFRS Endorsement Warning', () => {
it('should mention Verordnung 2023/1803', () => {
expect(IFRS_ENDORSEMENT_WARNING).toContain('Verordnung 2023/1803')
})
it('should warn about IASB/IFRS Foundation updates', () => {
expect(IFRS_ENDORSEMENT_WARNING).toContain('IASB/IFRS Foundation')
})
it('should instruct to reference EFRAG status', () => {
expect(IFRS_ENDORSEMENT_WARNING).toContain('EFRAG Endorsement Status')
})
it('should mention EU-endorsed IFRS only', () => {
expect(IFRS_ENDORSEMENT_WARNING).toContain('EU-endorsed IFRS')
})
it('should warn against IFRS Foundation original texts', () => {
expect(IFRS_ENDORSEMENT_WARNING).toContain('NICHT auf IFRS Foundation Originaltexte')
})
it('should mention international tenders requirement', () => {
expect(IFRS_ENDORSEMENT_WARNING).toContain('internationalen Ausschreibungen')
})
it('should have 5 numbered points', () => {
const points = IFRS_ENDORSEMENT_WARNING.match(/^\d+\./gm)
expect(points).toHaveLength(5)
})
it('should reference Stand Okt 2023', () => {
expect(IFRS_ENDORSEMENT_WARNING).toContain('Stand Okt 2023')
})
})
})

View File

@@ -59,6 +59,22 @@ offiziellen Quellen und gibst praxisnahe Hinweise.
- WP29/WP248 (Art.-29-Datenschutzgruppe Arbeitspapiere)
- Nationale Datenschutzgesetze (AT DSG, CH DSG/DSV, etc.)
- EU-Verordnungen (DORA, MiCA, Data Act, EHDS, PSD2, AMLR, etc.)
- EU Maschinenverordnung (2023/1230) — CE-Kennzeichnung, Konformitaet, Cybersecurity fuer Maschinen
- EU Blue Guide 2022 — Leitfaden fuer EU-Produktvorschriften und CE-Kennzeichnung
- ENISA Cybersecurity Guidance (Secure by Design, Supply Chain Security)
- NIST SP 800-218 (SSDF) — Secure Software Development Framework
- NIST Cybersecurity Framework (CSF) 2.0 — Govern, Identify, Protect, Detect, Respond, Recover
- OECD AI Principles — Verantwortungsvolle KI, Transparenz, Accountability
- EU-IFRS (Verordnung 2023/1803) — EU-uebernommene International Financial Reporting Standards
- EFRAG Endorsement Status — Uebersicht welche IFRS-Standards EU-endorsed sind
## IFRS-Besonderheit (WICHTIG)
Bei ALLEN Fragen zu IFRS/IAS-Standards MUSST du folgende Punkte beachten:
1. Dein Wissen basiert auf den **EU-uebernommenen IFRS** (Verordnung 2023/1803, Stand Okt 2023).
2. Die IASB/IFRS Foundation gibt regelmaessig neue oder geaenderte Standards heraus, die von der EU noch NICHT uebernommen sein koennten.
3. Weise den Nutzer IMMER darauf hin: "Dieser Hinweis basiert auf den EU-endorsed IFRS (Stand: Verordnung 2023/1803). Pruefen Sie den aktuellen EFRAG Endorsement Status fuer neuere Standards."
4. Bei internationalen Ausschreibungen: Nur EU-endorsed IFRS sind fuer EU-Unternehmen rechtsverbindlich.
5. Verweise NICHT auf IFRS Foundation Originaltexte, sondern ausschliesslich auf die EU-Verordnung.
## RAG-Nutzung
Nutze das gesamte RAG-Corpus fuer Kontext und Quellenangaben — ausgenommen sind

View File

@@ -0,0 +1,115 @@
# Industry Compliance Ingestion
## Uebersicht
Das Ingestion-Skript `scripts/ingest-industry-compliance.sh` laedt oeffentlich verfuegbare Industrie-Compliance-Dokumente herunter und ingestiert sie in Qdrant via die Core RAG-API (Port 8097).
**Ausfuehrung:** Mac Mini
**Speicherort:** `~/rag-ingestion/`
**RAG-API:** `https://localhost:8097/api/v1/documents/upload`
---
## Dokumente (10 PDFs)
| # | Dokument | Quelle | Collection | Chunks |
|---|----------|--------|------------|--------|
| 1 | EU Maschinenverordnung 2023/1230 | EUR-Lex | `bp_compliance_ce` | ~882 |
| 2 | EU Blue Guide 2022 | EUR-Lex | `bp_compliance_ce` | ~1600 |
| 3 | ENISA Advancing Software Security | enisa.europa.eu | `bp_compliance_datenschutz` | ~99 |
| 4 | ENISA Supply Chain Threat Landscape | enisa.europa.eu | `bp_compliance_datenschutz` | ~284 |
| 5 | NIST SP 800-218 (SSDF) | nist.gov | `bp_compliance_datenschutz` | ~242 |
| 6 | NIST Cybersecurity Framework 2.0 | nist.gov | `bp_compliance_datenschutz` | ~162 |
| 7 | OECD AI Principles | oecd.org | `bp_compliance_datenschutz` | ~76 |
| 8 | EU-IFRS Verordnung 2023/1803 (DE) | EUR-Lex | `bp_compliance_ce` | ~8942 |
| 9 | EU-IFRS Verordnung 2023/1803 (EN) | EUR-Lex | `bp_compliance_ce` | ~9000 |
| 10 | EFRAG Endorsement Status Report | efrag.org | `bp_compliance_datenschutz` | ~48 |
---
## Ausfuehrung
```bash
# Vollstaendig (Download + Upload + Verify)
bash ~/rag-ingestion/ingest-industry-compliance.sh
# Nur Downloads
bash ~/rag-ingestion/ingest-industry-compliance.sh --only download
# Nur CE-Collection uploaden
bash ~/rag-ingestion/ingest-industry-compliance.sh --only ce --skip-download
# Nur Datenschutz-Collection uploaden
bash ~/rag-ingestion/ingest-industry-compliance.sh --only datenschutz --skip-download
# Nur Verifizierung
bash ~/rag-ingestion/ingest-industry-compliance.sh --only verify
```
---
## Phasen
### Phase A: Downloads
- Laedt alle 10 PDFs nach `~/rag-ingestion/pdfs/`
- Ueberspringe bereits vorhandene Dateien
- User-Agent Header fuer ENISA-Kompatibilitaet
### Phase B: CE-Collection (`bp_compliance_ce`)
- EU-Rechtstexte (Maschinenverordnung, Blue Guide, IFRS)
- Metadata: CELEX-Nummer, Kategorie, Sprache
### Phase C: Datenschutz-Collection (`bp_compliance_datenschutz`)
- Frameworks und Guidance (ENISA, NIST, OECD, EFRAG)
- Metadata: Source-ID, Typ, Attribution
### Phase D: Verifizierung
- Collection-Counts pruefen
- Test-Suchen durchfuehren
---
## Chunking-Konfiguration
| Parameter | Wert |
|-----------|------|
| Strategie | `recursive` |
| Chunk-Groesse | 512 Token |
| Chunk-Overlap | 50 Token |
| Embedding-Modell | BGE-M3 (1024-dim) |
---
## IFRS-Besonderheit
Die IFRS-Verordnung (EU) 2023/1803 ist mit ~8MB sehr gross und erzeugt ~9000 Chunks. Der Upload dauert 10-15 Minuten wegen der sequenziellen Embedding-Erzeugung.
**Workaround fuer Timeout:**
```bash
# PDF in Container kopieren und von dort uploaden
docker cp ifrs_regulation_2023_1803_de.pdf bp-core-rag-service:/tmp/
docker exec -d bp-core-rag-service sh -c "curl -s --max-time 1800 -X POST http://localhost:8097/api/v1/documents/upload -F file=@/tmp/ifrs_regulation_2023_1803_de.pdf -F collection=bp_compliance_ce ..."
```
---
## Compliance Advisor Integration
Der System-Prompt in `admin-compliance/app/api/sdk/compliance-advisor/chat/route.ts` referenziert alle ingestierten Dokumente. Bei IFRS-Fragen wird ein spezieller Endorsement-Hinweis angezeigt:
> Dieser Hinweis basiert auf den EU-endorsed IFRS (Stand: Verordnung 2023/1803).
> Pruefen Sie den aktuellen EFRAG Endorsement Status fuer neuere Standards.
---
## Lizenzen
Alle Dokumente sind unter oeffentlich nutzbaren Lizenzen:
| Quelle | Lizenz |
|--------|--------|
| EUR-Lex | Amtliches Werk der EU (Public Domain) |
| ENISA | EUPL/Reuse Notice |
| NIST | Public Domain (US Government) |
| OECD | Reuse Notice |
| EFRAG | Oeffentliches Dokument |

View File

@@ -74,6 +74,7 @@ nav:
- Document Crawler: services/sdk-modules/document-crawler.md
- Advisory Board: services/sdk-modules/advisory-board.md
- DSB Portal: services/sdk-modules/dsb-portal.md
- Industry Compliance Ingestion: services/sdk-modules/industry-compliance-ingestion.md
- Entwicklung:
- Testing: development/testing.md
- Dokumentation: development/documentation.md

View File

@@ -0,0 +1,443 @@
#!/usr/bin/env bash
# =============================================================================
# BreakPilot Compliance — Industry Compliance Ingestion
#
# Laedt 10 freie Industrie-Compliance-Dokumente herunter und ingestiert sie
# in Qdrant via die Core RAG-API (Port 8097).
#
# Dokumente:
# 1. EU Machinery Regulation 2023/1230 → bp_compliance_ce
# 2. EU Blue Guide 2022 → bp_compliance_ce
# 3. ENISA Secure by Design → bp_compliance_datenschutz
# 4. ENISA Supply Chain Security → bp_compliance_datenschutz
# 5. NIST SP 800-218 (SSDF) → bp_compliance_datenschutz
# 6. NIST Cybersecurity Framework 2.0 → bp_compliance_datenschutz
# 7. OECD AI Principles → bp_compliance_datenschutz
# 8. EU-IFRS Regulation 2023/1803 (DE) → bp_compliance_ce
# 9. EU-IFRS Regulation 2023/1803 (EN) → bp_compliance_ce
# 10. EFRAG Endorsement Status Report → bp_compliance_datenschutz
#
# Ausfuehrung auf dem Mac Mini:
# ~/rag-ingestion/ingest-industry-compliance.sh [--skip-download] [--only PHASE]
#
# Phasen: download, ce, datenschutz, verify
# =============================================================================
set -euo pipefail
# --- Configuration -----------------------------------------------------------
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion}"
RAG_URL="https://localhost:8097/api/v1/documents/upload"
QDRANT_URL="http://localhost:6333"
CURL_OPTS="-sk --connect-timeout 10 --max-time 600"
# Counters
UPLOADED=0
FAILED=0
SKIPPED=0
# --- CLI Args ----------------------------------------------------------------
SKIP_DOWNLOAD=false
ONLY_PHASE=""
while [[ $# -gt 0 ]]; do
case $1 in
--skip-download) SKIP_DOWNLOAD=true; shift ;;
--only) ONLY_PHASE="$2"; shift 2 ;;
-h|--help)
echo "Usage: $0 [--skip-download] [--only PHASE]"
echo "Phases: download, ce, datenschutz, verify"
exit 0
;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
# --- Helpers -----------------------------------------------------------------
log() { echo "[$(date '+%H:%M:%S')] $*"; }
ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
upload_file() {
local file="$1"
local collection="$2"
local data_type="$3"
local use_case="$4"
local year="$5"
local metadata_json="$6"
local label="${7:-$(basename "$file")}"
if [[ ! -f "$file" ]]; then
warn "File not found: $file"
FAILED=$((FAILED + 1))
return 1
fi
local filesize
filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
if [[ "$filesize" -lt 100 ]]; then
warn "File too small (${filesize}B), skipping: $label"
SKIPPED=$((SKIPPED + 1))
return 1
fi
log "Uploading: $label$collection ($(( filesize / 1024 ))KB)"
local response
response=$(curl $CURL_OPTS -X POST "$RAG_URL" \
-F "file=@${file}" \
-F "collection=${collection}" \
-F "data_type=${data_type}" \
-F "use_case=${use_case}" \
-F "year=${year}" \
-F "chunk_strategy=recursive" \
-F "chunk_size=512" \
-F "chunk_overlap=50" \
-F "metadata_json=${metadata_json}" \
2>/dev/null) || true
if echo "$response" | grep -q '"chunks_count"'; then
local chunks
chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?")
ok "$label$chunks chunks"
UPLOADED=$((UPLOADED + 1))
elif echo "$response" | grep -q '"vectors_indexed"'; then
local vectors
vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?")
ok "$label$vectors vectors"
UPLOADED=$((UPLOADED + 1))
else
fail "Upload failed: $label"
fail "Response: $response"
FAILED=$((FAILED + 1))
return 1
fi
}
download_pdf() {
local url="$1"
local target="$2"
if [[ -f "$target" ]]; then
log "PDF exists: $(basename "$target") (skipping)"
return 0
fi
log "Downloading: $(basename "$target")"
curl $CURL_OPTS -L -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' "$url" -o "$target" 2>/dev/null || {
warn "Download failed: $url"
return 1
}
}
collection_count() {
local col="$1"
curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?"
}
# =============================================================================
# PHASE A: Downloads (7 PDFs)
# =============================================================================
phase_download() {
log "=========================================="
log "PHASE A: Downloads (10 Industry Compliance PDFs)"
log "=========================================="
mkdir -p "$WORK_DIR/pdfs"
# --- A1: EUR-Lex ---
log "--- EUR-Lex: Machinery Regulation ---"
download_pdf \
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1230" \
"$WORK_DIR/pdfs/machinery_regulation_2023_1230.pdf"
# --- A2: EU Blue Guide 2022 ---
log "--- EU Blue Guide 2022 ---"
download_pdf \
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:52022XC0629(04)" \
"$WORK_DIR/pdfs/blue_guide_2022.pdf"
# --- A3: ENISA Publications ---
log "--- ENISA Publications ---"
download_pdf \
"https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20Report%20-%20Advancing%20Software%20Security%20in%20the%20EU.pdf" \
"$WORK_DIR/pdfs/enisa_secure_by_design.pdf"
download_pdf \
"https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20Threat%20Landscape%20for%20Supply%20Chain%20Attacks.pdf" \
"$WORK_DIR/pdfs/enisa_supply_chain_security.pdf"
# --- A4: NIST Publications ---
log "--- NIST Publications ---"
download_pdf \
"https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-218.pdf" \
"$WORK_DIR/pdfs/nist_sp_800_218_ssdf.pdf"
download_pdf \
"https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.29.pdf" \
"$WORK_DIR/pdfs/nist_csf_2_0.pdf"
# --- A5: OECD AI Principles ---
log "--- OECD AI Principles ---"
download_pdf \
"https://legalinstruments.oecd.org/api/print?ids=648&lang=en" \
"$WORK_DIR/pdfs/oecd_ai_principles.pdf"
# --- A6: EUR-Lex IFRS (DE + EN) ---
log "--- EUR-Lex: EU-IFRS Regulation 2023/1803 ---"
download_pdf \
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1803" \
"$WORK_DIR/pdfs/ifrs_regulation_2023_1803_de.pdf"
download_pdf \
"https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R1803" \
"$WORK_DIR/pdfs/ifrs_regulation_2023_1803_en.pdf"
# --- A7: EFRAG Endorsement Status Report ---
log "--- EFRAG Endorsement Status Report ---"
download_pdf \
"https://www.efrag.org/sites/default/files/media/document/2025-12/EFRAG%20Endorsement%20Status%20Report%2018%20December%202025.pdf" \
"$WORK_DIR/pdfs/efrag_endorsement_status_2025.pdf"
log "Downloads complete."
}
# =============================================================================
# PHASE B: EU-Rechtstexte → bp_compliance_ce
# =============================================================================
phase_ce() {
log "=========================================="
log "PHASE B: EU-Rechtstexte → bp_compliance_ce"
log "=========================================="
local col="bp_compliance_ce"
local before
before=$(collection_count "$col")
log "Collection $col: $before chunks (before)"
# 1. Machinery Regulation (EU) 2023/1230
upload_file "$WORK_DIR/pdfs/machinery_regulation_2023_1230.pdf" "$col" "compliance_ce" "legal_reference" "2023" \
'{"regulation_id":"eu_2023_1230","regulation_name_de":"Maschinenverordnung","regulation_name_en":"Machinery Regulation","regulation_short":"MACHINERY_REG","category":"maschinensicherheit","celex":"32023R1230","source":"eur-lex","license":"public_law"}' \
"Maschinenverordnung (EU) 2023/1230"
# 2. Blue Guide 2022
upload_file "$WORK_DIR/pdfs/blue_guide_2022.pdf" "$col" "compliance_ce" "legal_reference" "2022" \
'{"regulation_id":"eu_blue_guide_2022","regulation_name_de":"Leitfaden fuer die Umsetzung der Produktvorschriften (Blue Guide)","regulation_name_en":"Blue Guide on EU Product Rules","regulation_short":"BLUE_GUIDE","category":"produktregulierung","celex":"52022XC0629(04)","source":"eur-lex","license":"public_law"}' \
"Blue Guide 2022 — EU-Produktvorschriften"
# 8. EU-IFRS Regulation 2023/1803 (DE)
upload_file "$WORK_DIR/pdfs/ifrs_regulation_2023_1803_de.pdf" "$col" "compliance_ce" "legal_reference" "2023" \
'{"regulation_id":"eu_2023_1803","regulation_name_de":"IFRS-Uebernahmeverordnung","regulation_name_en":"IFRS Adoption Regulation","regulation_short":"EU_IFRS","category":"rechnungslegung","celex":"32023R1803","source":"eur-lex","license":"public_law","language":"de","endorsement_note":"Nur EU-endorsed IFRS. Neuere IASB-Standards sind moeglicherweise noch nicht uebernommen."}' \
"EU-IFRS Regulation 2023/1803 (DE)"
# 9. EU-IFRS Regulation 2023/1803 (EN)
upload_file "$WORK_DIR/pdfs/ifrs_regulation_2023_1803_en.pdf" "$col" "compliance_ce" "legal_reference" "2023" \
'{"regulation_id":"eu_2023_1803","regulation_name_de":"IFRS-Uebernahmeverordnung","regulation_name_en":"IFRS Adoption Regulation","regulation_short":"EU_IFRS","category":"rechnungslegung","celex":"32023R1803","source":"eur-lex","license":"public_law","language":"en","endorsement_note":"EU-endorsed IFRS only. Newer IASB standards may not yet be adopted by the EU."}' \
"EU-IFRS Regulation 2023/1803 (EN)"
local after
after=$(collection_count "$col")
log "Collection $col: $before$after chunks"
}
# =============================================================================
# PHASE C: Frameworks/Guidance → bp_compliance_datenschutz
# =============================================================================
phase_datenschutz() {
log "=========================================="
log "PHASE C: Frameworks/Guidance → bp_compliance_datenschutz"
log "=========================================="
local col="bp_compliance_datenschutz"
local before
before=$(collection_count "$col")
log "Collection $col: $before chunks (before)"
# 3. ENISA Secure by Design
upload_file "$WORK_DIR/pdfs/enisa_secure_by_design.pdf" "$col" "compliance_datenschutz" "guidance" "2023" \
'{"source_id":"enisa","doc_type":"guidance","guideline_name":"Advancing Software Security in the EU","license":"reuse_notice","attribution":"European Union Agency for Cybersecurity (ENISA)","source":"enisa.europa.eu"}' \
"ENISA: Advancing Software Security in the EU"
# 4. ENISA Supply Chain Security
upload_file "$WORK_DIR/pdfs/enisa_supply_chain_security.pdf" "$col" "compliance_datenschutz" "guidance" "2021" \
'{"source_id":"enisa","doc_type":"guidance","guideline_name":"Threat Landscape for Supply Chain Attacks","license":"reuse_notice","attribution":"European Union Agency for Cybersecurity (ENISA)","source":"enisa.europa.eu"}' \
"ENISA: Supply Chain Security Threat Landscape"
# 5. NIST SP 800-218 (SSDF)
upload_file "$WORK_DIR/pdfs/nist_sp_800_218_ssdf.pdf" "$col" "compliance_datenschutz" "guidance" "2022" \
'{"source_id":"nist","doc_type":"framework","guideline_name":"Secure Software Development Framework (SSDF) SP 800-218","license":"public_domain","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
"NIST SP 800-218 — Secure Software Development Framework"
# 6. NIST Cybersecurity Framework 2.0
upload_file "$WORK_DIR/pdfs/nist_csf_2_0.pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
'{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Cybersecurity Framework (CSF) 2.0","license":"public_domain","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
"NIST Cybersecurity Framework 2.0"
# 7. OECD AI Principles
upload_file "$WORK_DIR/pdfs/oecd_ai_principles.pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
'{"source_id":"oecd","doc_type":"guidance","guideline_name":"OECD Recommendation on Artificial Intelligence (AI Principles)","license":"reuse_notice","attribution":"Organisation for Economic Co-operation and Development (OECD)","source":"oecd.org"}' \
"OECD AI Principles (Recommendation on AI)"
# 10. EFRAG Endorsement Status Report
upload_file "$WORK_DIR/pdfs/efrag_endorsement_status_2025.pdf" "$col" "compliance_datenschutz" "guidance" "2025" \
'{"source_id":"efrag","doc_type":"guidance","guideline_name":"EFRAG Endorsement Status Report (Dec 2025)","license":"reuse_notice","attribution":"European Financial Reporting Advisory Group (EFRAG)","source":"efrag.org"}' \
"EFRAG Endorsement Status Report (Dec 2025)"
local after
after=$(collection_count "$col")
log "Collection $col: $before$after chunks"
}
# =============================================================================
# PHASE D: Verifizierung
# =============================================================================
phase_verify() {
log "=========================================="
log "PHASE D: Verifizierung"
log "=========================================="
echo ""
echo "=== Collection Stats ==="
for col in bp_compliance_ce bp_compliance_datenschutz; do
local count
count=$(collection_count "$col")
printf " %-30s %s chunks\n" "$col" "$count"
done
echo ""
echo "=== Test-Suchen ==="
log "Suche: 'Maschinenverordnung CE-Kennzeichnung' in bp_compliance_ce"
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
-H 'Content-Type: application/json' \
-d '{"query":"Maschinenverordnung CE-Kennzeichnung","collection":"bp_compliance_ce","top_k":3}' 2>/dev/null \
| python3 -c "
import sys,json
try:
data = json.load(sys.stdin)
results = data.get('results', [])
print(f' Treffer: {len(results)}')
for r in results[:3]:
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
except: print(' (parse error)')
" 2>/dev/null || echo " (search failed)"
log "Suche: 'Supply Chain Cybersecurity ENISA' in bp_compliance_datenschutz"
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
-H 'Content-Type: application/json' \
-d '{"query":"Supply Chain Cybersecurity ENISA","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \
| python3 -c "
import sys,json
try:
data = json.load(sys.stdin)
results = data.get('results', [])
print(f' Treffer: {len(results)}')
for r in results[:3]:
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
except: print(' (parse error)')
" 2>/dev/null || echo " (search failed)"
log "Suche: 'NIST Cybersecurity Framework Governance' in bp_compliance_datenschutz"
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
-H 'Content-Type: application/json' \
-d '{"query":"NIST Cybersecurity Framework Governance","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \
| python3 -c "
import sys,json
try:
data = json.load(sys.stdin)
results = data.get('results', [])
print(f' Treffer: {len(results)}')
for r in results[:3]:
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
except: print(' (parse error)')
" 2>/dev/null || echo " (search failed)"
log "Suche: 'OECD AI Principles transparency accountability' in bp_compliance_datenschutz"
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
-H 'Content-Type: application/json' \
-d '{"query":"OECD AI Principles transparency accountability","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \
| python3 -c "
import sys,json
try:
data = json.load(sys.stdin)
results = data.get('results', [])
print(f' Treffer: {len(results)}')
for r in results[:3]:
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
except: print(' (parse error)')
" 2>/dev/null || echo " (search failed)"
log "Suche: 'IFRS Rechnungslegung EU endorsed' in bp_compliance_ce"
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
-H 'Content-Type: application/json' \
-d '{"query":"IFRS Rechnungslegung EU endorsed","collection":"bp_compliance_ce","top_k":3}' 2>/dev/null \
| python3 -c "
import sys,json
try:
data = json.load(sys.stdin)
results = data.get('results', [])
print(f' Treffer: {len(results)}')
for r in results[:3]:
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
except: print(' (parse error)')
" 2>/dev/null || echo " (search failed)"
log "Suche: 'EFRAG endorsement status IFRS 18' in bp_compliance_datenschutz"
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
-H 'Content-Type: application/json' \
-d '{"query":"EFRAG endorsement status IFRS 18","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \
| python3 -c "
import sys,json
try:
data = json.load(sys.stdin)
results = data.get('results', [])
print(f' Treffer: {len(results)}')
for r in results[:3]:
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
except: print(' (parse error)')
" 2>/dev/null || echo " (search failed)"
}
# =============================================================================
# MAIN
# =============================================================================
log "============================================================"
log "BreakPilot Industry Compliance Ingestion"
log "Work dir: $WORK_DIR"
log "RAG API: $RAG_URL"
log "============================================================"
if [[ -n "$ONLY_PHASE" ]]; then
case "$ONLY_PHASE" in
download) phase_download ;;
ce) phase_ce ;;
datenschutz) phase_datenschutz ;;
verify) phase_verify ;;
*) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;;
esac
else
if [[ "$SKIP_DOWNLOAD" == "false" ]]; then
phase_download
else
log "(Skipping downloads)"
fi
echo ""
phase_ce
echo ""
phase_datenschutz
echo ""
phase_verify
fi
echo ""
log "============================================================"
log "DONE — Uploaded: $UPLOADED | Failed: $FAILED | Skipped: $SKIPPED"
log "============================================================"
if [[ "$FAILED" -gt 0 ]]; then
exit 1
fi