Compare commits
5 Commits
0e932c0df8
...
71267e2a8a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
71267e2a8a | ||
|
|
52a9ad2279 | ||
|
|
ee79a48e8e | ||
|
|
9026e392dc | ||
|
|
9496e758fc |
167
admin-compliance/__tests__/ingest-industry-compliance.test.ts
Normal file
167
admin-compliance/__tests__/ingest-industry-compliance.test.ts
Normal file
@@ -0,0 +1,167 @@
|
||||
import { describe, it, expect } from 'vitest'
|
||||
import { readFileSync } from 'fs'
|
||||
import { resolve } from 'path'
|
||||
|
||||
/**
|
||||
* Tests for the ingestion script ingest-industry-compliance.sh
|
||||
* Validates script structure, URLs, metadata, and configuration.
|
||||
*/
|
||||
|
||||
const SCRIPT_PATH = resolve(__dirname, '../../scripts/ingest-industry-compliance.sh')
|
||||
let scriptContent: string
|
||||
|
||||
try {
|
||||
scriptContent = readFileSync(SCRIPT_PATH, 'utf-8')
|
||||
} catch {
|
||||
scriptContent = ''
|
||||
}
|
||||
|
||||
describe('Ingestion Script: ingest-industry-compliance.sh', () => {
|
||||
it('should exist and be non-empty', () => {
|
||||
expect(scriptContent.length).toBeGreaterThan(0)
|
||||
})
|
||||
|
||||
describe('download_pdf function', () => {
|
||||
it('should define download_pdf function', () => {
|
||||
expect(scriptContent).toContain('download_pdf()')
|
||||
})
|
||||
|
||||
it('should use User-Agent header for downloads', () => {
|
||||
expect(scriptContent).toContain('Mozilla/5.0')
|
||||
})
|
||||
|
||||
it('should follow redirects with -L flag', () => {
|
||||
expect(scriptContent).toMatch(/curl.*-L/)
|
||||
})
|
||||
|
||||
it('should skip already downloaded files', () => {
|
||||
expect(scriptContent).toContain('-f "$target"')
|
||||
})
|
||||
})
|
||||
|
||||
describe('upload_file function', () => {
|
||||
it('should define upload_file function', () => {
|
||||
expect(scriptContent).toContain('upload_file()')
|
||||
})
|
||||
|
||||
it('should use recursive chunk strategy', () => {
|
||||
expect(scriptContent).toContain('chunk_strategy=recursive')
|
||||
})
|
||||
|
||||
it('should use chunk_size=512', () => {
|
||||
expect(scriptContent).toContain('chunk_size=512')
|
||||
})
|
||||
|
||||
it('should use chunk_overlap=50', () => {
|
||||
expect(scriptContent).toContain('chunk_overlap=50')
|
||||
})
|
||||
|
||||
it('should validate minimum file size', () => {
|
||||
expect(scriptContent).toContain('"$filesize" -lt 100')
|
||||
})
|
||||
})
|
||||
|
||||
describe('IFRS Downloads', () => {
|
||||
it('should download IFRS DE from EUR-Lex', () => {
|
||||
expect(scriptContent).toContain(
|
||||
'https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1803'
|
||||
)
|
||||
})
|
||||
|
||||
it('should download IFRS EN from EUR-Lex', () => {
|
||||
expect(scriptContent).toContain(
|
||||
'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R1803'
|
||||
)
|
||||
})
|
||||
|
||||
it('should save IFRS DE with correct filename', () => {
|
||||
expect(scriptContent).toContain('ifrs_regulation_2023_1803_de.pdf')
|
||||
})
|
||||
|
||||
it('should save IFRS EN with correct filename', () => {
|
||||
expect(scriptContent).toContain('ifrs_regulation_2023_1803_en.pdf')
|
||||
})
|
||||
})
|
||||
|
||||
describe('EFRAG Download', () => {
|
||||
it('should download EFRAG Endorsement Status Report', () => {
|
||||
expect(scriptContent).toContain('efrag.org')
|
||||
})
|
||||
|
||||
it('should save as efrag_endorsement_status_2025.pdf', () => {
|
||||
expect(scriptContent).toContain('efrag_endorsement_status_2025.pdf')
|
||||
})
|
||||
})
|
||||
|
||||
describe('ENISA Downloads', () => {
|
||||
it('should download ENISA from new URL pattern', () => {
|
||||
expect(scriptContent).toContain('enisa.europa.eu/sites/default/files/publications')
|
||||
})
|
||||
|
||||
it('should NOT use old Plone-style URLs', () => {
|
||||
expect(scriptContent).not.toContain('@@download/fullReport')
|
||||
})
|
||||
})
|
||||
|
||||
describe('IFRS Metadata', () => {
|
||||
it('should include CELEX number 32023R1803', () => {
|
||||
expect(scriptContent).toContain('"celex":"32023R1803"')
|
||||
})
|
||||
|
||||
it('should tag as regulation_short EU_IFRS', () => {
|
||||
expect(scriptContent).toContain('"regulation_short":"EU_IFRS"')
|
||||
})
|
||||
|
||||
it('should set category to rechnungslegung', () => {
|
||||
expect(scriptContent).toContain('"category":"rechnungslegung"')
|
||||
})
|
||||
|
||||
it('should include endorsement note', () => {
|
||||
expect(scriptContent).toContain('EU-endorsed IFRS')
|
||||
})
|
||||
|
||||
it('should set license to public_law', () => {
|
||||
expect(scriptContent).toContain('"license":"public_law"')
|
||||
})
|
||||
})
|
||||
|
||||
describe('EFRAG Metadata', () => {
|
||||
it('should set source_id to efrag', () => {
|
||||
expect(scriptContent).toContain('"source_id":"efrag"')
|
||||
})
|
||||
|
||||
it('should include EFRAG attribution', () => {
|
||||
expect(scriptContent).toContain('European Financial Reporting Advisory Group')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Target Collections', () => {
|
||||
it('should reference bp_compliance_ce', () => {
|
||||
expect(scriptContent).toContain('bp_compliance_ce')
|
||||
})
|
||||
|
||||
it('should reference bp_compliance_datenschutz', () => {
|
||||
expect(scriptContent).toContain('bp_compliance_datenschutz')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Verification Phase', () => {
|
||||
it('should have a phase_verify function', () => {
|
||||
expect(scriptContent).toContain('phase_verify')
|
||||
})
|
||||
|
||||
it('should test search for IFRS', () => {
|
||||
expect(scriptContent).toContain('IFRS Rechnungslegung EU endorsed')
|
||||
})
|
||||
|
||||
it('should test search for EFRAG', () => {
|
||||
expect(scriptContent).toContain('EFRAG endorsement status')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Curl Configuration', () => {
|
||||
it('should set max-time to 600 seconds', () => {
|
||||
expect(scriptContent).toContain('--max-time 600')
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -0,0 +1,100 @@
|
||||
import { describe, it, expect } from 'vitest'
|
||||
|
||||
/**
|
||||
* Tests for Compliance Advisor system prompt:
|
||||
* - COMPLIANCE_COLLECTIONS includes required collections
|
||||
* - IFRS endorsement warning is present
|
||||
* - EU-IFRS competence area is declared
|
||||
*/
|
||||
|
||||
// Replicate COMPLIANCE_COLLECTIONS from route.ts
|
||||
const COMPLIANCE_COLLECTIONS = [
|
||||
'bp_compliance_gesetze',
|
||||
'bp_compliance_ce',
|
||||
'bp_compliance_datenschutz',
|
||||
'bp_dsfa_corpus',
|
||||
'bp_compliance_recht',
|
||||
'bp_legal_templates',
|
||||
] as const
|
||||
|
||||
// Replicate the IFRS system prompt sections
|
||||
const IFRS_COMPETENCE = [
|
||||
'EU-IFRS (Verordnung 2023/1803)',
|
||||
'EFRAG Endorsement Status',
|
||||
]
|
||||
|
||||
const IFRS_ENDORSEMENT_WARNING = `Bei ALLEN Fragen zu IFRS/IAS-Standards MUSST du folgende Punkte beachten:
|
||||
1. Dein Wissen basiert auf den **EU-uebernommenen IFRS** (Verordnung 2023/1803, Stand Okt 2023).
|
||||
2. Die IASB/IFRS Foundation gibt regelmaessig neue oder geaenderte Standards heraus, die von der EU noch NICHT uebernommen sein koennten.
|
||||
3. Weise den Nutzer IMMER darauf hin: "Dieser Hinweis basiert auf den EU-endorsed IFRS (Stand: Verordnung 2023/1803). Pruefen Sie den aktuellen EFRAG Endorsement Status fuer neuere Standards."
|
||||
4. Bei internationalen Ausschreibungen: Nur EU-endorsed IFRS sind fuer EU-Unternehmen rechtsverbindlich.
|
||||
5. Verweise NICHT auf IFRS Foundation Originaltexte, sondern ausschliesslich auf die EU-Verordnung.`
|
||||
|
||||
describe('Compliance Advisor System Prompt', () => {
|
||||
describe('COMPLIANCE_COLLECTIONS', () => {
|
||||
it('should include bp_compliance_ce for IFRS/CE documents', () => {
|
||||
expect(COMPLIANCE_COLLECTIONS).toContain('bp_compliance_ce')
|
||||
})
|
||||
|
||||
it('should include bp_compliance_datenschutz for EFRAG/ENISA', () => {
|
||||
expect(COMPLIANCE_COLLECTIONS).toContain('bp_compliance_datenschutz')
|
||||
})
|
||||
|
||||
it('should include bp_compliance_gesetze for laws', () => {
|
||||
expect(COMPLIANCE_COLLECTIONS).toContain('bp_compliance_gesetze')
|
||||
})
|
||||
|
||||
it('should include bp_dsfa_corpus for DSFA', () => {
|
||||
expect(COMPLIANCE_COLLECTIONS).toContain('bp_dsfa_corpus')
|
||||
})
|
||||
|
||||
it('should have exactly 6 collections', () => {
|
||||
expect(COMPLIANCE_COLLECTIONS).toHaveLength(6)
|
||||
})
|
||||
})
|
||||
|
||||
describe('IFRS Competence Area', () => {
|
||||
it('should declare EU-IFRS Verordnung 2023/1803', () => {
|
||||
expect(IFRS_COMPETENCE[0]).toContain('2023/1803')
|
||||
})
|
||||
|
||||
it('should declare EFRAG Endorsement Status', () => {
|
||||
expect(IFRS_COMPETENCE[1]).toContain('EFRAG')
|
||||
})
|
||||
})
|
||||
|
||||
describe('IFRS Endorsement Warning', () => {
|
||||
it('should mention Verordnung 2023/1803', () => {
|
||||
expect(IFRS_ENDORSEMENT_WARNING).toContain('Verordnung 2023/1803')
|
||||
})
|
||||
|
||||
it('should warn about IASB/IFRS Foundation updates', () => {
|
||||
expect(IFRS_ENDORSEMENT_WARNING).toContain('IASB/IFRS Foundation')
|
||||
})
|
||||
|
||||
it('should instruct to reference EFRAG status', () => {
|
||||
expect(IFRS_ENDORSEMENT_WARNING).toContain('EFRAG Endorsement Status')
|
||||
})
|
||||
|
||||
it('should mention EU-endorsed IFRS only', () => {
|
||||
expect(IFRS_ENDORSEMENT_WARNING).toContain('EU-endorsed IFRS')
|
||||
})
|
||||
|
||||
it('should warn against IFRS Foundation original texts', () => {
|
||||
expect(IFRS_ENDORSEMENT_WARNING).toContain('NICHT auf IFRS Foundation Originaltexte')
|
||||
})
|
||||
|
||||
it('should mention international tenders requirement', () => {
|
||||
expect(IFRS_ENDORSEMENT_WARNING).toContain('internationalen Ausschreibungen')
|
||||
})
|
||||
|
||||
it('should have 5 numbered points', () => {
|
||||
const points = IFRS_ENDORSEMENT_WARNING.match(/^\d+\./gm)
|
||||
expect(points).toHaveLength(5)
|
||||
})
|
||||
|
||||
it('should reference Stand Okt 2023', () => {
|
||||
expect(IFRS_ENDORSEMENT_WARNING).toContain('Stand Okt 2023')
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -59,6 +59,22 @@ offiziellen Quellen und gibst praxisnahe Hinweise.
|
||||
- WP29/WP248 (Art.-29-Datenschutzgruppe Arbeitspapiere)
|
||||
- Nationale Datenschutzgesetze (AT DSG, CH DSG/DSV, etc.)
|
||||
- EU-Verordnungen (DORA, MiCA, Data Act, EHDS, PSD2, AMLR, etc.)
|
||||
- EU Maschinenverordnung (2023/1230) — CE-Kennzeichnung, Konformitaet, Cybersecurity fuer Maschinen
|
||||
- EU Blue Guide 2022 — Leitfaden fuer EU-Produktvorschriften und CE-Kennzeichnung
|
||||
- ENISA Cybersecurity Guidance (Secure by Design, Supply Chain Security)
|
||||
- NIST SP 800-218 (SSDF) — Secure Software Development Framework
|
||||
- NIST Cybersecurity Framework (CSF) 2.0 — Govern, Identify, Protect, Detect, Respond, Recover
|
||||
- OECD AI Principles — Verantwortungsvolle KI, Transparenz, Accountability
|
||||
- EU-IFRS (Verordnung 2023/1803) — EU-uebernommene International Financial Reporting Standards
|
||||
- EFRAG Endorsement Status — Uebersicht welche IFRS-Standards EU-endorsed sind
|
||||
|
||||
## IFRS-Besonderheit (WICHTIG)
|
||||
Bei ALLEN Fragen zu IFRS/IAS-Standards MUSST du folgende Punkte beachten:
|
||||
1. Dein Wissen basiert auf den **EU-uebernommenen IFRS** (Verordnung 2023/1803, Stand Okt 2023).
|
||||
2. Die IASB/IFRS Foundation gibt regelmaessig neue oder geaenderte Standards heraus, die von der EU noch NICHT uebernommen sein koennten.
|
||||
3. Weise den Nutzer IMMER darauf hin: "Dieser Hinweis basiert auf den EU-endorsed IFRS (Stand: Verordnung 2023/1803). Pruefen Sie den aktuellen EFRAG Endorsement Status fuer neuere Standards."
|
||||
4. Bei internationalen Ausschreibungen: Nur EU-endorsed IFRS sind fuer EU-Unternehmen rechtsverbindlich.
|
||||
5. Verweise NICHT auf IFRS Foundation Originaltexte, sondern ausschliesslich auf die EU-Verordnung.
|
||||
|
||||
## RAG-Nutzung
|
||||
Nutze das gesamte RAG-Corpus fuer Kontext und Quellenangaben — ausgenommen sind
|
||||
|
||||
115
docs-src/services/sdk-modules/industry-compliance-ingestion.md
Normal file
115
docs-src/services/sdk-modules/industry-compliance-ingestion.md
Normal file
@@ -0,0 +1,115 @@
|
||||
# Industry Compliance Ingestion
|
||||
|
||||
## Uebersicht
|
||||
|
||||
Das Ingestion-Skript `scripts/ingest-industry-compliance.sh` laedt oeffentlich verfuegbare Industrie-Compliance-Dokumente herunter und ingestiert sie in Qdrant via die Core RAG-API (Port 8097).
|
||||
|
||||
**Ausfuehrung:** Mac Mini
|
||||
**Speicherort:** `~/rag-ingestion/`
|
||||
**RAG-API:** `https://localhost:8097/api/v1/documents/upload`
|
||||
|
||||
---
|
||||
|
||||
## Dokumente (10 PDFs)
|
||||
|
||||
| # | Dokument | Quelle | Collection | Chunks |
|
||||
|---|----------|--------|------------|--------|
|
||||
| 1 | EU Maschinenverordnung 2023/1230 | EUR-Lex | `bp_compliance_ce` | ~882 |
|
||||
| 2 | EU Blue Guide 2022 | EUR-Lex | `bp_compliance_ce` | ~1600 |
|
||||
| 3 | ENISA Advancing Software Security | enisa.europa.eu | `bp_compliance_datenschutz` | ~99 |
|
||||
| 4 | ENISA Supply Chain Threat Landscape | enisa.europa.eu | `bp_compliance_datenschutz` | ~284 |
|
||||
| 5 | NIST SP 800-218 (SSDF) | nist.gov | `bp_compliance_datenschutz` | ~242 |
|
||||
| 6 | NIST Cybersecurity Framework 2.0 | nist.gov | `bp_compliance_datenschutz` | ~162 |
|
||||
| 7 | OECD AI Principles | oecd.org | `bp_compliance_datenschutz` | ~76 |
|
||||
| 8 | EU-IFRS Verordnung 2023/1803 (DE) | EUR-Lex | `bp_compliance_ce` | ~8942 |
|
||||
| 9 | EU-IFRS Verordnung 2023/1803 (EN) | EUR-Lex | `bp_compliance_ce` | ~9000 |
|
||||
| 10 | EFRAG Endorsement Status Report | efrag.org | `bp_compliance_datenschutz` | ~48 |
|
||||
|
||||
---
|
||||
|
||||
## Ausfuehrung
|
||||
|
||||
```bash
|
||||
# Vollstaendig (Download + Upload + Verify)
|
||||
bash ~/rag-ingestion/ingest-industry-compliance.sh
|
||||
|
||||
# Nur Downloads
|
||||
bash ~/rag-ingestion/ingest-industry-compliance.sh --only download
|
||||
|
||||
# Nur CE-Collection uploaden
|
||||
bash ~/rag-ingestion/ingest-industry-compliance.sh --only ce --skip-download
|
||||
|
||||
# Nur Datenschutz-Collection uploaden
|
||||
bash ~/rag-ingestion/ingest-industry-compliance.sh --only datenschutz --skip-download
|
||||
|
||||
# Nur Verifizierung
|
||||
bash ~/rag-ingestion/ingest-industry-compliance.sh --only verify
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phasen
|
||||
|
||||
### Phase A: Downloads
|
||||
- Laedt alle 10 PDFs nach `~/rag-ingestion/pdfs/`
|
||||
- Ueberspringe bereits vorhandene Dateien
|
||||
- User-Agent Header fuer ENISA-Kompatibilitaet
|
||||
|
||||
### Phase B: CE-Collection (`bp_compliance_ce`)
|
||||
- EU-Rechtstexte (Maschinenverordnung, Blue Guide, IFRS)
|
||||
- Metadata: CELEX-Nummer, Kategorie, Sprache
|
||||
|
||||
### Phase C: Datenschutz-Collection (`bp_compliance_datenschutz`)
|
||||
- Frameworks und Guidance (ENISA, NIST, OECD, EFRAG)
|
||||
- Metadata: Source-ID, Typ, Attribution
|
||||
|
||||
### Phase D: Verifizierung
|
||||
- Collection-Counts pruefen
|
||||
- Test-Suchen durchfuehren
|
||||
|
||||
---
|
||||
|
||||
## Chunking-Konfiguration
|
||||
|
||||
| Parameter | Wert |
|
||||
|-----------|------|
|
||||
| Strategie | `recursive` |
|
||||
| Chunk-Groesse | 512 Token |
|
||||
| Chunk-Overlap | 50 Token |
|
||||
| Embedding-Modell | BGE-M3 (1024-dim) |
|
||||
|
||||
---
|
||||
|
||||
## IFRS-Besonderheit
|
||||
|
||||
Die IFRS-Verordnung (EU) 2023/1803 ist mit ~8MB sehr gross und erzeugt ~9000 Chunks. Der Upload dauert 10-15 Minuten wegen der sequenziellen Embedding-Erzeugung.
|
||||
|
||||
**Workaround fuer Timeout:**
|
||||
```bash
|
||||
# PDF in Container kopieren und von dort uploaden
|
||||
docker cp ifrs_regulation_2023_1803_de.pdf bp-core-rag-service:/tmp/
|
||||
docker exec -d bp-core-rag-service sh -c "curl -s --max-time 1800 -X POST http://localhost:8097/api/v1/documents/upload -F file=@/tmp/ifrs_regulation_2023_1803_de.pdf -F collection=bp_compliance_ce ..."
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Compliance Advisor Integration
|
||||
|
||||
Der System-Prompt in `admin-compliance/app/api/sdk/compliance-advisor/chat/route.ts` referenziert alle ingestierten Dokumente. Bei IFRS-Fragen wird ein spezieller Endorsement-Hinweis angezeigt:
|
||||
|
||||
> Dieser Hinweis basiert auf den EU-endorsed IFRS (Stand: Verordnung 2023/1803).
|
||||
> Pruefen Sie den aktuellen EFRAG Endorsement Status fuer neuere Standards.
|
||||
|
||||
---
|
||||
|
||||
## Lizenzen
|
||||
|
||||
Alle Dokumente sind unter oeffentlich nutzbaren Lizenzen:
|
||||
|
||||
| Quelle | Lizenz |
|
||||
|--------|--------|
|
||||
| EUR-Lex | Amtliches Werk der EU (Public Domain) |
|
||||
| ENISA | EUPL/Reuse Notice |
|
||||
| NIST | Public Domain (US Government) |
|
||||
| OECD | Reuse Notice |
|
||||
| EFRAG | Oeffentliches Dokument |
|
||||
@@ -74,6 +74,7 @@ nav:
|
||||
- Document Crawler: services/sdk-modules/document-crawler.md
|
||||
- Advisory Board: services/sdk-modules/advisory-board.md
|
||||
- DSB Portal: services/sdk-modules/dsb-portal.md
|
||||
- Industry Compliance Ingestion: services/sdk-modules/industry-compliance-ingestion.md
|
||||
- Entwicklung:
|
||||
- Testing: development/testing.md
|
||||
- Dokumentation: development/documentation.md
|
||||
|
||||
443
scripts/ingest-industry-compliance.sh
Executable file
443
scripts/ingest-industry-compliance.sh
Executable file
@@ -0,0 +1,443 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# BreakPilot Compliance — Industry Compliance Ingestion
|
||||
#
|
||||
# Laedt 10 freie Industrie-Compliance-Dokumente herunter und ingestiert sie
|
||||
# in Qdrant via die Core RAG-API (Port 8097).
|
||||
#
|
||||
# Dokumente:
|
||||
# 1. EU Machinery Regulation 2023/1230 → bp_compliance_ce
|
||||
# 2. EU Blue Guide 2022 → bp_compliance_ce
|
||||
# 3. ENISA Secure by Design → bp_compliance_datenschutz
|
||||
# 4. ENISA Supply Chain Security → bp_compliance_datenschutz
|
||||
# 5. NIST SP 800-218 (SSDF) → bp_compliance_datenschutz
|
||||
# 6. NIST Cybersecurity Framework 2.0 → bp_compliance_datenschutz
|
||||
# 7. OECD AI Principles → bp_compliance_datenschutz
|
||||
# 8. EU-IFRS Regulation 2023/1803 (DE) → bp_compliance_ce
|
||||
# 9. EU-IFRS Regulation 2023/1803 (EN) → bp_compliance_ce
|
||||
# 10. EFRAG Endorsement Status Report → bp_compliance_datenschutz
|
||||
#
|
||||
# Ausfuehrung auf dem Mac Mini:
|
||||
# ~/rag-ingestion/ingest-industry-compliance.sh [--skip-download] [--only PHASE]
|
||||
#
|
||||
# Phasen: download, ce, datenschutz, verify
|
||||
# =============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
# --- Configuration -----------------------------------------------------------
|
||||
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion}"
|
||||
RAG_URL="https://localhost:8097/api/v1/documents/upload"
|
||||
QDRANT_URL="http://localhost:6333"
|
||||
CURL_OPTS="-sk --connect-timeout 10 --max-time 600"
|
||||
|
||||
# Counters
|
||||
UPLOADED=0
|
||||
FAILED=0
|
||||
SKIPPED=0
|
||||
|
||||
# --- CLI Args ----------------------------------------------------------------
|
||||
SKIP_DOWNLOAD=false
|
||||
ONLY_PHASE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--skip-download) SKIP_DOWNLOAD=true; shift ;;
|
||||
--only) ONLY_PHASE="$2"; shift 2 ;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 [--skip-download] [--only PHASE]"
|
||||
echo "Phases: download, ce, datenschutz, verify"
|
||||
exit 0
|
||||
;;
|
||||
*) echo "Unknown option: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# --- Helpers -----------------------------------------------------------------
|
||||
log() { echo "[$(date '+%H:%M:%S')] $*"; }
|
||||
ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
|
||||
warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
|
||||
fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
|
||||
|
||||
upload_file() {
|
||||
local file="$1"
|
||||
local collection="$2"
|
||||
local data_type="$3"
|
||||
local use_case="$4"
|
||||
local year="$5"
|
||||
local metadata_json="$6"
|
||||
local label="${7:-$(basename "$file")}"
|
||||
|
||||
if [[ ! -f "$file" ]]; then
|
||||
warn "File not found: $file"
|
||||
FAILED=$((FAILED + 1))
|
||||
return 1
|
||||
fi
|
||||
|
||||
local filesize
|
||||
filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
|
||||
if [[ "$filesize" -lt 100 ]]; then
|
||||
warn "File too small (${filesize}B), skipping: $label"
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
return 1
|
||||
fi
|
||||
|
||||
log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)"
|
||||
|
||||
local response
|
||||
response=$(curl $CURL_OPTS -X POST "$RAG_URL" \
|
||||
-F "file=@${file}" \
|
||||
-F "collection=${collection}" \
|
||||
-F "data_type=${data_type}" \
|
||||
-F "use_case=${use_case}" \
|
||||
-F "year=${year}" \
|
||||
-F "chunk_strategy=recursive" \
|
||||
-F "chunk_size=512" \
|
||||
-F "chunk_overlap=50" \
|
||||
-F "metadata_json=${metadata_json}" \
|
||||
2>/dev/null) || true
|
||||
|
||||
if echo "$response" | grep -q '"chunks_count"'; then
|
||||
local chunks
|
||||
chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?")
|
||||
ok "$label → $chunks chunks"
|
||||
UPLOADED=$((UPLOADED + 1))
|
||||
elif echo "$response" | grep -q '"vectors_indexed"'; then
|
||||
local vectors
|
||||
vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?")
|
||||
ok "$label → $vectors vectors"
|
||||
UPLOADED=$((UPLOADED + 1))
|
||||
else
|
||||
fail "Upload failed: $label"
|
||||
fail "Response: $response"
|
||||
FAILED=$((FAILED + 1))
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
download_pdf() {
|
||||
local url="$1"
|
||||
local target="$2"
|
||||
|
||||
if [[ -f "$target" ]]; then
|
||||
log "PDF exists: $(basename "$target") (skipping)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "Downloading: $(basename "$target")"
|
||||
curl $CURL_OPTS -L -A 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' "$url" -o "$target" 2>/dev/null || {
|
||||
warn "Download failed: $url"
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
collection_count() {
|
||||
local col="$1"
|
||||
curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
||||
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE A: Downloads (7 PDFs)
|
||||
# =============================================================================
|
||||
phase_download() {
|
||||
log "=========================================="
|
||||
log "PHASE A: Downloads (10 Industry Compliance PDFs)"
|
||||
log "=========================================="
|
||||
|
||||
mkdir -p "$WORK_DIR/pdfs"
|
||||
|
||||
# --- A1: EUR-Lex ---
|
||||
log "--- EUR-Lex: Machinery Regulation ---"
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1230" \
|
||||
"$WORK_DIR/pdfs/machinery_regulation_2023_1230.pdf"
|
||||
|
||||
# --- A2: EU Blue Guide 2022 ---
|
||||
log "--- EU Blue Guide 2022 ---"
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:52022XC0629(04)" \
|
||||
"$WORK_DIR/pdfs/blue_guide_2022.pdf"
|
||||
|
||||
# --- A3: ENISA Publications ---
|
||||
log "--- ENISA Publications ---"
|
||||
download_pdf \
|
||||
"https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20Report%20-%20Advancing%20Software%20Security%20in%20the%20EU.pdf" \
|
||||
"$WORK_DIR/pdfs/enisa_secure_by_design.pdf"
|
||||
|
||||
download_pdf \
|
||||
"https://www.enisa.europa.eu/sites/default/files/publications/ENISA%20Threat%20Landscape%20for%20Supply%20Chain%20Attacks.pdf" \
|
||||
"$WORK_DIR/pdfs/enisa_supply_chain_security.pdf"
|
||||
|
||||
# --- A4: NIST Publications ---
|
||||
log "--- NIST Publications ---"
|
||||
download_pdf \
|
||||
"https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-218.pdf" \
|
||||
"$WORK_DIR/pdfs/nist_sp_800_218_ssdf.pdf"
|
||||
|
||||
download_pdf \
|
||||
"https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.29.pdf" \
|
||||
"$WORK_DIR/pdfs/nist_csf_2_0.pdf"
|
||||
|
||||
# --- A5: OECD AI Principles ---
|
||||
log "--- OECD AI Principles ---"
|
||||
download_pdf \
|
||||
"https://legalinstruments.oecd.org/api/print?ids=648&lang=en" \
|
||||
"$WORK_DIR/pdfs/oecd_ai_principles.pdf"
|
||||
|
||||
# --- A6: EUR-Lex IFRS (DE + EN) ---
|
||||
log "--- EUR-Lex: EU-IFRS Regulation 2023/1803 ---"
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1803" \
|
||||
"$WORK_DIR/pdfs/ifrs_regulation_2023_1803_de.pdf"
|
||||
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R1803" \
|
||||
"$WORK_DIR/pdfs/ifrs_regulation_2023_1803_en.pdf"
|
||||
|
||||
# --- A7: EFRAG Endorsement Status Report ---
|
||||
log "--- EFRAG Endorsement Status Report ---"
|
||||
download_pdf \
|
||||
"https://www.efrag.org/sites/default/files/media/document/2025-12/EFRAG%20Endorsement%20Status%20Report%2018%20December%202025.pdf" \
|
||||
"$WORK_DIR/pdfs/efrag_endorsement_status_2025.pdf"
|
||||
|
||||
log "Downloads complete."
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE B: EU-Rechtstexte → bp_compliance_ce
|
||||
# =============================================================================
|
||||
phase_ce() {
|
||||
log "=========================================="
|
||||
log "PHASE B: EU-Rechtstexte → bp_compliance_ce"
|
||||
log "=========================================="
|
||||
|
||||
local col="bp_compliance_ce"
|
||||
local before
|
||||
before=$(collection_count "$col")
|
||||
log "Collection $col: $before chunks (before)"
|
||||
|
||||
# 1. Machinery Regulation (EU) 2023/1230
|
||||
upload_file "$WORK_DIR/pdfs/machinery_regulation_2023_1230.pdf" "$col" "compliance_ce" "legal_reference" "2023" \
|
||||
'{"regulation_id":"eu_2023_1230","regulation_name_de":"Maschinenverordnung","regulation_name_en":"Machinery Regulation","regulation_short":"MACHINERY_REG","category":"maschinensicherheit","celex":"32023R1230","source":"eur-lex","license":"public_law"}' \
|
||||
"Maschinenverordnung (EU) 2023/1230"
|
||||
|
||||
# 2. Blue Guide 2022
|
||||
upload_file "$WORK_DIR/pdfs/blue_guide_2022.pdf" "$col" "compliance_ce" "legal_reference" "2022" \
|
||||
'{"regulation_id":"eu_blue_guide_2022","regulation_name_de":"Leitfaden fuer die Umsetzung der Produktvorschriften (Blue Guide)","regulation_name_en":"Blue Guide on EU Product Rules","regulation_short":"BLUE_GUIDE","category":"produktregulierung","celex":"52022XC0629(04)","source":"eur-lex","license":"public_law"}' \
|
||||
"Blue Guide 2022 — EU-Produktvorschriften"
|
||||
|
||||
# 8. EU-IFRS Regulation 2023/1803 (DE)
|
||||
upload_file "$WORK_DIR/pdfs/ifrs_regulation_2023_1803_de.pdf" "$col" "compliance_ce" "legal_reference" "2023" \
|
||||
'{"regulation_id":"eu_2023_1803","regulation_name_de":"IFRS-Uebernahmeverordnung","regulation_name_en":"IFRS Adoption Regulation","regulation_short":"EU_IFRS","category":"rechnungslegung","celex":"32023R1803","source":"eur-lex","license":"public_law","language":"de","endorsement_note":"Nur EU-endorsed IFRS. Neuere IASB-Standards sind moeglicherweise noch nicht uebernommen."}' \
|
||||
"EU-IFRS Regulation 2023/1803 (DE)"
|
||||
|
||||
# 9. EU-IFRS Regulation 2023/1803 (EN)
|
||||
upload_file "$WORK_DIR/pdfs/ifrs_regulation_2023_1803_en.pdf" "$col" "compliance_ce" "legal_reference" "2023" \
|
||||
'{"regulation_id":"eu_2023_1803","regulation_name_de":"IFRS-Uebernahmeverordnung","regulation_name_en":"IFRS Adoption Regulation","regulation_short":"EU_IFRS","category":"rechnungslegung","celex":"32023R1803","source":"eur-lex","license":"public_law","language":"en","endorsement_note":"EU-endorsed IFRS only. Newer IASB standards may not yet be adopted by the EU."}' \
|
||||
"EU-IFRS Regulation 2023/1803 (EN)"
|
||||
|
||||
local after
|
||||
after=$(collection_count "$col")
|
||||
log "Collection $col: $before → $after chunks"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE C: Frameworks/Guidance → bp_compliance_datenschutz
|
||||
# =============================================================================
|
||||
phase_datenschutz() {
|
||||
log "=========================================="
|
||||
log "PHASE C: Frameworks/Guidance → bp_compliance_datenschutz"
|
||||
log "=========================================="
|
||||
|
||||
local col="bp_compliance_datenschutz"
|
||||
local before
|
||||
before=$(collection_count "$col")
|
||||
log "Collection $col: $before chunks (before)"
|
||||
|
||||
# 3. ENISA Secure by Design
|
||||
upload_file "$WORK_DIR/pdfs/enisa_secure_by_design.pdf" "$col" "compliance_datenschutz" "guidance" "2023" \
|
||||
'{"source_id":"enisa","doc_type":"guidance","guideline_name":"Advancing Software Security in the EU","license":"reuse_notice","attribution":"European Union Agency for Cybersecurity (ENISA)","source":"enisa.europa.eu"}' \
|
||||
"ENISA: Advancing Software Security in the EU"
|
||||
|
||||
# 4. ENISA Supply Chain Security
|
||||
upload_file "$WORK_DIR/pdfs/enisa_supply_chain_security.pdf" "$col" "compliance_datenschutz" "guidance" "2021" \
|
||||
'{"source_id":"enisa","doc_type":"guidance","guideline_name":"Threat Landscape for Supply Chain Attacks","license":"reuse_notice","attribution":"European Union Agency for Cybersecurity (ENISA)","source":"enisa.europa.eu"}' \
|
||||
"ENISA: Supply Chain Security Threat Landscape"
|
||||
|
||||
# 5. NIST SP 800-218 (SSDF)
|
||||
upload_file "$WORK_DIR/pdfs/nist_sp_800_218_ssdf.pdf" "$col" "compliance_datenschutz" "guidance" "2022" \
|
||||
'{"source_id":"nist","doc_type":"framework","guideline_name":"Secure Software Development Framework (SSDF) SP 800-218","license":"public_domain","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
|
||||
"NIST SP 800-218 — Secure Software Development Framework"
|
||||
|
||||
# 6. NIST Cybersecurity Framework 2.0
|
||||
upload_file "$WORK_DIR/pdfs/nist_csf_2_0.pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
|
||||
'{"source_id":"nist","doc_type":"framework","guideline_name":"NIST Cybersecurity Framework (CSF) 2.0","license":"public_domain","attribution":"National Institute of Standards and Technology (NIST)","source":"nist.gov"}' \
|
||||
"NIST Cybersecurity Framework 2.0"
|
||||
|
||||
# 7. OECD AI Principles
|
||||
upload_file "$WORK_DIR/pdfs/oecd_ai_principles.pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
|
||||
'{"source_id":"oecd","doc_type":"guidance","guideline_name":"OECD Recommendation on Artificial Intelligence (AI Principles)","license":"reuse_notice","attribution":"Organisation for Economic Co-operation and Development (OECD)","source":"oecd.org"}' \
|
||||
"OECD AI Principles (Recommendation on AI)"
|
||||
|
||||
# 10. EFRAG Endorsement Status Report
|
||||
upload_file "$WORK_DIR/pdfs/efrag_endorsement_status_2025.pdf" "$col" "compliance_datenschutz" "guidance" "2025" \
|
||||
'{"source_id":"efrag","doc_type":"guidance","guideline_name":"EFRAG Endorsement Status Report (Dec 2025)","license":"reuse_notice","attribution":"European Financial Reporting Advisory Group (EFRAG)","source":"efrag.org"}' \
|
||||
"EFRAG Endorsement Status Report (Dec 2025)"
|
||||
|
||||
local after
|
||||
after=$(collection_count "$col")
|
||||
log "Collection $col: $before → $after chunks"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE D: Verifizierung
|
||||
# =============================================================================
|
||||
phase_verify() {
|
||||
log "=========================================="
|
||||
log "PHASE D: Verifizierung"
|
||||
log "=========================================="
|
||||
|
||||
echo ""
|
||||
echo "=== Collection Stats ==="
|
||||
for col in bp_compliance_ce bp_compliance_datenschutz; do
|
||||
local count
|
||||
count=$(collection_count "$col")
|
||||
printf " %-30s %s chunks\n" "$col" "$count"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Test-Suchen ==="
|
||||
|
||||
log "Suche: 'Maschinenverordnung CE-Kennzeichnung' in bp_compliance_ce"
|
||||
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"query":"Maschinenverordnung CE-Kennzeichnung","collection":"bp_compliance_ce","top_k":3}' 2>/dev/null \
|
||||
| python3 -c "
|
||||
import sys,json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
results = data.get('results', [])
|
||||
print(f' Treffer: {len(results)}')
|
||||
for r in results[:3]:
|
||||
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
|
||||
except: print(' (parse error)')
|
||||
" 2>/dev/null || echo " (search failed)"
|
||||
|
||||
log "Suche: 'Supply Chain Cybersecurity ENISA' in bp_compliance_datenschutz"
|
||||
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"query":"Supply Chain Cybersecurity ENISA","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \
|
||||
| python3 -c "
|
||||
import sys,json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
results = data.get('results', [])
|
||||
print(f' Treffer: {len(results)}')
|
||||
for r in results[:3]:
|
||||
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
|
||||
except: print(' (parse error)')
|
||||
" 2>/dev/null || echo " (search failed)"
|
||||
|
||||
log "Suche: 'NIST Cybersecurity Framework Governance' in bp_compliance_datenschutz"
|
||||
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"query":"NIST Cybersecurity Framework Governance","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \
|
||||
| python3 -c "
|
||||
import sys,json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
results = data.get('results', [])
|
||||
print(f' Treffer: {len(results)}')
|
||||
for r in results[:3]:
|
||||
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
|
||||
except: print(' (parse error)')
|
||||
" 2>/dev/null || echo " (search failed)"
|
||||
|
||||
log "Suche: 'OECD AI Principles transparency accountability' in bp_compliance_datenschutz"
|
||||
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"query":"OECD AI Principles transparency accountability","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \
|
||||
| python3 -c "
|
||||
import sys,json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
results = data.get('results', [])
|
||||
print(f' Treffer: {len(results)}')
|
||||
for r in results[:3]:
|
||||
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
|
||||
except: print(' (parse error)')
|
||||
" 2>/dev/null || echo " (search failed)"
|
||||
|
||||
log "Suche: 'IFRS Rechnungslegung EU endorsed' in bp_compliance_ce"
|
||||
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"query":"IFRS Rechnungslegung EU endorsed","collection":"bp_compliance_ce","top_k":3}' 2>/dev/null \
|
||||
| python3 -c "
|
||||
import sys,json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
results = data.get('results', [])
|
||||
print(f' Treffer: {len(results)}')
|
||||
for r in results[:3]:
|
||||
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
|
||||
except: print(' (parse error)')
|
||||
" 2>/dev/null || echo " (search failed)"
|
||||
|
||||
log "Suche: 'EFRAG endorsement status IFRS 18' in bp_compliance_datenschutz"
|
||||
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"query":"EFRAG endorsement status IFRS 18","collection":"bp_compliance_datenschutz","top_k":3}' 2>/dev/null \
|
||||
| python3 -c "
|
||||
import sys,json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
results = data.get('results', [])
|
||||
print(f' Treffer: {len(results)}')
|
||||
for r in results[:3]:
|
||||
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"content\",\"\")[:80]}...')
|
||||
except: print(' (parse error)')
|
||||
" 2>/dev/null || echo " (search failed)"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# MAIN
|
||||
# =============================================================================
|
||||
log "============================================================"
|
||||
log "BreakPilot Industry Compliance Ingestion"
|
||||
log "Work dir: $WORK_DIR"
|
||||
log "RAG API: $RAG_URL"
|
||||
log "============================================================"
|
||||
|
||||
if [[ -n "$ONLY_PHASE" ]]; then
|
||||
case "$ONLY_PHASE" in
|
||||
download) phase_download ;;
|
||||
ce) phase_ce ;;
|
||||
datenschutz) phase_datenschutz ;;
|
||||
verify) phase_verify ;;
|
||||
*) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;;
|
||||
esac
|
||||
else
|
||||
if [[ "$SKIP_DOWNLOAD" == "false" ]]; then
|
||||
phase_download
|
||||
else
|
||||
log "(Skipping downloads)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
phase_ce
|
||||
|
||||
echo ""
|
||||
phase_datenschutz
|
||||
|
||||
echo ""
|
||||
phase_verify
|
||||
fi
|
||||
|
||||
echo ""
|
||||
log "============================================================"
|
||||
log "DONE — Uploaded: $UPLOADED | Failed: $FAILED | Skipped: $SKIPPED"
|
||||
log "============================================================"
|
||||
|
||||
if [[ "$FAILED" -gt 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user