Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Phase 1 (LLM Quality): - Add format=json to all Ollama payloads (obligation_extractor, control_generator, citation_backfill) - Add Chain-of-Thought analysis steps to Pass 0a/0b system prompts Phase 2 (Retrieval Quality): - Hybrid search via Qdrant Query API with RRF fusion + automatic text index (legal_rag.go) - Fallback to dense-only search if Query API unavailable - Cross-encoder re-ranking with BGE Reranker v2 (RERANK_ENABLED=false by default) - CPU-only PyTorch dependency to keep Docker image small Phase 3 (Data Layer): - Cross-regulation dedup pass (threshold 0.95) links controls across regulations - DedupResult.link_type field distinguishes dedup_merge vs cross_regulation - Chunk size defaults updated 512/50 → 1024/128 for new ingestions only - Existing collections and controls are NOT affected Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
168 lines
4.9 KiB
TypeScript
168 lines
4.9 KiB
TypeScript
import { describe, it, expect } from 'vitest'
|
|
import { readFileSync } from 'fs'
|
|
import { resolve } from 'path'
|
|
|
|
/**
|
|
* Tests for the ingestion script ingest-industry-compliance.sh
|
|
* Validates script structure, URLs, metadata, and configuration.
|
|
*/
|
|
|
|
const SCRIPT_PATH = resolve(__dirname, '../../scripts/ingest-industry-compliance.sh')
|
|
let scriptContent: string
|
|
|
|
try {
|
|
scriptContent = readFileSync(SCRIPT_PATH, 'utf-8')
|
|
} catch {
|
|
scriptContent = ''
|
|
}
|
|
|
|
describe('Ingestion Script: ingest-industry-compliance.sh', () => {
|
|
it('should exist and be non-empty', () => {
|
|
expect(scriptContent.length).toBeGreaterThan(0)
|
|
})
|
|
|
|
describe('download_pdf function', () => {
|
|
it('should define download_pdf function', () => {
|
|
expect(scriptContent).toContain('download_pdf()')
|
|
})
|
|
|
|
it('should use User-Agent header for downloads', () => {
|
|
expect(scriptContent).toContain('Mozilla/5.0')
|
|
})
|
|
|
|
it('should follow redirects with -L flag', () => {
|
|
expect(scriptContent).toMatch(/curl.*-L/)
|
|
})
|
|
|
|
it('should skip already downloaded files', () => {
|
|
expect(scriptContent).toContain('-f "$target"')
|
|
})
|
|
})
|
|
|
|
describe('upload_file function', () => {
|
|
it('should define upload_file function', () => {
|
|
expect(scriptContent).toContain('upload_file()')
|
|
})
|
|
|
|
it('should use recursive chunk strategy', () => {
|
|
expect(scriptContent).toContain('chunk_strategy=recursive')
|
|
})
|
|
|
|
it('should use chunk_size=1024', () => {
|
|
expect(scriptContent).toContain('chunk_size=1024')
|
|
})
|
|
|
|
it('should use chunk_overlap=128', () => {
|
|
expect(scriptContent).toContain('chunk_overlap=128')
|
|
})
|
|
|
|
it('should validate minimum file size', () => {
|
|
expect(scriptContent).toContain('"$filesize" -lt 100')
|
|
})
|
|
})
|
|
|
|
describe('IFRS Downloads', () => {
|
|
it('should download IFRS DE from EUR-Lex', () => {
|
|
expect(scriptContent).toContain(
|
|
'https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1803'
|
|
)
|
|
})
|
|
|
|
it('should download IFRS EN from EUR-Lex', () => {
|
|
expect(scriptContent).toContain(
|
|
'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R1803'
|
|
)
|
|
})
|
|
|
|
it('should save IFRS DE with correct filename', () => {
|
|
expect(scriptContent).toContain('ifrs_regulation_2023_1803_de.pdf')
|
|
})
|
|
|
|
it('should save IFRS EN with correct filename', () => {
|
|
expect(scriptContent).toContain('ifrs_regulation_2023_1803_en.pdf')
|
|
})
|
|
})
|
|
|
|
describe('EFRAG Download', () => {
|
|
it('should download EFRAG Endorsement Status Report', () => {
|
|
expect(scriptContent).toContain('efrag.org')
|
|
})
|
|
|
|
it('should save as efrag_endorsement_status_2025.pdf', () => {
|
|
expect(scriptContent).toContain('efrag_endorsement_status_2025.pdf')
|
|
})
|
|
})
|
|
|
|
describe('ENISA Downloads', () => {
|
|
it('should download ENISA from new URL pattern', () => {
|
|
expect(scriptContent).toContain('enisa.europa.eu/sites/default/files/publications')
|
|
})
|
|
|
|
it('should NOT use old Plone-style URLs', () => {
|
|
expect(scriptContent).not.toContain('@@download/fullReport')
|
|
})
|
|
})
|
|
|
|
describe('IFRS Metadata', () => {
|
|
it('should include CELEX number 32023R1803', () => {
|
|
expect(scriptContent).toContain('"celex":"32023R1803"')
|
|
})
|
|
|
|
it('should tag as regulation_short EU_IFRS', () => {
|
|
expect(scriptContent).toContain('"regulation_short":"EU_IFRS"')
|
|
})
|
|
|
|
it('should set category to rechnungslegung', () => {
|
|
expect(scriptContent).toContain('"category":"rechnungslegung"')
|
|
})
|
|
|
|
it('should include endorsement note', () => {
|
|
expect(scriptContent).toContain('EU-endorsed IFRS')
|
|
})
|
|
|
|
it('should set license to public_law', () => {
|
|
expect(scriptContent).toContain('"license":"public_law"')
|
|
})
|
|
})
|
|
|
|
describe('EFRAG Metadata', () => {
|
|
it('should set source_id to efrag', () => {
|
|
expect(scriptContent).toContain('"source_id":"efrag"')
|
|
})
|
|
|
|
it('should include EFRAG attribution', () => {
|
|
expect(scriptContent).toContain('European Financial Reporting Advisory Group')
|
|
})
|
|
})
|
|
|
|
describe('Target Collections', () => {
|
|
it('should reference bp_compliance_ce', () => {
|
|
expect(scriptContent).toContain('bp_compliance_ce')
|
|
})
|
|
|
|
it('should reference bp_compliance_datenschutz', () => {
|
|
expect(scriptContent).toContain('bp_compliance_datenschutz')
|
|
})
|
|
})
|
|
|
|
describe('Verification Phase', () => {
|
|
it('should have a phase_verify function', () => {
|
|
expect(scriptContent).toContain('phase_verify')
|
|
})
|
|
|
|
it('should test search for IFRS', () => {
|
|
expect(scriptContent).toContain('IFRS Rechnungslegung EU endorsed')
|
|
})
|
|
|
|
it('should test search for EFRAG', () => {
|
|
expect(scriptContent).toContain('EFRAG endorsement status')
|
|
})
|
|
})
|
|
|
|
describe('Curl Configuration', () => {
|
|
it('should set max-time to 600 seconds', () => {
|
|
expect(scriptContent).toContain('--max-time 600')
|
|
})
|
|
})
|
|
})
|