All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 33s
CI / test-python-backend-compliance (push) Successful in 26s
CI / test-python-document-crawler (push) Successful in 23s
CI / test-python-dsms-gateway (push) Successful in 18s
46 tests covering: - COMPLIANCE_COLLECTIONS validation - IFRS endorsement warning content (5 points, CELEX, EFRAG reference) - Ingestion script structure (download_pdf, upload_file functions) - IFRS/EFRAG/ENISA URLs and metadata validation - Chunking config and verification phase Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
168 lines
4.9 KiB
TypeScript
168 lines
4.9 KiB
TypeScript
import { describe, it, expect } from 'vitest'
|
|
import { readFileSync } from 'fs'
|
|
import { resolve } from 'path'
|
|
|
|
/**
|
|
* Tests for the ingestion script ingest-industry-compliance.sh
|
|
* Validates script structure, URLs, metadata, and configuration.
|
|
*/
|
|
|
|
const SCRIPT_PATH = resolve(__dirname, '../../scripts/ingest-industry-compliance.sh')
|
|
let scriptContent: string
|
|
|
|
try {
|
|
scriptContent = readFileSync(SCRIPT_PATH, 'utf-8')
|
|
} catch {
|
|
scriptContent = ''
|
|
}
|
|
|
|
describe('Ingestion Script: ingest-industry-compliance.sh', () => {
|
|
it('should exist and be non-empty', () => {
|
|
expect(scriptContent.length).toBeGreaterThan(0)
|
|
})
|
|
|
|
describe('download_pdf function', () => {
|
|
it('should define download_pdf function', () => {
|
|
expect(scriptContent).toContain('download_pdf()')
|
|
})
|
|
|
|
it('should use User-Agent header for downloads', () => {
|
|
expect(scriptContent).toContain('Mozilla/5.0')
|
|
})
|
|
|
|
it('should follow redirects with -L flag', () => {
|
|
expect(scriptContent).toMatch(/curl.*-L/)
|
|
})
|
|
|
|
it('should skip already downloaded files', () => {
|
|
expect(scriptContent).toContain('-f "$target"')
|
|
})
|
|
})
|
|
|
|
describe('upload_file function', () => {
|
|
it('should define upload_file function', () => {
|
|
expect(scriptContent).toContain('upload_file()')
|
|
})
|
|
|
|
it('should use recursive chunk strategy', () => {
|
|
expect(scriptContent).toContain('chunk_strategy=recursive')
|
|
})
|
|
|
|
it('should use chunk_size=512', () => {
|
|
expect(scriptContent).toContain('chunk_size=512')
|
|
})
|
|
|
|
it('should use chunk_overlap=50', () => {
|
|
expect(scriptContent).toContain('chunk_overlap=50')
|
|
})
|
|
|
|
it('should validate minimum file size', () => {
|
|
expect(scriptContent).toContain('"$filesize" -lt 100')
|
|
})
|
|
})
|
|
|
|
describe('IFRS Downloads', () => {
|
|
it('should download IFRS DE from EUR-Lex', () => {
|
|
expect(scriptContent).toContain(
|
|
'https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1803'
|
|
)
|
|
})
|
|
|
|
it('should download IFRS EN from EUR-Lex', () => {
|
|
expect(scriptContent).toContain(
|
|
'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R1803'
|
|
)
|
|
})
|
|
|
|
it('should save IFRS DE with correct filename', () => {
|
|
expect(scriptContent).toContain('ifrs_regulation_2023_1803_de.pdf')
|
|
})
|
|
|
|
it('should save IFRS EN with correct filename', () => {
|
|
expect(scriptContent).toContain('ifrs_regulation_2023_1803_en.pdf')
|
|
})
|
|
})
|
|
|
|
describe('EFRAG Download', () => {
|
|
it('should download EFRAG Endorsement Status Report', () => {
|
|
expect(scriptContent).toContain('efrag.org')
|
|
})
|
|
|
|
it('should save as efrag_endorsement_status_2025.pdf', () => {
|
|
expect(scriptContent).toContain('efrag_endorsement_status_2025.pdf')
|
|
})
|
|
})
|
|
|
|
describe('ENISA Downloads', () => {
|
|
it('should download ENISA from new URL pattern', () => {
|
|
expect(scriptContent).toContain('enisa.europa.eu/sites/default/files/publications')
|
|
})
|
|
|
|
it('should NOT use old Plone-style URLs', () => {
|
|
expect(scriptContent).not.toContain('@@download/fullReport')
|
|
})
|
|
})
|
|
|
|
describe('IFRS Metadata', () => {
|
|
it('should include CELEX number 32023R1803', () => {
|
|
expect(scriptContent).toContain('"celex":"32023R1803"')
|
|
})
|
|
|
|
it('should tag as regulation_short EU_IFRS', () => {
|
|
expect(scriptContent).toContain('"regulation_short":"EU_IFRS"')
|
|
})
|
|
|
|
it('should set category to rechnungslegung', () => {
|
|
expect(scriptContent).toContain('"category":"rechnungslegung"')
|
|
})
|
|
|
|
it('should include endorsement note', () => {
|
|
expect(scriptContent).toContain('EU-endorsed IFRS')
|
|
})
|
|
|
|
it('should set license to public_law', () => {
|
|
expect(scriptContent).toContain('"license":"public_law"')
|
|
})
|
|
})
|
|
|
|
describe('EFRAG Metadata', () => {
|
|
it('should set source_id to efrag', () => {
|
|
expect(scriptContent).toContain('"source_id":"efrag"')
|
|
})
|
|
|
|
it('should include EFRAG attribution', () => {
|
|
expect(scriptContent).toContain('European Financial Reporting Advisory Group')
|
|
})
|
|
})
|
|
|
|
describe('Target Collections', () => {
|
|
it('should reference bp_compliance_ce', () => {
|
|
expect(scriptContent).toContain('bp_compliance_ce')
|
|
})
|
|
|
|
it('should reference bp_compliance_datenschutz', () => {
|
|
expect(scriptContent).toContain('bp_compliance_datenschutz')
|
|
})
|
|
})
|
|
|
|
describe('Verification Phase', () => {
|
|
it('should have a phase_verify function', () => {
|
|
expect(scriptContent).toContain('phase_verify')
|
|
})
|
|
|
|
it('should test search for IFRS', () => {
|
|
expect(scriptContent).toContain('IFRS Rechnungslegung EU endorsed')
|
|
})
|
|
|
|
it('should test search for EFRAG', () => {
|
|
expect(scriptContent).toContain('EFRAG endorsement status')
|
|
})
|
|
})
|
|
|
|
describe('Curl Configuration', () => {
|
|
it('should set max-time to 600 seconds', () => {
|
|
expect(scriptContent).toContain('--max-time 600')
|
|
})
|
|
})
|
|
})
|