import { describe, it, expect } from 'vitest' import { readFileSync } from 'fs' import { resolve } from 'path' /** * Tests for the ingestion script ingest-industry-compliance.sh * Validates script structure, URLs, metadata, and configuration. */ const SCRIPT_PATH = resolve(__dirname, '../../scripts/ingest-industry-compliance.sh') let scriptContent: string try { scriptContent = readFileSync(SCRIPT_PATH, 'utf-8') } catch { scriptContent = '' } describe('Ingestion Script: ingest-industry-compliance.sh', () => { it('should exist and be non-empty', () => { expect(scriptContent.length).toBeGreaterThan(0) }) describe('download_pdf function', () => { it('should define download_pdf function', () => { expect(scriptContent).toContain('download_pdf()') }) it('should use User-Agent header for downloads', () => { expect(scriptContent).toContain('Mozilla/5.0') }) it('should follow redirects with -L flag', () => { expect(scriptContent).toMatch(/curl.*-L/) }) it('should skip already downloaded files', () => { expect(scriptContent).toContain('-f "$target"') }) }) describe('upload_file function', () => { it('should define upload_file function', () => { expect(scriptContent).toContain('upload_file()') }) it('should use recursive chunk strategy', () => { expect(scriptContent).toContain('chunk_strategy=recursive') }) it('should use chunk_size=512', () => { expect(scriptContent).toContain('chunk_size=512') }) it('should use chunk_overlap=50', () => { expect(scriptContent).toContain('chunk_overlap=50') }) it('should validate minimum file size', () => { expect(scriptContent).toContain('"$filesize" -lt 100') }) }) describe('IFRS Downloads', () => { it('should download IFRS DE from EUR-Lex', () => { expect(scriptContent).toContain( 'https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32023R1803' ) }) it('should download IFRS EN from EUR-Lex', () => { expect(scriptContent).toContain( 'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32023R1803' ) }) it('should save IFRS DE with correct filename', () => { expect(scriptContent).toContain('ifrs_regulation_2023_1803_de.pdf') }) it('should save IFRS EN with correct filename', () => { expect(scriptContent).toContain('ifrs_regulation_2023_1803_en.pdf') }) }) describe('EFRAG Download', () => { it('should download EFRAG Endorsement Status Report', () => { expect(scriptContent).toContain('efrag.org') }) it('should save as efrag_endorsement_status_2025.pdf', () => { expect(scriptContent).toContain('efrag_endorsement_status_2025.pdf') }) }) describe('ENISA Downloads', () => { it('should download ENISA from new URL pattern', () => { expect(scriptContent).toContain('enisa.europa.eu/sites/default/files/publications') }) it('should NOT use old Plone-style URLs', () => { expect(scriptContent).not.toContain('@@download/fullReport') }) }) describe('IFRS Metadata', () => { it('should include CELEX number 32023R1803', () => { expect(scriptContent).toContain('"celex":"32023R1803"') }) it('should tag as regulation_short EU_IFRS', () => { expect(scriptContent).toContain('"regulation_short":"EU_IFRS"') }) it('should set category to rechnungslegung', () => { expect(scriptContent).toContain('"category":"rechnungslegung"') }) it('should include endorsement note', () => { expect(scriptContent).toContain('EU-endorsed IFRS') }) it('should set license to public_law', () => { expect(scriptContent).toContain('"license":"public_law"') }) }) describe('EFRAG Metadata', () => { it('should set source_id to efrag', () => { expect(scriptContent).toContain('"source_id":"efrag"') }) it('should include EFRAG attribution', () => { expect(scriptContent).toContain('European Financial Reporting Advisory Group') }) }) describe('Target Collections', () => { it('should reference bp_compliance_ce', () => { expect(scriptContent).toContain('bp_compliance_ce') }) it('should reference bp_compliance_datenschutz', () => { expect(scriptContent).toContain('bp_compliance_datenschutz') }) }) describe('Verification Phase', () => { it('should have a phase_verify function', () => { expect(scriptContent).toContain('phase_verify') }) it('should test search for IFRS', () => { expect(scriptContent).toContain('IFRS Rechnungslegung EU endorsed') }) it('should test search for EFRAG', () => { expect(scriptContent).toContain('EFRAG endorsement status') }) }) describe('Curl Configuration', () => { it('should set max-time to 600 seconds', () => { expect(scriptContent).toContain('--max-time 600') }) }) })