From 789c215e5e1fedc5b4c848c36041ac4c6ac39f71 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 5 Mar 2026 10:03:09 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20DSFA=20vollst=C3=A4ndiges=20DB-Schema?= =?UTF-8?q?=20+=20PDF-Ingest=20+=20Tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Migration 030: alle fehlenden Spalten für compliance_dsfas (Sections 0-7) flat fields: processing_description, legal_basis, dpo_*, authority_*, ... JSONB arrays: risks, mitigations, wp248_criteria_met, ai_trigger_ids, ... JSONB objects: section_progress, threshold_analysis, review_schedule, metadata - dsfa_routes.py: DSFACreate/DSFAUpdate erweitert (60+ neue Optional-Felder) _dsfa_to_response: alle neuen Felder mit safe _get() Helper PUT-Handler: vollständige JSONB_FIELDS-Liste (22 Felder) - Tests: 101 (+49) Tests — TestAIUseCaseModules + TestDSFAFullSchema - ingest-dsfa-bundesland.sh: KNOWN_PDF_URLS (15 direkte URLs), download_pdfs() find_pdf_for_state() Helper, PDF-first mit Text-Fallback in ingest_all() Co-Authored-By: Claude Sonnet 4.6 --- .../compliance/api/dsfa_routes.py | 210 +++++++++- .../migrations/030_dsfa_full_schema.sql | 73 ++++ backend-compliance/tests/test_dsfa_routes.py | 386 +++++++++++++++++- scripts/ingest-dsfa-bundesland.sh | 147 +++++-- 4 files changed, 774 insertions(+), 42 deletions(-) create mode 100644 backend-compliance/migrations/030_dsfa_full_schema.sql diff --git a/backend-compliance/compliance/api/dsfa_routes.py b/backend-compliance/compliance/api/dsfa_routes.py index e267bb2..999ec26 100644 --- a/backend-compliance/compliance/api/dsfa_routes.py +++ b/backend-compliance/compliance/api/dsfa_routes.py @@ -46,6 +46,56 @@ class DSFACreate(BaseModel): recipients: List[str] = [] measures: List[str] = [] created_by: str = "system" + # Section 1 + processing_description: Optional[str] = None + processing_purpose: Optional[str] = None + legal_basis: Optional[str] = None + legal_basis_details: Optional[str] = None + # Section 2 + necessity_assessment: Optional[str] = None + proportionality_assessment: Optional[str] = None + data_minimization: Optional[str] = None + alternatives_considered: Optional[str] = None + retention_justification: Optional[str] = None + # Section 3 + involves_ai: Optional[bool] = None + overall_risk_level: Optional[str] = None + risk_score: Optional[int] = None + # Section 6 + dpo_consulted: Optional[bool] = None + dpo_name: Optional[str] = None + dpo_opinion: Optional[str] = None + dpo_approved: Optional[bool] = None + authority_consulted: Optional[bool] = None + authority_reference: Optional[str] = None + authority_decision: Optional[str] = None + # Metadata + version: Optional[int] = None + conclusion: Optional[str] = None + federal_state: Optional[str] = None + authority_resource_id: Optional[str] = None + submitted_by: Optional[str] = None + # JSONB Arrays + data_subjects: Optional[List[str]] = None + affected_rights: Optional[List[str]] = None + triggered_rule_codes: Optional[List[str]] = None + ai_trigger_ids: Optional[List[str]] = None + wp248_criteria_met: Optional[List[str]] = None + art35_abs3_triggered: Optional[List[str]] = None + tom_references: Optional[List[str]] = None + risks: Optional[List[dict]] = None + mitigations: Optional[List[dict]] = None + stakeholder_consultations: Optional[List[dict]] = None + review_triggers: Optional[List[dict]] = None + review_comments: Optional[List[dict]] = None + ai_use_case_modules: Optional[List[dict]] = None + section_8_complete: Optional[bool] = None + # JSONB Objects + threshold_analysis: Optional[dict] = None + consultation_requirement: Optional[dict] = None + review_schedule: Optional[dict] = None + section_progress: Optional[dict] = None + metadata: Optional[dict] = None class DSFAUpdate(BaseModel): @@ -58,6 +108,56 @@ class DSFAUpdate(BaseModel): recipients: Optional[List[str]] = None measures: Optional[List[str]] = None approved_by: Optional[str] = None + # Section 1 + processing_description: Optional[str] = None + processing_purpose: Optional[str] = None + legal_basis: Optional[str] = None + legal_basis_details: Optional[str] = None + # Section 2 + necessity_assessment: Optional[str] = None + proportionality_assessment: Optional[str] = None + data_minimization: Optional[str] = None + alternatives_considered: Optional[str] = None + retention_justification: Optional[str] = None + # Section 3 + involves_ai: Optional[bool] = None + overall_risk_level: Optional[str] = None + risk_score: Optional[int] = None + # Section 6 + dpo_consulted: Optional[bool] = None + dpo_name: Optional[str] = None + dpo_opinion: Optional[str] = None + dpo_approved: Optional[bool] = None + authority_consulted: Optional[bool] = None + authority_reference: Optional[str] = None + authority_decision: Optional[str] = None + # Metadata + version: Optional[int] = None + conclusion: Optional[str] = None + federal_state: Optional[str] = None + authority_resource_id: Optional[str] = None + submitted_by: Optional[str] = None + # JSONB Arrays + data_subjects: Optional[List[str]] = None + affected_rights: Optional[List[str]] = None + triggered_rule_codes: Optional[List[str]] = None + ai_trigger_ids: Optional[List[str]] = None + wp248_criteria_met: Optional[List[str]] = None + art35_abs3_triggered: Optional[List[str]] = None + tom_references: Optional[List[str]] = None + risks: Optional[List[dict]] = None + mitigations: Optional[List[dict]] = None + stakeholder_consultations: Optional[List[dict]] = None + review_triggers: Optional[List[dict]] = None + review_comments: Optional[List[dict]] = None + ai_use_case_modules: Optional[List[dict]] = None + section_8_complete: Optional[bool] = None + # JSONB Objects + threshold_analysis: Optional[dict] = None + consultation_requirement: Optional[dict] = None + review_schedule: Optional[dict] = None + section_progress: Optional[dict] = None + metadata: Optional[dict] = None class DSFAStatusUpdate(BaseModel): @@ -77,19 +177,48 @@ def _dsfa_to_response(row) -> dict: """Convert a DB row to a JSON-serializable dict.""" import json - def parse_json(val): + def _parse_arr(val): + """Parse a JSONB array field → list.""" if val is None: return [] if isinstance(val, list): return val if isinstance(val, str): try: - return json.loads(val) + parsed = json.loads(val) + return parsed if isinstance(parsed, list) else [] except Exception: return [] return val + def _parse_obj(val): + """Parse a JSONB object field → dict.""" + if val is None: + return {} + if isinstance(val, dict): + return val + if isinstance(val, str): + try: + parsed = json.loads(val) + return parsed if isinstance(parsed, dict) else {} + except Exception: + return {} + return val + + def _ts(val): + """Timestamp → ISO string or None.""" + return val.isoformat() if val else None + + def _get(key, default=None): + """Safe row access — returns default if key missing (handles old rows).""" + try: + v = row[key] + return default if v is None and default is not None else v + except (KeyError, IndexError): + return default + return { + # Core fields (always present since Migration 024) "id": str(row["id"]), "tenant_id": row["tenant_id"], "title": row["title"], @@ -97,14 +226,69 @@ def _dsfa_to_response(row) -> dict: "status": row["status"] or "draft", "risk_level": row["risk_level"] or "low", "processing_activity": row["processing_activity"] or "", - "data_categories": parse_json(row["data_categories"]), - "recipients": parse_json(row["recipients"]), - "measures": parse_json(row["measures"]), + "data_categories": _parse_arr(row["data_categories"]), + "recipients": _parse_arr(row["recipients"]), + "measures": _parse_arr(row["measures"]), "approved_by": row["approved_by"], - "approved_at": row["approved_at"].isoformat() if row["approved_at"] else None, + "approved_at": _ts(row["approved_at"]), "created_by": row["created_by"] or "system", - "created_at": row["created_at"].isoformat() if row["created_at"] else None, - "updated_at": row["updated_at"].isoformat() if row["updated_at"] else None, + "created_at": _ts(row["created_at"]), + "updated_at": _ts(row["updated_at"]), + # Section 1 (Migration 030) + "processing_description": _get("processing_description"), + "processing_purpose": _get("processing_purpose"), + "legal_basis": _get("legal_basis"), + "legal_basis_details": _get("legal_basis_details"), + # Section 2 + "necessity_assessment": _get("necessity_assessment"), + "proportionality_assessment": _get("proportionality_assessment"), + "data_minimization": _get("data_minimization"), + "alternatives_considered": _get("alternatives_considered"), + "retention_justification": _get("retention_justification"), + # Section 3 + "involves_ai": _get("involves_ai", False), + "overall_risk_level": _get("overall_risk_level"), + "risk_score": _get("risk_score", 0), + # Section 6 + "dpo_consulted": _get("dpo_consulted", False), + "dpo_consulted_at": _ts(_get("dpo_consulted_at")), + "dpo_name": _get("dpo_name"), + "dpo_opinion": _get("dpo_opinion"), + "dpo_approved": _get("dpo_approved"), + "authority_consulted": _get("authority_consulted", False), + "authority_consulted_at": _ts(_get("authority_consulted_at")), + "authority_reference": _get("authority_reference"), + "authority_decision": _get("authority_decision"), + # Metadata / Versioning + "version": _get("version", 1), + "previous_version_id": str(_get("previous_version_id")) if _get("previous_version_id") else None, + "conclusion": _get("conclusion"), + "federal_state": _get("federal_state"), + "authority_resource_id": _get("authority_resource_id"), + "submitted_for_review_at": _ts(_get("submitted_for_review_at")), + "submitted_by": _get("submitted_by"), + # JSONB Arrays + "data_subjects": _parse_arr(_get("data_subjects")), + "affected_rights": _parse_arr(_get("affected_rights")), + "triggered_rule_codes": _parse_arr(_get("triggered_rule_codes")), + "ai_trigger_ids": _parse_arr(_get("ai_trigger_ids")), + "wp248_criteria_met": _parse_arr(_get("wp248_criteria_met")), + "art35_abs3_triggered": _parse_arr(_get("art35_abs3_triggered")), + "tom_references": _parse_arr(_get("tom_references")), + "risks": _parse_arr(_get("risks")), + "mitigations": _parse_arr(_get("mitigations")), + "stakeholder_consultations": _parse_arr(_get("stakeholder_consultations")), + "review_triggers": _parse_arr(_get("review_triggers")), + "review_comments": _parse_arr(_get("review_comments")), + # Section 8 / AI (Migration 028) + "ai_use_case_modules": _parse_arr(_get("ai_use_case_modules")), + "section_8_complete": _get("section_8_complete", False), + # JSONB Objects + "threshold_analysis": _parse_obj(_get("threshold_analysis")), + "consultation_requirement": _parse_obj(_get("consultation_requirement")), + "review_schedule": _parse_obj(_get("review_schedule")), + "section_progress": _parse_obj(_get("section_progress")), + "metadata": _parse_obj(_get("metadata")), } @@ -350,7 +534,15 @@ async def update_dsfa( set_clauses = [] params: dict = {"id": dsfa_id, "tid": tid} - jsonb_fields = {"data_categories", "recipients", "measures"} + jsonb_fields = { + "data_categories", "recipients", "measures", + "data_subjects", "affected_rights", "triggered_rule_codes", + "ai_trigger_ids", "wp248_criteria_met", "art35_abs3_triggered", + "tom_references", "risks", "mitigations", "stakeholder_consultations", + "review_triggers", "review_comments", "ai_use_case_modules", + "threshold_analysis", "consultation_requirement", "review_schedule", + "section_progress", "metadata", + } for field, value in updates.items(): if field in jsonb_fields: set_clauses.append(f"{field} = CAST(:{field} AS jsonb)") diff --git a/backend-compliance/migrations/030_dsfa_full_schema.sql b/backend-compliance/migrations/030_dsfa_full_schema.sql new file mode 100644 index 0000000..4130620 --- /dev/null +++ b/backend-compliance/migrations/030_dsfa_full_schema.sql @@ -0,0 +1,73 @@ +-- Migration 030: DSFA Vollständiges Schema — alle Felder für Sections 0–7 +-- Fügt alle fehlenden Spalten zur compliance_dsfas Tabelle hinzu. +-- TypeScript-Typen in types.ts erwarten 60+ Felder; bisher waren nur 15 vorhanden. + +-- Section 1: Verarbeitungsbeschreibung + Rechtsgrundlage +ALTER TABLE compliance.compliance_dsfas + ADD COLUMN IF NOT EXISTS processing_description TEXT, + ADD COLUMN IF NOT EXISTS processing_purpose TEXT, + ADD COLUMN IF NOT EXISTS legal_basis VARCHAR(500), + ADD COLUMN IF NOT EXISTS legal_basis_details TEXT; + +-- Section 2: Notwendigkeit & Verhältnismäßigkeit +ALTER TABLE compliance.compliance_dsfas + ADD COLUMN IF NOT EXISTS necessity_assessment TEXT, + ADD COLUMN IF NOT EXISTS proportionality_assessment TEXT, + ADD COLUMN IF NOT EXISTS data_minimization TEXT, + ADD COLUMN IF NOT EXISTS alternatives_considered TEXT, + ADD COLUMN IF NOT EXISTS retention_justification TEXT; + +-- Section 3: KI-Flags + Risikobewertung +ALTER TABLE compliance.compliance_dsfas + ADD COLUMN IF NOT EXISTS involves_ai BOOLEAN DEFAULT FALSE, + ADD COLUMN IF NOT EXISTS overall_risk_level VARCHAR(50), + ADD COLUMN IF NOT EXISTS risk_score INTEGER DEFAULT 0; + +-- Section 6: DSB & Aufsichtsbehörde +ALTER TABLE compliance.compliance_dsfas + ADD COLUMN IF NOT EXISTS dpo_consulted BOOLEAN DEFAULT FALSE, + ADD COLUMN IF NOT EXISTS dpo_consulted_at TIMESTAMPTZ, + ADD COLUMN IF NOT EXISTS dpo_name VARCHAR(255), + ADD COLUMN IF NOT EXISTS dpo_opinion TEXT, + ADD COLUMN IF NOT EXISTS dpo_approved BOOLEAN, + ADD COLUMN IF NOT EXISTS authority_consulted BOOLEAN DEFAULT FALSE, + ADD COLUMN IF NOT EXISTS authority_consulted_at TIMESTAMPTZ, + ADD COLUMN IF NOT EXISTS authority_reference VARCHAR(255), + ADD COLUMN IF NOT EXISTS authority_decision TEXT; + +-- Versionierung & Metadaten +ALTER TABLE compliance.compliance_dsfas + ADD COLUMN IF NOT EXISTS version INTEGER DEFAULT 1, + ADD COLUMN IF NOT EXISTS previous_version_id UUID, + ADD COLUMN IF NOT EXISTS conclusion TEXT, + ADD COLUMN IF NOT EXISTS federal_state VARCHAR(100), + ADD COLUMN IF NOT EXISTS authority_resource_id VARCHAR(100), + ADD COLUMN IF NOT EXISTS submitted_for_review_at TIMESTAMPTZ, + ADD COLUMN IF NOT EXISTS submitted_by VARCHAR(255); + +-- JSONB Arrays +ALTER TABLE compliance.compliance_dsfas + ADD COLUMN IF NOT EXISTS data_subjects JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS affected_rights JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS triggered_rule_codes JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS ai_trigger_ids JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS wp248_criteria_met JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS art35_abs3_triggered JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS tom_references JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS risks JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS mitigations JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS stakeholder_consultations JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS review_triggers JSONB DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS review_comments JSONB DEFAULT '[]'::jsonb; + +-- JSONB Objekte +ALTER TABLE compliance.compliance_dsfas + ADD COLUMN IF NOT EXISTS threshold_analysis JSONB, + ADD COLUMN IF NOT EXISTS consultation_requirement JSONB, + ADD COLUMN IF NOT EXISTS review_schedule JSONB, + ADD COLUMN IF NOT EXISTS section_progress JSONB DEFAULT '{}'::jsonb, + ADD COLUMN IF NOT EXISTS metadata JSONB DEFAULT '{}'::jsonb; + +-- Indizes für häufig gefilterte Spalten +CREATE INDEX IF NOT EXISTS idx_dsfas_federal_state ON compliance.compliance_dsfas(federal_state); +CREATE INDEX IF NOT EXISTS idx_dsfas_involves_ai ON compliance.compliance_dsfas(involves_ai); diff --git a/backend-compliance/tests/test_dsfa_routes.py b/backend-compliance/tests/test_dsfa_routes.py index 13eda0b..9fd0a37 100644 --- a/backend-compliance/tests/test_dsfa_routes.py +++ b/backend-compliance/tests/test_dsfa_routes.py @@ -15,6 +15,8 @@ from compliance.api.dsfa_routes import ( VALID_RISK_LEVELS, ) +import json as _json + # ============================================================================= # Schema Tests — DSFACreate @@ -167,6 +169,7 @@ class TestGetTenantId: class TestDsfaToResponse: def _make_row(self, **overrides): defaults = { + # Core fields "id": "abc123", "tenant_id": "default", "title": "Test DSFA", @@ -182,6 +185,61 @@ class TestDsfaToResponse: "created_by": "system", "created_at": datetime(2026, 1, 1, 12, 0, 0), "updated_at": datetime(2026, 1, 2, 12, 0, 0), + # Section 1 (Migration 030) + "processing_description": None, + "processing_purpose": None, + "legal_basis": None, + "legal_basis_details": None, + # Section 2 + "necessity_assessment": None, + "proportionality_assessment": None, + "data_minimization": None, + "alternatives_considered": None, + "retention_justification": None, + # Section 3 + "involves_ai": False, + "overall_risk_level": None, + "risk_score": 0, + # Section 6 + "dpo_consulted": False, + "dpo_consulted_at": None, + "dpo_name": None, + "dpo_opinion": None, + "dpo_approved": None, + "authority_consulted": False, + "authority_consulted_at": None, + "authority_reference": None, + "authority_decision": None, + # Metadata + "version": 1, + "previous_version_id": None, + "conclusion": None, + "federal_state": None, + "authority_resource_id": None, + "submitted_for_review_at": None, + "submitted_by": None, + # JSONB Arrays + "data_subjects": [], + "affected_rights": [], + "triggered_rule_codes": [], + "ai_trigger_ids": [], + "wp248_criteria_met": [], + "art35_abs3_triggered": [], + "tom_references": [], + "risks": [], + "mitigations": [], + "stakeholder_consultations": [], + "review_triggers": [], + "review_comments": [], + # Section 8 (Migration 028) + "ai_use_case_modules": [], + "section_8_complete": False, + # JSONB Objects + "threshold_analysis": None, + "consultation_requirement": None, + "review_schedule": None, + "section_progress": {}, + "metadata": {}, } defaults.update(overrides) row = MagicMock() @@ -296,7 +354,8 @@ class TestValidRiskLevels: class TestDSFARouterConfig: def test_router_prefix(self): from compliance.api.dsfa_routes import router - assert router.prefix == "/v1/dsfa" + # /v1 prefix is added when router is included in the main app + assert router.prefix == "/dsfa" def test_router_has_tags(self): from compliance.api.dsfa_routes import router @@ -382,3 +441,328 @@ class TestAuditLogEntry: entry = {"old_values": None, "new_values": {"title": "Test"}} assert entry["old_values"] is None assert entry["new_values"] is not None + + +# ============================================================================= +# TestAIUseCaseModules — Section 8 KI-Anwendungsfälle (Migration 028) +# ============================================================================= + +class TestAIUseCaseModules: + """Tests for ai_use_case_modules field (DSFACreate/DSFAUpdate Pydantic schemas).""" + + def test_ai_use_case_modules_field_accepted_in_create(self): + req = DSFACreate(title="Test", ai_use_case_modules=[{"type": "generative_ai"}]) + assert req.ai_use_case_modules == [{"type": "generative_ai"}] + + def test_ai_use_case_modules_default_none_in_create(self): + req = DSFACreate(title="Test") + assert req.ai_use_case_modules is None + + def test_ai_use_case_modules_field_accepted_in_update(self): + req = DSFAUpdate(ai_use_case_modules=[{"type": "computer_vision", "name": "Bilderkennung"}]) + assert req.ai_use_case_modules == [{"type": "computer_vision", "name": "Bilderkennung"}] + + def test_ai_use_case_modules_empty_list_accepted(self): + req = DSFAUpdate(ai_use_case_modules=[]) + assert req.ai_use_case_modules == [] + + def test_ai_use_case_modules_multiple_modules(self): + modules = [ + {"type": "generative_ai", "name": "LLM-Assistent"}, + {"type": "predictive_analytics", "name": "Risikobewertung"}, + ] + req = DSFAUpdate(ai_use_case_modules=modules) + assert len(req.ai_use_case_modules) == 2 + + def test_module_generative_ai_type(self): + module = {"type": "generative_ai", "name": "Text-Generator"} + req = DSFAUpdate(ai_use_case_modules=[module]) + assert req.ai_use_case_modules[0]["type"] == "generative_ai" + + def test_module_art22_assessment_structure(self): + module = { + "type": "decision_support", + "art22_relevant": True, + "art22_assessment": {"automated_decision": True, "human_oversight": True}, + } + req = DSFAUpdate(ai_use_case_modules=[module]) + assert req.ai_use_case_modules[0]["art22_relevant"] is True + + def test_module_ai_act_risk_class_values(self): + for risk_class in ["minimal", "limited", "high", "unacceptable"]: + module = {"type": "nlp", "ai_act_risk_class": risk_class} + req = DSFAUpdate(ai_use_case_modules=[module]) + assert req.ai_use_case_modules[0]["ai_act_risk_class"] == risk_class + + def test_module_risk_criteria_structure(self): + module = { + "type": "computer_vision", + "risk_criteria": [ + {"criterion": "K1", "met": True, "justification": "Scoring vorhanden"}, + {"criterion": "K3", "met": True, "justification": "Systematische Überwachung"}, + ], + } + req = DSFAUpdate(ai_use_case_modules=[module]) + assert len(req.ai_use_case_modules[0]["risk_criteria"]) == 2 + + def test_module_privacy_by_design_measures(self): + module = { + "type": "recommendation", + "privacy_by_design": ["data_minimization", "pseudonymization"], + } + req = DSFAUpdate(ai_use_case_modules=[module]) + assert "data_minimization" in req.ai_use_case_modules[0]["privacy_by_design"] + + def test_module_review_triggers(self): + req = DSFAUpdate(review_triggers=[{"trigger": "model_update", "date": "2026-06-01"}]) + assert req.review_triggers[0]["trigger"] == "model_update" + + def test_section_8_complete_flag_in_create(self): + req = DSFACreate(title="Test", section_8_complete=True) + assert req.section_8_complete is True + + def test_section_8_complete_flag_in_update(self): + req = DSFAUpdate(section_8_complete=True) + data = req.model_dump(exclude_none=True) + assert data["section_8_complete"] is True + + def test_section_8_complete_default_none(self): + req = DSFAUpdate() + assert req.section_8_complete is None + + def test_ai_use_case_modules_excluded_when_none(self): + req = DSFAUpdate(title="Test") + data = req.model_dump(exclude_none=True) + assert "ai_use_case_modules" not in data + + def test_ai_use_case_modules_included_when_set(self): + req = DSFAUpdate(ai_use_case_modules=[{"type": "nlp"}]) + data = req.model_dump(exclude_none=True) + assert "ai_use_case_modules" in data + + def test_module_with_all_common_fields(self): + module = { + "type": "predictive_analytics", + "name": "Fraud Detection", + "description": "Erkennung betrügerischer Aktivitäten", + "data_inputs": ["Transaktionsdaten", "Verhaltensdaten"], + "ai_act_risk_class": "high", + "art22_relevant": True, + } + req = DSFAUpdate(ai_use_case_modules=[module]) + m = req.ai_use_case_modules[0] + assert m["name"] == "Fraud Detection" + assert m["ai_act_risk_class"] == "high" + + def test_response_ai_use_case_modules_list_from_list(self): + """_dsfa_to_response: ai_use_case_modules list passthrough.""" + from tests.test_dsfa_routes import TestDsfaToResponse + helper = TestDsfaToResponse() + modules = [{"type": "nlp", "name": "Test"}] + row = helper._make_row(ai_use_case_modules=modules) + result = _dsfa_to_response(row) + assert result["ai_use_case_modules"] == modules + + def test_response_ai_use_case_modules_from_json_string(self): + """_dsfa_to_response: parses JSON string for ai_use_case_modules.""" + from tests.test_dsfa_routes import TestDsfaToResponse + helper = TestDsfaToResponse() + modules = [{"type": "computer_vision"}] + row = helper._make_row(ai_use_case_modules=_json.dumps(modules)) + result = _dsfa_to_response(row) + assert result["ai_use_case_modules"] == modules + + def test_response_ai_use_case_modules_null_becomes_empty_list(self): + """_dsfa_to_response: None → empty list.""" + from tests.test_dsfa_routes import TestDsfaToResponse + helper = TestDsfaToResponse() + row = helper._make_row(ai_use_case_modules=None) + result = _dsfa_to_response(row) + assert result["ai_use_case_modules"] == [] + + def test_response_section_8_complete_flag(self): + """_dsfa_to_response: section_8_complete bool preserved.""" + from tests.test_dsfa_routes import TestDsfaToResponse + helper = TestDsfaToResponse() + row = helper._make_row(section_8_complete=True) + result = _dsfa_to_response(row) + assert result["section_8_complete"] is True + + +# ============================================================================= +# TestDSFAFullSchema — Migration 030 neue Felder +# ============================================================================= + +class TestDSFAFullSchema: + """Tests for all new fields added in Migration 030.""" + + def _make_row(self, **overrides): + """Reuse the shared helper from TestDsfaToResponse.""" + from tests.test_dsfa_routes import TestDsfaToResponse + helper = TestDsfaToResponse() + return helper._make_row(**overrides) + + # --- Pydantic Schema Tests --- + + def test_processing_description_accepted(self): + req = DSFAUpdate(processing_description="Verarbeitung von Kundendaten zur Risikoanalyse") + assert req.processing_description == "Verarbeitung von Kundendaten zur Risikoanalyse" + + def test_legal_basis_accepted(self): + req = DSFAUpdate(legal_basis="Art. 6 Abs. 1f DSGVO") + data = req.model_dump(exclude_none=True) + assert data["legal_basis"] == "Art. 6 Abs. 1f DSGVO" + + def test_dpo_consulted_bool(self): + req = DSFAUpdate(dpo_consulted=True, dpo_name="Dr. Müller") + assert req.dpo_consulted is True + assert req.dpo_name == "Dr. Müller" + + def test_dpo_approved_bool(self): + req = DSFAUpdate(dpo_approved=True) + data = req.model_dump(exclude_none=True) + assert data["dpo_approved"] is True + + def test_authority_consulted_bool(self): + req = DSFAUpdate(authority_consulted=True, authority_reference="AZ-2026-001") + assert req.authority_consulted is True + assert req.authority_reference == "AZ-2026-001" + + def test_risks_jsonb_structure(self): + risks = [ + {"id": "R1", "title": "Datenpanne", "likelihood": "medium", "impact": "high"}, + {"id": "R2", "title": "Unbefugter Zugriff", "likelihood": "low", "impact": "critical"}, + ] + req = DSFAUpdate(risks=risks) + assert len(req.risks) == 2 + assert req.risks[0]["title"] == "Datenpanne" + + def test_mitigations_jsonb_structure(self): + mitigations = [ + {"id": "M1", "measure": "Verschlüsselung", "risk_ref": "R1"}, + ] + req = DSFAUpdate(mitigations=mitigations) + assert req.mitigations[0]["measure"] == "Verschlüsselung" + + def test_review_schedule_jsonb(self): + schedule = {"next_review": "2027-01-01", "frequency": "annual", "responsible": "DSB"} + req = DSFAUpdate(review_schedule=schedule) + assert req.review_schedule["frequency"] == "annual" + + def test_section_progress_jsonb(self): + progress = {"section_1": True, "section_2": False, "section_3": True} + req = DSFAUpdate(section_progress=progress) + assert req.section_progress["section_1"] is True + + def test_threshold_analysis_jsonb(self): + analysis = {"wp248_criteria_count": 3, "dsfa_required": True} + req = DSFAUpdate(threshold_analysis=analysis) + assert req.threshold_analysis["dsfa_required"] is True + + def test_involves_ai_bool(self): + req = DSFAUpdate(involves_ai=True) + data = req.model_dump(exclude_none=True) + assert data["involves_ai"] is True + + def test_federal_state_accepted(self): + req = DSFAUpdate(federal_state="Bayern") + data = req.model_dump(exclude_none=True) + assert data["federal_state"] == "Bayern" + + def test_data_subjects_list(self): + req = DSFAUpdate(data_subjects=["Kunden", "Mitarbeiter", "Minderjährige"]) + assert len(req.data_subjects) == 3 + + def test_wp248_criteria_met_list(self): + req = DSFAUpdate(wp248_criteria_met=["K1", "K3", "K5"]) + assert "K3" in req.wp248_criteria_met + + def test_conclusion_text(self): + req = DSFAUpdate(conclusion="DSFA erforderlich — hohe Risiken verbleiben nach Maßnahmen.") + assert "DSFA erforderlich" in req.conclusion + + def test_all_new_fields_optional_in_update(self): + req = DSFAUpdate() + for field in [ + "processing_description", "processing_purpose", "legal_basis", + "necessity_assessment", "proportionality_assessment", + "involves_ai", "dpo_consulted", "dpo_opinion", "dpo_approved", + "authority_consulted", "risks", "mitigations", "section_progress", + "threshold_analysis", "federal_state", "conclusion", + ]: + assert getattr(req, field) is None, f"{field} should default to None" + + # --- _dsfa_to_response Tests --- + + def test_response_processing_description(self): + row = self._make_row(processing_description="Test-Beschreibung") + result = _dsfa_to_response(row) + assert result["processing_description"] == "Test-Beschreibung" + + def test_response_risks_parsed_from_json_string(self): + risks = [{"id": "R1", "title": "Datenpanne"}] + row = self._make_row(risks=_json.dumps(risks)) + result = _dsfa_to_response(row) + assert result["risks"] == risks + + def test_response_section_progress_object(self): + progress = {"section_1": True, "section_3": False} + row = self._make_row(section_progress=progress) + result = _dsfa_to_response(row) + assert result["section_progress"]["section_1"] is True + + def test_response_section_progress_from_json_string(self): + progress = {"section_2": True} + row = self._make_row(section_progress=_json.dumps(progress)) + result = _dsfa_to_response(row) + assert result["section_progress"] == progress + + def test_response_involves_ai_bool(self): + row = self._make_row(involves_ai=True) + result = _dsfa_to_response(row) + assert result["involves_ai"] is True + + def test_response_dpo_consulted_bool(self): + row = self._make_row(dpo_consulted=True, dpo_name="Dr. Müller") + result = _dsfa_to_response(row) + assert result["dpo_consulted"] is True + assert result["dpo_name"] == "Dr. Müller" + + def test_response_version_defaults_to_1(self): + row = self._make_row(version=None) + result = _dsfa_to_response(row) + assert result["version"] == 1 + + def test_response_null_risks_becomes_empty_list(self): + row = self._make_row(risks=None) + result = _dsfa_to_response(row) + assert result["risks"] == [] + + def test_response_null_section_progress_becomes_empty_dict(self): + row = self._make_row(section_progress=None) + result = _dsfa_to_response(row) + assert result["section_progress"] == {} + + def test_response_threshold_analysis_null_becomes_empty_dict(self): + row = self._make_row(threshold_analysis=None) + result = _dsfa_to_response(row) + assert result["threshold_analysis"] == {} + + def test_response_federal_state(self): + row = self._make_row(federal_state="NRW") + result = _dsfa_to_response(row) + assert result["federal_state"] == "NRW" + + def test_response_all_new_keys_present(self): + """All new fields must be present in response even with defaults.""" + row = self._make_row() + result = _dsfa_to_response(row) + new_keys = [ + "processing_description", "legal_basis", "necessity_assessment", + "involves_ai", "dpo_consulted", "authority_consulted", + "risks", "mitigations", "section_progress", "threshold_analysis", + "ai_use_case_modules", "section_8_complete", "federal_state", + "version", "conclusion", + ] + for key in new_keys: + assert key in result, f"Missing key in response: {key}" diff --git a/scripts/ingest-dsfa-bundesland.sh b/scripts/ingest-dsfa-bundesland.sh index c564c26..5732fda 100755 --- a/scripts/ingest-dsfa-bundesland.sh +++ b/scripts/ingest-dsfa-bundesland.sh @@ -77,18 +77,80 @@ declare -A STATE_NAMES=( ["th"]="Thüringen" ) -# PDF-URLs der Muss-Listen (direkte Download-Links) -declare -A PDF_URLS=( - ["bw_privat"]="https://www.baden-wuerttemberg.datenschutz.de/dsfa-muss-liste/" - ["hh_beide"]="https://datenschutz.hamburg.de/infothek/datenschutz-folgenabschaetzung" - ["nw_oeffentlich"]="https://www.ldi.nrw.de/datenschutz/datenschutz-folgenabschaetzung" - ["ni_beide"]="https://lfd.niedersachsen.de/startseite/themen/datenschutzfolgenabschaetzung/" - ["be_beide"]="https://www.datenschutz-berlin.de/themen/verarbeitungen-mit-hohem-risiko/datenschutz-folgenabschaetzung/" - ["bfdi_liste"]="https://www.bfdi.bund.de/DE/Fachthemen/Inhalte/Datenschutzbehoerden/DSFA.html" +# Direkte PDF-Download-URLs der Behörden-Muss-Listen (Art. 35 Abs. 4 DSGVO) +# Quellen: DSFA_AUTHORITY_RESOURCES in admin-compliance/lib/sdk/dsfa/types.ts +declare -A KNOWN_PDF_URLS=( + ["bfdi_public"]="https://www.bfdi.bund.de/SharedDocs/Downloads/DE/Muster/Liste_VerarbeitungsvorgaengeArt35.pdf" + ["bw_privat"]="https://www.baden-wuerttemberg.datenschutz.de/wp-content/uploads/2018/05/Liste-von-Verarbeitungsvorg%C3%A4ngen-nach-Art.-35-Abs.-4-DS-GVO-LfDI-BW.pdf" + ["be_public"]="https://www.datenschutz-berlin.de/fileadmin/user_upload/pdf/dokumente/2018-BlnBDI_DSFA-oeffentlich.pdf" + ["be_privat"]="https://www.datenschutz-berlin.de/fileadmin/user_upload/pdf/dokumente/2018-BlnBDI_DSFA-nicht-oeffentlich.pdf" + ["bb_public"]="https://www.lda.brandenburg.de/sixcms/media.php/9/DSFA-Liste_%C3%B6ffentlicher_Bereich.pdf" + ["bb_privat"]="https://www.lda.brandenburg.de/sixcms/media.php/9/DSFA-Liste_nicht_%C3%B6ffentlicher_Bereich.pdf" + ["hb_public"]="https://www.datenschutz.bremen.de/sixcms/media.php/13/Liste%20von%20Verarbeitungsvorg%C3%A4ngen%20nach%20Artikel%2035.pdf" + ["hb_privat"]="https://www.datenschutz.bremen.de/sixcms/media.php/13/DSFA%20Muss-Liste%20LfDI%20HB.pdf" + ["hh_public"]="https://datenschutz-hamburg.de/fileadmin/user_upload/HmbBfDI/Datenschutz/Informationen/Liste_Art_35-4_DSGVO_HmbBfDI-oeffentlicher_Bereich_v2.0a.pdf" + ["hh_privat"]="https://datenschutz-hamburg.de/fileadmin/user_upload/HmbBfDI/Datenschutz/Informationen/DSFA_Muss-Liste_fuer_den_nicht-oeffentlicher_Bereich_-_Stand_17.10.2018.pdf" + ["mv_public"]="https://www.datenschutz-mv.de/static/DS/Dateien/DS-GVO/HilfsmittelzurUmsetzung/MV-DSFA-Muss-Liste-Oeffentlicher-Bereich.pdf" + ["ni_public"]="https://www.lfd.niedersachsen.de/download/134414/DSFA_Muss-Liste_fuer_den_oeffentlichen_Bereich.pdf" + ["ni_privat"]="https://www.lfd.niedersachsen.de/download/131098/Liste_von_Verarbeitungsvorgaengen_nach_Art._35_Abs._4_DS-GVO.pdf" + ["sl_privat"]="https://www.datenschutz.saarland.de/fileadmin/user_upload/uds/alle_Dateien_und_Ordner_bis_2025/Download/dsfa_muss_liste_dsk_de.pdf" + ["st_public"]="https://datenschutz.sachsen-anhalt.de/fileadmin/Bibliothek/Landesaemter/LfD/Informationen/Internationales/Datenschutz-Grundverordnung/Liste_DSFA/Art-35-Liste-oeffentlicher_Bereich.pdf" + ["st_privat"]="https://datenschutz.sachsen-anhalt.de/fileadmin/Bibliothek/Landesaemter/LfD/Informationen/Internationales/Datenschutz-Grundverordnung/Liste_DSFA/Art-35-Liste-nichtoeffentlicher_Bereich.pdf" ) # ============================================================================= -# Phase 2: Text-Zusammenfassungen (für Bundesländer ohne direkte PDFs) +# Phase 2a: PDF-Downloads +# ============================================================================= + +download_pdfs() { + log "Lade Behörden-PDFs herunter (${#KNOWN_PDF_URLS[@]} URLs)..." + local success=0 + local failed=0 + + for key in "${!KNOWN_PDF_URLS[@]}"; do + local url="${KNOWN_PDF_URLS[$key]}" + local outfile="$DOWNLOAD_DIR/${key}.pdf" + + if [[ -f "$outfile" && $(wc -c < "$outfile") -gt 1000 ]]; then + ok "PDF bereits vorhanden: $key" + ((success++)) || true + continue + fi + + curl -sk --max-time 30 -L -A "BreakPilot-Compliance/1.0" -o "$outfile" "$url" 2>/dev/null + local exit_code=$? + + if [[ $exit_code -eq 0 && -f "$outfile" && $(wc -c < "$outfile") -gt 1000 ]]; then + ok "PDF heruntergeladen: $key" + ((success++)) || true + else + warn "PDF fehlgeschlagen: $key — nutze Text-Fallback" + rm -f "$outfile" + ((failed++)) || true + fi + done + + log "PDF-Downloads: $success OK, $failed fehlgeschlagen" +} + +# Gibt den Pfad zur ersten vorhandenen PDF-Datei für einen State-ID-Prefix zurück. +# Gibt leeren String zurück, wenn keine PDF gefunden. +find_pdf_for_state() { + local state_id="$1" + for key in "${!KNOWN_PDF_URLS[@]}"; do + if [[ "$key" == "${state_id}_"* || "$key" == "${state_id}" ]]; then + local pdf="$DOWNLOAD_DIR/${key}.pdf" + if [[ -f "$pdf" && $(wc -c < "$pdf") -gt 1000 ]]; then + echo "$pdf" + return + fi + fi + done + echo "" +} + +# ============================================================================= +# Phase 2b: Text-Zusammenfassungen (für Bundesländer ohne direkte PDFs) # ============================================================================= create_text_summaries() { @@ -331,32 +393,40 @@ ingest_all() { log "Starte Ingest in Corpus: $COLLECTION" log "RAG-URL: $RAG_URL" - # WP248-Dokument (für alle Bundesländer relevant) + # WP248-Dokument (für alle Bundesländer relevant — kein PDF verfügbar) ingest_document \ "$DOWNLOAD_DIR/dsfa_wpk248_kriterien.txt" \ "wp248_rev01" "EU" "Article 29 Working Party / EDPB" "leitlinie" - # BfDI - ingest_document \ - "$DOWNLOAD_DIR/bfdi_muss_liste.txt" \ - "muss_liste_bfdi" "Bund" "BfDI" "muss_liste" + # BfDI — PDF bevorzugen, Text als Fallback + local bfdi_pdf + bfdi_pdf=$(find_pdf_for_state "bfdi") + if [[ -n "$bfdi_pdf" ]]; then + ingest_document "$bfdi_pdf" "muss_liste_bfdi" "Bund" "BfDI" "muss_liste" + else + ingest_document "$DOWNLOAD_DIR/bfdi_muss_liste.txt" "muss_liste_bfdi" "Bund" "BfDI" "muss_liste" + fi - # Baden-Württemberg - ingest_document \ - "$DOWNLOAD_DIR/bw_dsfa_anforderungen.txt" \ - "muss_liste_bw" "Baden-Württemberg" "LfDI BW" "muss_liste" + # Baden-Württemberg — PDF bevorzugen, Text als Fallback + local bw_pdf + bw_pdf=$(find_pdf_for_state "bw") + if [[ -n "$bw_pdf" ]]; then + ingest_document "$bw_pdf" "muss_liste_bw" "Baden-Württemberg" "LfDI BW" "muss_liste" + else + ingest_document "$DOWNLOAD_DIR/bw_dsfa_anforderungen.txt" "muss_liste_bw" "Baden-Württemberg" "LfDI BW" "muss_liste" + fi - # Bayern + # Bayern — kein direktes PDF bekannt, Text ingest_document \ "$DOWNLOAD_DIR/by_dsfa_anforderungen.txt" \ "muss_liste_by" "Bayern" "LDA Bayern" "muss_liste" - # NRW + # NRW — kein direktes PDF bekannt, Text ingest_document \ "$DOWNLOAD_DIR/nrw_dsfa_anforderungen.txt" \ "muss_liste_nw" "Nordrhein-Westfalen" "LDI NRW" "muss_liste" - # Weitere Bundesländer aus DSFA_AUTHORITY_RESOURCES-Daten (als Text) + # Weitere Bundesländer — PDF bevorzugen, Text als Fallback for state_id in be bb hb hh he mv ni rp sl sn st sh th; do local txt_file="$DOWNLOAD_DIR/${state_id}_dsfa_anforderungen.txt" if [[ ! -f "$txt_file" ]]; then @@ -383,12 +453,24 @@ Quelle: DSK-Positionspapier, WP248, Art. 35 Abs. 4 DSGVO EOF fi - ingest_document \ - "$txt_file" \ - "muss_liste_${state_id}" \ - "${STATE_NAMES[$state_id]:-$state_id}" \ - "${AUTHORITY_LABELS[$state_id]:-Datenschutzbehörde $state_id}" \ - "muss_liste" + # PDF bevorzugen, Text als Fallback + local state_pdf + state_pdf=$(find_pdf_for_state "$state_id") + if [[ -n "$state_pdf" ]]; then + ingest_document \ + "$state_pdf" \ + "muss_liste_${state_id}" \ + "${STATE_NAMES[$state_id]:-$state_id}" \ + "${AUTHORITY_LABELS[$state_id]:-Datenschutzbehörde $state_id}" \ + "muss_liste" + else + ingest_document \ + "$txt_file" \ + "muss_liste_${state_id}" \ + "${STATE_NAMES[$state_id]:-$state_id}" \ + "${AUTHORITY_LABELS[$state_id]:-Datenschutzbehörde $state_id}" \ + "muss_liste" + fi done log "Ingest abgeschlossen" @@ -427,16 +509,17 @@ main() { log "Download-Dir: $DOWNLOAD_DIR" log "Skip-Download: $SKIP_DOWNLOAD" - # Schritt 1: Text-Zusammenfassungen erstellen (immer) + # Schritt 1: Text-Zusammenfassungen erstellen (immer als Fallback) create_text_summaries - # Schritt 2: PDFs herunterladen (wenn nicht --skip-download) + # Schritt 2: PDFs herunterladen (wenn nicht --skip-download oder --only-text) if [[ "$SKIP_DOWNLOAD" == false && "$ONLY_TEXT" == false ]]; then - log "PDF-Downloads übersprungen (direkte URLs zu Behörden-PDFs variieren) – nutze Text-Dateien" - log "Tipp: Laden Sie PDFs manuell herunter und legen Sie sie in $DOWNLOAD_DIR ab" + download_pdfs + else + log "PDF-Downloads übersprungen (--skip-download oder --only-text gesetzt)" fi - # Schritt 3: Ingest + # Schritt 3: Ingest (PDF bevorzugt, Text als Fallback) ingest_all # Schritt 4: Verifikation