feat: Control Library UI, dedup migration, QA tooling, docs

- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00
parent c52dbdb8f1
commit 643b26618f
28 changed files with 5781 additions and 75 deletions
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
+
+Tests 5 representative gap articles from different sources.
+Measures: quality (JSON valid, fields complete), response time, cost estimate.
+
+Usage:
+    python3 benchmark_llm_controls.py
+"""
+import json
+import time
+import sys
+import os
+import requests
+from pathlib import Path
+
+# ── Config ──────────────────────────────────────────────────────────
+LITELLM_URL = "https://llm-dev.meghsakha.com"
+LITELLM_MODEL = "gpt-oss-120b"
+LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
+
+ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_MODEL = "claude-sonnet-4-6"
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+
+PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
+
+try:
+    import fitz  # PyMuPDF
+except ImportError:
+    print("PyMuPDF not available, using pre-extracted texts")
+    fitz = None
+
+# ── Prompts (identical to control_generator.py) ─────────────────────
+
+SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
+als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
+Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
+
+APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
+  Verwende ["all"] wenn der Control branchenuebergreifend gilt.
+  Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
+  "Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
+- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
+  Verwende ["all"] wenn keine Groessenbeschraenkung.
+  Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
+- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
+  {"requires_any": ["signal"], "description": "Erklaerung"}"""
+
+
+def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
+    return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
+Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
+
+WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
+Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
+
+Gib JSON zurück mit diesen Feldern:
+- title: Kurzer prägnanter Titel (max 100 Zeichen)
+- objective: Was soll erreicht werden? (1-3 Sätze)
+- rationale: Warum ist das wichtig? (1-2 Sätze)
+- requirements: Liste von konkreten Anforderungen (Strings)
+- test_procedure: Liste von Prüfschritten (Strings)
+- evidence: Liste von Nachweisdokumenten (Strings)
+- severity: low/medium/high/critical
+- tags: Liste von Tags
+- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
+- category: Inhaltliche Kategorie
+- target_audience: Liste der Zielgruppen
+- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
+- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
+{APPLICABILITY_PROMPT}
+
+Text: {article_text[:3000]}
+Quelle: {source_name}, {article_label}"""
+
+
+# ── PDF Text Extraction ─────────────────────────────────────────────
+
+def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
+    """Extract the text of a specific article from a PDF."""
+    import re
+
+    path = PDF_DIR / pdf_file
+    if not path.exists() or fitz is None:
+        return ""
+
+    doc = fitz.open(str(path))
+    full_text = ""
+    for page in doc:
+        full_text += page.get_text() + "\n"
+    doc.close()
+
+    # Find article boundaries
+    if doc_type == "eu_regulation":
+        # Find "Artikel N" heading
+        art_num = re.search(r'\d+', article_label)
+        if not art_num:
+            return ""
+        num = int(art_num.group())
+        # Find start of this article
+        pattern = rf'\nArtikel\s+{num}\s*\n'
+        match = re.search(pattern, full_text)
+        if not match:
+            return f"[Artikel {num} nicht im PDF gefunden]"
+        start = match.start()
+        # Find start of next article
+        next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else start + 5000
+        text = full_text[start:end].strip()
+        return text[:3000]
+
+    elif doc_type == "de_law":
+        para_num = re.search(r'\d+', article_label)
+        if not para_num:
+            return ""
+        num = int(para_num.group())
+        pattern = rf'\n§\s+{num}\b'
+        match = re.search(pattern, full_text)
+        if not match:
+            return f"[§ {num} nicht im PDF gefunden]"
+        start = match.start()
+        next_pattern = rf'\n§\s+{num+1}\b'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else start + 5000
+        text = full_text[start:end].strip()
+        return text[:3000]
+
+    elif doc_type == "nist":
+        # Find NIST control family
+        match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
+        if not match:
+            return f"[{article_label} nicht im PDF gefunden]"
+        start = match.start()
+        text = full_text[start:start+3000].strip()
+        return text
+
+    else:
+        # Generic section search
+        match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
+        if not match:
+            return f"[{article_label} nicht im PDF gefunden]"
+        start = match.start()
+        text = full_text[start:start+3000].strip()
+        return text
+
+
+# ── API Calls ────────────────────────────────────────────────────────
+
+def call_litellm(prompt: str, system_prompt: str) -> tuple:
+    """Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {LITELLM_API_KEY}",
+    }
+    payload = {
+        "model": LITELLM_MODEL,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt},
+        ],
+        "temperature": 0.3,
+        "max_tokens": 4096,
+        "stream": False,
+    }
+
+    t0 = time.time()
+    try:
+        resp = requests.post(
+            f"{LITELLM_URL}/v1/chat/completions",
+            headers=headers,
+            json=payload,
+            timeout=180,
+        )
+        duration = time.time() - t0
+        if resp.status_code != 200:
+            return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
+        data = resp.json()
+        content = data["choices"][0]["message"]["content"]
+        usage = data.get("usage", {})
+        return content, duration, None, usage
+    except Exception as e:
+        return "", time.time() - t0, str(e), {}
+
+
+def call_anthropic(prompt: str, system_prompt: str) -> tuple:
+    """Call Anthropic API. Returns (response_text, duration_seconds, error)."""
+    headers = {
+        "x-api-key": ANTHROPIC_API_KEY,
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+    }
+    payload = {
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": 4096,
+        "system": system_prompt,
+        "messages": [{"role": "user", "content": prompt}],
+    }
+
+    t0 = time.time()
+    try:
+        resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
+        duration = time.time() - t0
+        if resp.status_code != 200:
+            return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
+        data = resp.json()
+        content = data["content"][0]["text"] if data.get("content") else ""
+        usage = data.get("usage", {})
+        return content, duration, None, usage
+    except Exception as e:
+        return "", time.time() - t0, str(e), {}
+
+
+# ── Quality Assessment ───────────────────────────────────────────────
+
+REQUIRED_FIELDS = [
+    "title", "objective", "rationale", "requirements",
+    "test_procedure", "evidence", "severity", "domain",
+]
+
+BONUS_FIELDS = [
+    "tags", "category", "target_audience", "source_article",
+    "applicable_industries", "applicable_company_size",
+]
+
+
+def assess_quality(raw_text: str) -> dict:
+    """Assess the quality of a control generation response."""
+    result = {
+        "json_valid": False,
+        "required_fields": 0,
+        "required_total": len(REQUIRED_FIELDS),
+        "bonus_fields": 0,
+        "bonus_total": len(BONUS_FIELDS),
+        "requirements_count": 0,
+        "test_procedure_count": 0,
+        "evidence_count": 0,
+        "title_length": 0,
+        "objective_length": 0,
+        "score": 0,
+    }
+
+    # Try to parse JSON
+    text = raw_text.strip()
+    if text.startswith("```"):
+        lines = text.split("\n")
+        text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
+
+    try:
+        data = json.loads(text)
+        if isinstance(data, list):
+            data = data[0] if data else {}
+    except json.JSONDecodeError:
+        # Try to find JSON object
+        import re
+        match = re.search(r'\{[\s\S]*\}', text)
+        if match:
+            try:
+                data = json.loads(match.group())
+            except json.JSONDecodeError:
+                return result
+        else:
+            return result
+
+    result["json_valid"] = True
+
+    # Check required fields
+    for f in REQUIRED_FIELDS:
+        val = data.get(f)
+        if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
+            result["required_fields"] += 1
+
+    # Check bonus fields
+    for f in BONUS_FIELDS:
+        val = data.get(f)
+        if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
+            result["bonus_fields"] += 1
+
+    # Depth metrics
+    reqs = data.get("requirements", [])
+    result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
+    tp = data.get("test_procedure", [])
+    result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
+    ev = data.get("evidence", [])
+    result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
+    result["title_length"] = len(data.get("title", ""))
+    result["objective_length"] = len(data.get("objective", ""))
+
+    # Score: 0-100
+    score = 0
+    score += 20 if result["json_valid"] else 0
+    score += (result["required_fields"] / result["required_total"]) * 40
+    score += (result["bonus_fields"] / result["bonus_total"]) * 15
+    score += min(result["requirements_count"], 5) * 3  # max 15 for 5+ requirements
+    score += min(result["test_procedure_count"], 3) * 3  # max 9 for 3+ tests
+    score += 1 if result["objective_length"] > 50 else 0
+    result["score"] = round(score, 1)
+
+    result["parsed_data"] = data
+    return result
+
+
+# ── Test Cases ───────────────────────────────────────────────────────
+
+TEST_CASES = [
+    {
+        "source": "DSGVO (EU) 2016/679",
+        "article": "Artikel 32",
+        "pdf": "dsgvo_2016_679.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
+    },
+    {
+        "source": "KI-Verordnung (EU) 2024/1689",
+        "article": "Artikel 9",
+        "pdf": "ai_act_2024_1689.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Risikomanagement für Hochrisiko-KI",
+    },
+    {
+        "source": "NIS2-Richtlinie (EU) 2022/2555",
+        "article": "Artikel 21",
+        "pdf": "nis2_2022_2555.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
+    },
+    {
+        "source": "Cyber Resilience Act (CRA)",
+        "article": "Artikel 13",
+        "pdf": "cra_2024_2847.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Pflichten der Hersteller",
+    },
+    {
+        "source": "Bundesdatenschutzgesetz (BDSG)",
+        "article": "§ 26",
+        "pdf": "bdsg.pdf",
+        "doc_type": "de_law",
+        "license": "DE_LAW",
+        "description": "Datenverarbeitung im Beschäftigungskontext",
+    },
+]
+
+
+# ── Main ─────────────────────────────────────────────────────────────
+
+def main():
+    if not ANTHROPIC_API_KEY:
+        print("ERROR: Set ANTHROPIC_API_KEY environment variable")
+        sys.exit(1)
+
+    print("=" * 80)
+    print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
+    print("=" * 80)
+    print(f"  LiteLLM:   {LITELLM_URL} / {LITELLM_MODEL}")
+    print(f"  Anthropic: {ANTHROPIC_MODEL}")
+    print(f"  Tests:     {len(TEST_CASES)}")
+    print()
+
+    # Pre-check LiteLLM
+    try:
+        r = requests.get(f"{LITELLM_URL}/v1/models",
+                         headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
+        print(f"  LiteLLM OK: {r.status_code}")
+    except Exception as e:
+        print(f"  LiteLLM ERROR: {e}")
+        sys.exit(1)
+
+    results = []
+
+    for i, tc in enumerate(TEST_CASES):
+        print(f"\n{'='*80}")
+        print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}")
+        print(f"  {tc['description']}")
+        print(f"{'='*80}")
+
+        # Extract article text from PDF
+        article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
+        if not article_text or article_text.startswith("["):
+            print(f"  WARNING: {article_text or 'Empty text'}")
+            continue
+
+        print(f"  Text extracted: {len(article_text)} chars")
+        print(f"  First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
+
+        prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
+
+        # ── Call LiteLLM ──
+        print(f"\n  --- gpt-oss-120b ---")
+        litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
+        if litellm_err:
+            print(f"  ERROR: {litellm_err}")
+            litellm_quality = {"json_valid": False, "score": 0}
+        else:
+            print(f"  Time: {litellm_time:.1f}s")
+            print(f"  Tokens: {litellm_usage}")
+            litellm_quality = assess_quality(litellm_raw)
+            print(f"  JSON valid: {litellm_quality['json_valid']}")
+            print(f"  Score: {litellm_quality['score']}/100")
+            print(f"  Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
+            print(f"  Requirements: {litellm_quality['requirements_count']}, "
+                  f"Tests: {litellm_quality['test_procedure_count']}, "
+                  f"Evidence: {litellm_quality['evidence_count']}")
+            if litellm_quality.get("parsed_data"):
+                d = litellm_quality["parsed_data"]
+                print(f"  Title: {d.get('title', 'N/A')}")
+
+        # ── Call Anthropic ──
+        print(f"\n  --- Claude Sonnet 4.6 ---")
+        anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
+        if anthropic_err:
+            print(f"  ERROR: {anthropic_err}")
+            anthropic_quality = {"json_valid": False, "score": 0}
+        else:
+            print(f"  Time: {anthropic_time:.1f}s")
+            print(f"  Tokens: {anthropic_usage}")
+            anthropic_quality = assess_quality(anthropic_raw)
+            print(f"  JSON valid: {anthropic_quality['json_valid']}")
+            print(f"  Score: {anthropic_quality['score']}/100")
+            print(f"  Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
+            print(f"  Requirements: {anthropic_quality['requirements_count']}, "
+                  f"Tests: {anthropic_quality['test_procedure_count']}, "
+                  f"Evidence: {anthropic_quality['evidence_count']}")
+            if anthropic_quality.get("parsed_data"):
+                d = anthropic_quality["parsed_data"]
+                print(f"  Title: {d.get('title', 'N/A')}")
+
+        # Compare
+        print(f"\n  --- VERGLEICH ---")
+        speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
+        print(f"  Speed:   120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
+              f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
+        print(f"  Score:   120b {litellm_quality.get('score', 0)}/100 vs "
+              f"Sonnet {anthropic_quality.get('score', 0)}/100")
+
+        results.append({
+            "test": f"{tc['source']} — {tc['article']}",
+            "litellm": {
+                "time": round(litellm_time, 1),
+                "score": litellm_quality.get("score", 0),
+                "json_valid": litellm_quality.get("json_valid", False),
+                "requirements": litellm_quality.get("requirements_count", 0),
+                "tests": litellm_quality.get("test_procedure_count", 0),
+                "usage": litellm_usage,
+                "raw": litellm_raw[:500] if litellm_raw else "",
+            },
+            "anthropic": {
+                "time": round(anthropic_time, 1),
+                "score": anthropic_quality.get("score", 0),
+                "json_valid": anthropic_quality.get("json_valid", False),
+                "requirements": anthropic_quality.get("requirements_count", 0),
+                "tests": anthropic_quality.get("test_procedure_count", 0),
+                "usage": anthropic_usage,
+                "raw": anthropic_raw[:500] if anthropic_raw else "",
+            },
+        })
+
+    # ── Summary ──────────────────────────────────────────────────────
+    print(f"\n\n{'='*80}")
+    print("ZUSAMMENFASSUNG")
+    print(f"{'='*80}")
+
+    if not results:
+        print("  Keine Ergebnisse.")
+        return
+
+    litellm_scores = [r["litellm"]["score"] for r in results]
+    anthropic_scores = [r["anthropic"]["score"] for r in results]
+    litellm_times = [r["litellm"]["time"] for r in results]
+    anthropic_times = [r["anthropic"]["time"] for r in results]
+
+    print(f"\n  {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
+    print(f"  {'-'*30} {'-'*15} {'-'*15}")
+    print(f"  {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f}   "
+          f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
+    print(f"  {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f}   "
+          f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
+    print(f"  {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)}   "
+          f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
+    print(f"  {'Avg Requirements':<30s} "
+          f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f}   "
+          f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
+    print(f"  {'Avg Test Procedures':<30s} "
+          f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f}   "
+          f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
+
+    # Cost estimate
+    # Claude Sonnet: ~$3/M input, ~$15/M output
+    # gpt-oss-120b: self-hosted = $0 API cost (only compute)
+    total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
+    total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
+    anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
+
+    print(f"\n  Kostenvergleich (fuer {len(results)} Controls):")
+    print(f"    gpt-oss-120b:    $0.00 (self-hosted)")
+    print(f"    Claude Sonnet:   ${anthropic_cost:.4f} "
+          f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
+
+    # Extrapolate for 494 gap articles
+    if results:
+        cost_per_control = anthropic_cost / len(results)
+        print(f"\n  Hochrechnung fuer 494 Luecken-Artikel:")
+        print(f"    gpt-oss-120b:    $0.00")
+        print(f"    Claude Sonnet:   ${cost_per_control * 494:.2f}")
+        avg_time_120b = sum(litellm_times) / len(litellm_times)
+        avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
+        print(f"    Zeit 120b:       {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
+        print(f"    Zeit Sonnet:     {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
+
+    # Save full results
+    out_path = "/tmp/benchmark_llm_results.json"
+    with open(out_path, 'w') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print(f"\n  Detaillierte Ergebnisse: {out_path}")
+
+
+if __name__ == "__main__":
+    main()