feat(cra): SBOM- + DAST-Findings aus dem Scanner-MCP konsumieren

Sharangs compliance-scanner-agent exponiert SBOM (sbom_vuln_report) + DAST (list_dast_findings) als eigene MCP-Tools (nicht via list_findings). Neuer fetch_all_findings(repo_id) zieht list_findings + SBOM + DAST in EINER MCP-Session und normalisiert ins Finding-Schema: - SBOM: ein Finding pro verwundbarem Paket (nicht pro CVE), cwe=CWE-1395 -> deterministisch CRA-AI-22 (robust gegen Paketnamen wie "sqlite"). - DAST: cwe/endpoint/vuln_type uebernommen -> Mapping via cwe/keywords. assess-from-scanner nutzt fetch_all_findings + liefert source.breakdown (code/sbom/dast). DAST hat im MCP keinen repo_id-Filter -> dast_repo_scoped:false (deployment-weit, transparent geflaggt). Echte MCP-Daten: Kitchenasty 58 code + 35 sbom + 81 dast -> 174 gemappt (Coverage 94,3%, alle 35 SBOM -> CRA-AI-22). Enthaelt zusaetzlich das Qdrant->Prod-Kopierскript (#42, verbatim macmini->prod). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-18 12:05:05 +02:00
parent 8f21650d74
commit 43e02f794a
4 changed files with 298 additions and 3 deletions
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""Verbatim copy of the IACE Qdrant knowledge-base collections to another Qdrant.
+
+There is no RAG/embedding service on prod, so the normal ingest_iace_kb.sh has no
+target there. Instead we copy the already-embedded points (id + vector + payload)
+1:1 from the source Qdrant (macmini) to the destination (prod). No re-embedding,
+no re-chunking → the destination is byte-identical and /sdk/v1/rag/search reads it
+the same way. Idempotent: same point ids → upsert overwrites, no duplicates.
+
+Usage (run on macmini; reads local Qdrant, writes prod Qdrant):
+    SRC_QDRANT=http://localhost:6333 \
+    DST_QDRANT=https://qdrant-dev.breakpilot.ai \
+    DST_QDRANT_KEY=<prod-api-key> \
+    python3 copy_iace_collections_to_prod.py
+"""
+import json
+import os
+import urllib.error
+import urllib.request
+
+SRC = os.environ.get("SRC_QDRANT", "http://localhost:6333").rstrip("/")
+DST = os.environ["DST_QDRANT"].rstrip("/")
+KEY = os.environ["DST_QDRANT_KEY"]
+COLLECTIONS = os.environ.get(
+    "COLLECTIONS", "bp_iace_accident_stats,bp_iace_safety_kb,bp_iace_failure_kb"
+).split(",")
+BATCH = 128
+
+
+def _req(method, url, body=None, key=None):
+    data = json.dumps(body).encode() if body is not None else None
+    r = urllib.request.Request(url, data=data, method=method)
+    r.add_header("Content-Type", "application/json")
+    if key:
+        r.add_header("api-key", key)
+    with urllib.request.urlopen(r, timeout=120) as resp:
+        return json.loads(resp.read())
+
+
+def _exists(base, col, key=None) -> bool:
+    try:
+        _req("GET", f"{base}/collections/{col}", key=key)
+        return True
+    except urllib.error.HTTPError as e:
+        if e.code == 404:
+            return False
+        raise
+
+
+def copy_collection(col: str) -> None:
+    src_cfg = _req("GET", f"{SRC}/collections/{col}")["result"]["config"]["params"]["vectors"]
+    size, dist = src_cfg["size"], src_cfg["distance"]
+    if _exists(DST, col, KEY):
+        print(f"  {col}: dst exists — upserting into it")
+    else:
+        _req("PUT", f"{DST}/collections/{col}", {"vectors": {"size": size, "distance": dist}}, KEY)
+        print(f"  {col}: created on dst ({size}d {dist})")
+
+    offset, total = None, 0
+    while True:
+        body = {"limit": BATCH, "with_vector": True, "with_payload": True}
+        if offset is not None:
+            body["offset"] = offset
+        res = _req("POST", f"{SRC}/collections/{col}/points/scroll", body)["result"]
+        pts = res.get("points", [])
+        if not pts:
+            break
+        upsert = [{"id": p["id"], "vector": p["vector"], "payload": p.get("payload", {})} for p in pts]
+        _req("PUT", f"{DST}/collections/{col}/points?wait=true", {"points": upsert}, KEY)
+        total += len(pts)
+        offset = res.get("next_page_offset")
+        if offset is None:
+            break
+
+    src_n = _req("POST", f"{SRC}/collections/{col}/points/count", {"exact": True})["result"]["count"]
+    dst_n = _req("POST", f"{DST}/collections/{col}/points/count", {"exact": True}, KEY)["result"]["count"]
+    flag = "OK" if dst_n >= src_n else "MISMATCH"
+    print(f"  {col}: copied {total} | src={src_n} dst={dst_n} [{flag}]")
+
+
+def main() -> None:
+    print(f"Copy IACE collections {SRC} -> {DST}")
+    for col in COLLECTIONS:
+        copy_collection(col.strip())
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()
@@ -21,7 +21,7 @@ from compliance.services.cra_applicability import (
    compute_verdict, compute_machinery_verdict, maturity as evidence_maturity, MACHINE_INTEGRATOR,
 )
 from compliance.services.cra_datasheet_extractor import extract_grenzen
-from compliance.services.scanner_mcp_client import fetch_findings
+from compliance.services.scanner_mcp_client import fetch_findings, fetch_all_findings
 from compliance.services.cra_snapshot_store import save_snapshot, list_snapshots, get_snapshot
 from compliance.services.cra_use_case_controls import enrich_findings_with_breadth
 from compliance.services.cra_component_findings import findings_from_components
@@ -121,10 +121,11 @@ async def assess_from_scanner(body: ScannerPullRequest):
    scan_type/cvss_score/file_path). Returns empty assessment if no scanner is
    configured — the frontend then keeps its demo scenario.
    """
-    findings = await fetch_findings(
+    pulled = await fetch_all_findings(
        repo_id=body.repo_id, severity=body.severity,
        base_url=body.scanner_url, token=body.token,
    )
+    findings = pulled.get("findings", [])
    payload = {
        "findings": findings,
        "weights": body.weights,
@@ -136,7 +137,8 @@ async def assess_from_scanner(body: ScannerPullRequest):
        enrich_findings_with_breadth(result.get("mapped", []), db)
    finally:
        db.close()
-    result["source"] = {"scanner": True, "pulled": len(findings)}
+    result["source"] = {"scanner": True, "pulled": len(findings),
+                        "breakdown": pulled.get("breakdown", {})}
    return result


@@ -57,3 +57,130 @@ async def fetch_findings(

    texts = [c.text for c in (result.content or []) if getattr(c, "type", "") == "text"]
    return parse_findings_text(texts[0]) if texts else []
+
+
+# --- SBOM + DAST consumption (Sharang's scanner exposes these as dedicated MCP
+# tools, not via list_findings) -------------------------------------------------
+
+_SEV_BY_RANK = {4: "critical", 3: "high", 2: "medium", 1: "low"}
+_SEV_RANK = {v: k for k, v in _SEV_BY_RANK.items()}
+
+
+def normalize_sbom_report(text: str) -> list:
+    """sbom_vuln_report -> one finding per VULNERABLE PACKAGE (not per CVE — a repo
+    can have hundreds of CVEs but ~dozens of packages). scan_type='dependency' so
+    the CRA mapper routes it to dependency-monitoring (CRA-AI-22)."""
+    try:
+        data = json.loads(text)
+    except (json.JSONDecodeError, TypeError):
+        return []
+    if not isinstance(data, dict):
+        return []
+    repo_id = data.get("repo_id", "")
+    out = []
+    for pkg in data.get("packages") or []:
+        vulns = pkg.get("vulnerabilities") or []
+        if not vulns:
+            continue
+        ids, seen, best = [], set(), 0
+        for v in vulns:
+            vid = v.get("id")
+            if vid and vid not in seen:
+                seen.add(vid)
+                ids.append(vid)
+            best = max(best, _SEV_RANK.get((v.get("severity") or "").lower(), 0))
+        name, ver = pkg.get("name", ""), pkg.get("version", "")
+        pm = pkg.get("package_manager", "") or ""
+        shown = ", ".join(ids[:8]) + (" …" if len(ids) > 8 else "")
+        out.append({
+            "id": f"sbom:{repo_id}:{name}@{ver}",
+            "repo_id": repo_id,
+            "title": f"Verwundbare Abhängigkeit: {name} {ver} ({len(ids)} Schwachstelle(n))",
+            "description": f"Abhängigkeit {name} {ver} ({pm}) mit bekannten Schwachstellen: {shown}.",
+            "scan_type": "dependency",
+            # CWE-1395 (Dependency on Vulnerable Third-Party Component) → the CWE
+            # path maps deterministically to CRA-AI-22, robust against package
+            # names that happen to contain keyword tokens (e.g. "sqlite" → "sql").
+            "cwe": "CWE-1395",
+            "severity": _SEV_BY_RANK.get(best, "medium"),
+            "location": f"{pm}:{name}@{ver}" if pm else f"{name}@{ver}",
+        })
+    return out
+
+
+def normalize_dast(text: str) -> list:
+    """list_dast_findings -> findings (carry cwe + endpoint + vuln_type so the CRA
+    mapper routes them via cwe/keywords). scan_type='dast'."""
+    out = []
+    for d in parse_findings_text(text):
+        if not isinstance(d, dict):
+            continue
+        out.append({
+            "id": d.get("_id") or d.get("id") or "",
+            "repo_id": d.get("repo_id") or "",
+            "title": d.get("title", ""),
+            "description": " ".join(x for x in [d.get("vuln_type", ""), d.get("description", "")] if x),
+            "scan_type": "dast",
+            "cwe": str(d.get("cwe", "") or ""),
+            "severity": (d.get("severity") or "").lower(),
+            "location": d.get("endpoint") or d.get("target_id") or "",
+            "exploited": bool(d.get("exploitable", False)),
+        })
+    return out
+
+
+async def _open_and_call(url: str, tok: str, calls: list) -> dict:
+    """Open ONE MCP session and run [(tool, params), ...] -> {tool: text}. A tool
+    that errors yields '' (best-effort; the assessment degrades, never breaks)."""
+    from mcp.client.session import ClientSession
+    from mcp.client.streamable_http import streamablehttp_client
+
+    headers = {"Authorization": f"Bearer {tok}"} if tok else None
+    out: dict = {}
+    async with streamablehttp_client(url, headers=headers) as (read, write, _):
+        async with ClientSession(read, write) as session:
+            await session.initialize()
+            for tool, params in calls:
+                try:
+                    res = await session.call_tool(tool, params)
+                    texts = [c.text for c in (res.content or []) if getattr(c, "type", "") == "text"]
+                    out[tool] = texts[0] if texts else ""
+                except Exception:
+                    out[tool] = ""
+    return out
+
+
+async def fetch_all_findings(
+    repo_id: Optional[str] = None,
+    severity: Optional[str] = None,
+    limit: int = 200,
+    base_url: Optional[str] = None,
+    token: Optional[str] = None,
+    include_dast: bool = True,
+) -> dict:
+    """Pull list_findings + SBOM-vulns + DAST in one MCP session and return a
+    unified finding list plus a per-source breakdown. SBOM is repo-scoped
+    (sbom_vuln_report requires repo_id); DAST has no repo_id filter in the MCP, so
+    it is deployment-wide (flagged in the breakdown). Returns {} on no config."""
+    url = (base_url or SCANNER_MCP_URL).rstrip("/")
+    tok = token or SCANNER_MCP_TOKEN
+    if not url:
+        return {"findings": [], "breakdown": {}}
+
+    calls = [("list_findings", {"limit": limit, **({"repo_id": repo_id} if repo_id else {}),
+                                **({"severity": severity} if severity else {})})]
+    if repo_id:
+        calls.append(("sbom_vuln_report", {"repo_id": repo_id}))
+    if include_dast:
+        calls.append(("list_dast_findings", {"limit": limit,
+                                             **({"severity": severity} if severity else {})}))
+
+    res = await _open_and_call(url, tok, calls)
+    code = parse_findings_text(res.get("list_findings", ""))
+    sbom = normalize_sbom_report(res.get("sbom_vuln_report", "")) if repo_id else []
+    dast = normalize_dast(res.get("list_dast_findings", "")) if include_dast else []
+    return {
+        "findings": code + sbom + dast,
+        "breakdown": {"code": len(code), "sbom": len(sbom), "dast": len(dast),
+                      "dast_repo_scoped": False},
+    }
@@ -0,0 +1,77 @@
+"""SBOM/DAST normalization from the scanner MCP -> CRA finding shape + mapping.
+
+Shapes pinned from the live MCP (sbom_vuln_report / list_dast_findings, 2026-06-18).
+"""
+import json
+
+from compliance.services.scanner_mcp_client import normalize_sbom_report, normalize_dast
+from compliance.services.cra_finding_mapper import ScannerFinding, map_finding
+
+SBOM = json.dumps({
+    "repo_id": "r1",
+    "vulnerable_packages_count": 1,
+    "total_vulnerabilities": 3,
+    "packages": [
+        {"name": "dompurify", "version": "3.3.3", "package_manager": "npm",
+         "license": "MIT", "vulnerabilities": [
+             {"id": "GHSA-39q2", "source": "osv", "severity": None},
+             {"id": "GHSA-39q2", "source": "osv", "severity": None},   # dup
+             {"id": "GHSA-76mc", "source": "osv", "severity": "high"}]},
+        {"name": "clean-pkg", "version": "1.0", "package_manager": "npm",
+         "vulnerabilities": []},   # no vulns -> skipped
+    ],
+})
+
+DAST = json.dumps([
+    {"_id": {"$oid": "abc123"}, "vuln_type": "security_misconfiguration",
+     "title": "SQL backup exposure: /backup.sql", "description": "Sensitive resource accessible.",
+     "severity": "high", "cwe": "CWE-16", "endpoint": "https://demo.x/backup.sql",
+     "method": "GET", "exploitable": True},
+])
+
+
+class TestSbom:
+    def test_one_finding_per_vulnerable_package(self):
+        out = normalize_sbom_report(SBOM)
+        assert len(out) == 1                      # clean-pkg skipped
+        f = out[0]
+        assert f["scan_type"] == "dependency"
+        assert f["cwe"] == "CWE-1395"
+        assert f["location"] == "npm:dompurify@3.3.3"
+        assert f["severity"] == "high"            # escalated from the one graded vuln
+        assert "GHSA-39q2" in f["description"] and "GHSA-76mc" in f["description"]
+
+    def test_maps_to_dependency_requirement_even_with_keyword_in_name(self):
+        # CWE path dominates → CRA-AI-22, not CRA-AI-20 from a "sql"-like name
+        out = normalize_sbom_report(json.dumps({
+            "repo_id": "r", "packages": [
+                {"name": "sqlite3", "version": "5.0", "package_manager": "npm",
+                 "vulnerabilities": [{"id": "CVE-x", "severity": "medium"}]}]}))
+        m = map_finding(ScannerFinding.from_dict(out[0]))
+        assert m.primary_requirement == "CRA-AI-22"
+
+    def test_bad_json(self):
+        assert normalize_sbom_report("not json") == []
+        assert normalize_sbom_report("{}") == []
+
+
+class TestDast:
+    def test_normalizes_dast_finding(self):
+        out = normalize_dast(DAST)
+        assert len(out) == 1
+        f = out[0]
+        assert f["scan_type"] == "dast"
+        assert f["cwe"] == "CWE-16"
+        assert f["location"] == "https://demo.x/backup.sql"
+        assert f["exploited"] is True
+        assert "security_misconfiguration" in f["description"]
+
+    def test_dast_maps_via_cwe(self):
+        out = normalize_dast(DAST)
+        m = map_finding(ScannerFinding.from_dict(out[0]))
+        assert m.primary_requirement == "CRA-AI-1"   # CWE-16 -> secure config
+        assert m.finding_id == "abc123"               # _id.$oid extracted
+
+    def test_empty(self):
+        assert normalize_dast("[]") == []
+        assert normalize_dast("not json") == []