diff --git a/ai-compliance-sdk/scripts/copy_iace_collections_to_prod.py b/ai-compliance-sdk/scripts/copy_iace_collections_to_prod.py new file mode 100644 index 00000000..25f21d37 --- /dev/null +++ b/ai-compliance-sdk/scripts/copy_iace_collections_to_prod.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Verbatim copy of the IACE Qdrant knowledge-base collections to another Qdrant. + +There is no RAG/embedding service on prod, so the normal ingest_iace_kb.sh has no +target there. Instead we copy the already-embedded points (id + vector + payload) +1:1 from the source Qdrant (macmini) to the destination (prod). No re-embedding, +no re-chunking → the destination is byte-identical and /sdk/v1/rag/search reads it +the same way. Idempotent: same point ids → upsert overwrites, no duplicates. + +Usage (run on macmini; reads local Qdrant, writes prod Qdrant): + SRC_QDRANT=http://localhost:6333 \ + DST_QDRANT=https://qdrant-dev.breakpilot.ai \ + DST_QDRANT_KEY= \ + python3 copy_iace_collections_to_prod.py +""" +import json +import os +import urllib.error +import urllib.request + +SRC = os.environ.get("SRC_QDRANT", "http://localhost:6333").rstrip("/") +DST = os.environ["DST_QDRANT"].rstrip("/") +KEY = os.environ["DST_QDRANT_KEY"] +COLLECTIONS = os.environ.get( + "COLLECTIONS", "bp_iace_accident_stats,bp_iace_safety_kb,bp_iace_failure_kb" +).split(",") +BATCH = 128 + + +def _req(method, url, body=None, key=None): + data = json.dumps(body).encode() if body is not None else None + r = urllib.request.Request(url, data=data, method=method) + r.add_header("Content-Type", "application/json") + if key: + r.add_header("api-key", key) + with urllib.request.urlopen(r, timeout=120) as resp: + return json.loads(resp.read()) + + +def _exists(base, col, key=None) -> bool: + try: + _req("GET", f"{base}/collections/{col}", key=key) + return True + except urllib.error.HTTPError as e: + if e.code == 404: + return False + raise + + +def copy_collection(col: str) -> None: + src_cfg = _req("GET", f"{SRC}/collections/{col}")["result"]["config"]["params"]["vectors"] + size, dist = src_cfg["size"], src_cfg["distance"] + if _exists(DST, col, KEY): + print(f" {col}: dst exists — upserting into it") + else: + _req("PUT", f"{DST}/collections/{col}", {"vectors": {"size": size, "distance": dist}}, KEY) + print(f" {col}: created on dst ({size}d {dist})") + + offset, total = None, 0 + while True: + body = {"limit": BATCH, "with_vector": True, "with_payload": True} + if offset is not None: + body["offset"] = offset + res = _req("POST", f"{SRC}/collections/{col}/points/scroll", body)["result"] + pts = res.get("points", []) + if not pts: + break + upsert = [{"id": p["id"], "vector": p["vector"], "payload": p.get("payload", {})} for p in pts] + _req("PUT", f"{DST}/collections/{col}/points?wait=true", {"points": upsert}, KEY) + total += len(pts) + offset = res.get("next_page_offset") + if offset is None: + break + + src_n = _req("POST", f"{SRC}/collections/{col}/points/count", {"exact": True})["result"]["count"] + dst_n = _req("POST", f"{DST}/collections/{col}/points/count", {"exact": True}, KEY)["result"]["count"] + flag = "OK" if dst_n >= src_n else "MISMATCH" + print(f" {col}: copied {total} | src={src_n} dst={dst_n} [{flag}]") + + +def main() -> None: + print(f"Copy IACE collections {SRC} -> {DST}") + for col in COLLECTIONS: + copy_collection(col.strip()) + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/backend-compliance/compliance/api/cra_assess_routes.py b/backend-compliance/compliance/api/cra_assess_routes.py index 179acd45..20176a38 100644 --- a/backend-compliance/compliance/api/cra_assess_routes.py +++ b/backend-compliance/compliance/api/cra_assess_routes.py @@ -21,7 +21,7 @@ from compliance.services.cra_applicability import ( compute_verdict, compute_machinery_verdict, maturity as evidence_maturity, MACHINE_INTEGRATOR, ) from compliance.services.cra_datasheet_extractor import extract_grenzen -from compliance.services.scanner_mcp_client import fetch_findings +from compliance.services.scanner_mcp_client import fetch_findings, fetch_all_findings from compliance.services.cra_snapshot_store import save_snapshot, list_snapshots, get_snapshot from compliance.services.cra_use_case_controls import enrich_findings_with_breadth from compliance.services.cra_component_findings import findings_from_components @@ -121,10 +121,11 @@ async def assess_from_scanner(body: ScannerPullRequest): scan_type/cvss_score/file_path). Returns empty assessment if no scanner is configured — the frontend then keeps its demo scenario. """ - findings = await fetch_findings( + pulled = await fetch_all_findings( repo_id=body.repo_id, severity=body.severity, base_url=body.scanner_url, token=body.token, ) + findings = pulled.get("findings", []) payload = { "findings": findings, "weights": body.weights, @@ -136,7 +137,8 @@ async def assess_from_scanner(body: ScannerPullRequest): enrich_findings_with_breadth(result.get("mapped", []), db) finally: db.close() - result["source"] = {"scanner": True, "pulled": len(findings)} + result["source"] = {"scanner": True, "pulled": len(findings), + "breakdown": pulled.get("breakdown", {})} return result diff --git a/backend-compliance/compliance/services/scanner_mcp_client.py b/backend-compliance/compliance/services/scanner_mcp_client.py index 0b5da97a..18babd15 100644 --- a/backend-compliance/compliance/services/scanner_mcp_client.py +++ b/backend-compliance/compliance/services/scanner_mcp_client.py @@ -57,3 +57,130 @@ async def fetch_findings( texts = [c.text for c in (result.content or []) if getattr(c, "type", "") == "text"] return parse_findings_text(texts[0]) if texts else [] + + +# --- SBOM + DAST consumption (Sharang's scanner exposes these as dedicated MCP +# tools, not via list_findings) ------------------------------------------------- + +_SEV_BY_RANK = {4: "critical", 3: "high", 2: "medium", 1: "low"} +_SEV_RANK = {v: k for k, v in _SEV_BY_RANK.items()} + + +def normalize_sbom_report(text: str) -> list: + """sbom_vuln_report -> one finding per VULNERABLE PACKAGE (not per CVE — a repo + can have hundreds of CVEs but ~dozens of packages). scan_type='dependency' so + the CRA mapper routes it to dependency-monitoring (CRA-AI-22).""" + try: + data = json.loads(text) + except (json.JSONDecodeError, TypeError): + return [] + if not isinstance(data, dict): + return [] + repo_id = data.get("repo_id", "") + out = [] + for pkg in data.get("packages") or []: + vulns = pkg.get("vulnerabilities") or [] + if not vulns: + continue + ids, seen, best = [], set(), 0 + for v in vulns: + vid = v.get("id") + if vid and vid not in seen: + seen.add(vid) + ids.append(vid) + best = max(best, _SEV_RANK.get((v.get("severity") or "").lower(), 0)) + name, ver = pkg.get("name", ""), pkg.get("version", "") + pm = pkg.get("package_manager", "") or "" + shown = ", ".join(ids[:8]) + (" …" if len(ids) > 8 else "") + out.append({ + "id": f"sbom:{repo_id}:{name}@{ver}", + "repo_id": repo_id, + "title": f"Verwundbare Abhängigkeit: {name} {ver} ({len(ids)} Schwachstelle(n))", + "description": f"Abhängigkeit {name} {ver} ({pm}) mit bekannten Schwachstellen: {shown}.", + "scan_type": "dependency", + # CWE-1395 (Dependency on Vulnerable Third-Party Component) → the CWE + # path maps deterministically to CRA-AI-22, robust against package + # names that happen to contain keyword tokens (e.g. "sqlite" → "sql"). + "cwe": "CWE-1395", + "severity": _SEV_BY_RANK.get(best, "medium"), + "location": f"{pm}:{name}@{ver}" if pm else f"{name}@{ver}", + }) + return out + + +def normalize_dast(text: str) -> list: + """list_dast_findings -> findings (carry cwe + endpoint + vuln_type so the CRA + mapper routes them via cwe/keywords). scan_type='dast'.""" + out = [] + for d in parse_findings_text(text): + if not isinstance(d, dict): + continue + out.append({ + "id": d.get("_id") or d.get("id") or "", + "repo_id": d.get("repo_id") or "", + "title": d.get("title", ""), + "description": " ".join(x for x in [d.get("vuln_type", ""), d.get("description", "")] if x), + "scan_type": "dast", + "cwe": str(d.get("cwe", "") or ""), + "severity": (d.get("severity") or "").lower(), + "location": d.get("endpoint") or d.get("target_id") or "", + "exploited": bool(d.get("exploitable", False)), + }) + return out + + +async def _open_and_call(url: str, tok: str, calls: list) -> dict: + """Open ONE MCP session and run [(tool, params), ...] -> {tool: text}. A tool + that errors yields '' (best-effort; the assessment degrades, never breaks).""" + from mcp.client.session import ClientSession + from mcp.client.streamable_http import streamablehttp_client + + headers = {"Authorization": f"Bearer {tok}"} if tok else None + out: dict = {} + async with streamablehttp_client(url, headers=headers) as (read, write, _): + async with ClientSession(read, write) as session: + await session.initialize() + for tool, params in calls: + try: + res = await session.call_tool(tool, params) + texts = [c.text for c in (res.content or []) if getattr(c, "type", "") == "text"] + out[tool] = texts[0] if texts else "" + except Exception: + out[tool] = "" + return out + + +async def fetch_all_findings( + repo_id: Optional[str] = None, + severity: Optional[str] = None, + limit: int = 200, + base_url: Optional[str] = None, + token: Optional[str] = None, + include_dast: bool = True, +) -> dict: + """Pull list_findings + SBOM-vulns + DAST in one MCP session and return a + unified finding list plus a per-source breakdown. SBOM is repo-scoped + (sbom_vuln_report requires repo_id); DAST has no repo_id filter in the MCP, so + it is deployment-wide (flagged in the breakdown). Returns {} on no config.""" + url = (base_url or SCANNER_MCP_URL).rstrip("/") + tok = token or SCANNER_MCP_TOKEN + if not url: + return {"findings": [], "breakdown": {}} + + calls = [("list_findings", {"limit": limit, **({"repo_id": repo_id} if repo_id else {}), + **({"severity": severity} if severity else {})})] + if repo_id: + calls.append(("sbom_vuln_report", {"repo_id": repo_id})) + if include_dast: + calls.append(("list_dast_findings", {"limit": limit, + **({"severity": severity} if severity else {})})) + + res = await _open_and_call(url, tok, calls) + code = parse_findings_text(res.get("list_findings", "")) + sbom = normalize_sbom_report(res.get("sbom_vuln_report", "")) if repo_id else [] + dast = normalize_dast(res.get("list_dast_findings", "")) if include_dast else [] + return { + "findings": code + sbom + dast, + "breakdown": {"code": len(code), "sbom": len(sbom), "dast": len(dast), + "dast_repo_scoped": False}, + } diff --git a/backend-compliance/tests/test_scanner_mcp_normalize.py b/backend-compliance/tests/test_scanner_mcp_normalize.py new file mode 100644 index 00000000..eb0c1bc8 --- /dev/null +++ b/backend-compliance/tests/test_scanner_mcp_normalize.py @@ -0,0 +1,77 @@ +"""SBOM/DAST normalization from the scanner MCP -> CRA finding shape + mapping. + +Shapes pinned from the live MCP (sbom_vuln_report / list_dast_findings, 2026-06-18). +""" +import json + +from compliance.services.scanner_mcp_client import normalize_sbom_report, normalize_dast +from compliance.services.cra_finding_mapper import ScannerFinding, map_finding + +SBOM = json.dumps({ + "repo_id": "r1", + "vulnerable_packages_count": 1, + "total_vulnerabilities": 3, + "packages": [ + {"name": "dompurify", "version": "3.3.3", "package_manager": "npm", + "license": "MIT", "vulnerabilities": [ + {"id": "GHSA-39q2", "source": "osv", "severity": None}, + {"id": "GHSA-39q2", "source": "osv", "severity": None}, # dup + {"id": "GHSA-76mc", "source": "osv", "severity": "high"}]}, + {"name": "clean-pkg", "version": "1.0", "package_manager": "npm", + "vulnerabilities": []}, # no vulns -> skipped + ], +}) + +DAST = json.dumps([ + {"_id": {"$oid": "abc123"}, "vuln_type": "security_misconfiguration", + "title": "SQL backup exposure: /backup.sql", "description": "Sensitive resource accessible.", + "severity": "high", "cwe": "CWE-16", "endpoint": "https://demo.x/backup.sql", + "method": "GET", "exploitable": True}, +]) + + +class TestSbom: + def test_one_finding_per_vulnerable_package(self): + out = normalize_sbom_report(SBOM) + assert len(out) == 1 # clean-pkg skipped + f = out[0] + assert f["scan_type"] == "dependency" + assert f["cwe"] == "CWE-1395" + assert f["location"] == "npm:dompurify@3.3.3" + assert f["severity"] == "high" # escalated from the one graded vuln + assert "GHSA-39q2" in f["description"] and "GHSA-76mc" in f["description"] + + def test_maps_to_dependency_requirement_even_with_keyword_in_name(self): + # CWE path dominates → CRA-AI-22, not CRA-AI-20 from a "sql"-like name + out = normalize_sbom_report(json.dumps({ + "repo_id": "r", "packages": [ + {"name": "sqlite3", "version": "5.0", "package_manager": "npm", + "vulnerabilities": [{"id": "CVE-x", "severity": "medium"}]}]})) + m = map_finding(ScannerFinding.from_dict(out[0])) + assert m.primary_requirement == "CRA-AI-22" + + def test_bad_json(self): + assert normalize_sbom_report("not json") == [] + assert normalize_sbom_report("{}") == [] + + +class TestDast: + def test_normalizes_dast_finding(self): + out = normalize_dast(DAST) + assert len(out) == 1 + f = out[0] + assert f["scan_type"] == "dast" + assert f["cwe"] == "CWE-16" + assert f["location"] == "https://demo.x/backup.sql" + assert f["exploited"] is True + assert "security_misconfiguration" in f["description"] + + def test_dast_maps_via_cwe(self): + out = normalize_dast(DAST) + m = map_finding(ScannerFinding.from_dict(out[0])) + assert m.primary_requirement == "CRA-AI-1" # CWE-16 -> secure config + assert m.finding_id == "abc123" # _id.$oid extracted + + def test_empty(self): + assert normalize_dast("[]") == [] + assert normalize_dast("not json") == []