feat(cra): SBOM- + DAST-Findings aus dem Scanner-MCP konsumieren
CI / detect-changes (push) Successful in 8s
CI / branch-name (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 6s
CI / validate-canonical-controls (push) Successful in 10s
CI / loc-budget (push) Successful in 20s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Successful in 1m4s
CI / iace-gt-coverage (push) Successful in 15s
CI / test-python-backend (push) Successful in 24s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped

Sharangs compliance-scanner-agent exponiert SBOM (sbom_vuln_report) + DAST
(list_dast_findings) als eigene MCP-Tools (nicht via list_findings). Neuer
fetch_all_findings(repo_id) zieht list_findings + SBOM + DAST in EINER
MCP-Session und normalisiert ins Finding-Schema:
- SBOM: ein Finding pro verwundbarem Paket (nicht pro CVE), cwe=CWE-1395
  -> deterministisch CRA-AI-22 (robust gegen Paketnamen wie "sqlite").
- DAST: cwe/endpoint/vuln_type uebernommen -> Mapping via cwe/keywords.
assess-from-scanner nutzt fetch_all_findings + liefert source.breakdown
(code/sbom/dast). DAST hat im MCP keinen repo_id-Filter -> dast_repo_scoped:false
(deployment-weit, transparent geflaggt).

Echte MCP-Daten: Kitchenasty 58 code + 35 sbom + 81 dast -> 174 gemappt
(Coverage 94,3%, alle 35 SBOM -> CRA-AI-22).

Enthaelt zusaetzlich das Qdrant->Prod-Kopierскript (#42, verbatim macmini->prod).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Bönisch
2026-06-18 12:05:05 +02:00
parent 8f21650d74
commit 43e02f794a
4 changed files with 298 additions and 3 deletions
@@ -0,0 +1,89 @@
#!/usr/bin/env python3
"""Verbatim copy of the IACE Qdrant knowledge-base collections to another Qdrant.
There is no RAG/embedding service on prod, so the normal ingest_iace_kb.sh has no
target there. Instead we copy the already-embedded points (id + vector + payload)
1:1 from the source Qdrant (macmini) to the destination (prod). No re-embedding,
no re-chunking → the destination is byte-identical and /sdk/v1/rag/search reads it
the same way. Idempotent: same point ids → upsert overwrites, no duplicates.
Usage (run on macmini; reads local Qdrant, writes prod Qdrant):
SRC_QDRANT=http://localhost:6333 \
DST_QDRANT=https://qdrant-dev.breakpilot.ai \
DST_QDRANT_KEY=<prod-api-key> \
python3 copy_iace_collections_to_prod.py
"""
import json
import os
import urllib.error
import urllib.request
SRC = os.environ.get("SRC_QDRANT", "http://localhost:6333").rstrip("/")
DST = os.environ["DST_QDRANT"].rstrip("/")
KEY = os.environ["DST_QDRANT_KEY"]
COLLECTIONS = os.environ.get(
"COLLECTIONS", "bp_iace_accident_stats,bp_iace_safety_kb,bp_iace_failure_kb"
).split(",")
BATCH = 128
def _req(method, url, body=None, key=None):
data = json.dumps(body).encode() if body is not None else None
r = urllib.request.Request(url, data=data, method=method)
r.add_header("Content-Type", "application/json")
if key:
r.add_header("api-key", key)
with urllib.request.urlopen(r, timeout=120) as resp:
return json.loads(resp.read())
def _exists(base, col, key=None) -> bool:
try:
_req("GET", f"{base}/collections/{col}", key=key)
return True
except urllib.error.HTTPError as e:
if e.code == 404:
return False
raise
def copy_collection(col: str) -> None:
src_cfg = _req("GET", f"{SRC}/collections/{col}")["result"]["config"]["params"]["vectors"]
size, dist = src_cfg["size"], src_cfg["distance"]
if _exists(DST, col, KEY):
print(f" {col}: dst exists — upserting into it")
else:
_req("PUT", f"{DST}/collections/{col}", {"vectors": {"size": size, "distance": dist}}, KEY)
print(f" {col}: created on dst ({size}d {dist})")
offset, total = None, 0
while True:
body = {"limit": BATCH, "with_vector": True, "with_payload": True}
if offset is not None:
body["offset"] = offset
res = _req("POST", f"{SRC}/collections/{col}/points/scroll", body)["result"]
pts = res.get("points", [])
if not pts:
break
upsert = [{"id": p["id"], "vector": p["vector"], "payload": p.get("payload", {})} for p in pts]
_req("PUT", f"{DST}/collections/{col}/points?wait=true", {"points": upsert}, KEY)
total += len(pts)
offset = res.get("next_page_offset")
if offset is None:
break
src_n = _req("POST", f"{SRC}/collections/{col}/points/count", {"exact": True})["result"]["count"]
dst_n = _req("POST", f"{DST}/collections/{col}/points/count", {"exact": True}, KEY)["result"]["count"]
flag = "OK" if dst_n >= src_n else "MISMATCH"
print(f" {col}: copied {total} | src={src_n} dst={dst_n} [{flag}]")
def main() -> None:
print(f"Copy IACE collections {SRC} -> {DST}")
for col in COLLECTIONS:
copy_collection(col.strip())
print("Done.")
if __name__ == "__main__":
main()
@@ -21,7 +21,7 @@ from compliance.services.cra_applicability import (
compute_verdict, compute_machinery_verdict, maturity as evidence_maturity, MACHINE_INTEGRATOR,
)
from compliance.services.cra_datasheet_extractor import extract_grenzen
from compliance.services.scanner_mcp_client import fetch_findings
from compliance.services.scanner_mcp_client import fetch_findings, fetch_all_findings
from compliance.services.cra_snapshot_store import save_snapshot, list_snapshots, get_snapshot
from compliance.services.cra_use_case_controls import enrich_findings_with_breadth
from compliance.services.cra_component_findings import findings_from_components
@@ -121,10 +121,11 @@ async def assess_from_scanner(body: ScannerPullRequest):
scan_type/cvss_score/file_path). Returns empty assessment if no scanner is
configured — the frontend then keeps its demo scenario.
"""
findings = await fetch_findings(
pulled = await fetch_all_findings(
repo_id=body.repo_id, severity=body.severity,
base_url=body.scanner_url, token=body.token,
)
findings = pulled.get("findings", [])
payload = {
"findings": findings,
"weights": body.weights,
@@ -136,7 +137,8 @@ async def assess_from_scanner(body: ScannerPullRequest):
enrich_findings_with_breadth(result.get("mapped", []), db)
finally:
db.close()
result["source"] = {"scanner": True, "pulled": len(findings)}
result["source"] = {"scanner": True, "pulled": len(findings),
"breakdown": pulled.get("breakdown", {})}
return result
@@ -57,3 +57,130 @@ async def fetch_findings(
texts = [c.text for c in (result.content or []) if getattr(c, "type", "") == "text"]
return parse_findings_text(texts[0]) if texts else []
# --- SBOM + DAST consumption (Sharang's scanner exposes these as dedicated MCP
# tools, not via list_findings) -------------------------------------------------
_SEV_BY_RANK = {4: "critical", 3: "high", 2: "medium", 1: "low"}
_SEV_RANK = {v: k for k, v in _SEV_BY_RANK.items()}
def normalize_sbom_report(text: str) -> list:
"""sbom_vuln_report -> one finding per VULNERABLE PACKAGE (not per CVE — a repo
can have hundreds of CVEs but ~dozens of packages). scan_type='dependency' so
the CRA mapper routes it to dependency-monitoring (CRA-AI-22)."""
try:
data = json.loads(text)
except (json.JSONDecodeError, TypeError):
return []
if not isinstance(data, dict):
return []
repo_id = data.get("repo_id", "")
out = []
for pkg in data.get("packages") or []:
vulns = pkg.get("vulnerabilities") or []
if not vulns:
continue
ids, seen, best = [], set(), 0
for v in vulns:
vid = v.get("id")
if vid and vid not in seen:
seen.add(vid)
ids.append(vid)
best = max(best, _SEV_RANK.get((v.get("severity") or "").lower(), 0))
name, ver = pkg.get("name", ""), pkg.get("version", "")
pm = pkg.get("package_manager", "") or ""
shown = ", ".join(ids[:8]) + ("" if len(ids) > 8 else "")
out.append({
"id": f"sbom:{repo_id}:{name}@{ver}",
"repo_id": repo_id,
"title": f"Verwundbare Abhängigkeit: {name} {ver} ({len(ids)} Schwachstelle(n))",
"description": f"Abhängigkeit {name} {ver} ({pm}) mit bekannten Schwachstellen: {shown}.",
"scan_type": "dependency",
# CWE-1395 (Dependency on Vulnerable Third-Party Component) → the CWE
# path maps deterministically to CRA-AI-22, robust against package
# names that happen to contain keyword tokens (e.g. "sqlite" → "sql").
"cwe": "CWE-1395",
"severity": _SEV_BY_RANK.get(best, "medium"),
"location": f"{pm}:{name}@{ver}" if pm else f"{name}@{ver}",
})
return out
def normalize_dast(text: str) -> list:
"""list_dast_findings -> findings (carry cwe + endpoint + vuln_type so the CRA
mapper routes them via cwe/keywords). scan_type='dast'."""
out = []
for d in parse_findings_text(text):
if not isinstance(d, dict):
continue
out.append({
"id": d.get("_id") or d.get("id") or "",
"repo_id": d.get("repo_id") or "",
"title": d.get("title", ""),
"description": " ".join(x for x in [d.get("vuln_type", ""), d.get("description", "")] if x),
"scan_type": "dast",
"cwe": str(d.get("cwe", "") or ""),
"severity": (d.get("severity") or "").lower(),
"location": d.get("endpoint") or d.get("target_id") or "",
"exploited": bool(d.get("exploitable", False)),
})
return out
async def _open_and_call(url: str, tok: str, calls: list) -> dict:
"""Open ONE MCP session and run [(tool, params), ...] -> {tool: text}. A tool
that errors yields '' (best-effort; the assessment degrades, never breaks)."""
from mcp.client.session import ClientSession
from mcp.client.streamable_http import streamablehttp_client
headers = {"Authorization": f"Bearer {tok}"} if tok else None
out: dict = {}
async with streamablehttp_client(url, headers=headers) as (read, write, _):
async with ClientSession(read, write) as session:
await session.initialize()
for tool, params in calls:
try:
res = await session.call_tool(tool, params)
texts = [c.text for c in (res.content or []) if getattr(c, "type", "") == "text"]
out[tool] = texts[0] if texts else ""
except Exception:
out[tool] = ""
return out
async def fetch_all_findings(
repo_id: Optional[str] = None,
severity: Optional[str] = None,
limit: int = 200,
base_url: Optional[str] = None,
token: Optional[str] = None,
include_dast: bool = True,
) -> dict:
"""Pull list_findings + SBOM-vulns + DAST in one MCP session and return a
unified finding list plus a per-source breakdown. SBOM is repo-scoped
(sbom_vuln_report requires repo_id); DAST has no repo_id filter in the MCP, so
it is deployment-wide (flagged in the breakdown). Returns {} on no config."""
url = (base_url or SCANNER_MCP_URL).rstrip("/")
tok = token or SCANNER_MCP_TOKEN
if not url:
return {"findings": [], "breakdown": {}}
calls = [("list_findings", {"limit": limit, **({"repo_id": repo_id} if repo_id else {}),
**({"severity": severity} if severity else {})})]
if repo_id:
calls.append(("sbom_vuln_report", {"repo_id": repo_id}))
if include_dast:
calls.append(("list_dast_findings", {"limit": limit,
**({"severity": severity} if severity else {})}))
res = await _open_and_call(url, tok, calls)
code = parse_findings_text(res.get("list_findings", ""))
sbom = normalize_sbom_report(res.get("sbom_vuln_report", "")) if repo_id else []
dast = normalize_dast(res.get("list_dast_findings", "")) if include_dast else []
return {
"findings": code + sbom + dast,
"breakdown": {"code": len(code), "sbom": len(sbom), "dast": len(dast),
"dast_repo_scoped": False},
}
@@ -0,0 +1,77 @@
"""SBOM/DAST normalization from the scanner MCP -> CRA finding shape + mapping.
Shapes pinned from the live MCP (sbom_vuln_report / list_dast_findings, 2026-06-18).
"""
import json
from compliance.services.scanner_mcp_client import normalize_sbom_report, normalize_dast
from compliance.services.cra_finding_mapper import ScannerFinding, map_finding
SBOM = json.dumps({
"repo_id": "r1",
"vulnerable_packages_count": 1,
"total_vulnerabilities": 3,
"packages": [
{"name": "dompurify", "version": "3.3.3", "package_manager": "npm",
"license": "MIT", "vulnerabilities": [
{"id": "GHSA-39q2", "source": "osv", "severity": None},
{"id": "GHSA-39q2", "source": "osv", "severity": None}, # dup
{"id": "GHSA-76mc", "source": "osv", "severity": "high"}]},
{"name": "clean-pkg", "version": "1.0", "package_manager": "npm",
"vulnerabilities": []}, # no vulns -> skipped
],
})
DAST = json.dumps([
{"_id": {"$oid": "abc123"}, "vuln_type": "security_misconfiguration",
"title": "SQL backup exposure: /backup.sql", "description": "Sensitive resource accessible.",
"severity": "high", "cwe": "CWE-16", "endpoint": "https://demo.x/backup.sql",
"method": "GET", "exploitable": True},
])
class TestSbom:
def test_one_finding_per_vulnerable_package(self):
out = normalize_sbom_report(SBOM)
assert len(out) == 1 # clean-pkg skipped
f = out[0]
assert f["scan_type"] == "dependency"
assert f["cwe"] == "CWE-1395"
assert f["location"] == "npm:dompurify@3.3.3"
assert f["severity"] == "high" # escalated from the one graded vuln
assert "GHSA-39q2" in f["description"] and "GHSA-76mc" in f["description"]
def test_maps_to_dependency_requirement_even_with_keyword_in_name(self):
# CWE path dominates → CRA-AI-22, not CRA-AI-20 from a "sql"-like name
out = normalize_sbom_report(json.dumps({
"repo_id": "r", "packages": [
{"name": "sqlite3", "version": "5.0", "package_manager": "npm",
"vulnerabilities": [{"id": "CVE-x", "severity": "medium"}]}]}))
m = map_finding(ScannerFinding.from_dict(out[0]))
assert m.primary_requirement == "CRA-AI-22"
def test_bad_json(self):
assert normalize_sbom_report("not json") == []
assert normalize_sbom_report("{}") == []
class TestDast:
def test_normalizes_dast_finding(self):
out = normalize_dast(DAST)
assert len(out) == 1
f = out[0]
assert f["scan_type"] == "dast"
assert f["cwe"] == "CWE-16"
assert f["location"] == "https://demo.x/backup.sql"
assert f["exploited"] is True
assert "security_misconfiguration" in f["description"]
def test_dast_maps_via_cwe(self):
out = normalize_dast(DAST)
m = map_finding(ScannerFinding.from_dict(out[0]))
assert m.primary_requirement == "CRA-AI-1" # CWE-16 -> secure config
assert m.finding_id == "abc123" # _id.$oid extracted
def test_empty(self):
assert normalize_dast("[]") == []
assert normalize_dast("not json") == []