diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..7a9d74e --- /dev/null +++ b/.editorconfig @@ -0,0 +1,18 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +indent_style = space +indent_size = 2 +insert_final_newline = true +trim_trailing_whitespace = true + +[*.go] +indent_style = tab + +[Makefile] +indent_style = tab + +[*.md] +trim_trailing_whitespace = false diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..e69de29 diff --git a/.gitea/issue_template/bug.md b/.gitea/issue_template/bug.md new file mode 100644 index 0000000..c54e86f --- /dev/null +++ b/.gitea/issue_template/bug.md @@ -0,0 +1,50 @@ +--- +name: Bug report +about: Something works incorrectly or breaks +labels: bug +--- + +## What happened + + + +## What I expected + + + +## Steps to reproduce + +1. +2. +3. + +## Environment + +- **Env:** dev / stage / prod +- **Tenant slug:** +- **Product:** +- **Release tag / commit SHA:** +- **Browser (if portal):** + +## Evidence + + + +``` + +``` + +**SigNoz trace:** + +## Blast radius + +- [ ] Affects a single tenant +- [ ] Affects multiple tenants +- [ ] Affects all tenants on this env +- [ ] Data loss or corruption risk +- [ ] Security / authz implication + +## Suspected cause (optional) + + diff --git a/.gitea/issue_template/feature.md b/.gitea/issue_template/feature.md new file mode 100644 index 0000000..00da226 --- /dev/null +++ b/.gitea/issue_template/feature.md @@ -0,0 +1,41 @@ +--- +name: Feature / change request +about: Propose a new capability or behavior change +labels: enhancement +--- + +## Problem + + + +## Proposed solution + + + +## Acceptance criteria + + + +- [ ] +- [ ] +- [ ] + +## Alternatives considered + + + +## Linked milestone + + + +M0.1 — or **new milestone needed** + +## Out of scope + + + +## Open questions + + diff --git a/.gitea/pull_request_template.md b/.gitea/pull_request_template.md new file mode 100644 index 0000000..8483f14 --- /dev/null +++ b/.gitea/pull_request_template.md @@ -0,0 +1,66 @@ + + +## What + + +- + +## Why + + + +Linked milestone: **M0.1** + + + +## How + + + +## Test plan + +- [ ] Unit tests added/updated +- [ ] Integration tests added/updated (real DB via testcontainers) +- [ ] Playwright e2e added/updated (only if user-facing flow changed) +- [ ] Manual smoke on stage after deploy +- [ ] Regression test added (only if this PR fixes a bug — must fail before the fix) + + + +## Risk + +**Blast radius:** + +**What could break:** +- + +**Rollback plan:** + + +## Checklist + +- [ ] Docs updated (or n/a — explain) +- [ ] Audit events emitted for state changes (or n/a) +- [ ] Secrets via Infisical, never in repo +- [ ] Migration is forward-only + idempotent (or no migration) +- [ ] Tenant scoping enforced on every DB query (or no DB access) +- [ ] OpenAPI spec updated (or no API change) +- [ ] `featureFlags.evaluate()` used for any toggleable behavior (or n/a) +- [ ] CHANGELOG entry under "Unreleased" (or n/a) + +## Screenshots / recordings + + + +--- + + diff --git a/.gitea/workflows/ci.yaml b/.gitea/workflows/ci.yaml new file mode 100644 index 0000000..dbd02aa --- /dev/null +++ b/.gitea/workflows/ci.yaml @@ -0,0 +1,31 @@ +# CI skeleton (TypeScript shape; no app code yet). +# Lights up to commitlint + gitleaks + trivy fs scan. Add lint/test/build jobs +# when this repo grows real package code. +name: ci + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + shared: + runs-on: docker + steps: + - uses: actions/checkout@v4 + with: { fetch-depth: 0 } + + - name: commitlint (PR only) + if: github.event_name == 'pull_request' + uses: wagoid/commitlint-github-action@v6 + + - name: gitleaks + uses: gitleaks/gitleaks-action@v2 + + - name: trivy fs scan + uses: aquasecurity/trivy-action@master + with: + scan-type: fs + severity: HIGH,CRITICAL + exit-code: 1 diff --git a/.gitea/workflows/release.yaml b/.gitea/workflows/release.yaml new file mode 100644 index 0000000..80c7348 --- /dev/null +++ b/.gitea/workflows/release.yaml @@ -0,0 +1,85 @@ +# release.yaml — production release on git tag vX.Y.Z. +# Promotes the image already on stage to prod, gated by manual sign-off. +name: release + +on: + push: + tags: ['v*.*.*'] + +jobs: + promote: + runs-on: docker + environment: + name: production # Gitea Environments — requires sign-off per branch protection + url: https://yourplatform.com + steps: + - uses: actions/checkout@v4 + with: { fetch-depth: 0 } + + - name: extract version + id: v + run: echo "version=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT + + - name: verify stage soak (>= 24h on this image) + run: | + IMG=registry.yourplatform.com/${{ github.event.repository.name }}:env-stage + SOAK_SECONDS=$(orca image-age --env=stage --image $IMG) + if [ "$SOAK_SECONDS" -lt 86400 ]; then + echo "Stage soak only $SOAK_SECONDS s, < 24h. Aborting." + exit 1 + fi + env: + ORCA_TOKEN: ${{ secrets.ORCA_STAGE_TOKEN }} + + - name: re-tag image as semver + env-prod + uses: docker/login-action@v3 + with: + registry: registry.yourplatform.com + username: ${{ secrets.REGISTRY_USER }} + password: ${{ secrets.REGISTRY_PASS }} + + - run: | + IMG=registry.yourplatform.com/${{ github.event.repository.name }} + docker pull $IMG:env-stage + docker tag $IMG:env-stage $IMG:v${{ steps.v.outputs.version }} + docker tag $IMG:env-stage $IMG:env-prod + docker push $IMG:v${{ steps.v.outputs.version }} + docker push $IMG:env-prod + + - name: deploy to prod + run: orca apply --env=prod --image-tag=v${{ steps.v.outputs.version }} + env: + ORCA_TOKEN: ${{ secrets.ORCA_PROD_TOKEN }} + + - name: post-deploy smoke + run: orca exec --env=prod smoke-runner + + - name: generate release notes from conventional commits + uses: orhun/git-cliff-action@v3 + with: + config: cliff.toml + args: --latest --strip header + env: + OUTPUT: RELEASE_NOTES.md + + - name: create Gitea release + run: | + curl -X POST -H "Authorization: token ${{ secrets.GITEA_TOKEN }}" \ + -H "Content-Type: application/json" \ + -d "$(jq -Rs '{tag_name:"v${{ steps.v.outputs.version }}", name:"v${{ steps.v.outputs.version }}", body:.}' < RELEASE_NOTES.md)" \ + https://gitea.meghsakha.com/api/v1/repos/${{ github.repository }}/releases + + rollback-on-failure: + needs: promote + if: failure() + runs-on: docker + steps: + - name: orca rollback prod + run: orca rollout undo ${{ github.event.repository.name }} --env=prod + env: + ORCA_TOKEN: ${{ secrets.ORCA_PROD_TOKEN }} + - name: page on-call + run: | + curl -X POST -H "Content-Type: application/json" \ + -d '{"text":"Release of ${{ github.event.repository.name }} ${{ github.ref }} FAILED. Rolled back. See Gitea Actions run."}' \ + ${{ secrets.ONCALL_WEBHOOK }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..376d6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# OS +.DS_Store +Thumbs.db + +# Editors +.vscode/ +.idea/ +*.swp +*~ + +# Local secrets +.env +.env.local +.env.*.local + +# Build outputs +dist/ +build/ +out/ +target/ +coverage/ +*.log +*.tmp + +# Node +node_modules/ +.pnpm-store/ +.next/ +.turbo/ + +# Go +*.test +*.out +vendor/ + +# Rust +**/target/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..a143db9 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,25 @@ +# Changelog + +All notable changes to this repo. Format: [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +Generated section is appended on release tag via `git-cliff` (see `.gitea/workflows/release.yaml`). + +## [Unreleased] + +### Added +- + +### Changed +- + +### Fixed +- + +### Removed +- + +### Security +- + +--- + + diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000..8f9856c --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,35 @@ +# CODEOWNERS — auto-requests reviewers based on touched paths. +# Format: <@user-or-team> [<@user-or-team> ...] +# More specific patterns override less specific ones. +# See: https://docs.gitea.com/usage/code-owners +# +# This is the BASELINE — copy into the repo and tighten paths per service. + +# Default — every PR gets at least Sharang +* @sharang + +# Architecture / specs / runbooks (touchy — both founders look) +/docs/ @sharang @benjamin_boenisch +*.md @sharang @benjamin_boenisch + +# Security-sensitive paths +/internal/auth/ @sharang +/internal/keycloak/ @sharang +/internal/api-keys/ @sharang +/middleware/auth/ @sharang + +# Schema and data migrations — irreversible, both founders look +/migrations/ @sharang @benjamin_boenisch +**/schema/ @sharang @benjamin_boenisch + +# Infra-as-code +/orca/ @sharang +/.gitea/workflows/ @sharang +/Dockerfile @sharang + +# Manifests (catalog metadata visible to customers) +/product.manifest.yaml @sharang @benjamin_boenisch + +# Frontend-only changes +/src/app/ @sharang +/src/components/ @sharang diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..d54d1a5 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,89 @@ +# Contributing + +Conventions are platform-wide. The full ruleset lives in [`platform/docs/IMPLEMENTATION_PLAN.md §1`](https://gitea.meghsakha.com/platform/docs/src/branch/main/IMPLEMENTATION_PLAN.md). This is the short version. + +## Branching + +- Trunk-based. `main` is always deployable. +- Branch from `main`. Name: `feat/`, `fix/`, `chore/`, `docs/`, `refactor/`. +- Max 5 days. Longer-lived branches get merge conflicts and stop being trusted. +- Never push directly to `main` (branch protection blocks it). + +## Commits + +[Conventional Commits](https://www.conventionalcommits.org/) — enforced by `commitlint` in CI. + +``` +()?: + +[optional body] + +[optional footer: BREAKING CHANGE: ..., Refs: M5.2] +``` + +Types: `feat`, `fix`, `chore`, `docs`, `refactor`, `test`, `perf`, `build`, `ci`. +Breaking change: append `!` (e.g. `feat!: drop /v0 endpoints`) and add `BREAKING CHANGE:` footer. + +Examples: +``` +feat(api): add POST /v1/tenants/:id/cancel +fix(auth): reject JWT when org_id missing +docs: link runbook from README +refactor!: rename column tenant.kind → tenant.type +``` + +## Pull requests + +1. Open a PR against `main` using the template (`.gitea/pull_request_template.md` is auto-loaded). +2. Fill **every** section — the template is a checklist, not decoration. +3. Link the milestone in the body: `Linked milestone: M5.2`. +4. Wait for green CI + 1 approving review. **Do not self-merge.** +5. Squash-merge. The PR title becomes the commit message — keep it as a Conventional Commit. + +## Tests + +| Change type | Required tests | +|---|---| +| New API endpoint | unit + integration (testcontainers, real DB) | +| New user-facing flow | Playwright e2e against stage | +| Bug fix | regression test FIRST (must fail before fix) | +| IaC / Orca manifest | `orca validate` + dry-run plan in PR comment | +| Pure refactor | existing suite must stay green | + +**"Manually tested" is not acceptable** except for IaC, and even there the dry-run plan must be in the PR. + +## Secrets + +- Never commit secrets. `gitleaks` runs in CI and blocks merge. +- Local dev: `.env.local` (gitignored); template at `.env.example`. +- Stage / prod: Infisical machine identity at `/{env}/{service}/`. + +## Code style + +| Stack | Tools | +|---|---| +| Go | `go fmt`, `go vet`, `golangci-lint run` — all required clean | +| Rust | `cargo fmt --all`, `cargo clippy -- -D warnings` — both required | +| TypeScript | `pnpm lint`, `pnpm typecheck` — both required | +| Python | `ruff check`, `ruff format`, `mypy` — all required | + +CI runs these. Pre-commit hooks recommended (`.githooks/pre-commit` in this repo). + +## Audit + observability + +Any state-changing endpoint MUST emit an audit event to Tenant Registry `/audit` in the Retraced-shape schema. See [`PRODUCT_INTEGRATION_SPEC.md §8.4`](https://gitea.meghsakha.com/platform/docs/src/branch/main/PRODUCT_INTEGRATION_SPEC.md). + +Any service ships OTel SDK from day one (`OTEL_EXPORTER_OTLP_ENDPOINT` injected by Orca). No `fmt.Println` / `console.log` in committed code. + +## Reviewer hat + +When reviewing, check in this order: +1. **Risk** — what could break in prod? Is the rollback clear? +2. **Tests** — do they actually exercise the change? +3. **Security** — secrets, authz, input validation, tenant scoping. +4. **Correctness** — does it do what the PR says it does? +5. **Style** — last; CI already caught the mechanical stuff. + +## Questions + +`#engineering` channel · `oncall@yourplatform.com` · or open a PR with a `[WIP]` prefix and ask in the description. diff --git a/COST_PLAN.md b/COST_PLAN.md new file mode 100644 index 0000000..cf38c60 --- /dev/null +++ b/COST_PLAN.md @@ -0,0 +1,258 @@ +# Cost Plan — SysEleven Infrastructure + +Companion to `INFRASTRUCTURE.md` and `IMPLEMENTATION_PLAN.md`. Pricing source: `SysEleven-Cloud-Services-Preisinformationen_01_26_v2.pdf` (effective 2026-01-20). All prices net EUR, exclusive of 19% VAT. Region: DUS2 + HAM1. + +--- + +## 1. TL;DR + +**Locked topology (2026-05-18):** 4 billable VMs — 1 stage + 3 prod — totalling **48 GiB-RAM**. See `INFRASTRUCTURE.md §1`. + +All four pricing modes, side by side, at the locked sizing: + +| Mode | Compute €/mo | Storage €/mo | Network €/mo | **Total net €/mo** | + 19% VAT | **Annual gross €** | +|---|---:|---:|---:|---:|---:|---:| +| **On-Demand** | 434.50 | 112 | 2.92 | **549.42** | 653.81 | **7,846** | +| **12-month commit** | 295.20 | 112 | 2.92 | **410.12** | 488.04 | **5,856** | +| **36-month no upfront** | 216.00 | 112 | 2.92 | **330.92** | 393.79 | **4,725** | +| **36-month upfront** | 192.00 | 112 | 2.92 | **306.92** | 365.23 | **4,383** | + +**36M upfront one-time payment**: €6,912 net at signing (compute only; storage + network still billed monthly). + +**Recommended cash plan for Year 1:** +1. Months 1–3: burn On-Demand (~€549/mo) while flavors get proven against real workload +2. Month 4 onward: sign 36M-upfront commit at proven size (~€307/mo) +3. Year-1 total infra: **€4,410 net / €5,248 gross** + one-time €6,912 upfront in Month 4 + +Growth tiers extend that same baseline (next 4 sections drill in). + +--- + +## 2. What to use / what to skip + +### Use from day one +| Service | Why | Cost | +|---|---|---| +| **OpenStack IaaS (m2 GP)** | Bread and butter. General-purpose 1:4 vCPU:RAM fits everything. | per VM, see §3 | +| **Block Storage (Ceph)** | 3x replicated, persistent. €0.10/GiB/mo. | per GiB | +| **Object Storage (S3)** | Backups, audit logs, demo seed bundles, export ZIPs. €0.02/GiB/mo. | per GiB | +| **Floating IP** | Public IPs for vm-edge (1) and stage (1). | €2.92/IP/mo | +| **VPN as a Service** | Inclusive. Use for ops access from our laptops. | €0 | +| **Self-Service Support** | Free. Adequate while we're shaking out the platform. | €0 | + +### Defer until clearly needed +| Service | When to add | Cost | +|---|---|---| +| **DNS Zones (DNSaaS)** | Never — we self-host PowerDNS on vm-edge per [[self-hosted-oss-first]] | €10/zone — skipped | +| **Load Balancer (Octavia)** | When we add a second vm-edge for HA (Tier D). Until then orca-proxy + Floating IP is enough. | €14.60–57.67/mo | +| **Business Support** | When MRR > €5k. Below that, Self-Service docs cover us. | €185/mo | +| **Priority Support** | Only if we sign an Enterprise contract that requires <1h response. | €545/mo | +| **DDoS Guard PLUS** | After first attack OR before launching anything customer-promoted. | €875/mo | +| **DBaaS PostgreSQL Cluster** | When tenant_registry Postgres becomes the bottleneck (200+ customers, see RISK-1 in INFRASTRUCTURE.md). | €213–426/mo per cluster (m2.small–medium, 36M upfront) | +| **MetaKube Core (managed K8s)** | We use Orca (the user's own product). MetaKube would compete with Orca, not complement it. Skip unless Orca is replaced. | €0 by design | +| **Managed VM (Business/Priority)** | Defeats Orca. We are the ones who manage VMs. | skipped — saves €1k+/mo | +| **Operational Support Platform** | €759–€1,479/mo. Massive overkill until late stage. | skipped | + +### GPU instances (separate concern) +LiteLLM today is a passthrough. If we ever self-host an inference model: +- **L40S (24 GB GPU RAM)**: €1,309/mo On-Demand, €1,086 (12M), €877 (24M) +- **H100 NVL (94 GB)**: €5,755/mo On-Demand, €4,637 (12M), €3,743 (24M) + +For now: route LLM calls through LiteLLM → external provider. Add GPU only if a customer pays for dedicated inference. + +--- + +## 3. Per-VM sizing — Locked topology (Tier A, 5 customers) + +Flavor mapping from `INFRASTRUCTURE.md §1` to SysEleven `m2` General Purpose (1 vCPU : 4 GiB RAM, 50 GiB ephemeral root included). + +### Compute — all four pricing modes side by side + +| VM | Env | Flavor | vCPU | RAM | On-Demand | 12M | 36M no-upfront | 36M upfront | +|---|---|---|---:|---:|---:|---:|---:|---:| +| stage | stage | m2.small | 2 | 8 GB | 72.42 | 49.20 | 36.00 | 32.00 | +| vm-edge | prod | m2.small | 2 | 8 GB | 72.42 | 49.20 | 36.00 | 32.00 | +| vm-control | prod | m2.medium | 4 | 16 GB | 144.83 | 98.40 | 72.00 | 64.00 | +| vm-data | prod | m2.medium | 4 | 16 GB | 144.83 | 98.40 | 72.00 | 64.00 | +| **TOTAL** | | | **12** | **48 GB** | **434.50** | **295.20** | **216.00** | **192.00** | + +**36M upfront one-time cost:** 192 × 36 = **€6,912 net** at signing (compute only; everything else billed monthly). + +**Reference per-GiB-RAM rates** (the linear model behind all numbers above): +| Mode | €/GiB-RAM/mo | +|---|---:| +| On-Demand | 9.05 | +| 12M commit | 6.15 | +| 36M no-upfront | 4.50 | +| 36M upfront | 4.00 | + +Any future sizing change can be sanity-checked as `RAM × rate`. + +### Storage — Tier A steady state + +| Item | GiB | €/GiB/mo | €/mo | +|---|---:|---:|---:| +| stage block (ephemeral PG + Mongo + Qdrant in-VM) | +50 | 0.10 | 5.00 | +| vm-edge block (pg-keycloak + pg-infisical + Gitea repos) | +50 | 0.10 | 5.00 | +| vm-control block (MariaDB + Stalwart spool) | +250 | 0.10 | 25.00 | +| vm-data block (MongoDB + pg-app + Qdrant + MinIO) | +500 | 0.10 | 50.00 | +| Object storage — geo-redundant backups (DUS2↔HAM1) | ~500 | 0.0496 | 25.00 *(€12.50 first 6mo via launch discount)* | +| Object storage — seed bundles + exports + audit archive | ~100 | 0.02 | 2.00 | +| **Storage subtotal (steady state)** | | | **112.00** | +| **Storage subtotal (first 6 months)** | | | **99.50** | + +### Network + +| Item | €/mo | +|---|---:| +| 1 Floating IP (vm-edge — only public host in prod) | 2.92 | +| 1 Floating IP (stage — public for tester access) | 2.92 | +| PowerDNS (self-hosted on vm-edge) | 0 | +| Octavia Load Balancer (deferred to Tier D HA phase) | 0 | +| **Network subtotal** | **5.84** | + +> Storage table above uses 1 Floating IP. Adjust to **€5.84** if running stage with its own public IP (recommended). One-line delta of €2.92/mo. + +### Combined Tier A — four-mode summary + +| Mode | Compute | Storage | Network | **Total net €/mo** | + 19% VAT | **Annual gross €** | +|---|---:|---:|---:|---:|---:|---:| +| On-Demand | 434.50 | 112 | 5.84 | **552.34** | 657.28 | **7,887** | +| 12M commit | 295.20 | 112 | 5.84 | **413.04** | 491.52 | **5,898** | +| 36M no-upfront | 216.00 | 112 | 5.84 | **333.84** | 397.27 | **4,767** | +| 36M upfront | 192.00 | 112 | 5.84 | **309.84** | 368.71 | **4,425** | + +### Recommended cash plan — Year 1 (use this line in the pitch) + +| Months | Mode | €/mo (net) | Subtotal € | +|---|---|---:|---:| +| 1–3 (rightsizing window) | On-Demand | 552.34 | 1,657 | +| 4–12 (proven baseline) | 36M upfront | 309.84 | 2,789 | +| **Year-1 infra net** | | | **4,446** | +| + 19% VAT | | | **5,291** | +| + one-time 36M upfront in Month 4 | (compute)| | **6,912** | +| **Year-1 cash out (gross)** | | | **12,203** | + +### 3-year cumulative (full 36M commitment term) + +| Item | € | +|---|---:| +| Months 1–3 On-Demand (compute+storage+net) | 1,657 | +| Compute 36M upfront (paid Month 4) | 6,912 | +| Storage + network, 36 months × ~118 €/mo | 4,248 | +| **3-year infra net** | **12,817** | +| + 19% VAT | **15,252** | + +--- + +## 4. Growth tiers — what scales when + +### Tier A — Pilot (5 customers, first 6 months) +- **Locked topology**: 4 VMs (stage + vm-edge + vm-control + vm-data). See INFRASTRUCTURE.md §1. +- **Year 1 cash plan**: 3 months On-Demand → 36M upfront. ~€310/mo committed compute+storage+net + one-time €6,912. +- **Add**: Self-Service support (free). Skip LB, DNSaaS, DDoS, DBaaS, MetaKube, Managed Services. + +### Tier B — Early growth (50–200 customers, Year 1) +- **Vertical scale only.** Bump vm-data m2.medium → m2.large (+€64/mo for 36M upfront). +- **Add cold-standby vm-edge-spare** (€0 idle, only billed during a swap event). +- **Add Business Support** (€185/mo) once MRR > €5k. +- **Add LB Single Instance** (€14.60/mo) when we want zero-downtime portal deploys. +- **Add DDoS Guard PLUS** (€875/mo) before any marketing push. +- Estimated total: **~€1,100–1,400/mo + VAT**. + +### Tier C — Scale (500–1000 customers, Year 1–2) +- **Split vm-data** into vm-data + vm-data-db (move pg-app to its own VM; resolves RISK-1). + - Alternative: move pg-registry to DBaaS m2.small cluster (3 inst, 36M upfront): **€213/mo** +- **Split vm-control** into vm-control + vm-ops (ERPNext + MariaDB + Stalwart go to vm-ops): **+€64/mo** +- **HA edge**: second vm-edge, switch Floating IP → Load Balancer Double Instance (**€58/mo**). +- **Object storage growth:** audit logs, exports, demo backups → estimated 2 TB = **€40/mo**. +- Estimated total: **~€2,000–2,500/mo + VAT**. + +### Tier D — Full scale (2000 customers, Year 2–3) +- **3-node clusters** on hot paths: vm-control × 2, vm-data × 2. +- **Split vm-edge** into vm-edge + vm-identity + vm-secrets (back toward original 7-VM design). +- **DBaaS m2.medium cluster** (4V/16GB, 36M upfront): **€426/mo** for tenant_registry. +- **Keycloak HA cluster**: 2 vm-identity (m2.medium) + Postgres replica. +- **Priority Support** (€545/mo) becomes worth it. +- **Object storage:** ~5 TB = **€100/mo**. +- **DDoS Guard PREMIUM** (€2,200/mo) if traffic warrants — likely stays on PLUS. +- Estimated total: **€4,500–6,000/mo + VAT**. + +### Compute scaling cheat sheet (vs locked topology) + +| Tier | Customers | Topology delta from Tier A | Compute €/mo (36M upfront) | +|---|---:|---|---:| +| **A** | 5 | locked baseline: stage + 3 prod VMs (48 GiB) | **192** | +| **B** | 200 | + vm-data bumped m2.med → m2.large (+16 GiB) | **256** | +| **C** | 1000 | + split vm-data (+16 GiB), split vm-control (+16 GiB) | **384** | +| **D** | 2000 | + split vm-edge (3 → 3 VMs), HA clusters (~+90 GiB) | **~640** | + +The **€4/GiB-RAM/mo rate** (GP, 36M upfront) is the linear model — everything else (storage, network, support, DBaaS, DDoS) scales sub-linearly with customer count. Compute is never the bottleneck on the bill. + +--- + +## 5. Cost per customer + +| Tier | Customers | Monthly infra net (€) | Per customer/month (€) | +|---|---:|---:|---:| +| A | 5 | 310 | **62.00** | +| B | 200 | 1,200 | **6.00** | +| C | 1000 | 2,300 | **2.30** | +| D | 2000 | 5,000 | **2.50** | + +At Tier A the per-customer cost is irrelevant — fixed costs dominate. From Tier B onward our gross margin on a Professional plan (assume €99/customer/month) is **~94%** infrastructure-only. Add LLM passthrough (LiteLLM) + Polar.sh fees (~5%) + on-call time, and we are still well above the 80% gross margin floor SaaS investors look for. + +**Break-even: ~4 paying customers at €99/mo covers Tier A infra (€310/mo net).** + +--- + +## 6. SysEleven services we explicitly skip and why + +| Service | Why skip | +|---|---| +| DNSaaS (€10/zone) | We self-host PowerDNS on vm-gateway. €0 marginal cost since vm-gateway exists anyway. | +| MetaKube Core | Orca already orchestrates our containers. MetaKube would mean abandoning Orca, which the user owns. | +| MetaKube Accelerator | Same — competes with Orca. | +| MetaKube Operator add-ons (ExternalDNS, Cert-Manager, Tideways, Velero etc. at €78–171/mo each) | We pick and roll our own per [[self-hosted-oss-first]]. | +| Managed VM (Business €128–142/mo per VM, Priority €164–182) | Defeats Orca. We are the operators. Saves €1k+/mo at 7 VMs. | +| Operational Support Platform (€759–1,479/mo) | Massively over-specified for our scale. Buy individual Engineering Support days (€1,264/day) on demand if a real incident requires it. | +| DDoS Guard PREMIUM (€2,200) / ENTERPRISE (€4,800) | PLUS at €875/mo is enough for ≤500-customer scale. Upgrade if we see actual 1+ Tbps attacks. | +| Block Storage for Databases (€0.09 vs €0.10) | The €0.01/GiB difference saves ~€5/mo at our scale. Use it only on DBaaS cluster volumes (where SysEleven enforces it anyway). | + +--- + +## 7. Negotiation levers + +SysEleven publishes list prices but is open to commercial negotiation, especially as a German Mittelstand provider courting startups. Things worth asking for: + +1. **Startup credits.** Hetzner, OVH, and most EU clouds run startup-credit programs. Ask SysEleven for the equivalent before signing the 36M commit. Even €5–10k of credits = 6–12 months of Tier A infra free. +2. **EXIST / HTGF discount.** If we close the €1.5M raise (`project_breakpilot_fundraising`), SysEleven sometimes offers "Gründerförderung" pricing for HTGF-backed companies. +3. **Single-region discount.** We don't need DUS2 + HAM1 geo-redundancy at Tier A. Ask if single-region (DUS2 only) is cheaper. +4. **Object storage commitment.** 6-month 50%-off on geo-redundant storage applies anyway, but bulk commitments on regular S3 may unlock further pricing. +5. **Bundled support.** If we commit to 36M IaaS + Business Support, ask for support fee waiver in year 1. +6. **Move-in incentive.** Negotiate a setup/migration credit covering first 3 months of On-Demand burn. + +--- + +## 8. Open questions / things to validate + +- **Port 25 outbound from vm-ops.** Confirmed with SysEleven that outbound SMTP is allowed by default; if not, fall-back is to relay through Postal/Postmark for transactional only. +- **Region choice.** DUS2 vs HAM1 — DUS2 is the only region for L40S GPUs, HAM1 has A30. If we never self-host inference, region is purely a latency choice (DUS2 closer to most EU customers). +- **Geo-redundant Ceph backups.** Currently planning local block + S3 backup. Could also use SysEleven's geo-redundant S3 (DUS2 ↔ HAM1) for true DR. Cost: €0.05/GiB/mo vs €0.02 single-region. At 500GB backup that's €15/mo extra — buy it. +- **Egress traffic.** Fair Use policy — they reserve the right to bill if we exceed normal patterns. CERTifAI LLM passthrough could be heavy. Ask for clarification on what triggers metered billing. +- **VPN-as-a-Service inclusive.** Confirmed in the pricing doc. Use it for ops access — replaces our need to build IP-allowlists into Orca-Proxy for `erp.` and `git.`. + +--- + +## 9. Recommendation summary + +1. **Sign On-Demand for first 90 days.** Burn ~€1,365/mo while you find the right flavor for each VM. +2. **At Day 90, commit 36M upfront on proven baselines.** Cuts monthly to ~€700. +3. **Keep all 7 VMs separate.** The €100/mo difference vs. consolidation is not worth losing failure isolation. +4. **Skip every Managed Service.** We have Orca. +5. **Add Business Support at €5k MRR, DDoS PLUS before any public marketing push.** +6. **Negotiate startup credits before signing.** Could be worth months of free infra. + +--- + +*End of document. Pricing snapshot 2026-01-20; re-check before signing commitments.* diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..5595548 --- /dev/null +++ b/IMPLEMENTATION_PLAN.md @@ -0,0 +1,714 @@ +# Implementation Plan — Breakpilot Platform + +Companion to `PLATFORM_ARCHITECTURE.md`, `INFRASTRUCTURE.md`, and `PRODUCT_INTEGRATION_SPEC.md`. + +This is the build plan for an AI coding agent (Claude Code, executing PRs against the listed repos). Each milestone is sized to fit in 1–3 PRs, ships independently, and leaves the system in a working state. + +--- + +## 0. How to read this document + +- Milestones are named `M{phase}.{n}` and grouped by phase. +- Each milestone has: **Goal**, **Depends on**, **Repos/files**, **Deliverables**, **Acceptance**, **Tests**, **Gate**, **Effort** (S = ≤1 day, M = 2–4 days, L = ≥1 week). +- "Gate" is who/what approves the PR for merge. Standard is 1 human reviewer + green CI; some milestones add a manual sign-off. +- Phases are ordered; milestones within a phase can be parallelised where dependencies allow. +- The dependency graph at §11 is the source of truth — when in doubt, read it. + +--- + +## 1. Cross-cutting conventions (apply to every PR in every repo) + +### 1.1 Repo strategy +Polyrepo under a new Gitea org `gitea.meghsakha.com/platform/`. One repo per deployable unit. Existing product repos stay where they are. + +**Repos to create:** + +| Repo | Purpose | Created in | +|---|---|---| +| `platform/orca-platform` | IaC for VMs, Orca manifests, DNS, TLS, backups | M1.1 | +| `platform/tenant-registry` | Go service: tenant glue, audit, API keys | M4.1 | +| `platform/portal` | Next.js 15: customer area + backstage | M5.1 | +| `platform/docs` | Architecture, integration spec, this plan, runbooks | M0.1 | +| `platform/seed-data` | Demo tenant fixtures per product | M13.1 | +| `platform/design-tokens` | CSS variables / fonts (consumed by product web comps) | M5.1 | + +**Existing repos that get changes (no new repos):** +- `benjamin_boenisch/certifai` — M6.1 / M6.2 / M6.3 +- `benjamin_boenisch/breakpilot-compliance` — M7.1 / M7.2 + +### 1.2 Per-repo scaffolding (must exist before any feature work) +Every new repo lands in M0.1 with: + +``` +/README.md what this repo is, how to run, links to architecture +/CONTRIBUTING.md branch model, commit format, how to open a PR +/CODEOWNERS at least one mandatory reviewer (us) +/.gitea/ + /pull_request_template.md + /issue_template/ + bug.md + feature.md +/.gitea/workflows/ + ci.yaml fmt → lint → test → build (per-language details in M0.2) + release.yaml on tag: build image, push to registry +/CHANGELOG.md generated from conventional commits +/LICENSE MIT for portal/docs; Apache-2.0 for libraries +``` + +### 1.3 Branch + commit conventions +- **Trunk-based.** `main` is always deployable. Feature branches: `feat/`, `fix/`, `chore/`. Max lifetime 5 days. +- **Conventional Commits** (`feat:`, `fix:`, `chore:`, `docs:`, `refactor:`, `test:`, `breaking!:`). Enforced by `commitlint` in CI. +- **Squash-merge** to main. PR title becomes the commit message. +- **Direct push to main is blocked** by Gitea branch protection. + +### 1.4 PR template (in every repo) +```markdown +## What +<1-3 bullets> + +## Why + + +## How + + +## Test plan +- [ ] unit +- [ ] integration (if API surface changed) +- [ ] e2e (if user-facing flow changed) +- [ ] manual smoke on stage + +## Risk + + +## Linked milestone +M{phase}.{n} +``` + +### 1.5 CI checks (required before merge, configured in M0.2) +Per language defaults: + +| Stack | Required checks | +|---|---| +| Go | `go fmt -l (no diff)`, `go vet`, `golangci-lint`, `go test ./...`, `go build` | +| Rust | `cargo fmt --check`, `cargo clippy -- -D warnings`, `cargo test -j8` | +| TypeScript | `pnpm lint`, `pnpm typecheck`, `pnpm test`, `pnpm build` | +| Python | `ruff check`, `ruff format --check`, `mypy`, `pytest` | +| All | `commitlint`, image build, container scan (`trivy`), SBOM upload | + +### 1.6 Approval gates +- **Standard gate** (most milestones): 1 human reviewer approves + all CI checks green. Enforced by Gitea branch protection on `main`. +- **CODEOWNERS** auto-requests the right reviewer based on path. +- **Production-promotion gate** (release tags only): manual sign-off by `@sharang` on the release issue + stage soak ≥ 24h. +- **Security gate** (M2.x, M4.x, M14.x): security checklist in PR body completed. + +### 1.7 Versioning + release strategy +- **Semver** per repo. Container images carry **three tags**: `:sha-`, `:v1.4.2`, `:env-stage` / `:env-prod`. +- **Stage** auto-deploys on every merge to `main` (Gitea Actions → Orca apply against `stage` cluster). +- **Production** deploys only when a release tag `vX.Y.Z` is created. Tag creation requires the production-promotion gate. +- **Rollback**: `orca rollout undo ` flips back to previous image tag. RTO target ≤ 5 min for any single service. +- **Database migrations** are forward-only and run as an init container before the service starts. Migrations that delete columns require two releases (1: stop writing, 2: drop). + +### 1.8 Environments +Three Orca clusters, all on the same hardware until volume justifies separation: + +| Env | Cluster name | Purpose | Data | Auto-deploy? | +|---|---|---|---|---| +| dev | local | Developer machine, docker-compose | fixtures | n/a | +| stage | `orca-stage` | Pre-prod validation | seeded demo + synthetic customers | yes (on merge to main) | +| prod | `orca-prod` | Live customer traffic | real | tag + gate | + +Domain pattern: +- dev: `*.localhost` (mkcert) +- stage: `*.stage.yourplatform.com` +- prod: `*.yourplatform.com` + +### 1.9 Observability + audit +- **SigNoz** (already running at `signoz.meghsakha.com`) for traces, logs, metrics. Every service ships OTel SDK from day one. +- **Audit events** in the Retraced-shape schema (PRODUCT_INTEGRATION_SPEC.md §8.4) emitted to Tenant Registry `/audit` from every service. Required for every state-changing endpoint. +- **Structured logs** (JSON) only. No `fmt.Println` / `console.log` in committed code; CI rejects. + +### 1.10 Secrets +- Infisical machine identity per service, path `/{env}/{service}/`. +- The only secret allowed in an Orca env file is the Keycloak DB URI (bootstrap exception — see INFRASTRUCTURE.md). +- CI scans for committed secrets via `gitleaks`. Failures block merge. + +### 1.11 Testing policy (mandatory; see also `feedback_testing_everything`) +- **Unit**: every non-trivial function. +- **Integration**: every API endpoint, against real Postgres/MongoDB via `testcontainers`. No mock databases. +- **E2E**: every user-facing flow has at least one Playwright spec running against stage post-deploy. +- **Regression**: when a bug is fixed, a failing test is added FIRST, then the fix. +- **No PR ships without tests.** "Manual tested" is not acceptable except for IaC. + +### 1.12 A/B testing (designed for, adopted later) +Every place where a future flag would gate behaviour MUST flow through a single `featureFlags.evaluate(tenantId, flagKey)` function. Initial implementation returns hard-coded values from `manifest.yaml`. Swap to Unleash/OpenFeature in M19.1 with zero call-site changes. + +--- + +## 2. Phase 0 — Foundations (M0.x – M3.x) + +**Goal:** Repos exist, CI works, infra is provisioned and observable, identity + secrets are usable. No customer-visible features yet. + +### M0.1 — Bootstrap repos and docs +- **Depends on:** nothing +- **Repos:** `platform/docs`, `platform/orca-platform`, `platform/portal`, `platform/tenant-registry`, `platform/design-tokens`, `platform/seed-data` +- **Deliverables:** create the Gitea `platform` org; for each repo add the §1.2 scaffolding; `platform/docs` ingests the existing `PLATFORM_ARCHITECTURE.md`, `INFRASTRUCTURE.md`, `PRODUCT_INTEGRATION_SPEC.md`, this plan. +- **Acceptance:** every repo has a working `README.md`, `CONTRIBUTING.md`, `CODEOWNERS`, PR template. +- **Tests:** n/a +- **Gate:** standard +- **Effort:** S + +### M0.2 — CI templates + branch protection +- **Depends on:** M0.1 +- **Repos:** all of the above +- **Deliverables:** `.gitea/workflows/ci.yaml` per repo (matching §1.5 by stack), Gitea branch protection on `main` (require PR, 1 review, status checks green, no direct push), `commitlint`, `gitleaks`, `trivy` configured. +- **Acceptance:** a deliberately-broken PR is rejected by every check; a clean PR is mergeable. +- **Tests:** smoke PR per repo demonstrating green CI. +- **Gate:** standard +- **Effort:** S + +### M0.3 — Self-hosted DNS + wildcard TLS +- **Depends on:** M1.2 (vm-edge must exist before PowerDNS lands) +- **Repos:** `platform/orca-platform` +- **Deliverables:** + - **PowerDNS Authoritative** on `vm-edge` (Orca-managed). PostgreSQL backend on same VM (small; ~100 records). + - At the registrar (Benjamin's account): set `ns1.yourplatform.com` and `ns2.yourplatform.com` glue records pointing at vm-edge public IP; delegate the domain to those NS. + - Zone file committed in `orca-platform/dns/yourplatform.com.zone`; Orca syncs into PowerDNS on apply. + - Records: apex `yourplatform.com`, wildcards `*.yourplatform.com` + `*.stage.yourplatform.com`, plus `auth.`, `erp.`, `mcp.`, `cdn.`, `mail.`, `ns1.`, `ns2.`, SPF/DKIM/DMARC TXT records (for M3.2). + - Wildcard TLS via Let's Encrypt **DNS-01 against PowerDNS** (Lego's `--dns=pdns` provider); ACME credentials in Infisical at `/prod/orca-proxy/PDNS_API_KEY`. + - Orca-Proxy reloads the cert via watch on the secret file; renewal cron runs at 02:00 daily. +- **Acceptance:** `dig @1.1.1.1 anything.yourplatform.com` returns an answer; `curl https://anything.yourplatform.com` returns 404 from Orca-Proxy (no TLS error). +- **Tests:** ACME renewal dry-run; PowerDNS zone-diff check in CI; reach via stage and prod subdomains; cert expiry page wired to SigNoz alert. +- **Gate:** standard + manual DNS-delegation check by both founders (irreversible from registrar side without 24–48h propagation) +- **Effort:** M (was S — registrar delegation + PowerDNS adds setup time vs. Cloudflare) + +### M1.1 — `orca-platform` repo (IaC) +- **Depends on:** M0.1, M0.2 +- **Repos:** `platform/orca-platform` +- **Deliverables:** directory layout per `INFRASTRUCTURE.md`; one Orca manifest per VM × service; per-env overlays (`overlays/dev`, `overlays/stage`, `overlays/prod`); a `Makefile` with `make plan` / `make apply` per env. +- **Acceptance:** `make plan ENV=stage` produces a no-op diff once applied. +- **Tests:** `orca validate` runs in CI; PRs that break a manifest fail. +- **Gate:** standard +- **Effort:** M + +### M1.2 — Provision VMs (locked topology) +- **Depends on:** M1.1 (Orca manifest layout) +- **Repos:** `platform/orca-platform` +- **Deliverables:** the **4 VMs** from `INFRASTRUCTURE.md §1` provisioned on SysEleven (DUS2): + - **stage** (m2.small, public IP) — runs app-plane code only, calls prod KC + Stalwart + - **vm-edge** (m2.small, public IP) — Identity + Infra planes (orca-proxy, PowerDNS, Keycloak, pg-keycloak, Infisical, pg-infisical, Gitea) + - **vm-control** (m2.medium) — Control plane (portal, tenant-registry, ERPNext, Frappe HD, MariaDB, Stalwart) + - **vm-data** (m2.medium) — Data plane (CERTifAI, MongoDB, LiteLLM, compliance ×3, pg-app, Qdrant, MinIO) + - Private network 10.0.0.0/16 between all four. Public ingress only via vm-edge (and stage's own IP for tester access). + - SSH disabled; only `orca exec` for shell access. +- **Acceptance:** every VM reachable from Orca control plane; private-network connectivity verified; resource limits per service set in manifest per `INFRASTRUCTURE.md §6` co-tenant notes. +- **Tests:** cold-start sequence from `INFRASTRUCTURE.md §10 Scenario F` runs successfully on stage VMs. +- **Gate:** standard + manual sign-off (touches infra spend and 36M commitment decision) +- **Effort:** M +- **Cost impact:** see COST_PLAN.md §3. Initial run: ~€552/mo On-Demand, dropping to ~€310/mo after 36M-upfront commit in Month 4. + +### M1.3 — Backups, monitoring, on-call +- **Depends on:** M1.2 +- **Repos:** `platform/orca-platform` +- **Deliverables:** backup cron per VM per `INFRASTRUCTURE.md §3` (Postgres pg_dump, MinIO bucket replication); SigNoz OTel collector running on every VM; alert routing to `oncall@yourplatform.com`; restore runbook in `platform/docs/runbooks/restore.md`. +- **Acceptance:** restore drill on stage succeeds (script in `platform/orca-platform/scripts/restore-drill.sh`); SigNoz shows traces from a synthetic request. +- **Tests:** disaster-recovery exercise per failure scenario in `INFRASTRUCTURE.md §10` — at least Scenarios A, B, F validated on stage. +- **Gate:** standard + manual sign-off +- **Effort:** L + +### M2.1 — Keycloak deployment +- **Depends on:** M1.2, M1.3 +- **Repos:** `platform/orca-platform` +- **Deliverables:** Keycloak 26 on `vm-identity`, Postgres backing store on `vm-control`, exposed at `auth.yourplatform.com` and `auth.stage.yourplatform.com`. Realm import file in `orca-platform/keycloak/realm-export.json` (committed, source-of-truth). +- **Acceptance:** master admin login works; realm `breakpilot-prod` exists in both envs. +- **Tests:** automated realm-state diff in CI (`kcadm` against checked-in export). +- **Gate:** standard + security checklist +- **Effort:** M + +### M2.2 — Realm configuration: roles + protocol mappers + Organizations +- **Depends on:** M2.1 +- **Repos:** `platform/orca-platform` (realm config) +- **Deliverables:** Organizations feature enabled; realm roles `BREAKPILOT_ADMIN`, `SUPPORT_ENGINEER`, `SALES_REP`; org roles `IT_ADMIN`, `CXO`, `FINANCE`, `LEGAL`, `USER`; protocol mapper that calls Tenant Registry at token issuance for `products`, `plan`, `tenant_status` claims; SALES_REP guardrail policy (token only issuable with `org_id = demo`). +- **Acceptance:** a test user gets the expected JWT claims; a SALES_REP user cannot get a JWT for a non-demo org (verified by integration test). +- **Tests:** Keycloak integration suite in `platform/tenant-registry/test/keycloak_test.go`. +- **Gate:** standard + security checklist +- **Effort:** M + +### M3.1 — Infisical +- **Depends on:** M1.2 +- **Repos:** `platform/orca-platform` +- **Deliverables:** Infisical on `vm-secrets`, machine identity per service, secret paths laid out per `PRODUCT_INTEGRATION_SPEC.md §9.4`. +- **Acceptance:** a stub service can read its secrets at startup; rotating a secret in Infisical UI is picked up on next pod start. +- **Tests:** smoke test container reads secrets. +- **Gate:** standard + security checklist +- **Effort:** S + +### M3.2 — Stalwart transactional email +- **Depends on:** M0.3 (needs DNS records under our control), M3.1 +- **Repos:** `platform/orca-platform` +- **Deliverables:** + - **Stalwart** on `vm-control` (Orca-managed); reachable at `mail.yourplatform.com`. + - DNS records added to the zone in M0.3: `mail` A record, MX → mail, SPF (`v=spf1 mx -all`), DKIM (Stalwart-generated public key), DMARC (`p=quarantine; rua=mailto:dmarc@yourplatform.com`), reverse DNS (PTR) configured at the cloud provider for the vm-control public IP — coordinate with vm-edge since outbound mail must egress from a host with a clean PTR. + - SMTP submission service account per platform sender: `noreply@`, `oncall@`, `support@`, `billing@`, `dmarc@`. + - Outbound queue and bounce handler; failed deliveries surface as audit events. + - Webhook receiver at `/inbound/postmaster` for bounce/complaint feedback loops (Gmail FBL, MS SNDS). + - **IP warming plan**: write a `platform/docs/runbooks/email-warming.md` documenting the 4–8 week ramp from low daily volumes; first 2 weeks of trial nudges (M12.2) explicitly throttled. +- **Acceptance:** test email from `noreply@yourplatform.com` to `parnerkarsharang@gmail.com` lands in inbox (not spam) on day 1; SPF/DKIM/DMARC all "pass" in Gmail's "show original" view; mail-tester.com score ≥ 9/10. +- **Tests:** automated daily mail-tester check (failure pages on-call); bounce-handling integration test. +- **Gate:** standard + security checklist + manual deliverability sign-off (DKIM keys are load-bearing) +- **Effort:** L (deliverability tuning is the long tail) + +**Phase 0 exit criteria:** +- Stage cluster boots cold from cron-driven nightly stop/start using only `INFRASTRUCTURE.md §5` ordering. +- A synthetic HTTPS request to `https://hello.stage.yourplatform.com` reaches a stub container. +- Restore drill on stage Postgres succeeds end-to-end. + +--- + +## 3. Phase 1 — Control plane core (M4.x – M5.x) + +**Goal:** Tenant Registry stores tenants; the portal authenticates a user and resolves their tenant. No products surfaced yet. + +### M4.1 — Tenant Registry: schema + migrations +- **Depends on:** M1.2, M2.2 +- **Repos:** `platform/tenant-registry` +- **Deliverables:** Go service scaffold; `golang-migrate` migrations for `tenants`, `tenant_projects`, `tenant_products`, `tenant_idp_config`, `api_keys`, `audit_log` per `PLATFORM_ARCHITECTURE.md §5c`; the `tenant.status` enum + `tenant.kind` column from the lifecycle spec. +- **Acceptance:** `make migrate-up` on a fresh Postgres produces the documented schema. +- **Tests:** migration up/down round-trip via `testcontainers-go`. +- **Gate:** standard +- **Effort:** M + +### M4.2 — Tenant Registry: REST API +- **Depends on:** M4.1 +- **Repos:** `platform/tenant-registry` +- **Deliverables:** OpenAPI 3.1 spec at `/openapi.yaml`; endpoints `POST /tenants`, `GET /tenants/:id`, `POST /tenants/:id/activate`, `POST /tenants/:id/cancel`, `GET /catalog`, `POST /catalog/request`, `POST /catalog/trial-request`, `POST /api-keys`, `POST /internal/api-keys/verify`, `POST /audit`, `GET /audit`. +- **Acceptance:** every endpoint passes the OpenAPI contract test; returns documented errors for invalid input. +- **Tests:** integration tests against real Postgres for every endpoint. +- **Gate:** standard +- **Effort:** L + +### M4.3 — Tenant Registry: Keycloak adapter +- **Depends on:** M4.2, M2.2 +- **Repos:** `platform/tenant-registry` +- **Deliverables:** package `internal/keycloak` that creates orgs, invites IT_ADMIN users, sets realm roles, and serves the protocol-mapper claims endpoint (the URL Keycloak hits during token issuance from M2.2). +- **Acceptance:** creating a tenant via `POST /tenants` provisions a Keycloak org and one IT_ADMIN user; user receives invite email. +- **Tests:** integration test against the stage Keycloak. +- **Gate:** standard +- **Effort:** M + +### M5.1 — Portal scaffold: subdomain routing + OIDC login +- **Depends on:** M2.2, M4.3, M0.3 +- **Repos:** `platform/portal`, `platform/design-tokens` +- **Deliverables:** Next.js 15 app on `vm-control`; middleware reads `Host` → extracts slug → calls Tenant Registry `GET /tenants?slug=` → injects tenant context; Keycloak OIDC login; logout; `design-tokens` package consumed by portal. +- **Acceptance:** visiting `https://acme.stage.yourplatform.com` redirects to Keycloak; after login, user lands on `/acme/dashboard` (empty page) with valid session. +- **Tests:** Playwright e2e: login + logout for an existing test tenant. +- **Gate:** standard +- **Effort:** M + +### M5.2 — Portal: dashboard + backstage shells +- **Depends on:** M5.1 +- **Repos:** `platform/portal` +- **Deliverables:** customer dashboard route `/[slug]/dashboard` (renders product tiles from JWT `products` claim — empty initially), backstage routes per `PLATFORM_ARCHITECTURE.md §5a` skeleton, RBAC enforcement (§5a "Operating principles" — hide what user can't access), session refresh. +- **Acceptance:** user with `org_roles=[USER]` cannot see settings or billing links; backstage routes return 403 for non-`BREAKPILOT_ADMIN` users. +- **Tests:** Playwright spec per role × route matrix. +- **Gate:** standard +- **Effort:** M + +### M5.3 — Playwright e2e harness +- **Depends on:** M5.2 +- **Repos:** `platform/portal` +- **Deliverables:** Playwright config that runs against `stage.yourplatform.com` post-deploy; CI job `e2e-stage` triggered after stage deploy; failure pages on-call. +- **Acceptance:** breaking change to login is caught in CI within 10 min of merge. +- **Tests:** the suite itself. +- **Gate:** standard +- **Effort:** S + +**Phase 1 exit criteria:** +- A tenant created via `POST /tenants` results in a working login flow at `.stage.yourplatform.com`. +- All Phase 1 routes have a passing Playwright spec running on every stage deploy. + +--- + +## 4. Phase 2 — Existing product uplift (M6.x – M7.x, parallel) + +**Goal:** CERTifAI and breakpilot-compliance both honour the JWT contract and surface a real product tile in the portal. + +### M6.1 — CERTifAI: org_id scoping at DB layer +- **Depends on:** M2.2 +- **Repos:** `benjamin_boenisch/certifai` +- **Deliverables:** MongoDB middleware that requires `org_id` on every query; backfill script for existing collections; per-tenant collection-level role checks (`IT_ADMIN` → Admin, etc.). +- **Acceptance:** integration test attempting a cross-tenant read returns `403`; existing single-tenant flows still work for tenant `default`. +- **Tests:** unit + integration; regression tests for every existing controller. +- **Gate:** standard + security checklist +- **Effort:** L (4–6 weeks per prior gap analysis) + +### M6.2 — CERTifAI: JWT validation + role mapping +- **Depends on:** M6.1 +- **Repos:** `benjamin_boenisch/certifai` +- **Deliverables:** Keycloak JWKS validation middleware; role mapping per `PLATFORM_ARCHITECTURE.md §6`; tenant_status middleware (returns 402 on writes when `frozen`, 410 when `archived`, allows demo with no metering). +- **Acceptance:** all four `tenant.status` states behave per spec; tested against a stage Keycloak. +- **Tests:** integration tests per status value. +- **Gate:** standard + security checklist +- **Effort:** M + +### M6.3 — CERTifAI: manifest + integration assets +- **Depends on:** M6.2 +- **Repos:** `benjamin_boenisch/certifai` +- **Deliverables:** `product.manifest.yaml` per `PRODUCT_INTEGRATION_SPEC.md §10` published to `cdn.yourplatform.com`; OpenAPI 3.1 spec; `/v1/health`, `/v1/usage`, `/v1/tenants/:id/export`, `DELETE /v1/tenants/:id/data`, `POST /v1/tenants/demo/reset`; web component `certifai-dashboard` per §5.A. +- **Acceptance:** CERTifAI appears in the portal catalog; subscribed tenants can open it from the dashboard. +- **Tests:** contract test that manifest validates against schema; web component renders inside portal shadow-DOM host. +- **Gate:** standard +- **Effort:** L + +### M7.1 — Compliance: JWT validation upgrade +- **Depends on:** M2.2 +- **Repos:** `benjamin_boenisch/breakpilot-compliance` +- **Deliverables:** Next.js proxy validates JWT against Keycloak JWKS (replacing today's `X-Tenant-ID` trust); tenant_status middleware as in M6.2. +- **Acceptance:** spoofing `X-Tenant-ID` without a JWT returns 401; valid JWT for tenant A cannot read tenant B data. +- **Tests:** integration tests for both auth and status states. +- **Gate:** standard + security checklist +- **Effort:** M (3–5 weeks per prior gap analysis) + +### M7.2 — Compliance: manifest + integration assets +- **Depends on:** M7.1 +- **Repos:** `benjamin_boenisch/breakpilot-compliance` +- **Deliverables:** same endpoint set as M6.3; web component (existing React → `@r2wc/react-to-web-component` per §5.A); manifest with `supports_projects: true` (already implemented). +- **Acceptance:** compliance appears in portal catalog; opens from dashboard; project switching works inside the product. +- **Tests:** as M6.3. +- **Gate:** standard +- **Effort:** M + +**Phase 2 exit criteria:** +- A real tenant on stage can subscribe to both products and use them through the portal. +- Cross-product audit at `/[slug]/audit` shows events from both products in the Retraced schema. + +--- + +## 5. Phase 3 — Business operations (M8.x – M9.x) + +**Goal:** ERPNext and Frappe HD run, Sales Order → tenant activate works, tickets escalate to Gitea. + +### M8.1 — ERPNext deployment +- **Depends on:** M1.2, M2.1 +- **Repos:** `platform/orca-platform` +- **Deliverables:** Frappe + ERPNext on `vm-control` (separate Postgres database from tenant_registry — see `INFRASTRUCTURE.md` RISK-1); reached at `erp.yourplatform.com`; Keycloak OIDC; IP-restricted at Orca-Proxy. +- **Acceptance:** us login works; a Customer record can be created manually. +- **Tests:** smoke test for OIDC; backup of Frappe filestore validated. +- **Gate:** standard + manual sign-off (touches `vm-control` resources) +- **Effort:** M + +### M8.2 — ERPNext customization +- **Depends on:** M8.1 +- **Repos:** `platform/orca-platform/erpnext-app/` +- **Deliverables:** custom Frappe app with: `tenant_id` field on `Customer`; `sales_owner` field on `Lead`; server scripts for the Sales Order → Tenant Registry webhook; `Cancel` workflow that calls Tenant Registry `/cancel`. +- **Acceptance:** submitting a Sales Order in ERPNext triggers a tenant activation in stage Tenant Registry. +- **Tests:** server-script unit tests (Frappe test harness); integration test exercises the full webhook. +- **Gate:** standard +- **Effort:** M + +### M8.3 — Self-serve billing (Polar.sh) +- **Depends on:** M8.1, M5.2 +- **Repos:** `platform/portal`, `platform/tenant-registry` +- **Deliverables:** + - Polar.sh organization + products configured for Starter / Professional / per-seat tiers. + - Polar Checkout embedded in portal `/[slug]/billing/upgrade`. + - Webhook listener at `tenant-registry /polar/webhook` (HMAC-verified) handles `subscription.created`, `subscription.updated`, `subscription.canceled`, `order.paid` → flips `tenant.status`, mirrors the customer + invoice into ERPNext via REST. + - Polar acts as Merchant of Record — they handle EU VAT MOSS, no per-country tax registration needed for our side. + - Portal billing page reads invoices from ERPNext (single source of truth for accounting) but links out to Polar's customer portal for payment-method management. +- **Acceptance:** signing up self-serve creates a tenant, a Polar subscription, an ERPNext Customer + Invoice, and a usable login; VAT line item appears correctly on the EU customer's invoice. +- **Tests:** integration test against Polar sandbox; webhook replay test; tax calculation correct for at least DE, FR, NL, US. +- **Gate:** standard + security checklist +- **Effort:** L + +> **Why Polar.sh over Stripe / Lemon Squeezy:** OSS-aligned, Merchant of Record (handles EU VAT MOSS automatically), developer-first, 4% + Stripe fees vs. Lemon's 5%. Stripe direct would require us to register for VAT in 27 countries — not viable for a 2-person team. See [[self-hosted-oss-first]]. + +### M9.1 — Frappe Helpdesk +- **Depends on:** M8.1 +- **Repos:** `platform/orca-platform` +- **Deliverables:** Frappe HD on the same Frappe bench; customer portal embedded at `/[slug]/support/`. +- **Acceptance:** a customer user can submit a ticket; we receive it. +- **Tests:** Playwright spec for ticket submission. +- **Gate:** standard +- **Effort:** S + +### M9.2 — HD → Gitea escalation +- **Depends on:** M9.1 +- **Repos:** `platform/orca-platform/erpnext-app/` +- **Deliverables:** server script that on a `Ticket: Escalate to Engineering` action creates a Gitea issue in the matching repo via Gitea REST API; reverse webhook from Gitea on issue close marks ticket resolved. +- **Acceptance:** the round-trip works for a test ticket on stage. +- **Tests:** integration test against stage Gitea. +- **Gate:** standard +- **Effort:** S + +**Phase 3 exit criteria:** +- ERPNext is the source of truth for billing/CRM/HR. +- The full Lead → Quote → Sales Order → Tenant chain works on stage. + +--- + +## 6. Phase 4 — Customer UX & lifecycle (M10.x – M14.x) + +**Goal:** Every customer-facing flow from `PLATFORM_ARCHITECTURE.md` works end-to-end on stage. + +### M10.1 — Customer area: full surfaces +- **Depends on:** M5.2, M6.3, M7.2 +- **Repos:** `platform/portal` +- **Deliverables:** real implementations of `/[slug]/dashboard`, `/[slug]/products/*`, `/[slug]/projects`, `/[slug]/settings/{identity,users,api-keys,integrations}`, `/[slug]/billing`, `/[slug]/audit`, `/[slug]/support`. +- **Acceptance:** every route is implemented, RBAC-gated, with empty/loading/error states. +- **Tests:** one Playwright spec per route × primary role. +- **Gate:** standard +- **Effort:** L + +### M10.2 — Cross-product audit view +- **Depends on:** M10.1, M4.2 +- **Repos:** `platform/portal` +- **Deliverables:** audit page filters by product/actor/action/time; CSV + PDF export; events rendered from the Retraced-shape schema. +- **Acceptance:** a DPO-style query ("show me everything user X did across all products last month") returns in <2s for a tenant with 100k events. +- **Tests:** load test with synthetic events. +- **Gate:** standard + security checklist +- **Effort:** M + +### M11.1 — Catalog flow (P13) +- **Depends on:** M4.2, M10.1 +- **Repos:** `platform/portal`, `platform/tenant-registry` +- **Deliverables:** `/[slug]/catalog` UI per `PLATFORM_ARCHITECTURE.md` P13; "Request" button creates ERPNext CRM Lead. +- **Acceptance:** customer requests a non-subscribed product; sales sees a Lead in ERPNext with the right `sales_owner`. +- **Tests:** Playwright e2e covering the full P13 sequence. +- **Gate:** standard +- **Effort:** M + +### M12.1 — Self-serve trial (P15) +- **Depends on:** M8.3, M11.1 +- **Repos:** `platform/portal`, `platform/tenant-registry` +- **Deliverables:** public `/start` form; trial tenant provisioning (status=trial, trial_ends_at); banner; trial_quota enforcement (read by products from JWT). +- **Acceptance:** prospect signs up; trial tenant with 14-day timer exists; quota enforced. +- **Tests:** Playwright e2e signs up → uses → hits quota. +- **Gate:** standard +- **Effort:** M + +### M12.2 — Trial lifecycle cron + emails +- **Depends on:** M12.1, M3.2 (Stalwart must be deliverability-clean) +- **Repos:** `platform/tenant-registry` +- **Deliverables:** scheduler in tenant-registry that runs day-7/12/14 emails; status transitions trial → active (on payment) or trial → frozen → archived; SMTP via Stalwart at `mail.yourplatform.com:587`; sender `noreply@yourplatform.com`; HTML + plaintext templates committed under `tenant-registry/templates/email/`; List-Unsubscribe headers per RFC 8058. +- **Acceptance:** in a time-warped stage test (script that advances `trial_ends_at`), all transitions fire in order and all three emails land in Gmail inbox. +- **Tests:** integration test with time injection; deliverability spot-check at each release. +- **Gate:** standard +- **Effort:** M + +### M13.1 — Demo tenant seeding +- **Depends on:** M6.3, M7.2 +- **Repos:** `platform/seed-data` +- **Deliverables:** per-product fixture archives (`certifai/seed-v1.tar.gz`, `compliance/seed-v1.tar.gz`); publishing pipeline to `cdn.yourplatform.com`; `catalog.demo.seed_data_url` populated in product manifests. +- **Acceptance:** calling `POST /v1/tenants/demo/reset` on either product restores fixtures. +- **Tests:** integration test asserts fixture state after reset. +- **Gate:** standard +- **Effort:** M + +### M13.2 — Sales demo flow (P14) +- **Depends on:** M2.2, M13.1 +- **Repos:** `platform/portal`, `platform/tenant-registry` +- **Deliverables:** demo tenant created in stage and prod with `kind=demo, status=demo`; SALES_REP role usable; backstage routes restricted to `/backstage/leads` and `/backstage/demo`; demo tenant audit events tagged `{"demo": true}` and hidden from real-tenant audit views. +- **Acceptance:** sales rep logs in at `demo.yourplatform.com`, walks both products live, [Request Trial] modal creates a CRM Lead with `sales_owner = the rep`. +- **Tests:** Playwright e2e for the sales walk-through. +- **Gate:** standard + security checklist (SALES_REP guardrail enforcement is the load-bearing piece) +- **Effort:** M + +### M13.3 — Nightly demo reset +- **Depends on:** M13.2 +- **Repos:** `platform/tenant-registry` +- **Deliverables:** cron at 03:00 Europe/Berlin calls each product's reset endpoint; failures page on-call. +- **Acceptance:** after a deliberately-corrupted demo state, the next 03:00 reset restores fixtures. +- **Tests:** test runs the reset manually + verifies fixture state. +- **Gate:** standard +- **Effort:** S + +### M14.1 — Cancel + frozen state (P16 part 1) +- **Depends on:** M10.1, M6.2, M7.1 +- **Repos:** `platform/portal`, `platform/tenant-registry` +- **Deliverables:** cancel modal with reason + typed-confirm; status active → frozen transition; Stripe `cancel_at_period_end`; ERPNext Opportunity → Lost; reactivation path within 30 days. +- **Acceptance:** test customer cancels; portal switches to read-only; reactivate restores `active` status without data loss. +- **Tests:** Playwright e2e covering cancel + reactivate. +- **Gate:** standard + security checklist +- **Effort:** M + +### M14.2 — Offboarding cron + final export (P16 part 2) +- **Depends on:** M14.1, M6.3, M7.2 +- **Repos:** `platform/tenant-registry` +- **Deliverables:** day-30 cron builds final export ZIP per product, emails signed URL (7-day TTL), calls `DELETE /v1/tenants/:id/data` on every subscribed product, archives Keycloak org, marks `tenant.status = archived`. +- **Acceptance:** time-warped test runs the full P16 sequence end-to-end on stage; export ZIP contains data from both products; second post-archive request to either product returns 410. +- **Tests:** integration test with time injection; GDPR-compliance regression suite added. +- **Gate:** standard + security checklist + manual sign-off (irreversible operation) +- **Effort:** L + +**Phase 4 exit criteria:** +- Every flow P1–P16 from `PLATFORM_ARCHITECTURE.md` has a passing Playwright spec. +- Stage runs a full lifecycle: sign-up trial → convert → use → cancel → offboard, in an automated nightly job. +- We can hand a prospect a real demo using `demo.yourplatform.com`. + +--- + +## 7. Phase 5 — Headless products (M15.x – M17.x) + +**Goal:** Make the platform host products with no UI of their own. + +### M15.1 — API key infrastructure +- **Depends on:** M4.2, M10.1 +- **Repos:** `platform/tenant-registry`, `platform/portal` +- **Deliverables:** API key CRUD per `PRODUCT_INTEGRATION_SPEC.md §6.2`; portal UI at `/[slug]/settings/api-keys`; `POST /internal/api-keys/verify` for products. +- **Acceptance:** create key in portal; product call with key succeeds; revoke kills access within 60s. +- **Tests:** integration tests for verify endpoint; Playwright for portal UI. +- **Gate:** standard + security checklist (rotation + scope enforcement) +- **Effort:** M + +### M15.2 — Webhook delivery +- **Depends on:** M15.1 +- **Repos:** `platform/tenant-registry`, `platform/portal` +- **Deliverables:** webhook config + delivery service per `PLATFORM_ARCHITECTURE.md` H4; portal page `/[slug]/integrations`; signed payloads; 3-attempt retry with backoff; dead-letter visible at `/webhooks/deliveries`. +- **Acceptance:** test webhook to https://requestbin.com works; failed deliveries appear in dead letter. +- **Tests:** integration tests with a local sink. +- **Gate:** standard +- **Effort:** M + +### M16.1 — First headless product reference implementation +- **Depends on:** M15.2 +- **Repos:** TBD (proof-of-concept can live in `platform/docs/examples/headless-template/`) +- **Deliverables:** a minimal headless product (e.g., echo-bot) that implements the full §5.C contract: manifest, API, audit emit, usage emit, demo reset, GDPR endpoints. +- **Acceptance:** echo-bot is bookable from catalog, works end-to-end, passes the same lifecycle test as Phase 4. +- **Tests:** the lifecycle e2e from M14.2 extended to include echo-bot. +- **Gate:** standard +- **Effort:** M + +### M17.1 — MCP servers (Enterprise) +- **Depends on:** M6.3, M7.2 +- **Repos:** `benjamin_boenisch/certifai`, `benjamin_boenisch/breakpilot-compliance` +- **Deliverables:** MCP endpoints per `PRODUCT_INTEGRATION_SPEC.md §10` `mcp:` block; gated on `plan == enterprise`; routed via `mcp.yourplatform.com`. +- **Acceptance:** Claude Code can connect to `mcp.yourplatform.com/certifai` with a service token and call `list_ai_agents`. +- **Tests:** MCP contract test using `mcp-cli`. +- **Gate:** standard + security checklist +- **Effort:** L + +**Phase 5 exit criteria:** +- A third-party (or us) can add a new headless product by following `PRODUCT_INTEGRATION_SPEC.md` and a referenced template, with no portal code changes required. + +--- + +## 8. Phase 6 — Enterprise + scale (M18.x – M19.x) + +These ship only when a paying customer requires them. + +### M18.1 — Custom domains +- **Depends on:** M0.3, M10.1 +- **Repos:** `platform/orca-platform`, `platform/portal` +- **Deliverables:** ACME on-demand TLS in Orca-Proxy; portal UI for customer to add domain; CNAME verification. +- **Acceptance:** `compliance.acme.com` resolves and renders the Acme portal. +- **Tests:** integration test with a synthetic domain. +- **Gate:** standard +- **Effort:** M + +### M18.2 — Physical data isolation +- **Depends on:** M4.1, M6.1, M7.1 +- **Repos:** all data-plane products + `tenant-registry` +- **Deliverables:** option per tenant for a dedicated Postgres / Mongo schema or database; provisioning automation; migration path from logical → physical. +- **Acceptance:** an enterprise tenant runs on a dedicated schema; cross-tenant queries are physically impossible. +- **Tests:** isolation enforcement test. +- **Gate:** standard + security review + manual sign-off +- **Effort:** L + +### M19.1 — A/B testing infra +- **Depends on:** anywhere `featureFlags.evaluate()` is called +- **Repos:** new `platform/feature-flags` (Unleash on `vm-control` or hosted) + portal SDK shim +- **Deliverables:** swap the hard-coded `evaluate()` from §1.12 to call Unleash; eval results land in audit events for reproducibility. +- **Acceptance:** flipping a flag in Unleash changes behaviour for the targeted tenant set within 30s; no behavior change for other tenants. +- **Tests:** integration test asserts flag-driven branches. +- **Gate:** standard +- **Effort:** M + +--- + +## 9. Cross-cutting work (every phase, ongoing) + +These are not milestones — they are commitments enforced by CI and process. + +- **Regression suite expansion.** Every bug fix lands with a regression test FIRST. Tracked by `tests-added` label on PRs; fix-without-test PRs are rejected by reviewer. +- **Security review per phase.** End of each phase: dependency audit (`cargo audit`, `npm audit`, `pip-audit`), SAST scan (`semgrep`), threat model update in `platform/docs/security/`. +- **Disaster-recovery drills.** Once per phase on stage: pick one scenario from `INFRASTRUCTURE.md §10`, run it, document time-to-recover in the runbook. +- **Doc currency.** PR template requires the author to tick "docs updated" or "n/a" — CI fails on a missing tick. +- **OSS swap-in readiness.** When adding metering / audit / SCIM / flag eval code, use the schema/interface noted in `PRODUCT_INTEGRATION_SPEC.md §15` so swap-in stays cheap. + +--- + +## 10. First-PR checklist for Claude Code + +When starting work, the first sequence of PRs should be: + +1. **PR-1** (M0.1): Create `platform/docs` with copied architecture docs + this plan. Land in 1 day. +2. **PR-2 to PR-7** (M0.1 continued): Bootstrap each of the other five repos with §1.2 scaffolding. Land in parallel. +3. **PR-8** (M0.2): CI templates + branch protection per repo. +4. **PR-9** (M1.1): `orca-platform` directory layout + first stub manifest. +5. **PR-10** (M1.2): VM provisioning (vm-edge, vm-identity, vm-secrets, vm-control first — DNS and Keycloak depend on these). +6. **PR-11** (M0.3): PowerDNS on vm-edge + zone file + registrar NS delegation + wildcard TLS via Let's Encrypt DNS-01. + +After PR-11, the dependency graph fans out and parallel work begins. + +For each PR, Claude Code MUST: +- Open the PR with the §1.4 template filled in. +- Link the milestone ID in PR body (`Linked milestone: M0.1`). +- Wait for human approval (no self-merge — branch protection enforces). +- After merge: verify the stage deploy succeeds before starting the next dependent PR. + +--- + +## 11. Dependency graph + +``` + ┌── M6.1 ── M6.2 ── M6.3 ──┐ + │ │ + ┌── M2.1 ── M2.2 ────────┤ ├── M10.1 ── M10.2 + │ │ │ │ +M0.1 ── M0.2 ── M1.1 ──┼── M1.2 ── M0.3 ── M1.3 │ │ ├── M11.1 ── M12.1 ── M12.2 + │ │ │ │ │ │ + │ └── M3.1 ── M3.2 │ ├── M13.2 ── M13.3 │ + │ │ │ │ │ + └─────────────────────── M4.1 ── M4.2 ── M4.3 ── M5.1 ── M5.2 ── M5.3 M13.1 │ + │ + M8.1 ── M8.2 ── M8.3 ── M9.1 ── M9.2 ──────────────────────┤ + │ + M15.1 ── M15.2 ── M16.1 ── M17.1 │ + │ + M14.1 ── M14.2 + +Phase-6 (M18, M19) depends on Phase-4 completion + a paying customer. +M12.2 depends on M3.2 (Stalwart deliverability must be clean before trial emails go out). +``` + +**Critical path** (longest chain to first paying customer): +`M0.1 → M0.2 → M1.1 → M1.2 → M0.3 → M1.3 → M2.1 → M2.2 → M4.1 → M4.2 → M4.3 → M5.1 → M5.2 → M6.2 → M6.3 → M10.1 → M11.1 → M12.1` + +That's 18 milestones. With one full-time agent and standard human review pacing, plan for **9–13 weeks** to first paying customer flow on stage (added 1 week for the PowerDNS / DNS-delegation cycle vs. the prior Cloudflare path); **+2–4 weeks** for prod hardening and the Phase-4 lifecycle completion. + +> **Note on M3.2 critical path:** Stalwart IP warming (4–8 weeks) runs in *background parallel* — start it immediately after M3.1 so warming finishes before M12.2 needs it. It is NOT on the critical path for first paying customer (that customer can be onboarded by hand), but it IS on the critical path for self-serve trial volume. + +**Parallelism opportunities:** +- M6.x and M7.x can run fully in parallel (different repos, different stacks). +- M8.x is independent of all data-plane work once M2.2 is done. +- M15.x can begin as soon as M10.1 lands. + +--- + +## 12. Open questions to resolve before starting + +**Resolved:** +- ~~Email provider~~ → **Stalwart**, self-hosted on vm-control. Plan in M3.2; 4–8 week IP warming acknowledged. +- ~~Stripe vs Lemon Squeezy~~ → **Polar.sh**. Plan in M8.3. +- ~~Cloudflare account ownership~~ → not used; DNS is self-hosted via PowerDNS on vm-edge (M0.3). Registrar account (Benjamin's) still needs documented 2FA recovery — see new DR item below. + +**Still open:** +- **CDN host** for `cdn.yourplatform.com`: self-hosted MinIO + Caddy on vm-edge is the OSS-aligned default; alternative is BunnyCDN (cheap, EU). Decide before M6.3 (manifest bundles + hero images). +- **Cloud provider for port 25 outbound.** Stalwart needs unblocked port 25 to send mail. Hetzner blocks by default and requires a request to unblock with proof of intent + abuse contact; OVH and Scaleway unblock on request faster. Confirm with Benjamin which provider vm-control runs on. Block on M3.2 if port 25 is unblockable — fallback is sending via a different provider's IP with reverse DNS. +- **Test data privacy.** The demo tenant must contain ONLY synthetic data — confirm seed pipeline strips real PII even if our test orgs accidentally seed from prod. +- **Registrar + DNS bus-factor.** Document who owns the registrar account, who has 2FA recovery codes, and the procedure to update NS records without that person available. Goes in `platform/docs/runbooks/dr.md` before M0.3 ships. +- **Internal CA.** `step-ca` listed in INFRASTRUCTURE.md vm-edge as "optional" — decide whether inter-service mTLS is in scope for Phase 0 or deferred until Phase 4 (Enterprise tier). + +--- + +*End of document. Open items in §12 should be triaged before M0.1 starts; the bus-factor and port-25 items are the only hard blockers.* diff --git a/INFRASTRUCTURE.md b/INFRASTRUCTURE.md new file mode 100644 index 0000000..51a0814 --- /dev/null +++ b/INFRASTRUCTURE.md @@ -0,0 +1,774 @@ +# Infrastructure Specification +**Status:** Locked Topology +**Authors:** Sharang, Benjamin +**Date:** 2026-05-11 (topology lock: 2026-05-18) +**Companion docs:** PLATFORM_ARCHITECTURE.md, IMPLEMENTATION_PLAN.md, COST_PLAN.md +**Cloud provider:** SysEleven Cloud Services (DUS2, OpenStack) + +--- + +## 1. VM Inventory + +**Four billable VMs total.** Three in production (one per plane after collapsing Identity+Infra), one in stage. Dev runs entirely on developer laptops via docker-compose. + +``` +┌──────────────┬─────────────────┬────────────────────────┬───────────┬─────────────────┐ +│ Name │ Env │ SysEleven flavor │ Public IP │ Planes owned │ +├──────────────┼─────────────────┼────────────────────────┼───────────┼─────────────────┤ +│ vm-edge │ prod │ m2.small (2v / 8 GB) │ YES (1) │ Identity + Infra│ +│ vm-control │ prod │ m2.medium (4v / 16 GB) │ No │ Control │ +│ vm-data │ prod │ m2.medium (4v / 16 GB) │ No │ Data │ +│ stage │ stage │ m2.small (2v / 8 GB) │ YES (1) │ App plane only │ +│ (dev) │ dev │ local docker-compose │ n/a │ all (in-memory) │ +└──────────────┴─────────────────┴────────────────────────┴───────────┴─────────────────┘ +``` + +**Total compute:** 48 GiB-RAM, 12 vCPU. **Monthly compute net: €192 (36M upfront) / €295 (12M) / €435 (On-Demand).** See COST_PLAN.md for the full three-mode table. + +### Why this topology and not the previous 7-VM layout + +The earlier draft proposed one VM per service group (vm-gateway, vm-identity, vm-secrets, vm-ops, vm-control, vm-certifai, vm-compliance). That gave maximum failure isolation but cost 132 GiB-RAM stage+prod. At 5 customers the isolation is unused — every VM ran at <10% utilisation. The locked topology buys back failure isolation incrementally as load grows (see §13 Growth Trajectory). + +Critical isolations preserved even at 4 VMs: +- **vm-edge isolates identity from app workloads.** Keycloak JVM has its own page cache; ERPNext background jobs cannot starve token issuance. +- **vm-data isolates databases from stateless services.** All data-plane DBs share one host, but they're walled off from the portal + ERPNext + Stalwart competing on vm-control. +- **stage runs the app plane only.** It calls prod Keycloak + prod Tenant Registry under `tenant.kind = stage` rather than mirroring those services. + +--- + +## 2. Service-to-VM Mapping + +``` +vm-edge (prod, m2.small 8 GB, public IP) + ├── orca-proxy (Orca-managed; wildcard TLS terminator) + ├── powerdns-auth (Orca-managed; authoritative DNS for yourplatform.com) + ├── keycloak-26 (Orca-managed; JVM, ~1.5 GB heap) + ├── postgres-keycloak (Orca-managed; dedicated PG instance for Keycloak only) + ├── infisical (Orca-managed) + ├── postgres-infisical (Orca-managed; dedicated PG instance for Infisical only) + ├── redis-infisical (Orca-managed; ephemeral) + └── gitea (Orca-managed; SQLite backend to avoid a third PG) + +vm-control (prod, m2.medium 16 GB) + ├── customer-portal (Orca-managed; Next.js) + ├── tenant-registry (Orca-managed; Go) + ├── orca-controller (Orca core process; NOT a managed container) + ├── erpnext (Orca-managed; Frappe bench) + ├── frappe-hd (same bench as ERPNext) + ├── mariadb (Orca-managed; for ERPNext) + ├── redis-erpnext (Orca-managed) + └── stalwart-mail (Orca-managed; SMTP/IMAP/JMAP on mail.yourplatform.com) + +vm-data (prod, m2.medium 16 GB) + ├── certifai-dashboard (Orca-managed) + ├── mongodb (Orca-managed) + ├── litellm (Orca-managed) + ├── backend-compliance (Orca-managed) + ├── ai-compliance-sdk (Orca-managed) + ├── admin-compliance (Orca-managed) + ├── postgres-app (Orca-managed; schemas: tenant_registry, compliance) + ├── qdrant (Orca-managed) + └── minio (Orca-managed) + +stage (stage, m2.small 8 GB, public IP) + ├── orca-proxy (light; only routes to stage app) + ├── customer-portal (NEW VERSION under test) + ├── tenant-registry (NEW VERSION under test, talks to ephemeral PG below) + ├── certifai-dashboard (NEW VERSION under test) + ├── backend-compliance (NEW VERSION under test) + ├── ai-compliance-sdk (NEW VERSION under test) + ├── admin-compliance (NEW VERSION under test) + ├── litellm (light; same image as prod) + ├── postgres-app-stage (ephemeral; lives entirely on stage VM) + ├── mongodb-stage (ephemeral) + └── qdrant-stage (ephemeral, tiny corpus) + + Calls OUT to prod: + → auth.yourplatform.com (Keycloak token issuance, under stage client_id) + → mail.yourplatform.com (Stalwart SMTP, recipient filter forces +stage@ only) + → Polar SANDBOX webhook URL (NEVER prod Polar) + → no calls to prod Postgres-app, MariaDB, MongoDB +``` + +### Stage isolation rules (enforced at the platform, not in product code) + +| Risk | Enforcement mechanism | Owner | +|---|---|---| +| Stage writes to prod database | Infisical scope: stage app only gets `/stage/*` secrets. Prod DB credentials never reach stage. | Infra plane | +| Stage emails real customers | Stalwart accept-rule: drop if recipient does not match `*+stage@*`. | Control plane (Stalwart config) | +| Stage triggers real Polar charges | Stage env points `POLAR_API_URL` to sandbox. Prod Polar webhook secret never on stage. | Control plane | +| Stage Keycloak JWT used in prod | `stage_client_id` issued only by Keycloak; prod services reject JWTs with this aud. | Identity plane | +| Stage load DOSes prod Keycloak | Keycloak rate-limit per client_id; stage limited to 60 req/s. | Identity plane | + +--- + +## 3. Network Topology + +``` + INTERNET + │ + (yourplatform.com — authoritative on vm-edge PowerDNS; + stage.yourplatform.com — authoritative same zone) + │ + ┌─────────────┴─────────────┐ + │ │ + ┌───────▼────────┐ ┌────────▼─────────┐ + │ vm-edge │ │ stage │ + │ (public IP) │ │ (public IP) │ + │ │ │ │ + │ orca-proxy ────┤ │ orca-proxy │ + │ powerdns │ │ portal-new │ + │ keycloak │◄────────┤ tenant-registry-new + │ pg-keycloak │ stage │ certifai-new │ + │ infisical │ calls │ compliance-new │ + │ pg-infisical │ prod │ pg-stage │ + │ redis-infis │ KC + │ mongo-stage │ + │ gitea │ Stalwart│ qdrant-stage │ + └───────┬────────┘ └──────────────────┘ + │ PRIVATE NETWORK 10.0.0.0/16 + ┌────────┴─────────┐ + │ │ +┌──────▼───────┐ ┌───────▼──────┐ +│ vm-control │ │ vm-data │ +│ │ │ │ +│ portal │ │ certifai │ +│ tenant-reg │ │ mongodb │ +│ orca-ctrl │ │ litellm │ +│ erpnext │ │ backend-comp │ +│ frappe-hd │ │ ai-sdk │ +│ mariadb │ │ admin-comp │ +│ redis-erp │ │ pg-app │ +│ stalwart │ │ qdrant │ +└──────────────┘ │ minio │ + └──────────────┘ + +Orca-Proxy routing (vm-edge, by Host header): + auth.yourplatform.com → 127.0.0.1:8443 (Keycloak, local on vm-edge) + erp.yourplatform.com → vm-control:8000 (ERPNext) [allowlist: our IPs only] + git.yourplatform.com → vm-edge:3000 (Gitea, local) [allowlist: our IPs only] + mail.yourplatform.com → vm-control:587 (Stalwart submission) [allowlist: VM internal only] + ns1.yourplatform.com → 127.0.0.1:53 (PowerDNS, local) + *.yourplatform.com → vm-control:3000 (customer portal) + +Orca-Proxy routing (stage, by Host header): + *.stage.yourplatform.com → 127.0.0.1:3000 (stage portal — all subdomains route here) +``` + +--- + +## 4. Storage and Volume Requirements + +Block volumes (Ceph 3x replicated, €0.10/GiB/mo) mounted to each VM. + +``` +┌──────────────┬───────────────────────────────────────────┬─────────┬─────────────────────┐ +│ VM │ Data stores │ +Block │ Growth profile │ +├──────────────┼───────────────────────────────────────────┼─────────┼─────────────────────┤ +│ vm-edge │ pg-keycloak + pg-infisical + Gitea repos │ +50 GB │ Slow │ +│ vm-control │ MariaDB (ERPNext) + Stalwart mail spool │ +250 GB │ Medium │ +│ vm-data │ MongoDB + pg-app + Qdrant + MinIO │ +500 GB │ Fast (scales w/ N) │ +│ stage │ pg-stage + mongo-stage + qdrant-stage │ +50 GB │ Resets per release │ +└──────────────┴───────────────────────────────────────────┴─────────┴─────────────────────┘ + +Each VM's root disk: 50 GB ephemeral, included in flavor price. + +Object storage (S3, €0.02/GiB/mo single-region or €0.0496/GiB/mo geo-redundant): + ┌─────────────────────────────────┬─────────┬──────────────────────────┐ + │ Bucket │ Size │ Purpose │ + ├─────────────────────────────────┼─────────┼──────────────────────────┤ + │ s3://backups (geo-redundant) │ ~500 GB │ Database dumps │ + │ s3://seed-data │ ~30 GB │ Demo tenant fixtures │ + │ s3://exports │ ~50 GB │ GDPR/offboarding ZIPs │ + │ s3://audit-archive │ ~20 GB │ Old audit log overflow │ + └─────────────────────────────────┴─────────┴──────────────────────────┘ +``` + +--- + +## 5. Backup Requirements + +All backups ship to **SysEleven Object Storage** (S3-compatible, geo-redundant DUS2 ↔ HAM1 for production-critical data). Backup jobs run as Orca one-shot containers on cron. Infisical holds the S3 credentials. + +``` +┌───────────────────────┬──────────────────┬────────────┬────────────┬──────────────────────┐ +│ Data store │ Method │ Frequency │ Retention │ Owner (who restores) │ +├───────────────────────┼──────────────────┼────────────┼────────────┼──────────────────────┤ +│ pg-keycloak (vm-edge) │ pg_dump → S3-geo │ Every 6h │ 14 days │ Infra Plane │ +│ pg-infisical (vm-edge)│ pg_dump → S3-geo │ Daily │ 30 days │ Infra Plane │ +│ Gitea (vm-edge) │ gitea dump → S3 │ Daily │ 30 days │ Infra Plane │ +│ Keycloak realm export │ KC export → S3 │ Daily │ 14 days │ Identity Plane (owns)│ +│ Infisical store │ encrypted → S3 │ Daily │ 30 days │ Infra Plane │ +│ MariaDB (vm-control) │ mysqldump → S3 │ Every 6h │ 30 days │ Control Plane │ +│ Stalwart queue/store │ tar → S3 │ Daily │ 7 days │ Control Plane │ +│ pg-app (vm-data) │ pg_dump → S3-geo │ Every 6h │ 30 days │ Data Plane (owns RPO)│ +│ MongoDB (vm-data) │ mongodump → S3 │ Daily │ 30 days │ Data Plane │ +│ MinIO (vm-data) │ mc mirror → S3 │ Daily │ 90 days │ Data Plane │ +│ Qdrant (vm-data) │ API snap → S3 │ Daily │ 14 days │ Data Plane (rebuild) │ +│ stage * │ no backup │ — │ — │ — (ephemeral) │ +│ Orca config (IaC) │ Gitea (VCS) │ On commit │ Forever │ Infra Plane │ +└───────────────────────┴──────────────────┴────────────┴────────────┴──────────────────────┘ +``` + +### RPO by data criticality + +``` +CRITICAL (RPO ≤ 6h) + pg-keycloak — org memberships, IdP config + pg-app — tenant registry, compliance records + MariaDB/ERPNext — sales orders, invoices, contracts + +IMPORTANT (RPO ≤ 24h) + MongoDB — chat history, user preferences + MinIO — compliance evidence documents + pg-infisical — encrypted secrets + Stalwart store — inbound webhooks, bounce records + +RECOVERABLE (RPO ≤ 48h, rebuildable) + Qdrant — vector index (rebuildable from MinIO source documents) + Gitea — code (mirrored on dev machines) + Keycloak export — org structure (pg-keycloak is primary) + +NOT BACKED UP + stage (any data) — by design; restored from seed bundles on each deploy + redis-* — caches; restart cold +``` + +--- + +## 6. Constraint Framework + +### Constraint types + +``` +AVAILABILITY — required uptime percentage over rolling 30 days +RTO — Recovery Time Objective: max time to restore service after failure +RPO — Recovery Point Objective: max acceptable data loss window +IaC — service must be declared in Orca config, no manual container runs in prod +SECRET_HYGIENE — all secrets via Infisical machine identity; no env files, no hardcoded values +NETWORK — whether service is internet-exposed or internal-only +DATA_RESIDENCY — all data must remain in EU (SysEleven DUS2 + HAM1) +AUDIT_TRAIL — all mutating actions logged (who, what, when, from where) +IMMUTABILITY — config changes go through Gitea → Orca pipeline, not manual SSH +STAGE_ISOLATION— stage tenant cannot mutate any prod data; reads-only against prod KC + TR +``` + +### Plane ownership of constraints + +Even though planes now share VMs, the **ownership model is unchanged** — the plane that owns a constraint owns it regardless of which VM hosts the service. The Infra Plane (now collapsed onto vm-edge alongside the Identity plane) still mechanically enforces backup, IaC, secrets, and network constraints. + +``` +╔══════════════════════════════════════════════════════════════════════════════════════════╗ +║ IDENTITY PLANE (on vm-edge) ║ +║ ║ +║ Owns / defines: ║ +║ AVAILABILITY — must be ≥ 99.5% (root dep for everything) ║ +║ RTO — ≤ 15 min ║ +║ AUDIT_TRAIL — realm-level audit (logins, token issuance, IdP events) ║ +║ DATA_RESIDENCY— Keycloak realm data must stay EU ║ +║ STAGE_ISOLATION— rate-limits stage_client_id; rejects stage JWTs in prod audiences ║ +║ ║ +║ Co-tenant note: shares vm-edge with Infra Plane services. JVM heap pinned to 1.5 GB ║ +║ in Orca manifest so it cannot starve PowerDNS / Infisical. ║ +╚══════════════════════════════════════════════════════════════════════════════════════════╝ + +╔══════════════════════════════════════════════════════════════════════════════════════════╗ +║ CONTROL PLANE (on vm-control) ║ +║ ║ +║ Owns / defines: ║ +║ RPO (tenant) — tenant registry & compliance schemas RPO ≤ 6h ║ +║ RPO (ERPNext) — sales orders, invoices RPO ≤ 6h ║ +║ AUDIT_TRAIL — all portal actions (invites, IdP changes, impersonations) ║ +║ AVAILABILITY — portal ≥ 99.5%; ERPNext ≥ 99% (internal) ║ +║ RTO (portal) — ≤ 10 min ║ +║ RTO (ERPNext) — ≤ 60 min ║ +║ ║ +║ Co-tenant note: ERPNext + Portal + Stalwart on one VM. Orca resource limits enforced: ║ +║ portal: 1 GB memory cap ║ +║ erpnext: 6 GB memory cap ║ +║ mariadb: 3 GB memory cap ║ +║ stalwart: 1 GB memory cap ║ +║ tenant-registry: 500 MB ║ +╚══════════════════════════════════════════════════════════════════════════════════════════╝ + +╔══════════════════════════════════════════════════════════════════════════════════════════╗ +║ DATA PLANE (on vm-data) ║ +║ ║ +║ Owns / defines: ║ +║ DATA_RESIDENCY — all customer data (MongoDB, pg-app, MinIO) must stay EU ║ +║ RPO (product) — compliance records ≤ 6h; chat history ≤ 24h ║ +║ DATA_ISOLATION — every query scoped by org_id/tenant_id ║ +║ AUDIT_TRAIL — product-level actions ║ +║ AVAILABILITY — CERTifAI ≥ 99.5%; compliance ≥ 99.5% ║ +║ ║ +║ Co-tenant note: this VM is the SCALE driver. When vm-data hits 80% RAM, bump flavor ║ +║ (m2.medium → m2.large → m2.xlarge). See §13 Growth Trajectory. ║ +╚══════════════════════════════════════════════════════════════════════════════════════════╝ + +╔══════════════════════════════════════════════════════════════════════════════════════════╗ +║ INFRA PLANE (on vm-edge, alongside Identity) ║ +║ ║ +║ Owns / enforces ALL of: ║ +║ BACKUP — executes all backup jobs (pg_dump, mongodump, mc mirror) ║ +║ IaC — ALL services declared in Orca config; no manual prod changes ║ +║ IMMUTABILITY — config changes: Gitea commit → Gitea Actions → Orca API only ║ +║ SECRET_HYGIENE— Infisical (on vm-edge); provisions machine identities ║ +║ NETWORK — Orca-Proxy rules; VM firewall; no direct VM public exposure ║ +║ DATA_RESIDENCY— VM region = SysEleven DUS2; backups geo-redundant DUS2↔HAM1 ║ +║ AVAILABILITY — Orca restart policies, health checks ║ +║ COLD_START — enforces startup ordering (see §10 Scenario F) ║ +║ STAGE_ISOLATION— Infisical secret-path scoping for stage_app identity ║ +╚══════════════════════════════════════════════════════════════════════════════════════════╝ +``` + +--- + +## 7. SLA Table + +``` +┌───────────────────────┬──────────────┬─────────┬─────────┬────────────────────────────────┐ +│ Service │ Availability │ RTO │ RPO │ Host VM │ +├───────────────────────┼──────────────┼─────────┼─────────┼────────────────────────────────┤ +│ Orca-Proxy │ 99.9% │ 5 min │ N/A │ vm-edge │ +│ PowerDNS │ 99.9% │ 5 min │ N/A │ vm-edge │ +│ Keycloak │ 99.5% │ 15 min │ 6h │ vm-edge (root auth dep) │ +│ Infisical │ 99.5% │ 30 min │ 24h │ vm-edge (running svcs survive) │ +│ Gitea │ 99% │ 2h │ 24h │ vm-edge (dev machines mirror) │ +│ Customer Portal │ 99.5% │ 10 min │ N/A │ vm-control │ +│ Tenant Registry │ 99.5% │ 10 min │ 6h │ vm-control │ +│ ERPNext │ 99% │ 60 min │ 6h │ vm-control (internal only) │ +│ Frappe HD │ 99% │ 60 min │ 24h │ vm-control │ +│ MariaDB │ 99.5% │ 20 min │ 6h │ vm-control │ +│ Stalwart Mail │ 99% │ 60 min │ 24h │ vm-control │ +│ CERTifAI │ 99.5% │ 10 min │ 24h │ vm-data │ +│ MongoDB │ 99.5% │ 20 min │ 24h │ vm-data │ +│ LiteLLM │ 99% │ 5 min │ N/A │ vm-data │ +│ backend-compliance │ 99.5% │ 10 min │ 6h │ vm-data │ +│ ai-compliance-sdk │ 99.5% │ 10 min │ 6h │ vm-data │ +│ pg-app │ 99.9% │ 20 min │ 6h │ vm-data (SPOF — RISK-1) │ +│ MinIO │ 99.5% │ 30 min │ 24h │ vm-data │ +│ Qdrant │ 99% │ 2h │ 24h │ vm-data (rebuildable) │ +│ stage (any service) │ 95% │ best ef.│ N/A │ stage (ephemeral; no SLA) │ +└───────────────────────┴──────────────┴─────────┴─────────┴────────────────────────────────┘ +``` + +--- + +## 8. IaC Constraint (Orca) + +Every production service declared in Orca config. No exceptions. + +### Rules + +``` +1. ALL containers run via Orca manifests committed to Gitea + → /orca/manifests/{vm-name}/{service-name}.toml + → Changes go through: Gitea PR → Gitea Actions lint → Orca API apply + +2. NO manual docker run / docker-compose up on any production VM + → SSH to prod VMs allowed for debugging only; no state changes + +3. Secrets are NEVER in Orca manifests + → Manifests reference Infisical paths, not values + → Bootstrap exception: Keycloak DB URI in Orca env (Keycloak runs ON vm-edge alongside + Infisical, so chicken-and-egg is solved by Orca env file, not Infisical lookup) + +4. Restart policy: always (Orca restarts crashed containers with exponential backoff) + → Health check per service (HTTP /health or TCP probe) + +5. Resource limits MANDATORY in every manifest + → On a 3-VM prod, co-tenant noise is the single biggest risk; limits are non-negotiable + → See §6 Plane ownership "Co-tenant note" boxes for the per-service caps + +6. Orca controller state itself is recoverable + → Manifest files in Gitea = desired state + → Loss of Orca controller = re-apply manifests from Gitea, services continue running + +7. Stage app gets its own Infisical scope + → /stage/* path; no prod-DB credentials reach this scope + → Enforced at Infisical machine-identity level, not in app code +``` + +### Gitea Actions pipeline for infra changes + +``` +infra change committed to Gitea + │ + ├── lint: validate Orca manifest schema + ├── diff: show what changes will be applied (orca plan) + ├── (manual approval gate for vm-edge changes — touches auth root) + └── apply: POST to Orca Controller API → rolling update +``` + +--- + +## 9. Dependency Graph + +Arrows = "requires to function." Dashed = soft (degrades, doesn't fail). +**Intra-VM dependencies elided** for clarity (e.g. Keycloak ↔ pg-keycloak are on the same host and start together). + +``` + EXTERNAL + AI APIs + (OpenAI / Anthropic) + │ + │ (soft) + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ vm-edge (Identity + Infra) │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ pg-keycloak ──► keycloak │ │ +│ │ pg-infisical ─► infisical ◄── (all VMs pull on startup) │ │ +│ │ redis-infis ──► infisical │ │ +│ │ (sqlite) ─────► gitea │ │ +│ │ powerdns-auth (no deps) │ │ +│ │ orca-proxy (route table only; backends are remote) │ │ +│ └────────────────────────────────────────────────────────────┘ │ +│ │ Keycloak JWKS │ Infisical /secrets │ +│ │ │ │ +└────────────────────────────┼────────────────┼────────────────────┘ + ▼ ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ vm-control (Control) │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ mariadb + redis-erp ──► erpnext + frappe-hd │ │ +│ │ (intra) ─────────────► stalwart │ │ +│ │ ──────────────────────► customer-portal │ │ +│ │ ──────────────────────► tenant-registry ──► pg-app (vm-data)│ │ +│ └────────────────────────────────────────────────────────────┘ │ +│ │ tenant-registry API │ +└────────────────────────────┼─────────────────────────────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ vm-data (Data) │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ mongodb ───► certifai ◄── (vm-edge JWKS, vm-edge secrets) │ │ +│ │ litellm ───► certifai, ai-compliance-sdk │ │ +│ │ pg-app ────► tenant-registry-on-vm-control, backend-compl,│ │ +│ │ ai-compliance-sdk │ │ +│ │ qdrant ────► ai-compliance-sdk │ │ +│ │ minio ────► backend-compliance │ │ +│ │ backend-compliance ──► admin-compliance │ │ +│ └────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────┐ +│ stage (App plane only) │ +│ Calls vm-edge:8443 (KC) + vm-control:587 (Stalwart submission) │ +│ Calls Polar SANDBOX (never prod Polar webhook URL) │ +│ Its own ephemeral DBs; cannot read prod data │ +└──────────────────────────────────────────────────────────────────┘ +``` + +### Simplified critical path (customer login → product use) + +``` + DNS (vm-edge PowerDNS) + │ + ▼ + orca-proxy (vm-edge) + │ + ├──► keycloak (vm-edge) ──► pg-keycloak (intra-VM) + │ + └──► customer-portal (vm-control) + ├──► tenant-registry (vm-control) ──► pg-app (vm-data) + ├──► certifai (vm-data) ──► mongodb (intra-VM) + └──► backend-compliance (vm-data) ──► pg-app (intra-VM) + ──► ai-sdk ──► qdrant + minio + ──► litellm ──► [external AI APIs] +``` + +--- + +## 10. Failure Scenarios and Deadlock Analysis + +### Scenario A — vm-edge fails (HIGHEST SEVERITY) + +``` +Impact: TOTAL outage. Nothing reachable from internet. + No DNS. No TLS. No auth. No new logins. Running JWTs expire within 15 min, + then ALL services start returning 401. + Backstage and customer portal both fully blocked. + Stage also blocked (depends on prod Keycloak). +Cascade: T+0: DNS fails → orca-proxy unreachable + T+5m: existing JWTs still valid; portal cached → partial reads work + T+15m: JWTs expire → full outage +Deadlock: None — services downstream don't deadlock, they just fail closed +Recovery: 1. Spin up vm-edge-spare (cold standby, same Orca config) — ~3 min provision + 2. Restore pg-keycloak + pg-infisical from latest backup — ~5 min + 3. Swap registrar NS records to spare IP (TTL 60s) — ~2 min propagation + 4. Restart all services on vm-edge-spare via Orca apply — ~3 min + Total RTO target: 15 min +Mitigation: COLD STANDBY vm-edge-spare. Same Orca config committed in Gitea. + Provision cost when idle: €0 (only billed when running). + Test recovery quarterly. +Severity: CRITICAL — single host owns 3 root dependencies (DNS, auth, secrets) +Cost of fix at Tier C: split vm-edge into vm-edge + vm-identity + vm-secrets + (back toward original 7-VM design) — €100/mo extra +``` + +### Scenario B — vm-control fails (NEW — consequence of plane consolidation) + +``` +Impact: customer-portal: DOWN → /[slug]/* all return 503 + tenant-registry: DOWN → Keycloak protocol-mapper for products claim breaks + → users can log in but see "No active products" + ERPNext + Frappe HD: DOWN → we cannot create sales orders or read tickets + Stalwart: DOWN → no outbound emails (trial nudges, exports, ticket replies) + MariaDB: DOWN → ERPNext queries fail; backups paused + Products (CERTifAI, compliance): UNAFFECTED (on vm-data, JWTs still validate) + Existing logged-in users: can use products directly via product subdomain + IF they bookmark it; portal home is 503. +Cascade: T+0: portal 503; new tenant onboarding blocked (registry down) + T+15m: existing JWTs missing refreshed products claim + T+1h: trial emails not sent → trial nudge cadence breaks +Deadlock: None +Recovery: Restart vm-control containers via Orca. If MariaDB corrupt: restore mysqldump. + RTO target: 10 min (portal) / 60 min (ERPNext) +Mitigation: Multiple services co-hosted = single failure hits many SLAs. + Resource limits in Orca prevent ERPNext OOM from killing portal. + Quarterly drill: deliberately stop portal, measure recovery. +Severity: HIGH — three services down at once, but products keep serving customers +Cost of fix at Tier B/C: split vm-control → vm-portal + vm-ops (ERPNext) + — €64/mo extra at m2.small +``` + +### Scenario C — vm-data fails + +``` +Impact: tenant-registry queries: FAIL (pg-app down) → portal returns 503 for tenant lookup + customer-portal: DEGRADED (login works, dashboard fails) + CERTifAI: COMPLETELY DOWN + backend-compliance + ai-sdk + admin: COMPLETELY DOWN + ERPNext + Stalwart: UNAFFECTED +Cascade: T+0: products down; portal degraded + T+15m: support tickets pile up + Note: prod is partial — users see error pages but ERPNext + auth still work +Recovery: Restart vm-data containers. If pg-app corrupt: restore from pg_dump (RPO 6h). + RTO target: 20 min +Mitigation: This is the SCALE-event VM. RISK-1 below makes this the worst SPOF: + one pg-app instance owns tenant_registry + compliance schemas. + HIGH PRIORITY fix: split pg-app into separate clusters at Tier B/C transition. +Severity: HIGH — products down, business operations (ERPNext) still work so we can + contact customers +``` + +### Scenario D — LiteLLM fails + +``` +Impact: CERTifAI: AI features fail (summarization, chat completion). + CERTifAI dashboard, sessions: UNAFFECTED. + compliance AI generation: FAILS (DSFA/TOM/VVT generation blocked). + Compliance CRUD: UNAFFECTED. +Cascade: Soft degradation only. Products show "AI features temporarily unavailable" banner. +Deadlock: None. +Recovery: Restart LiteLLM on vm-data (stateless, ~30s). +Severity: MEDIUM — graceful degradation by design +``` + +### Scenario E — Stage VM compromised or buggy + +``` +Impact: On stage itself: stage portal serves bad data; stage testers see errors. + On prod: NONE if isolation rules in §2 are intact. + Worst case if isolation breaks: + - Stage code tries to call prod pg-app → fails (no creds in /stage/* Infisical) + - Stage emits real email → blocked by Stalwart recipient filter + - Stage triggers Polar charge → goes to sandbox, no real money +Cascade: None to prod by design. +Recovery: Roll back stage to previous image via Orca. RTO target: 5 min. +Mitigation: The 5 enforcement rules in §2 are the load-bearing controls. Verify quarterly + via deliberate red-team: try to write to prod pg-app from stage and confirm 401. +Severity: LOW (in prod) / HIGH (on stage, but stage SLA is 95%) +``` + +### Scenario F — Full Cold Start (Power Loss, All VMs Restart Simultaneously) + +``` +Three VMs boot at once. Services must start in dependency order or services +crash-loop until their deps are ready. + +DEADLOCK RISK: vm-control services (portal, tenant-registry) start before vm-data + services (pg-app, certifai, compliance). They'll crash-loop ~2-5min + with backoff retries. + Same for ERPNext on vm-control trying to reach Keycloak on vm-edge. + +RESOLUTION: Orca enforces cross-VM startup ordering via health-check dependencies. + Bootstrap exception: Keycloak DB URI in Orca env on vm-edge (not from + Infisical — chicken-and-egg solved). + +Required cold start sequence: + + Phase 0 — Data roots on vm-data (parallel): + pg-app, mongodb, qdrant, minio + Phase 0 — Data roots on vm-control (parallel): + mariadb, redis-erpnext + Phase 0 — Data roots on vm-edge (parallel): + pg-keycloak, pg-infisical, redis-infisical + + Phase 1 — Secrets + DNS on vm-edge: + infisical (needs: pg-infisical, redis-infisical) + powerdns-auth (no deps) + + Phase 2 — Identity on vm-edge: + keycloak (needs: pg-keycloak [Phase 0], infisical [Phase 1]) + gitea (needs: sqlite; ready from Phase 0) + + Phase 3 — Control on vm-control + Data services on vm-data (parallel): + tenant-registry (needs: keycloak [Phase 2], pg-app [Phase 0, remote]) + erpnext + frappe-hd (needs: mariadb, redis-erpnext [Phase 0], keycloak [Phase 2]) + stalwart (needs: infisical [Phase 1]) + litellm (needs: infisical) + certifai (needs: keycloak, mongodb, litellm) + backend-compliance (needs: keycloak, pg-app) + ai-compliance-sdk (needs: pg-app, qdrant, litellm) + admin-compliance (needs: backend + sdk) + + Phase 4 — Customer-facing on vm-control: + customer-portal (needs: keycloak, tenant-registry) + + Phase 5 — Gateway on vm-edge (last): + orca-proxy (waits for all backends healthy before opening listener) + +Estimated cold-start time: 6-10 minutes (faster than 7-VM since less network roundtrip) +``` + +### Scenario G — Tenant Registry fails + +``` +Impact: Portal cannot resolve tenant from subdomain → /[slug]/* all 503 + Keycloak protocol mapper cannot get products claim → JWT missing field + → users can log in but see "No active products" + Products (CERTifAI, compliance) themselves: UNAFFECTED if already authenticated +Cascade: New logins degraded. + Existing sessions continue. +Deadlock: None. +Recovery: Restart tenant-registry on vm-control. pg-app on vm-data must be healthy. + RTO target: ≤ 60s +Mitigation: Portal caches slug → tenant mapping with 60s TTL. + Short outage invisible to customers. +Severity: MEDIUM +``` + +--- + +## 11. Cross-Dependency Summary Table + +``` + Needs → │PG-KC│PG-Inf│PG-App│Mongo│Maria│Redis│Minio│Qdrant│ KC │Infis│Lit. │T.Reg│ +─────────────────────┼─────┼──────┼──────┼─────┼─────┼─────┼─────┼──────┼─────┼─────┼─────┼─────┤ +keycloak │ ● │ │ │ │ │ │ │ │ │ ◐* │ │ │ +infisical │ │ ● │ │ │ │ ● │ │ │ │ │ │ │ +gitea │ │ │ │ │ │ │ │ │ │ ● │ │ │ +tenant-registry │ │ │ ● │ │ │ │ │ │ ● │ ● │ │ │ +customer-portal │ │ │ │ │ │ │ │ │ ● │ ● │ │ ● │ +erpnext │ │ │ │ │ ● │ ● │ │ │ ● │ ● │ │ │ +frappe-hd │ │ │ │ │ ● │ ● │ │ │ │ ● │ │ │ +stalwart │ │ │ │ │ │ │ │ │ │ ● │ │ │ +certifai │ │ │ │ ● │ │ │ │ │ ● │ ● │ ◐ │ │ +litellm │ │ │ │ │ │ │ │ │ │ ● │ │ │ +backend-compl. │ │ │ ● │ │ │ │ │ │ ● │ ● │ │ │ +ai-compl-sdk │ │ │ ● │ │ │ │ │ ● │ │ ● │ ◐ │ │ +admin-compl. │ │ │ │ │ │ │ │ │ │ │ │ │ +orca-proxy │ │ │ │ │ │ │ │ │ │ │ │ │ +stage-app │ │ │ │ │ │ │ │ │ ● │ ◑ │ │ ◑ │ + +● = hard dependency (cannot start without) +◐ = soft dependency (starts, features degrade) +◑ = stage-only read-mostly dependency (writes blocked by Infisical scope) +◐*= bootstrap exception (Keycloak DB URI in Orca env on vm-edge, not Infisical) +``` + +--- + +## 12. Open Infrastructure Risks (Priority Order) + +``` +RISK-1 pg-app (vm-data) is a single instance serving tenant_registry + compliance schemas. + One crash blocks portal AND compliance product simultaneously. + → Mitigation: split into pg-registry + pg-compliance at Tier B (200 customers). + Move pg-registry to its own DBaaS PostgreSQL cluster (€213/mo). + Priority: HIGH — fix before 100 customers; flagged also in COST_PLAN.md + +RISK-2 vm-edge is a single VM owning 3 root dependencies (DNS, auth, secrets). + Failure = total external outage. Highest blast radius in the system. + → Mitigation: + Phase A: cold-standby vm-edge-spare (idle cost €0; tested quarterly) + Phase B (Tier C, 500 cust): split vm-edge into vm-edge + vm-identity + vm-secrets + Priority: HIGH + +RISK-3 vm-control hosts 5 service groups (portal, tenant-registry, ERPNext, Frappe HD, + Stalwart). Co-tenant noise risk; one OOM kills the others. + → Mitigation: + Phase A: hard Orca resource limits per service (see §6 co-tenant notes) + Phase B (Tier B): split vm-control → vm-portal + vm-ops at €64/mo extra + Priority: MEDIUM + +RISK-4 Keycloak is a single instance with no clustering. + Any Keycloak outage = total auth failure within JWT TTL. + → Mitigation: short-term: tested runbook + 15min RTO target + long-term: Keycloak active-passive cluster (Phase 2, on split vm-identity) + Priority: MEDIUM + +RISK-5 Stage isolation depends on 5 enforcement controls (see §2 table). + If any one breaks, stage code can affect prod customers. + → Mitigation: quarterly red-team verification of each control. + Especially: Infisical secret-path scoping and Stalwart recipient filter. + Priority: MEDIUM — easy to forget once it's working + +RISK-6 Infisical downtime during multi-VM restart causes delayed cold start. + → Mitigation: Orca startup ordering + bootstrap secrets for Keycloak only + Priority: LOW — documented runbook; cold start is rare + +RISK-7 ERPNext → Tenant Registry webhook has no guaranteed delivery. + Failed activation = tenant not active after contract signed. + → Mitigation: Frappe retry + idempotent /activate endpoint + manual Backstage trigger + Priority: LOW + +RISK-8 LiteLLM calls external AI APIs (OpenAI / Anthropic). + → Mitigation: LiteLLM fallback routing; products degrade gracefully. + Priority: LOW — external dependency, by design +``` + +--- + +## 13. Growth Trajectory — when to add VMs + +The locked 4-VM topology is right for 5–~200 customers. Past that, expect to add VMs back in this order: + +``` +Tier A (5–200 cust): 4 VMs as locked €192/mo compute (36M upfront) + ↓ +Tier B (200–500): Bump vm-data m2.med → m2.large +€64/mo + Add cold-standby vm-edge-spare +€0 (idle, paid only on swap) + ↓ +Tier C (500–1000): Split vm-data: vm-data + vm-data-db +€64/mo + (postgres-app moves to its own VM, or DBaaS cluster +€213/mo) + Split vm-control: vm-control + vm-ops +€64/mo + (ERPNext + MariaDB + Stalwart move to vm-ops) + ↓ +Tier D (1000–2000): Split vm-edge: vm-edge + vm-identity + vm-secrets +€96/mo + HA Keycloak active-passive on 2× vm-identity +€32/mo + Octavia Load Balancer Double Instance +€58/mo + vm-data m2.large → m2.xlarge or 2× +€128–256/mo + ↓ + Final topology ≈ 8 prod VMs + DBaaS +``` + +Each step is justified by a measurable signal (>80% RAM, >70% CPU, sustained queue depth, or a specific outage scenario). Never split preemptively. + +--- + +## 14. Cost summary (see COST_PLAN.md for full breakdown) + +| Mode | Compute €/mo | Storage €/mo | Network €/mo | Total net | + 19% VAT | +|---|---:|---:|---:|---:|---:| +| On-Demand | 434.50 | 112 | 2.92 | 549.42 | 653.81 | +| 12-month commit | 295.20 | 112 | 2.92 | 410.12 | 488.04 | +| 36-month no upfront | 216.00 | 112 | 2.92 | 330.92 | 393.79 | +| 36-month upfront | 192.00 | 112 | 2.92 | 306.92 | 365.23 | + +Plus €6,912 net one-time payment if signing 36M-upfront for the compute portion. + +--- + +*End of document. Review quarterly or after any significant infrastructure change. Topology last locked 2026-05-18.* diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..bbbd4b3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Breakpilot Platform + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/PLATFORM_ARCHITECTURE.md b/PLATFORM_ARCHITECTURE.md new file mode 100644 index 0000000..d9fc31c --- /dev/null +++ b/PLATFORM_ARCHITECTURE.md @@ -0,0 +1,1144 @@ +# Platform Architecture — B2B Customer Portal +**Status:** Design Draft +**Authors:** Sharang, Benjamin +**Date:** 2026-05-11 + +--- + +## 1. Vision + +We sell CERTifAI and breakpilot-compliance as modular B2B building blocks. Customers buy one or both and operate them inside a unified customer portal — without needing to understand that they are separate products under the hood. + +Each customer is a **tenant**: fully isolated data, their own user base, their own identity configuration. We manage all tenants from a single operator backstage. + +ERPNext runs our company: CRM, sales orders, invoicing, HR. Frappe Helpdesk runs customer support. Gitea runs engineering. Everything else — Keycloak, all product services, all databases — runs on our own infrastructure managed by Orca. + +--- + +## 2. Products in Scope + +| Product | What it is | +|---|---| +| **CERTifAI** | Self-hosted GDPR-compliant AI admin dashboard. Manages LLMs, AI agents, MCP servers, usage analytics. Built with Rust/Dioxus. | +| **breakpilot-compliance** | GDPR and AI-Act compliance automation. Covers DSFA, VVT, TOM, DSR, AI Act, risk, vendor, incidents. Built with Python/FastAPI + Go AI SDK + Next.js. | + +Out of scope: breakpilot-dataroom, breakpilot-lehrer, breakpilot-pitch-deck. + +--- + +## 3. The Four Planes + +``` +╔══════════════════════════════════════════════════════════════════╗ +║ PLANE 1 — IDENTITY (logical root, all auth flows through here) ║ +╚══════════════════════════════════════════════════════════════════╝ + ↓ JWT +╔══════════════════════════════════════════════════════════════════╗ +║ PLANE 2 — CONTROL (portal + ERPNext + tenant registry) ║ +╚══════════════════════════════════════════════════════════════════╝ + ↓ tenant-scoped API calls +╔══════════════════════════════════════════════════════════════════╗ +║ PLANE 3 — DATA (CERTifAI + breakpilot-compliance) ║ +╚══════════════════════════════════════════════════════════════════╝ + ↓ everything runs on +╔══════════════════════════════════════════════════════════════════╗ +║ PLANE 4 — INFRA (Orca + VMs + Gitea + Infisical + LiteLLM) ║ +╚══════════════════════════════════════════════════════════════════╝ +``` + +--- + +## 4. Plane 1 — Identity + +**Technology:** Keycloak 26, single realm (`breakpilot-prod`) + +Keycloak is the only truth about who anyone is. Every other service validates JWTs issued here — nothing else handles auth logic. + +### Structure + +``` +Realm: breakpilot-prod +│ +├── Organizations (one per B2B customer) +│ ├── Acme Corp → org_id: uuid-acme +│ ├── BayernAG → org_id: uuid-bayernag +│ └── ... +│ +├── Organization Roles (what a user can do within their company) +│ ├── IT_ADMIN — full portal access, user management, IdP config +│ ├── CXO — dashboard, billing, audit (read) +│ ├── FINANCE — billing, invoices +│ ├── LEGAL — audit log, compliance read +│ └── USER — product access only +│ +├── Realm Roles (what we, the operators, can do) +│ ├── BREAKPILOT_ADMIN — full backstage, impersonation, demo tenant edit +│ ├── SUPPORT_ENGINEER — read backstage, limited impersonation +│ └── SALES_REP — demo tenant login, CRM read, NO real-tenant access +│ +└── Identity Provider Brokering (per org, optional) + ├── OIDC (Okta, Google Workspace, any OIDC provider) + └── SAML (Azure AD, ADFS, any SAML 2.0 provider) +``` + +### JWT Structure + +Every service receives a JWT containing: + +``` +sub — user UUID +email — user email +org_id — customer tenant UUID (= Keycloak org ID) +org_name — human-readable company name +org_roles — [IT_ADMIN, USER, ...] roles within their org +realm_roles — [customer] | [BREAKPILOT_ADMIN] | [SUPPORT_ENGINEER] | [SALES_REP] +products — [certifai, compliance] entitlements (injected by protocol mapper) +plan — starter | professional | enterprise +iss — https://auth.yourplatform.com/realms/breakpilot-prod +``` + +The `products` and `plan` claims are added by a Keycloak **protocol mapper** that reads live entitlements from the Tenant Registry at token issuance. Products do not need to call back to the registry on every request. + +--- + +## 5. Plane 2 — Control + +Three distinct services. Clear separation of responsibility. + +### 5a. Customer Portal + +**Technology:** Next.js 15 (new service) +**Deployed at:** `*.yourplatform.com` via Orca-Proxy wildcard routing + +The front door for all customers and for us. Owns no business logic — it is a routing, auth, and UI layer. + +**Subdomain routing:** +- DNS wildcard `*.yourplatform.com` → Orca-Proxy +- Orca-Proxy reads `Host` header → routes all traffic to the portal container +- Portal reads `Host` → extracts tenant slug → looks up Tenant Registry + +**Customer area** (requires valid JWT for their org): + +``` +/[slug]/dashboard product tiles, usage summary, activity +/[slug]/catalog browse ALL products, subscribed and not (upgrade/upsell flow) +/[slug]/products/ + /certifai CERTifAI product area (subscribed only) + /compliance breakpilot-compliance area (subscribed only) +/[slug]/projects optional sub-tenancy: dev/staging/prod separation [IT_ADMIN] +/[slug]/settings/ + /identity IdP configuration [IT_ADMIN] + /users invite, roles, deactivate [IT_ADMIN] + /api-keys API keys for integrations [IT_ADMIN] + /integrations webhooks, process hooks +/[slug]/billing/ plan, usage, invoices [FINANCE, CXO, IT_ADMIN] +/[slug]/audit/ platform + product audit, filterable by product [LEGAL, IT_ADMIN] +/[slug]/support/ Frappe HD customer portal [all roles] +``` + +**Operating principles (borrowed from AWS/Azure/GCP consoles):** + +``` +1. Role-based UI hiding + The portal NEVER shows a button, link, or section the user cannot use. + Disabled-with-tooltip is also wrong — hide it. The customer's mental model + should be "the portal shows me what I can do," not "the portal teases me." + +2. Browse before buy + /catalog shows every product available on the platform with description, + pricing tier, and a one-click "Request" CTA — even for products the + customer is not subscribed to. Drives organic upsell instead of + requiring sales touchpoints. + +3. Hierarchy: Tenant → Project (optional) → Resources + A tenant can have multiple projects (e.g., "Production", "Staging"). + Products that support project scoping isolate data per project. + Customers without sophistication operate as single-project (default). + Mirrors GCP Project / AWS Account / Azure Resource Group pattern. + +4. Cross-product activity log + /audit shows portal events AND every product's audit events filtered + by tenant. Filterable by product, actor, action, time range. One log + to satisfy DPO inquiries instead of hunting per-product. + +5. Cost and usage as first-class + Billing page is not just "your invoice." Shows live usage per product, + trend over time, and projected next invoice. Removes "bill shock." +``` + +**Backstage** (access by realm role): +- `BREAKPILOT_ADMIN` — everything below +- `SUPPORT_ENGINEER` — read all + impersonation, no create/delete +- `SALES_REP` — `/backstage/leads`, `/backstage/demo`, own CRM activity only; CANNOT load any other `/backstage/tenants/[id]` route + +``` +/backstage/dashboard MRR, active tenants, system health +/backstage/tenants/ + /new create customer + /[id]/overview health, logins, API volume + /[id]/products enable/configure products + /[id]/users view members, impersonate + /[id]/billing Stripe + ERPNext view + /[id]/support tickets for this customer + /[id]/audit full audit trail +/backstage/system/ + /health all service health + /incidents incident log + /releases deployment history +``` + +### 5b. ERPNext + +**Technology:** Frappe + ERPNext (self-hosted via Orca) +**Access:** `erp.yourplatform.com` — us only (IP-restricted at Orca-Proxy) +**Auth:** Keycloak OIDC — we log in with our existing accounts, no separate password + +ERPNext is our **business operations backbone**. We do not build CRM, invoicing, or HR — we configure ERPNext for these. + +| ERPNext Module | Used for | +|---|---| +| CRM | Leads, opportunities, deal pipeline | +| Sales | Quotations, Sales Orders (= contracts) | +| Accounts | Sales Invoices, payment tracking, DATEV export | +| Buying | Our own SaaS costs, infrastructure invoices | +| HR | Sharang + Benjamin as employees, expense claims | +| Support (Frappe HD) | Customer tickets, SLA, escalation to Gitea | + +**Integration with the platform:** +- ERPNext Customer record has a custom field `tenant_id` linking to the Tenant Registry +- When a Sales Order is submitted in ERPNext → webhook → Tenant Registry `/tenants/{id}/activate` +- Portal billing page reads invoices from ERPNext REST API server-side — customers never log into ERPNext directly +- We (founders) create quotations, orders, and invoices inside ERPNext + +### 5c. Tenant Registry + +**Technology:** Go service (new), PostgreSQL schema `tenant_registry` + +The glue between Keycloak, ERPNext, and the products. The technical source of truth for "what is this tenant, what do they have access to, how are they configured." + +**Key data it holds:** + +``` +tenants id, slug, name, erp_customer_id, stripe_cust_id, + status, plan, trial/contract dates, sales_owner, + kind (real | demo). + status ∈ {demo, trial, active, frozen, archived}. + demo — shared demo tenant; reset nightly; no billing + trial — real customer in their N-day evaluation window + active — paid, contract or self-serve plan + frozen — read-only after cancel / non-payment (30d grace) + archived — data export window closed; only audit log retained + +tenant_projects OPTIONAL sub-tenancy. id, tenant_id, name, slug, + status. Customers without need operate as a single + implicit "default" project. Products opt in via + manifest (supports_projects: true) and accept an + optional project_id parameter on tenant-scoped APIs. + Mirrors GCP Project / AWS Account pattern. + +tenant_products tenant ↔ product, enabled, config (litellm_url, + max_seats, modules_enabled), expires_at + +tenant_idp_config type (oidc/saml), metadata, verified + +audit_log every portal AND product action: who, what, when, + from where, including impersonations. Indexed for + cross-product search (filter by tenant + product + + actor + action + time). Schema is Retraced-compatible + so we can swap implementation without changing + producers (see PRODUCT_INTEGRATION_SPEC.md §8.4). + +api_keys portal-owned. tenant_id, product, scopes, name, + hash, created_by, last_used_at, revoked_at. + Headless products call /internal/api-keys/verify + to validate inbound keys. Single source of truth + across all products. +``` + +**Links:** +- `tenant.id` = `Keycloak org_id` (one-to-one) +- `tenant.erp_customer_id` = `ERPNext Customer.name` (one-to-one) +- `tenant.stripe_cust_id` = Stripe Customer ID (self-serve billing only) + +### 5d. Demo Tenant (Shared) + +**Slug:** `demo` — reachable at `demo.yourplatform.com` +**Status:** `demo` (never transitions; never billed) +**Owner:** us (`BREAKPILOT_ADMIN` curates content; `SALES_REP` reads + logs in) + +A single, shared tenant pre-seeded with realistic-but-fake data covering CERTifAI + breakpilot-compliance. Sales reps use it to walk prospects through the product live. Prospects do NOT log in directly — the sales rep drives the screen. + +**How it differs from a real tenant:** + +``` +DEMO TENANT REAL TENANT +───────────────────────────────────── ─────────────────────────────────── +status = demo status = trial | active +billing disabled billing active +audit emitted but not exported audit emitted and exportable +nightly reset job restores fixtures data is permanent +seed data loaded on reset: customer-owned data + product.manifest.seed_data_url +all real-tenant flows work otherwise same flows, same code paths +``` + +**Why shared and not per-prospect:** +- Cheap (one tenant, no Orca provisioning per prospect) +- Predictable (sales reps know exactly what's in there) +- The known-quantity model — works in practice, matches what we have experience with +- Tradeoff accepted: concurrent edits during the same day are visible across demo sessions; nightly reset hides this within 24h + +**Nightly reset:** +- Cron job (3:00 Europe/Berlin) calls each product's `/v1/tenants/demo/reset` endpoint +- Product fetches its fixtures from `catalog.seed_data_url` and restores +- Reset is itself an audit event; failures page the on-call + +### 5e. Frappe Helpdesk + Gitea Issues + +**Technology:** Frappe HD (installed on same Frappe bench as ERPNext), Gitea Issues + +**Support flow:** +- Customer submits ticket via `/[slug]/support/` (Frappe HD customer portal, embedded or linked) +- Agent (us) triages in Frappe HD agent UI at `erp.yourplatform.com` +- If technical: agent clicks "Escalate to Engineering" → Frappe server script creates a Gitea issue in the relevant repo via Gitea REST API → issue URL stored on ticket +- When Gitea issue is closed → Gitea webhook → Frappe HD → ticket marked "Resolved" + +--- + +## 6. Plane 3 — Data + +### CERTifAI + +Self-hosted GDPR-compliant AI dashboard. After updates, it is fully tenant-aware. + +**Multi-tenancy:** All MongoDB queries scoped by `org_id` from JWT +**Auth:** Validates Keycloak JWT (JWKS endpoint), maps `org_roles` to product roles +**LiteLLM:** Shared managed instance (Starter/Professional, API key per tenant) or customer-hosted (Enterprise, URL stored in `tenant_products.config`) +**Role mapping:** + +| Portal role | CERTifAI role | +|---|---| +| IT_ADMIN | Admin | +| CXO, USER | Member | +| FINANCE, LEGAL | Viewer | + +### breakpilot-compliance + +GDPR and AI-Act compliance automation platform. After updates, tenant identity comes from validated JWT — not raw client headers. + +**Multi-tenancy:** All PostgreSQL queries scoped by `tenant_id` (= `org_id` from JWT) +**Auth:** Next.js proxy validates JWT → extracts `org_id` → sets `X-Tenant-ID` +**Role mapping:** `LEGAL` can approve DSFA; `IT_ADMIN` is compliance admin; `USER` contributes to DSR/VVT workflows + +--- + +## 7. Plane 4 — Infra + +**Orchestration:** Orca manages all containers on Hetzner VMs +**Secrets:** Infisical — every service has a machine identity, pulls its own secrets at startup +**CI/CD:** Gitea Actions → Docker build → push to private registry → Orca redeploy webhook +**Routing:** Orca-Proxy handles all TLS termination and subdomain routing + +``` +Orca-Proxy routing table: + auth.yourplatform.com → Keycloak + erp.yourplatform.com → ERPNext + Frappe HD (IP-restricted) + git.yourplatform.com → Gitea + secrets.yourplatform.com → Infisical (IP-restricted) + *.yourplatform.com → customer-portal (wildcard, Host → tenant) +``` + +**Services managed by Orca:** + +``` +Identity & Auth + └── Keycloak 26 + +Business Operations + ├── ERPNext (Frappe) + └── Frappe Helpdesk + +Developer Tooling + ├── Gitea + └── Gitea Runner + +Secrets + └── Infisical + +AI Inference + └── LiteLLM (shared, API key per tenant; or customer-hosted for Enterprise) + +Customer Portal + ├── customer-portal (new) + └── tenant-registry (new) + +Products + ├── certifai-dashboard + └── breakpilot-compliance stack + ├── backend-compliance (Python/FastAPI) + ├── ai-compliance-sdk (Go) + └── admin-compliance (Next.js) + +Data Stores + ├── PostgreSQL 17 [schemas: tenant_registry, compliance] + ├── MongoDB [CERTifAI] + ├── Qdrant [compliance RAG] + └── MinIO [compliance documents] +``` + +**Infisical secret namespacing:** + +``` +/prod/ + /keycloak/ DB_PASS, ADMIN_PASS, REALM_KEYS + /erpnext/ DB_PASS, SMTP_PASS, OIDC_CLIENT_SECRET + /customer-portal/ KEYCLOAK_CLIENT_SECRET, ERP_API_KEY, REGISTRY_DB_URI + /tenant-registry/ POSTGRES_URI, KEYCLOAK_ADMIN_SECRET, ERP_API_KEY, STRIPE_SECRET + /certifai/ MONGODB_URI, KEYCLOAK_CLIENT_SECRET, LITELLM_MASTER_KEY + /compliance/ POSTGRES_URI, QDRANT_API_KEY, MINIO_KEYS, ANTHROPIC_API_KEY + /litellm/ OPENAI_API_KEY, ANTHROPIC_API_KEY, MASTER_KEY + /gitea-runner/ DOCKER_REGISTRY_PASS, ORCA_WEBHOOK_TOKEN +``` + +--- + +## 8. Process Sketches + +### P1 — New Customer Onboarding (Sales-Led) + +``` + US (ERPNext) TENANT REGISTRY KEYCLOAK + │ │ │ + │ Lead → Opportunity │ │ + │ → Quotation (PDF sent) │ │ + │ → Sales Order submitted │ │ + │─────── webhook ─────────────►│ │ + │ │ create org ─────────►│ + │ │◄──── org_id ──────────│ + │ │ write tenant row │ + │ │ write tenant_products│ + │ │ send welcome email │ + │ │ │ │ + │ │ ▼ │ + │ IT ADMIN receives email │ + │ clicks setup link │ + │ │ ┌───────┤ + │ │ │set pw │ + │ │ │ 2FA │ + │ │ └───┬───┘ + │ │ │ + │ lands on /acme/dashboard │ + │ │ │ +``` + +### P2 — User Login (Customer's Own IdP) + +``` + USER ORCA-PROXY PORTAL KEYCLOAK CUSTOMER IdP + │ │ │ │ │ + │ acme.yourplatform.com │ │ │ │ + │───────────────────────►│ │ │ │ + │ │ Host=acme.* │ │ │ + │ │───────────────►│ │ │ + │ │ │ slug=acme │ │ + │ │ │ lookup tenant │ │ + │ │ │ → idp=acme-okta│ │ + │ │ │─── redirect ──►│ │ + │ │ │ kc_idp_hint │ │ + │ │ │ │─── redirect ──►│ + │ │ │ │ │ + │ │◄─────────────────────── auth ──┤ │ + │ │ │ │ issue JWT │ + │ │ │◄── JWT ────────│ │ + │◄── /acme/dashboard ────┤ │ │ │ +``` + +### P3 — User Login (Our IdP — email + password) + +``` + USER PORTAL KEYCLOAK + │ │ │ + │ acme.yourplatform│ │ + │──────────────────►│ │ + │ │ redirect + PKCE │ + │ │─────────────────►│ + │◄── Keycloak login page ─────────────┤ + │ enter email + password (+ TOTP) │ + │─────────────────────────────────────►│ + │ │◄── JWT ──────────│ + │◄── /acme/dashboard┤ │ +``` + +### P4 — IT Admin Configures External IdP + +``` + IT ADMIN PORTAL TENANT REGISTRY KEYCLOAK + │ │ │ │ + │ /settings/ │ │ │ + │ identity │ │ │ + │───────────────►│ │ │ + │ fill OIDC/ │ │ │ + │ SAML details │ │ │ + │───────────────►│ │ │ + │ │── PATCH idp_config►│ │ + │ │ │── create IdP ────►│ + │ │ │ for org │ + │ │ │◄── ok ────────────│ + │ │ │ verified=true │ + │◄── "Test" btn ─┤ │ │ + │ auth popup ───────────────────────────────────────────►│ + │◄── success ────────────────────────────────────────────┤ + │◄── "IdP configured" ┤ │ │ +``` + +### P5 — IT Admin Invites a Team Member + +``` + IT ADMIN PORTAL KEYCLOAK NEW USER + │ │ │ │ + │ /settings/ │ │ │ + │ users → invite│ │ │ + │ email + role │ │ │ + │───────────────►│ │ │ + │ │ create user in org│ │ + │ │──────────────────►│ │ + │ │ │ send invite │ + │ │ │ email ────────►│ + │ │ │ │ click link + │ │ │◄── set pw ─────│ + │ │ │ (+ TOTP) │ + │ │ │ issue JWT │ + │ │◄─── JWT ──────────│ │ + │ │ │ ┌──────┘ + │ │ │ lands on│ + │ │ │ /acme/dashboard + │ │ │ (role-filtered view) +``` + +### P6 — Customer Accesses a Product + +``` + USER PORTAL KEYCLOAK PRODUCT (e.g. CERTifAI) + │ │ │ │ + │ /acme/products/ │ │ │ + │ certifai │ │ │ + │─────────────────►│ │ │ + │ │ check JWT: │ │ + │ │ products claim │ │ + │ │ includes │ │ + │ │ "certifai" ? │ │ + │ │ │ │ + │ [YES] ────────┤ │ │ + │ │ pass JWT ──────────────────────── │ + │ │ │ validate JWKS │ + │ │ │ extract org_id │ + │ │ │ scope all data │ + │◄── product UI ───┤ │ │ + │ │ │ │ + │ [NO] ─────────┤ │ │ + │◄── "Not in your plan" + upgrade CTA │ +``` + +### P7 — Finance User Views Billing + +``` + FINANCE USER PORTAL ERPNEXT API STRIPE API + │ │ │ │ + │ /acme/billing│ │ │ + │──────────────►│ │ │ + │ │ role check: │ │ + │ │ FINANCE → ok │ │ + │ │ │ │ + │ │── fetch invoices►│ │ + │ │◄── invoice list ─│ │ + │ │ │ │ + │ │── fetch usage ────────────────────► │ + │ │◄── usage data ────────────────────── │ + │ │ │ │ + │◄── billing page renders │ │ + │ plan · usage · invoices │ │ + │ [Download PDF] ──────────────►│ │ + │◄── PDF streamed ─────────────────│ │ + │ │ │ + │ [Upgrade Plan] │ │ + │──────────────►│ │ │ + │ │ create CRM task in ERPNext │ + │ │─────────────────►│ │ + │ │ notify us (email/ERPNext task) │ + │◄── "We'll be in touch" ──┤ │ │ +``` + +### P8 — Legal User Exports Audit Report + +``` + LEGAL USER PORTAL TENANT REGISTRY COMPLIANCE PRODUCT + │ │ │ │ + │ /acme/audit │ │ │ + │──────────────►│ │ │ + │ │ role check: │ │ + │ │ LEGAL → ok │ │ + │ │ │ │ + │ │── platform audit ──────────────────► + │ │ (who logged in, role changes, │ + │ │ IdP changes, impersonations) │ + │ │◄── audit_log rows ─────────────────┤ + │ │ │ │ + │ │── compliance audit ────────────────► + │ │ (DSFA approvals, DSR processing, │ + │ │ TOM completions) │ + │ │◄── compliance audit rows ──────────┤ + │ │ │ │ + │ [Export] │ │ │ + │──────────────►│ │ │ + │◄── ZIP: │ │ │ + │ platform-audit.csv │ │ + │ compliance-audit.pdf │ │ +``` + +### P9 — Support Ticket Escalated to Engineering + +``` + CUSTOMER FRAPPE HD US (AGENT) GITEA + │ │ │ │ + │ submit ticket │ │ │ + │ via /support/ │ │ │ + │─────────────────►│ │ │ + │ │── notify agent ─►│ │ + │ │ │ triage ticket │ + │ │ │ → technical bug │ + │ │ │ │ + │ │ │ [Escalate] │ + │ │◄─────────────────│ │ + │ │ server script: │ │ + │ │ POST /issues ────────────────────► │ + │ │ │ {title, body, │ + │ │ │ labels:[bug]} │ + │ │◄──── issue URL ───────────────────┤ + │ │ store on ticket │ │ + │◄── "Escalated to engineering, we'll update you" ─────┤ + │ │ │ │ + │ │ │ dev fixes it │ + │ │ │ closes issue ─►│ + │ │◄──────── webhook ──────────────────│ + │ │ ticket → Resolved│ │ + │◄── notification ─│ │ │ +``` + +### P10 — We Create a New Customer (Startup Flow) + +``` + US (BACKSTAGE) TENANT REGISTRY KEYCLOAK ERPNEXT IT ADMIN + │ │ │ │ │ + │ /backstage/ │ │ │ │ + │ tenants/new │ │ │ │ + │ fill: name, │ │ │ │ + │ contact, plan, │ │ │ │ + │ products │ │ │ │ + │──── [Create] ────►│ │ │ │ + │ │── create org ───►│ │ │ + │ │◄── org_id ───────│ │ │ + │ │── create Customer ──────────────►│ │ + │ │◄── erp_customer_id ──────────────│ │ + │ │ write tenant rows │ │ + │ │ send welcome email ─────────────────────────── │ + │◄── tenant created ┤ │ │ │ + │ "Awaiting setup"│ │ │ │ + │ │ │ │ click link│ + │ │ │◄── set pw ────────────────── │ + │ │ │ + 2FA │ │ + │ │ │ JWT issued │ │ + │ │◄─────────────────────────────────────── /acme/ ─┤ +``` + +### P11 — We Debug a Customer Issue (Impersonation) + +``` + US (BACKSTAGE) TENANT REGISTRY KEYCLOAK PORTAL (AS CUSTOMER) + │ │ │ │ + │ /backstage/ │ │ │ + │ tenants/acme/ │ │ │ + │ users → │ │ │ + │ Impersonate Alice│ │ │ + │──────────────────►│ │ │ + │ │ write audit_log │ │ + │ │ {action: │ │ + │ │ impersonate, │ │ + │ │ actor: sharang, │ │ + │ │ target: alice} │ │ + │ │── request token ►│ │ + │ │◄── imp. token ───│ │ + │◄── token ─────────┤ (30min, signed, │ │ + │ │ impersonated_by │ │ + │ │ claim) │ │ + │ │ │ + │ new tab: acme.yourplatform.com │ │ + │──────────────────────────────────────────────────────────►│ + │ │ [orange banner] │ + │ │ Impersonating │ + │ │ alice@acme.com │ + │ │ 29:47 remaining │ + │ reproduce issue, identify root cause │ │ + │──────────────────────────────────────────────────────────►│ + │ │ [Exit impersonation] +``` + +### P12 — ERPNext Sales Order Activates a Tenant + +``` + US (ERPNEXT) ERPNEXT TENANT REGISTRY KEYCLOAK IT ADMIN + │ │ │ │ │ + │ Sales Order │ │ │ │ + │ Submit ──── ►│ │ │ │ + │ │── webhook ───────►│ │ │ + │ │ {order_id, │ │ │ + │ │ tenant_id, │ │ │ + │ │ products, │ │ │ + │ │ plan, │ │ │ + │ │ contract_start │ │ │ + │ │ contract_end} │ │ │ + │ │ │ tenant.status │ │ + │ │ │ = active │ │ + │ │ │ tenant_products │ │ + │ │ │ enabled=true │ │ + │ │ │── update claims ─►│ │ + │ │ │ (protocol mapper│ │ + │ │ │ picks up new │ │ + │ │ │ entitlements) │ │ + │ │ │── send email ─────────────────►│ + │ │ │ "Subscription │ │ + │ │ │ now active" │ │ +``` + +### P13 — Customer Browses Catalog and Requests a New Product + +``` + USER (any role) PORTAL TENANT REGISTRY ERPNEXT (CRM) + │ │ │ │ + │ /acme/catalog │ │ │ + │──────────────►│ │ │ + │ │── GET /catalog ──────►│ │ + │ │◄── product manifests +│ │ + │ │ subscribed status │ │ + │◄── catalog page │ │ + │ • CERTifAI [✓ Subscribed] │ │ + │ • Compliance [✓ Subscribed] │ │ + │ • Notetaker [+ Request] │ │ + │ • Classifier [+ Request] │ │ + │ │ │ + │ click [Request] on Notetaker │ │ + │ Modal: "Why do you want this?" │ │ + │ + estimated seats / volume │ │ + │──────────────►│ │ │ + │ │── POST /catalog/ │ │ + │ │ request ─────────►│ │ + │ │ {tenant, product, │ │ + │ │ requested_by, note}│ │ + │ │ │── create CRM Lead ──►│ + │ │ │ linked to Customer │ + │ │ │◄── lead_id ──────────│ + │ │ │ notify sales_owner │ + │ │ │ (email + ERPNext │ + │ │ │ activity) │ + │◄── "We'll be in touch within 1 day" ──│ │ +``` + +### P14 — Sales Rep Demos to a Prospect (Shared Demo Tenant) + +``` + SALES REP KEYCLOAK PORTAL DEMO TENANT + │ │ │ │ + │ open Zoom with prospect, share screen │ + │ │ + │ demo.yourplatform.com │ + │────────────────────────────────►│ │ + │ │ │ Host: demo │ + │ │ │ → slug = demo │ + │ │ │ → tenant.kind=demo │ + │ │ │ tenant.status=demo │ + │ │ │ │ + │ │ OIDC redirect │ │ + │◄──────────────│─────────────────│ │ + │ login sales@breakpilot │ + │ realm_role=SALES_REP │ + │──────────────►│ │ │ + │ │ verify SALES_REP allowed on demo only │ + │ │ issue JWT: │ + │ │ org_id=demo, org_roles=[IT_ADMIN], │ + │ │ realm_roles=[SALES_REP], │ + │ │ products=[certifai, compliance] │ + │◄──────────────│ │ │ + │ │ │ + │ /demo/dashboard ───────────────►│ │ + │ /demo/products/certifai ─►│ load custom elt ►│ + │ /demo/products/compliance ─►│ load custom elt ►│ + │◄── show prospect every flow ────│ │ + │ │ │ + │ if prospect interested: │ + │ click [Request Trial] in /demo/catalog │ + │ modal: prospect email, company, est. seats │ + │ → POST /catalog/trial-request │ + │ creates CRM Lead in ERPNext, NOT a tenant │ + │ sales_owner = the logged-in SALES_REP │ + │ │ + │ 03:00 nightly: │ + │ cron → product /v1/tenants/demo/reset │ + │ fixtures from catalog.seed_data_url restored │ + │ demo is clean for next day │ +``` + +**Guardrails:** +- Keycloak policy: `SALES_REP` realm role MUST NOT be issued a token with `org_id ≠ demo` +- Backstage policy: `SALES_REP` CANNOT see real-tenant data, CAN see CRM (their leads) +- Real customer support is NEVER done from a SALES_REP login + +### P15 — Self-Serve Trial → Convert or Expire + +``` + PROSPECT PORTAL TENANT REGISTRY ERPNEXT KEYCLOAK + │ │ │ │ │ + │ yourplatform.com/start │ │ │ + │──────────────►│ │ │ │ + │ form: email, company, password │ │ │ + │──────────────►│ │ │ │ + │ │── POST /trials ──────►│ │ │ + │ │ {email, company, │ │ │ + │ │ requested_products} │ │ │ + │ │ │ │ │ + │ │ │ slugify(company) │ │ + │ │ │ create tenant │ │ + │ │ │ status=trial │ │ + │ │ │ trial_ends_at = │ │ + │ │ │ now + 14d │ │ + │ │ │ create Customer ►│ │ + │ │ │ tier=Trial │ │ + │ │ │ sales_owner= │ │ + │ │ │ unassigned │ │ + │ │ │── create org ───────────────►│ + │ │ │ + IT_ADMIN user │ + │ │ │ + verify email │ + │ │ │ │ │ + │◄── magic link │ │ │ │ + │ click link, set password │ │ │ + │ land on /acme-trial/dashboard │ │ │ + │ banner: "Trial: 14 days left — Add billing to keep your data" │ + │ │ │ │ + │ ── customer uses platform normally ── │ │ + │ │ │ │ + │ DAY 7 cron: trial_ends_at - 7d │ │ │ + │ → email IT_ADMIN + CXO │ │ │ + │ → CRM Activity: "Day-7 nudge" ►│ │ │ + │ │ │ │ + │ DAY 12: same, urgent tone │ │ │ + │ DAY 14: trial_ends_at reached │ │ │ + │ │ │ │ + │ IF customer added payment: │ │ │ + │ status: trial → active │ │ │ + │ Stripe subscription created │ │ │ + │ OR Sales Order in ERPNext signed │ │ │ + │ banner removed │ │ │ + │ │ + │ ELSE: │ + │ status: trial → frozen │ │ │ + │ 30-day grace: portal read-only, products return 402 │ │ + │ daily reminder email until day 44 │ │ + │ │ + │ DAY 44: frozen → archived │ │ │ + │ GDPR export ZIP emailed to IT_ADMIN │ │ │ + │ each product called: DELETE /v1/tenants/{id}/data │ │ + │ 30 days later: tenant row deleted (audit_log retained 7y) │ +``` + +**Trial scoping:** +- All paid products are available in trial mode by default unless `catalog.available_on_plans` excludes `trial` +- Usage-billed products (e.g., LiteLLM calls) get a hard cap during trial (manifest: `trial_quota`) +- Customer can upgrade plan mid-trial; trial timer just stops, no proration + +### P16 — Customer Cancels and Offboards + +``` + IT ADMIN PORTAL TENANT REGISTRY PRODUCTS ERPNEXT + │ │ │ │ │ + │ /acme/settings/billing │ │ │ + │──────────────►│ │ │ │ + │ [Cancel Subscription] │ │ │ + │ Modal: │ │ │ + │ • reason (dropdown) │ │ │ + │ • confirm typing "acme" │ │ │ + │ • shows: data retained 30d, then deleted │ │ + │──────────────►│ │ │ │ + │ │── POST /tenants/ │ │ │ + │ │ acme/cancel ─────►│ │ │ + │ │ │ status: active │ │ + │ │ │ → frozen │ │ + │ │ │ frozen_at = now │ │ + │ │ │ delete_at = │ │ + │ │ │ now + 30d │ │ + │ │ │── Stripe cancel │ │ + │ │ │ at_period_end │ │ + │ │ │── opportunity ──────────────────►│ + │ │ │ stage=Lost │ │ + │ │ │ reason=... │ │ + │ │ │── notify sales_owner │ + │ │ │ (could reach out for save) │ + │◄── confirmation page │ │ │ + │ "Frozen until . Download your data anytime." │ │ + │ │ │ │ + │ frozen state: │ │ │ + │ portal works READ-ONLY │ │ │ + │ /export available (all products) │ │ │ + │ product APIs return 402 on writes │ │ │ + │ │ │ │ + │ if customer changes mind within 30d: │ │ + │ [Reactivate] → status: frozen → active │ │ + │ no data loss │ │ + │ │ │ │ + │ DAY 30 cron: │ │ │ + │ tenant.delete_at reached │ │ │ + │ build final export ZIP per product ►│ /v1/tenants/{id}/export │ + │ email ZIP link to IT_ADMIN + CXO │ │ │ + │ (signed URL, 7-day TTL) │ │ │ + │ for each product: │ │ + │ DELETE /v1/tenants/{id}/data ────►│ │ │ + │ Keycloak: org archived, users disabled │ │ + │ ERPNext Customer: status=Inactive │ │ + │ tenant.status = archived │ │ + │ │ │ + │ audit_log retained 7y per GDPR / accounting │ │ +``` + +**Self-serve vs. enterprise:** +- Stripe-billed customers cancel in-portal; flow above +- ERPNext-billed (enterprise) customers send written notice; sales rep updates Sales Order; flow runs from `/backstage/tenants/[id]/lifecycle` with the same downstream effects + +### Headless Product Flows + +P1–P13 cover **interactive** products that ship a UI. Products declared as `frontend.type = headless` (see PRODUCT_INTEGRATION_SPEC.md §5) ship no frontend code — customers configure them through a portal-rendered UI and consume them via API/MCP from their own systems. Examples: a notetaker bot, a document classifier, a webhook router, a compliance reporter. + +The portal still hosts these products end-to-end: the customer area, billing, audit, and Backstage all work the same. Only the "use the product" surface changes from a UI to API keys + webhooks. + +### H1 — Customer Enables a Headless Product + +``` + IT ADMIN PORTAL TENANT REGISTRY HEADLESS PRODUCT + │ │ │ │ + │ /acme/products│ │ │ + │ /notetaker │ │ │ + │──────────────►│ │ │ + │ │ load manifest │ │ + │ │ frontend.type = │ │ + │ │ "headless" │ │ + │ │ │ │ + │ │ render portal-owned │ │ + │ │ config page from │ │ + │ │ manifest sections: │ │ + │ │ • API Keys │ │ + │ │ • Webhooks │ │ + │ │ • Usage chart │ │ + │ │ • Docs link │ │ + │ │ • Code samples │ │ + │◄── page ──────│ │ │ +``` + +### H2 — Generate API Key for a Headless Product + +``` + IT ADMIN PORTAL TENANT REGISTRY HEADLESS PRODUCT + │ │ │ │ + │ [Generate Key]│ │ │ + │ name: "prod" │ │ │ + │ scopes:[r,w] │ │ │ + │──────────────►│ │ │ + │ │── POST /api-keys ────►│ │ + │ │ {tenant, scopes, │ │ + │ │ product, name} │ │ + │ │ │ generate raw key │ + │ │ │ store HASH only │ + │ │ │ bind: tenant + │ + │ │ │ product + │ + │ │ │ scopes │ + │ │◄── raw key (once) ────│ │ + │◄── show once ─│ │ │ + │ "Copy now — │ │ │ + │ won't show │ │ │ + │ again" │ │ │ +``` + +### H3 — Customer's System Calls the Headless Product + +``` + CUSTOMER SYSTEM HEADLESS PRODUCT TENANT REGISTRY + │ │ │ + │ POST /v1/sessions │ │ + │ Auth: ApiKey k_xxx │ │ + │ X-Tenant: acme │ │ + │──────────────────────►│ │ + │ │ validate key ───────►│ + │ │ → tenant_id, │ + │ │ scopes │ + │ │◄─────────────────────│ + │ │ enforce scope │ + │ │ tenant_id in EVERY │ + │ │ DB query │ + │ │ process request │ + │ │ emit usage ─────────►│ + │ │ emit audit ─────────►│ + │◄── 200 response ──────│ │ +``` + +### H4 — Async Result Delivered via Webhook + +``` + HEADLESS PRODUCT CUSTOMER WEBHOOK URL PORTAL (delivery log) + │ │ │ + │ async job finishes │ │ + │ load webhook config │ │ + │ for this tenant + │ │ + │ this event type │ │ + │ │ │ + │ POST customer URL ─────►│ │ + │ Body: {event, result, │ │ + │ tenant, signature} │ │ + │◄── 200 ─────────────────│ │ + │ log delivery ────────────────────────────────────► + │ (success/fail, ts, │ │ + │ response code) │ │ + │ │ │ + │ if delivery fails: │ │ + │ retry with backoff │ │ + │ 3 attempts, then │ │ + │ dead-letter │ │ + │ visible in portal at │ │ + │ /webhooks/deliveries │ │ +``` + +### H5 — Headless Product Tile on Customer Dashboard + +``` + USER PORTAL TENANT REGISTRY HEADLESS PRODUCT + │ │ │ │ + │ /acme/dashboard │ │ │ + │─────────────────►│ │ │ + │ │ for each entitled │ │ + │ │ product in JWT: │ │ + │ │ │ │ + │ │ type=interactive → │ │ + │ │ render "Open" tile │ │ + │ │ │ │ + │ │ type=widget → │ │ + │ │ load widget bundle │ │ + │ │ render custom elt │ │ + │ │ │ │ + │ │ type=headless → │ │ + │ │ GET /v1/usage ─────────────────────────────►│ + │ │◄────────────────────── usage summary ────────│ + │ │ render stat tile: │ │ + │ │ "Notetaker │ │ + │ │ 142 sessions │ │ + │ │ last 30d" │ │ + │ │ click → goes to │ │ + │ │ /products/notetaker │ │ + │◄── dashboard ────│ │ │ +``` + +### H6 — Backstage Operates a Headless Product + +``` + US (BACKSTAGE) PORTAL TENANT REGISTRY HEADLESS PRODUCT + │ │ │ │ + │ /backstage/ │ │ │ + │ tenants/acme/ │ │ │ + │ products/ │ │ │ + │ notetaker │ │ │ + │──────────────►│ │ │ + │ │ NO "Impersonate" btn │ │ + │ │ (no UI to enter) │ │ + │ │ │ │ + │ │ shows: │ │ + │ │ • Health │ │ + │ │ • Usage 30/90d │ │ + │ │ • API call errors │ │ + │ │ • Webhook deliveries │ │ + │ │ • Failed deliveries │ │ + │ │ • Admin actions from │ │ + │ │ manifest: │ │ + │ │ [Flush queue] │ │ + │ │ [Rotate keys] │ │ + │ │ [Reset state] │ │ + │ [Flush queue] │ │ │ + │──────────────►│ │ │ + │ │── service token ─────►│ │ + │ │ POST /admin/flush ─────────────────────────► + │ │ │ audit event │ + │ │◄─────────────────────────────────── ok ───── │ + │◄── done ──────│ │ │ +``` + +--- + +## 9. Technology Decisions (Locked) + +| Decision | Choice | Rationale | +|---|---|---| +| Identity | Keycloak, single realm | Already in CERTifAI; Organizations + IdP brokering built-in | +| Tenant model | Keycloak Organization per customer | Native isolation, JWT claims, no custom multi-tenant auth code | +| Subdomain routing | Orca-Proxy, wildcard cert | Consistent with existing infra; tenant from `Host` header | +| Secret management | Infisical, machine identity per service | Uniform across all services; path-namespaced per service | +| Business operations | ERPNext (Frappe) | CRM + sales + invoicing + HR in one; avoids building our own | +| Customer support | Frappe Helpdesk | Same Frappe bench as ERPNext; native customer-ticket-account link | +| Engineering issues | Gitea Issues | Already running Gitea; Frappe HD → Gitea via REST API (server script) | +| Data isolation | Logical (tenant_id / org_id columns) | Sufficient for Starter/Professional; physical isolation offered for Enterprise | +| Billing — self-serve | Stripe (Starter, Professional) | Standard; portal billing page reads Stripe | +| Billing — enterprise | ERPNext Sales Invoices | Manual invoicing, DATEV export for accountant | +| Customer portal | New Next.js 15 app | Clean slate; existing admin apps have product-specific chrome | +| Tenant Registry | New Go service | Thin glue layer; owns entitlements, IdP config, audit log | +| Products scope | CERTifAI + breakpilot-compliance only | Dataroom and pitch-deck out of scope | + +--- + +## 10. Open Items / Phasing + +### Phase 0 — Foundation (pilot-ready, one real customer) +- Orca-Proxy: wildcard TLS, subdomain routing table +- Infisical: machine identities + secrets for all existing services +- Keycloak: Organizations enabled, realm roles (incl. `SALES_REP`), one test org +- Tenant Registry: core schema + API (`/tenants` CRUD + `/activate`), `status` enum +- Backstage minimal: create tenant form, tenant list, impersonation +- Portal login: subdomain detection → Keycloak OIDC → tenant context +- CERTifAI: MongoDB-backed sessions, `org_id` query scoping, role enforcement +- breakpilot-compliance: JWT → `X-Tenant-ID` validated at Next.js proxy +- **Demo tenant `demo` seeded**; sales rep can log in and walk a screen-shared prospect + +### Phase 1 — Customer-Facing Portal +- Full customer dashboard, product tiles, usage summary +- User management and invite flow +- IdP configuration wizard (OIDC + SAML) +- Billing page (ERPNext invoices + Stripe usage) +- Audit log page and CSV/PDF export +- Frappe HD embedded in `/[slug]/support/` + +### Phase 2 — Business Operations +- ERPNext configured: CRM, Sales Orders, Invoicing, HR +- ERPNext → Tenant Registry webhook (Sales Order submit → tenant activate) +- Frappe HD → Gitea escalation (server script) +- Backstage health dashboard (service health, incidents) +- Keycloak protocol mapper (products + plan injected into JWT) +- **Self-serve trial flow P15**: `/start` form, 14-day timer, day-7/12/14 emails, trial → frozen → archived state machine +- **Cancel + offboard flow P16**: cancel modal, 30-day frozen window, automated final-export ZIP, GDPR erasure call to every product +- **Demo nightly reset**: cron at 03:00 Europe/Berlin calls each product's `/v1/tenants/demo/reset` + +### Phase 3 — Product API Surface +- CERTifAI: OpenAPI spec, `/api/v1/health` + `/api/v1/usage` +- breakpilot-compliance: OpenAPI spec, `/api/v1/usage` +- Customer-facing API keys (IT Admin generates, scoped to their org) +- LiteLLM per-tenant API key metering → usage data in portal + +### Phase 4 — Enterprise Tier +- Physical data isolation option (dedicated PostgreSQL schema per tenant) +- Customer-hosted LiteLLM (URL stored in `tenant_products.config`) +- Custom domain support (`compliance.acme.com` → Orca-Proxy → portal) +- MCP servers per product (CERTifAI MCP, compliance MCP) +- SLA enforcement in Frappe HD per plan tier + +--- + +*End of document. Updated after design review 2026-05-11.* diff --git a/PRODUCT_INTEGRATION_SPEC.md b/PRODUCT_INTEGRATION_SPEC.md new file mode 100644 index 0000000..f5cb371 --- /dev/null +++ b/PRODUCT_INTEGRATION_SPEC.md @@ -0,0 +1,1245 @@ +# Product Integration Specification +**Status:** Design Draft +**Authors:** Sharang, Benjamin +**Date:** 2026-05-11 +**Companion docs:** PLATFORM_ARCHITECTURE.md, INFRASTRUCTURE.md +**Contract version:** 1.0 + +--- + +## 1. Purpose + +This document defines the contract that **every product** must implement to be sold on the platform as a B2B building block. The contract is enforced by the Tenant Registry, Customer Portal, and Orca deployment pipeline. A product that does not implement the contract cannot be activated for a tenant. + +The contract is designed to be **first-party today, third-party-ready tomorrow** — the technical surface is identical for our own products and any future external developers, with stricter verification gates for the latter. + +--- + +## 2. Core Principles + +``` +1. ONE TENANT, ONE TRUTH + Every request is scoped by org_id from the JWT. Cross-tenant data leakage is the + single largest commercial risk; the spec treats it as a contract violation. + +2. PLATFORM OWNS IDENTITY, BILLING, ROUTING + Products NEVER implement their own login, never store passwords, never invoice + customers directly. These are platform concerns. + +3. PRODUCTS OWN THEIR DOMAIN AND DATA + Products own their database, their data model, their backup, their RTO/RPO. + No cross-product database sharing. Composition is via APIs, not via DB joins. + +4. STATELESS APPLICATIONS, STATEFUL DATA STORES + Application containers are replaceable in seconds. State lives in databases + that have explicit backup contracts. + +5. CONTRACT EVOLVES, PRODUCTS DECLARE COMPATIBILITY + Products declare which contract version they implement. The platform supports + N and N-1; deprecation is announced before removal. +``` + +--- + +## 3. Required Surfaces + +A product is composed of five surfaces. Three are mandatory, one is tier-gated, one is mandatory documentation. + +``` +┌──────────────────────┬──────────────────────────────────────────┬───────────────┐ +│ Surface │ What │ Requirement │ +├──────────────────────┼──────────────────────────────────────────┼───────────────┤ +│ Backend API │ REST + OpenAPI 3.0 spec │ REQUIRED │ +│ Frontend │ Web component (custom element) │ REQUIRED │ +│ MCP Server │ MCP server exposing tenant-scoped tools │ REQUIRED for │ +│ │ │ Enterprise │ +│ │ │ tier; opt for │ +│ │ │ Starter/Pro │ +│ Documentation │ README, API docs, integration guide, │ REQUIRED │ +│ │ runbook, data model, GDPR retention │ │ +│ Observability │ /health, /metrics, structured logs, │ REQUIRED │ +│ │ audit event emission │ │ +└──────────────────────┴──────────────────────────────────────────┴───────────────┘ +``` + +--- + +## 4. Backend API Contract + +### 4.1 Mandatory Endpoints + +Every product backend must implement these endpoints. The Tenant Registry health-checks them on deploy; any missing endpoint blocks the registration. + +``` +GET /health + Returns 200 if healthy, 503 if unhealthy. + Body: {"status": "ok"|"degraded"|"down", "checks": {"db": "ok", "deps": "ok"}} + Authentication: NONE (Orca probe) + +GET /version + Returns product version and contract version it implements. + Body: {"product": "certifai", "version": "1.4.2", "contract": "1.0", + "build": "", "deployed_at": "2026-05-10T..."} + Authentication: NONE + +GET /v1/usage + Query: ?tenant_id=&from=&to=&project_id= + Returns billing-relevant usage metrics for a tenant (and optional project). + Body: {"tenant_id": "...", "project_id": "...", "period": {...}, + "metrics": {"seats_active": 12, "api_calls": 14203, ...}} + Authentication: SERVICE TOKEN (called by billing job) + Note: products with high-cardinality usage (LLM tokens, etc.) SHOULD also + stream per-event metering to /internal/usage/events on Tenant Registry. + Event shape is Lago-compatible (transaction_id, code, external_subscription_id, + properties) so we can swap to a Lago instance later without changing producers. + +POST /v1/tenants/{id}/provision + Body: {"plan": "...", "config": {...}, "contract_version": "1.0"} + Initializes tenant-specific resources (schemas, default data, queues). + Must be idempotent: a second call with same params is a no-op. + Authentication: SERVICE TOKEN (called by Tenant Registry) + +POST /v1/tenants/{id}/suspend + Soft-suspend: data retained, all customer access blocked. + Authentication: SERVICE TOKEN + +POST /v1/tenants/{id}/reactivate + Reverse of suspend. + Authentication: SERVICE TOKEN + +POST /v1/tenants/{id}/terminate + Hard terminate: schedules data for erasure per retention policy. + Body: {"reason": "...", "scheduled_erasure_at": "..."} + Authentication: SERVICE TOKEN + +POST /v1/tenants/{id}/export + GDPR Article 20 (data portability) export for a tenant. + Returns: signed URL to a ZIP containing all tenant data in JSON + binary blobs. + Authentication: USER JWT (IT_ADMIN or LEGAL role) + +DELETE /v1/tenants/{id}/data + GDPR Article 17 (right to erasure) full deletion. + Body: {"confirm": ""} ← safety check + Authentication: SERVICE TOKEN + USER JWT (IT_ADMIN signing off) +``` + +### 4.2 Authentication Modes + +``` +USER JWT Bearer token issued by Keycloak for a user session. + Contains: sub, org_id, org_roles, products, plan. + Validated via Keycloak JWKS endpoint. + Used for: all customer-facing endpoints. + +SERVICE TOKEN Short-lived JWT issued by Keycloak via OAuth 2.0 + client_credentials flow. + Each service has a Keycloak client (id: certifai-svc, + compliance-svc, etc.) with declared scopes. + Used for: platform-to-product calls (provisioning), + product-to-product calls (inter-product API). + TTL: 15 minutes max. + +ORCA PROBE No authentication. Local network only. + Used for: /health, /version (Orca polls these). + Must not leak tenant data. +``` + +### 4.3 Tenant Scoping Rules + +``` +1. EVERY non-probe endpoint extracts tenant context from JWT or path. + USER JWT → tenant = jwt.org_id + SERVICE TOKEN → tenant = path parameter, validated against service scopes + +2. EVERY query to the product database includes WHERE tenant_id = $1. + No exceptions. Code review enforces this; tests verify it. + +3. EVERY response includes only data for the requested tenant. + The product asserts this invariant in middleware (defense in depth). + +4. EVERY log line and audit event includes tenant_id as a structured field. +``` + +### 4.4 OpenAPI Spec + +Every product publishes `openapi.yaml` at `/openapi.yaml`. The Tenant Registry pulls this on product registration and validates that the mandatory endpoints from §4.1 are present with correct signatures. + +``` +Product OpenAPI must: + - Be valid OpenAPI 3.0 (3.1 not yet — tooling gap) + - Include all mandatory endpoints from §4.1 + - Document all custom endpoints with examples + - Declare authentication mode for each endpoint + - Declare scopes consumed (for SERVICE TOKEN endpoints) + - Include error response schemas (4xx, 5xx) +``` + +--- + +## 5. Frontend Contract + +A product declares its frontend type in the manifest. The portal renders accordingly. Three types are supported: + +``` +interactive Full UI shipped as a web component custom element. + Customer OPERATES the product through this UI. + Examples: CERTifAI, breakpilot-compliance, classic SaaS products. + +widget Only a small dashboard tile component; no full product page. + Customer SEES product output in a tile; deeper management + happens on a portal-rendered config page. + Examples: monitoring, status reporting. + +headless No frontend code at all. The portal renders a generic + management UI from a portal_config block in the manifest. + Customer CONFIGURES (API keys, webhooks) and their own + systems consume the product via API/MCP. + Examples: notetaker bot, document classifier, webhook router. +``` + +The portal branches its rendering on `manifest.frontend.type`. Backend, MCP, observability, and lifecycle contracts are identical across all three types — only the customer-facing surface changes. + +### 5.A Interactive (Web Component) + +The frontend is a **custom element** registered with a product-specific tag name. The Customer Portal loads the bundle and renders the element with attributes passed in by the portal. + +#### 5.A.1 Why web components + +Our products span Rust/Dioxus, Next.js, Go, Python. Web components are the only framework-agnostic surface that lets all of these ship a frontend without forcing a stack rewrite: + +- CERTifAI compiles Dioxus → WASM → wraps in a custom element +- breakpilot-compliance wraps React components via `@r2wc/react-to-web-component` +- Any future Vue/Svelte/Solid product also works + +#### 5.A.2 The Tag Contract + +Each product declares ONE primary tag in its manifest. The portal renders it like this: + +```html + +``` + +Attributes the portal passes (the product MUST handle these): + +``` +tenant tenant slug (acme) +tenant-id tenant UUID (uuid-acme) +jwt short-lived JWT (≤ 5 min), product validates against Keycloak JWKS +locale en / de / fr / es / pt +theme light / dark +api-base backend URL the product should call +audit-callback-url URL to POST audit events to (portal-relative) +``` + +#### 5.A.3 Events the Product Must Emit + +The component emits events upward via `CustomEvent`. The portal listens for these and integrates them: + +``` +breakpilot:navigate + Detail: {path: "/sub/route", title: "Page Title"} + Portal updates browser URL + breadcrumb without reloading. + +breakpilot:error + Detail: {code: "...", message: "...", recoverable: true|false} + Portal shows toast / blocking error. + +breakpilot:audit + Detail: {action: "...", target: "...", metadata: {...}} + Portal forwards to central audit log via audit-callback-url. + +breakpilot:loading + Detail: {state: "start"|"end", description: "Generating DSFA..."} + Portal shows progress indicator. + +breakpilot:request-upgrade + Detail: {feature: "...", required_plan: "enterprise"} + Portal opens upgrade-quote flow. +``` + +#### 5.A.4 Design System Compatibility + +The platform publishes `@breakpilot/design-tokens` (CSS variables, fonts, spacing). Products are encouraged but not required to consume it. The portal injects design tokens into the shadow DOM root so consuming them is a single CSS line: + +```css +:host { color: var(--bp-text); background: var(--bp-surface); } +``` + +Products that ship custom styling must respect the `theme` attribute and the prefers-color-scheme media query. + +#### 5.A.5 Bundle Loading + +``` +Product publishes a bundle at: + https://cdn.yourplatform.com/products/{name}/{version}/element.js + +Portal loads it lazily via dynamic import when the user navigates to /[tenant]/products/{name}. +Portal caches the bundle URL per product version (declared in tenant_products.config). +Bundle size budget: ≤ 500KB gzipped for first load. +``` + +### 5.B Widget + +A widget product declares ONE custom element that renders only as a dashboard tile. It receives the same attributes as interactive products but emits no `breakpilot:navigate` events — clicking the tile takes the user to a portal-rendered config page (same surface as headless products in §5.C). + +```html + +``` + +Constraints: + +``` +Bundle size budget ≤ 50KB gzipped (widgets load eagerly on dashboard) +Dimensions declared in manifest (e.g., 200×120 or 400×240) +Refresh widget polls own API; portal does not push updates +Allowed events breakpilot:error, breakpilot:audit, breakpilot:request-upgrade + (NOT breakpilot:navigate — click-through is portal-controlled) +``` + +API keys, webhooks, and full management UI for widget products use the same portal-rendered config page as headless products (§5.C). + +### 5.C Headless + +The product ships NO frontend code. The portal renders a generic management UI from a `portal_config` block in the manifest. This page is served at `/[tenant]/products/{name}` and contains the same elements regardless of which product it is — populated entirely from manifest data. + +#### 5.C.1 What the Portal Renders + +``` +┌──────────────────────────────────────────────────────────┐ +│ Notetaker [Status: OK] │ +│ ────────────────────────────────────────────────────────│ +│ │ +│ USAGE (last 30 days) │ +│ ┌──────────────────────────────────────────────────┐ │ +│ │ 142 sessions processed ▁▃▆█▆▄▂▃▅▆▄▂▁▂▃▄▅▆▇█ │ │ +│ └──────────────────────────────────────────────────┘ │ +│ │ +│ API KEYS [+ Generate] │ +│ ──────────────────────────────────────────────────── │ +│ • prod-key k_xxx...4f12 scopes: r,w 2026-01-04│ +│ • staging-key k_xxx...9a83 scopes: r 2026-04-22│ +│ │ +│ WEBHOOKS [+ Add] │ +│ ──────────────────────────────────────────────────── │ +│ • https://acme.example.com/notetaker/cb │ +│ events: session.completed, session.failed │ +│ last 24h: 142 delivered, 0 failed [Test] │ +│ │ +│ CODE SAMPLES │ +│ ──────────────────────────────────────────────────── │ +│ [curl] [JS] [Python] │ +│ curl -X POST https://notetaker-api.yourplatform.com/v1 │ +│ -H "Authorization: ApiKey k_xxx" │ +│ -H "X-Tenant: acme" │ +│ -d '{...}' │ +│ │ +│ DOCS ► developers.yourplatform.com/products/notetaker │ +└──────────────────────────────────────────────────────────┘ +``` + +#### 5.C.2 Manifest Requirement + +A headless product must include a `portal_config` block declaring: + +- **sections**: which UI sections to render (subset of: `api_keys`, `webhooks`, `usage`, `docs`, `code_samples`, `custom_actions`) +- **webhook_events**: the catalog of events the product can emit +- **api_key_scopes**: the catalog of scopes that can be granted on a key +- **code_samples**: at least one language with a working request example +- **status_endpoint**: optional URL for the portal to poll for the status badge + +See §10.1 for the full schema. + +#### 5.C.3 API Keys + +API keys are a portal concern, not a product concern. Tenant Registry generates and stores key hashes; the product validates incoming keys against `POST /internal/api-keys/verify` on Tenant Registry. This means: + +- Key rotation is portal-controlled +- Scope enforcement is consistent across all headless products +- Revocation is instant (registry updates a single row) + +#### 5.C.4 Webhooks + +The portal owns webhook configuration UI and delivery logging. Products POST event payloads to a portal endpoint (`/internal/webhooks/dispatch`); the portal handles signing, delivery, retry, dead-letter, and the customer-visible delivery log. + +This keeps webhook UX consistent across all headless products and means a product cannot accidentally leak events from one tenant to another's webhook URL. + +#### 5.C.5 No Impersonation + +Backstage shows no "Impersonate" button for headless products — there is no UI to enter. Debugging is via API call logs, audit events, webhook delivery history, and admin actions declared in the manifest (e.g., "Flush Queue", "Rotate Keys", "Reset State"). + +--- + +## 6. MCP Server (Required for Enterprise) + +### 6.1 What it is + +An MCP (Model Context Protocol) server exposes the product's capabilities as tools that customer-side AI agents can call. The customer's IT Admin configures the MCP endpoint in their AI agent platform (Claude Desktop, Cursor, internal agents, etc.). + +### 6.2 Required Behavior + +``` +1. ONE MCP server per product + Endpoint: https://mcp.{product}.yourplatform.com (or unified mcp.yourplatform.com/{product}) + +2. Authentication via SCOPED API KEY + Customer IT Admin generates API key in /[tenant]/settings/api-keys. + Key carries tenant_id binding and scopes (read/write per product domain). + No user JWT for MCP — agents authenticate as the org, not as a user. + +3. Tools are tenant-scoped + Every tool call uses the API key's tenant_id binding. + Cross-tenant calls are impossible by construction. + +4. Tool catalog declared in manifest + Each tool: name, description, parameters (JSONSchema), required_scopes. + +5. Audit every tool call + Emit breakpilot:audit-equivalent server-side: actor=api_key_id, + action=tool_name, metadata=parameters. +``` + +### 6.3 Example Tool Catalogs + +``` +CERTifAI MCP tools: + list_ai_agents → returns agents configured for this tenant + get_llm_usage → returns LiteLLM usage for date range + run_news_search → SearXNG search + list_chat_sessions → user's chat history + +Compliance MCP tools: + create_dsfa → starts a DSFA workflow + check_tom_status → returns TOM compliance status + list_dsr_requests → returns open Data Subject Requests + approve_dsfa → marks DSFA as approved + list_ai_act_assessments → returns AI Act assessments +``` + +### 6.4 Activation + +Enterprise customers automatically get MCP enabled. Starter/Pro customers see "Available on Enterprise" in the API Keys page. Tenant Registry checks `tenant.plan` before issuing an MCP API key. + +--- + +## 7. Documentation Contract + +A product ships five required documents. They are published at `developers.yourplatform.com/products/{name}/`. + +``` +1. README What does it do? Value prop in 200 words. + Who is the typical user? What are the workflows? + +2. API Reference Auto-generated from openapi.yaml. + Hosted via Redoc or Stoplight Elements. + +3. Integration Guide For customer IT teams. How to: + - Enable the product on their tenant + - Configure SSO and roles + - Wire into their workflows + - Use the MCP server (if applicable) + - Generate and manage API keys + +4. Operational Runbook For us. How to: + - Deploy a new version + - Roll back + - Debug a stuck tenant + - Reset tenant state + - Investigate slow queries + +5. Data Model + GDPR What data is stored, in which table/collection, + personal data category (Art. 9 special category?), + retention period, GDPR lawful basis. + Used by customer DPOs for their own Verzeichnis. +``` + +--- + +## 8. Observability Contract + +### 8.1 Health Check + +``` +GET /health returns: + 200 {"status": "ok", "checks": {...}} — all good + 200 {"status": "degraded", "checks": {...}, "reason"} — degraded but serving + 503 {"status": "down", "checks": {...}, "reason"} — restart me +``` + +Orca polls every 30s. Three consecutive 503s triggers automatic restart. + +### 8.2 Metrics + +``` +GET /metrics returns Prometheus exposition format. + +Required metrics: + bp_http_requests_total{method, route, status, tenant_id} + bp_http_request_duration_seconds{method, route, tenant_id} + bp_active_tenants_gauge + bp_db_query_duration_seconds{operation} + bp_external_api_calls_total{provider, status} (LLM calls, etc.) +``` + +### 8.3 Structured Logging + +``` +All logs are JSON. All log lines include: + ts ISO-8601 timestamp + level debug|info|warn|error + service product name (certifai) + tenant_id tenant UUID (or "system" for non-tenant ops) + user_sub user UUID if applicable + request_id trace ID + msg human-readable message + ... additional structured fields + +No PII in logs (use the PII redaction middleware from breakpilot-core). +``` + +### 8.4 Audit Events + +Audit events go to the central audit log in Tenant Registry. Products emit them via POST to the audit-callback-url passed by the portal (frontend) or directly to Tenant Registry API (backend). + +``` +Event format (Retraced-shape — transformable 1:1 if we swap to BoxyHQ Retraced later): + { + "tenant_id": "uuid-acme", # → Retraced "group.id" + "project_id": "uuid-prod" | null, # optional sub-tenancy scope + "product": "certifai", # which product emitted + "actor": { + "id": "user-uuid" | "svc:certifai" | "api_key:keyid", + "type": "user" | "service" | "api_key", + "name": "alice@acme.com" + }, + "action": "dsfa.approve", # dotted: . + "crud": "u", # c|r|u|d + "target": { + "id": "", + "type": "dsfa" | "llm_config" | ..., + "name": "" + }, + "source_ip": "1.2.3.4", + "description": "Alice approved DSFA #42 for Customer Data Processing", + "fields": {...}, # additional structured metadata + "created_at": "2026-05-11T14:23:01Z" + } + +Mandatory event categories per product: + config changes everything in product settings + data exports anyone exporting tenant data + data deletions erasures and bulk deletes + permission changes role grants/revocations within product + approvals business-significant approvals (DSFA, etc.) + cross-product calls service-token calls into other products (auto-emitted + by both caller and callee, with on_behalf_of in fields) +``` + +The portal /audit page renders these events filtered by tenant + product + +actor + action + time range. The schema is intentionally Retraced-compatible +so the storage layer can be swapped without changing producers. + +--- + +## 9. Plane-by-Plane Integration Requirements + +### 9.1 Identity Plane + +``` +[REGISTRATION] + - Register an OIDC client in Keycloak (id: {product}-client) + Confidential, client_credentials grant for service tokens, + authorization_code grant if product has its own UI flows. + - Declare role mappings in product manifest: + role_mappings: + IT_ADMIN: Admin + LEGAL: Auditor + FINANCE: ReadOnly + USER: Member + - Declare an entitlement key (e.g. "certifai") that goes into JWT products claim. + +[RUNTIME] + - Validate JWT via Keycloak JWKS endpoint (cache JWKS for 5 min). + - Reject if products claim does not include this product's entitlement key. + - Reject if iss is not the platform Keycloak. + - Reject if exp expired or nbf future. + +[NEVER] + - Never validate JWT against a static secret. JWKS only. + - Never issue tokens. Never accept passwords. Never store credentials. +``` + +### 9.2 Control Plane + +``` +[REGISTRATION] + - On first deploy, product POSTs to Tenant Registry: + POST /catalog/products + body: manifest (see §10) + Tenant Registry verifies the manifest, pulls openapi.yaml, validates + mandatory endpoints, registers the product. + + - Product appears in Backstage product picker when creating sales orders. + +[LIFECYCLE] + - On tenant.activate: Tenant Registry calls product /v1/tenants/{id}/provision + - On tenant.suspend: calls /suspend + - On tenant.churn: calls /terminate + - On contract.renew: no call (idempotent: just stays active) + +[USAGE METERING] + - Tenant Registry runs a daily job hitting product /v1/usage for billing. + - Product is responsible for accurate metering and idempotent reporting. + +[BACKSTAGE ACTIONS] + - Product declares custom admin actions in manifest: + admin_actions: + - name: "Rebuild RAG Index" + endpoint: POST /v1/tenants/{id}/admin/rebuild-rag + confirm: required + plane: data + - Backstage renders these as buttons on /backstage/tenants/{id}/products/{name} + - Calls are SERVICE TOKEN authenticated and audit-logged. + +[AUDIT EVENTS] + - Product POSTs all audit events to Tenant Registry /audit endpoint. + - Tenant Registry stores them in audit_log table for cross-product unified view. +``` + +### 9.3 Data Plane + +``` +[DATA OWNERSHIP] + - Product owns its database. No other service queries it directly. + - Cross-product composition is via the inter-product service-token API (§11), + never via shared DB connections. + +[ISOLATION] + - Every table/collection has a tenant_id (or org_id) column. + - Every query filters by it. + - Database user permissions cannot bypass it. + +[PROJECT SCOPING — OPTIONAL] + - Products MAY support sub-tenancy via projects (mirrors GCP Project / + AWS Account pattern). Allows customers to separate dev / staging / prod + or per-team data within a single tenant. + - Declared in manifest: + data: + supports_projects: true + - Implementation: + - All tenant-scoped tables/collections add project_id column. + - Compound unique constraints become (tenant_id, project_id, key). + - All endpoints accept optional ?project_id=; absence means + the tenant's default implicit project. + - JWT may carry an active project_id claim; products SHOULD respect it + if present. + - Reference implementation: breakpilot-compliance already uses this pattern + (sdk_states UNIQUE on (tenant_id, project_id) since March 2026). + - Products that do NOT support projects must still gracefully ignore + project_id parameters (return tenant-wide data). + +[TENANT LIFECYCLE CONTRACT] + Products MUST honor the tenant.status passed in the JWT (`tenant_status` + custom claim) and behave per the table below. See PLATFORM_ARCHITECTURE.md + P15 + P16 for the full state machine. + + ┌──────────┬───────────────────────────────────────────────────────────┐ + │ status │ Product behavior │ + ├──────────┼───────────────────────────────────────────────────────────┤ + │ demo │ Accept all calls. Apply NO billing meter. Honor │ + │ │ /v1/tenants/demo/reset (idempotent). Seed from │ + │ │ catalog.demo.seed_data_url. Audit emitted but tagged │ + │ │ {"demo": true} so portal can hide from real audit. │ + ├──────────┼───────────────────────────────────────────────────────────┤ + │ trial │ Accept all calls up to catalog.trial_quota; over quota │ + │ │ return 429 with header X-Trial-Limit-Reset. Show "Trial" │ + │ │ context in any product UI banner area provided by host. │ + ├──────────┼───────────────────────────────────────────────────────────┤ + │ active │ Normal operation. │ + ├──────────┼───────────────────────────────────────────────────────────┤ + │ frozen │ Per data.frozen_behavior in manifest (typically reads │ + │ │ allowed, writes return 402, background jobs paused). │ + │ │ /export MUST work; webhook deliveries MUST stop. │ + ├──────────┼───────────────────────────────────────────────────────────┤ + │ archived │ All API calls return 410 Gone. Data already deleted by │ + │ │ the offboarding step; this state is for audit only. │ + └──────────┴───────────────────────────────────────────────────────────┘ + + Products MUST implement: + - GET /v1/tenants/{id}/export + Returns one ZIP per tenant containing every format declared in + data.offboarding_export_formats. Synchronous OK if <60s; async + with signed URL otherwise. + - DELETE /v1/tenants/{id}/data + Removes all tenant data within 30 days. Audit log retained + separately (see §8.4). Idempotent. + - POST /v1/tenants/demo/reset + Restores seed data. Only callable from the portal service token. + +[BACKUP CONTRACT] + - Product declares in manifest: + backup: + data_stores: [postgres, qdrant, minio] + rpo: 6h + rto: 30min + retention_days: 30 + - Infra Plane executes backups per declaration (pg_dump, etc.). + - Product publishes restore procedure in operational runbook. + +[GDPR ENDPOINTS] + - /v1/tenants/{id}/export returns ALL data for the tenant (JSON + blobs in ZIP). + - DELETE /v1/tenants/{id}/data deletes everything within 30 days of call. + - Both endpoints emit audit events. + +[DATA RESIDENCY] + - All data stays in EU (database, object storage, cache). + - Product declares any external data flows (e.g., LLM calls to OpenAI EU endpoint) + in the data model documentation. +``` + +### 9.4 Infra Plane + +``` +[IaC] + - Orca manifest at: /orca/manifests/{vm}/{product}.toml + - Manifest declares: image, resource limits, health check, secret refs, + network rules, replicas, restart policy. + - Changes go through Gitea PR → Gitea Actions → Orca apply. + +[SECRETS] + - All secrets via Infisical machine identity. + - Secret path namespacing: /prod/{product}/{KEY} + - Manifest references paths, never values: + secrets: + DB_URL: /prod/certifai/MONGODB_URI + LLM_KEY: /prod/certifai/LITELLM_MASTER_KEY + - Bootstrap secrets (DB URIs for Keycloak only) are the lone exception. + +[NETWORKING] + - Product services bind only to the private network. + - Public-facing routes pass through Orca-Proxy. + - Inter-product calls use internal DNS names (e.g., certifai.internal:8080). + +[BUILD + DEPLOY] + - Dockerfile in product repo root. + - Gitea Actions pipeline: + fmt → lint → test → build → push → orca apply → e2e + - Image tagged with git SHA + semver. + +[COLD START] + - Product declares startup dependencies in manifest: + depends_on: [keycloak, postgres-app, infisical] + - Orca enforces ordering on full restart (see INFRASTRUCTURE.md §10 Scenario F). +``` + +--- + +## 10. Product Manifest + +The canonical declaration of a product, used by Tenant Registry, Orca, and Backstage. One file, committed to product repo, applied via deployment pipeline. + +```yaml +# product.manifest.yaml +schema_version: "1.0" + +product: + id: certifai + name: "CERTifAI" + description: "Self-hosted GDPR-compliant AI infrastructure dashboard" + vendor: breakpilot # we; future third-parties will use their slug + contract_version: "1.0" + product_version: "1.4.2" + repo: git.yourplatform.com/sharang/certifai + +catalog: + # Renders in /[tenant]/catalog and /backstage/products + category: "AI Infrastructure" # AI Infrastructure | Compliance | Productivity | Security | Data + tagline: "GDPR-compliant LLMs without leaving the EU" + hero_image: https://cdn.yourplatform.com/products/certifai/hero.png + screenshots: + - https://cdn.yourplatform.com/products/certifai/dashboard.png + - https://cdn.yourplatform.com/products/certifai/agents.png + pricing_summary: "From €X/seat/month — included on Professional and Enterprise plans" + available_on_plans: [trial, professional, enterprise] # 'trial' opt-in for self-serve + trial_days: 14 + trial_quota: # caps applied while tenant.status == trial + llm_tokens_per_day: 100_000 + api_calls_per_day: 10_000 + works_well_with: [compliance] # cross-product affinity; surfaced in catalog + depends_on_products: [] # hard dependencies (rare; for compositions) + + demo: + supported: true # MUST be true unless explicitly waived + seed_data_url: https://cdn.yourplatform.com/products/certifai/demo/seed-v3.tar.gz + reset_endpoint: /v1/tenants/demo/reset # called nightly by portal cron + persona_hints: # for sales rep talk track + - "GDPR officer at a 200-person SaaS" + - "CTO replacing OpenAI calls with EU-hosted LLMs" + +identity: + oidc_client_id: certifai-client + entitlement_key: certifai + role_mappings: + IT_ADMIN: Admin + CXO: Member + FINANCE: Viewer + LEGAL: Viewer + USER: Member + required_scopes: + - read:agents + - write:agents + - read:usage + +frontend: + type: interactive # interactive | widget | headless + tag: certifai-dashboard + bundle_url: https://cdn.yourplatform.com/products/certifai/{version}/element.js + bundle_size_kb: 380 + routes: + - path: / + label: "Dashboard" + - path: /agents + label: "AI Agents" + required_role: Member + - path: /providers + label: "Providers" + required_role: Admin + +backend: + openapi_url: /openapi.yaml + base_url: https://certifai-api.internal/v1 + health_url: /health + service_token_audience: certifai-svc + +mcp: + enabled: true + required_plan: enterprise + endpoint: https://mcp.yourplatform.com/certifai + tools: + - name: list_ai_agents + description: "Returns AI agents configured for the tenant" + required_scope: read:agents + - name: get_llm_usage + description: "Returns LLM usage metrics" + required_scope: read:usage + # ... more tools + +data: + data_stores: + - type: mongodb + vm: vm-certifai + - type: external_api + provider: litellm + pii_class: low + tenant_scoping: + field: org_id + enforcement: middleware + supports_projects: false # see §9.3 PROJECT SCOPING + retention_default_days: 365 + gdpr_export: /v1/tenants/{id}/export + gdpr_erasure: /v1/tenants/{id}/data + offboarding_export_formats: [json, csv] # produced by P16 final-export step + frozen_behavior: + reads: allow # customer can still pull data / download exports + writes: deny_402 # POST/PUT/DELETE return 402 Payment Required + background_jobs: pause # scheduled work suspended, queue preserved + +backup: + rpo: 24h + rto: 30min + retention_days: 30 + +infra: + image: registry.yourplatform.com/certifai-dashboard + vm: vm-certifai + replicas: 1 + resource_limits: + cpu: "2000m" + memory: "4Gi" + health_check: + path: /health + interval: 30s + timeout: 5s + threshold: 3 + secrets: + - MONGODB_URI: /prod/certifai/MONGODB_URI + - KEYCLOAK_CLIENT_SECRET: /prod/certifai/KEYCLOAK_CLIENT_SECRET + - LITELLM_MASTER_KEY: /prod/certifai/LITELLM_MASTER_KEY + depends_on: + - keycloak + - mongodb + - infisical + +admin_actions: + - name: "Reset LiteLLM API Key" + description: "Rotates the per-tenant LiteLLM key" + endpoint: POST /v1/tenants/{id}/admin/rotate-litellm-key + confirm: required + audit_required: true + +observability: + metrics: /metrics + logs: + format: json + pii_redaction: true + audit_endpoint: tenant-registry.internal/audit +``` + +### 10.1 Manifest Variants by Frontend Type + +The example above shows an `interactive` product. Headless and widget products differ only in the `frontend` block. + +#### Widget variant + +```yaml +frontend: + type: widget + tag: status-monitor-widget + bundle_url: https://cdn.yourplatform.com/products/status/{version}/widget.js + bundle_size_kb: 38 + dimensions: + width: 400 + height: 240 + poll_interval_s: 60 + portal_config: + # same shape as headless (§ below) — used for click-through management page + sections: [api_keys, webhooks, usage, docs] + api_key_scopes: [...] + webhook_events: [...] +``` + +#### Headless variant (no frontend bundle) + +```yaml +frontend: + type: headless + # NO tag, NO bundle_url — the portal renders 100% of the customer UI + portal_config: + sections: + - api_keys + - webhooks + - usage + - code_samples + - docs + status_endpoint: /v1/status # optional; portal polls for status badge + api_key_scopes: + - id: read + description: "Read sessions and results" + - id: write + description: "Create new sessions" + - id: admin + description: "Manage settings (rare; consider before granting)" + webhook_events: + - name: session.completed + description: "Fires when a notetaker session is fully processed" + payload_schema_url: /schemas/session.completed.json + - name: session.failed + description: "Fires when a session cannot be processed" + payload_schema_url: /schemas/session.failed.json + code_samples: + - language: curl + title: "Create a session" + snippet: | + curl -X POST https://notetaker-api.yourplatform.com/v1/sessions \ + -H "Authorization: ApiKey k_xxx" \ + -H "X-Tenant: acme" \ + -d '{"audio_url": "...", "language": "en"}' + - language: python + title: "Create a session" + snippet: | + import requests + requests.post( + "https://notetaker-api.yourplatform.com/v1/sessions", + headers={"Authorization": "ApiKey k_xxx", "X-Tenant": "acme"}, + json={"audio_url": "...", "language": "en"}, + ) +``` + +The Tenant Registry validates the `frontend` block against the type: +- `interactive` requires `tag` and `bundle_url`; `portal_config` is optional +- `widget` requires `tag`, `bundle_url`, `dimensions`, AND `portal_config` +- `headless` MUST NOT declare `tag` or `bundle_url`; `portal_config` is required + +--- + +## 11. Service Token Model (Inter-Product Communication) + +Products can call each other directly. Auth is via short-lived service tokens issued by Keycloak's `client_credentials` flow. + +### 11.1 Flow + +``` +1. Compliance product needs to list AI agents for an AI Act assessment. + +2. Compliance backend requests a service token: + POST https://auth.yourplatform.com/realms/breakpilot-prod/protocol/openid-connect/token + Body: grant_type=client_credentials + client_id=compliance-svc + client_secret= + scope=read:certifai-agents + Response: JWT (15 min TTL) + +3. Compliance calls CERTifAI: + GET https://certifai-api.internal/v1/tenants/{tenant_id}/agents + Authorization: Bearer + X-On-Behalf-Of-User: ← original user, for audit + X-Service-Reason: ai-act-assessment + +4. CERTifAI validates token: + - Issued by platform Keycloak: ok + - Audience includes "certifai-svc": ok + - Scopes include "read:certifai-agents": ok + - tenant_id in path matches caller's intent: ok (no cross-tenant) + +5. CERTifAI returns data. + +6. Both sides emit audit events: + {actor: "svc:compliance", action: "certifai.list_agents", + on_behalf_of: "user_sub", tenant_id: "...", reason: "ai-act-assessment"} +``` + +### 11.2 Scope Catalog + +Each service declares scopes it offers (other services can request these) and scopes it consumes (it needs from other services). + +``` +certifai offers: + read:certifai-agents + read:certifai-usage + write:certifai-settings (rare; consider before granting) + +compliance offers: + read:compliance-status + read:compliance-dsfa + write:compliance-events (for cross-product event emission) + +billing-service consumes: + read:certifai-usage + read:compliance-status + +compliance consumes: + read:certifai-agents (for AI Act assessments) +``` + +Scopes are granted in Keycloak per service client. Grants are reviewed quarterly. + +### 11.3 Third-Party Readiness + +When we open the platform to third parties: + +``` +- Same OIDC client_credentials flow +- Manifests are SIGNED by third-party developer keys (signature verified by Tenant Registry) +- Third-party scopes are read-only by default; write scopes require manual approval +- Network isolation: third-party services run in a separate Orca subnet +- Resource limits enforced (CPU, memory, network egress) +- Per-tenant install requires explicit IT Admin consent (OAuth consent screen) +``` + +The contract surface today is the same — we just add verification gates. + +--- + +## 12. Versioning and Contract Evolution + +### 12.1 Versions in play + +``` +contract_version This document. Updated when the platform changes what products + must implement. Currently 1.0. Bumped on breaking changes. + +product_version The product's own version (semver). Tracked by Tenant Registry. + Independent of contract version. + +api_version The version in URL paths (/v1/, /v2/). Within a contract version, + a product may have multiple API versions live. +``` + +### 12.2 Platform supports N and N-1 + +The platform always supports the current contract version and the previous one. Deprecation announced in this doc before any breaking change. + +### 12.3 Breaking Change Process + +``` +1. Announce in this doc (one section per breaking change with motivation). +2. Update contract_version, e.g. 1.0 → 2.0. +3. New products required to ship 2.0 from day one. +4. Existing products get 12 months to migrate. +5. After 12 months, 1.0 retired; tenants on 1.0 products are migrated or churned. +``` + +--- + +## 13. Onboarding Checklist for a New Product + +A product is "ready to ship to a customer" when all boxes are ticked. + +``` +☐ Backend API + ☐ openapi.yaml committed and validated + ☐ Mandatory endpoints implemented (§4.1) + ☐ JWT validation via Keycloak JWKS + ☐ Service token validation + ☐ Tenant scoping enforced in middleware + tested + ☐ /v1/tenants/{id}/provision idempotency test passes + ☐ /v1/tenants/{id}/export produces valid GDPR-compliant ZIP + ☐ DELETE /v1/tenants/{id}/data is irreversible and audited + +☐ Frontend (manifest declares one of: interactive | widget | headless) + + For frontend.type = interactive: + ☐ Custom element registered with declared tag + ☐ Bundle published to CDN (≤ 500KB gzipped) + ☐ Handles all required attributes (§5.A.2) + ☐ Emits all event types (§5.A.3) + ☐ Light + dark theme support (§5.A.4) + ☐ At least one locale beyond English + + For frontend.type = widget: + ☐ Widget custom element registered with declared tag + ☐ Bundle published to CDN (≤ 50KB gzipped) + ☐ Tile dimensions declared in manifest + ☐ Allowed events only (no breakpilot:navigate) + ☐ portal_config block complete (for click-through page) + + For frontend.type = headless: + ☐ NO tag and NO bundle_url declared + ☐ portal_config.sections declared + ☐ portal_config.api_key_scopes catalog complete + ☐ portal_config.webhook_events catalog with payload schemas + ☐ portal_config.code_samples in at least one language + ☐ Webhook payloads include HMAC signature for verification + ☐ Status endpoint returns valid format (if declared) + ☐ POST /internal/api-keys/verify integration tested with Tenant Registry + ☐ POST /internal/webhooks/dispatch integration tested with portal + +☐ MCP (if Enterprise plan or applicable) + ☐ MCP server deployed + ☐ Tool catalog declared in manifest + ☐ API key authentication implemented + ☐ All tools tenant-scoped and audited + +☐ Documentation + ☐ README published at developers.yourplatform.com/products/{name} + ☐ API reference auto-generated and live + ☐ Integration guide for customer IT + ☐ Operational runbook for us + ☐ Data model + GDPR retention table + +☐ Observability + ☐ /health implemented and returns valid format + ☐ /metrics in Prometheus format + ☐ JSON structured logging + ☐ Audit events emitted for all listed categories + ☐ No PII in logs (PII redaction tested) + +☐ Identity integration + ☐ Keycloak OIDC client registered + ☐ Role mappings declared and tested + ☐ Entitlement key included in tenant JWTs (verified end-to-end) + +☐ Control integration + ☐ product.manifest.yaml committed + ☐ Registered with Tenant Registry catalog + ☐ Lifecycle endpoints tested via Backstage "Create Test Tenant" + ☐ Usage endpoint returns valid format + ☐ Backstage admin actions render correctly + +☐ Data integration + ☐ All tables/collections have tenant_id + ☐ Cross-tenant query test (negative test) passes + ☐ Backup contract declared and Infra Plane is executing it + ☐ GDPR export tested with real data + ☐ Data residency confirmed (no exfiltration outside EU) + +☐ Infra integration + ☐ Orca manifest committed and applies cleanly + ☐ Dockerfile builds reproducibly + ☐ All secrets in Infisical (zero hardcoded) + ☐ Gitea Actions pipeline green + ☐ Resource limits set and tested under load + ☐ Cold start dependency order declared +``` + +--- + +## 14. Gap Analysis — Existing Products + +### CERTifAI vs. Contract 1.0 + +``` +✓ OIDC via Keycloak — already implemented +✓ Role data model (Admin/Member/Viewer) — exists +✗ Mandatory endpoints — NONE of §4.1 implemented yet +✗ Frontend as web component — currently a full Dioxus fullstack app +✗ MCP server — not implemented +✗ Tenant scoping in queries — only chat is user-scoped, no org_id scoping +✗ Service token validation — not implemented +✗ GDPR export/erasure — not implemented +✗ /health, /metrics, structured audit emission — not implemented +✓ Orca + Infisical compatible — already deployed this way + +Effort estimate: 4-6 weeks of focused work +``` + +### breakpilot-compliance vs. Contract 1.0 + +``` +✓ Multi-tenant via X-Tenant-ID — exists (needs JWT validation upgrade) +✓ Modular Next.js frontend — close to web-component-wrappable +✗ Mandatory endpoints — partially implemented (usage endpoint missing) +✗ JWT validation at proxy — currently raw header trust +✗ Frontend as web component — needs wrapping with @r2wc/react-to-web-component +✗ MCP server — not implemented +✓ Backup contract — declared informally, needs to be in manifest +✗ GDPR export/erasure — partial (DSR module exists, doesn't cover whole tenant) +✓ Observability — partial (structured logs, no /metrics) + +Effort estimate: 3-5 weeks of focused work +``` + +--- + +## 15. Open Items + +``` +- Design tokens package (@breakpilot/design-tokens) — needs to exist before web components ship +- CDN for product bundles — pick provider (Hetzner Object Storage + Cloudflare?) +- MCP gateway — single mcp.yourplatform.com vs. per-product subdomains +- Third-party manifest signing — defer until first real third-party conversation +- Inter-product event bus — explicitly deferred; service tokens cover the use cases for now +- Contract testing — automate manifest + openapi validation in Gitea Actions +- Customer-facing catalog UI — defined at /[tenant]/catalog (see PLATFORM_ARCHITECTURE.md + §5a operating principles); Backstage product picker reuses same catalog metadata. + +OSS swap-in points (designed-for, not adopted yet): +- Audit log storage: BoxyHQ Retraced — our event schema is Retraced-shape (§8.4), + swap when audit query patterns outgrow PostgreSQL or when a customer asks for + exportable SOC2-grade audit retention. +- Usage metering: Lago — our /v1/usage endpoint plus optional per-event stream + (§4.1) is Lago-compatible. Swap when LiteLLM token billing requires real-time + metering or per-customer pricing tiers we cannot model in Stripe. +- Customer IdP federation (SCIM): BoxyHQ Jackson or Keycloak's SCIM module. + Adopt when first enterprise customer asks for automated user provisioning. +- Feature flags / per-tenant feature gating: OpenFeature (vendor-neutral). + Adopt when product features need finer-than-plan-tier gating per tenant. +``` + +--- + +*End of document. Contract version 1.0. Next review: after first product (CERTifAI or compliance) achieves full compliance with §13 checklist.* diff --git a/README.md b/README.md index 5c1de97..65c2c09 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,59 @@ # docs -Platform-wide architecture, integration spec, runbooks. \ No newline at end of file +Platform-wide architecture, integration spec, runbooks. + +> Part of the **Breakpilot Platform**. For the big picture see [`platform/docs`](https://gitea.meghsakha.com/platform/docs): +> [Architecture](https://gitea.meghsakha.com/platform/docs/src/branch/main/PLATFORM_ARCHITECTURE.md) · +> [Infrastructure](https://gitea.meghsakha.com/platform/docs/src/branch/main/INFRASTRUCTURE.md) · +> [Product Integration Spec](https://gitea.meghsakha.com/platform/docs/src/branch/main/PRODUCT_INTEGRATION_SPEC.md) · +> [Implementation Plan](https://gitea.meghsakha.com/platform/docs/src/branch/main/IMPLEMENTATION_PLAN.md) + +## What this is + +Platform-wide architecture, integration spec, runbooks. Scaffolded under milestone M0.1. See [`platform/docs`](https://gitea.meghsakha.com/platform/docs) for the full architecture context. + +**Plane:** Control +**Owner:** @sharang +**Status:** pre-alpha +**Linked milestone:** [M0.1](https://gitea.meghsakha.com/platform/docs/src/branch/main/IMPLEMENTATION_PLAN.md) + +## Run locally + +```bash +# prerequisites: see CONTRIBUTING.md for tooling once code lands +make dev # starts dependencies + this service on http://localhost:3000 +make test # unit + integration +make e2e # only if this repo ships user-facing flows +``` + +Local secrets come from `.env.local` (gitignored). Template at `.env.example`. + +## Endpoints / surface + +{{For services: list the top-level routes or commands. +For libraries: list the public API entry points. +For IaC: list the make targets.}} + +## Deployment + +| Env | URL | How | +|---|---|---| +| dev | `http://localhost:3000` | `make dev` | +| stage | `https://docs.stage.yourplatform.com` | auto on merge to `main` | +| prod | `https://docs.yourplatform.com` | manual: tag `vX.Y.Z` + sign-off | + +Rollback: `orca rollout undo docs --env={{env}}`. + +## Observability + +- Traces, logs, metrics: [SigNoz](https://signoz.meghsakha.com) — service name `docs` +- Audit events: Tenant Registry `/audit` (Retraced-shape schema) +- On-call: `oncall@yourplatform.com` · runbook at `platform/docs/runbooks/docs.md` + +## Contributing + +See [`CONTRIBUTING.md`](./CONTRIBUTING.md). TL;DR: branch from main, open a PR, 1 review + green CI, squash-merge. + +## License + +MIT — see [`LICENSE`](./LICENSE). diff --git a/cliff.toml b/cliff.toml new file mode 100644 index 0000000..134be61 --- /dev/null +++ b/cliff.toml @@ -0,0 +1,39 @@ +# git-cliff config — generates release notes from Conventional Commits. +# Preset: keepachangelog. + +[changelog] +header = """ +# Changelog + +All notable changes to this repo. Format: [Keep a Changelog](https://keepachangelog.com/). +""" +body = """ +{% if version %}\ +## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }} +{% else %}\ +## [Unreleased] +{% endif %}\ +{% for group, commits in commits | group_by(attribute="group") %} +### {{ group | upper_first }} +{% for commit in commits %} +- {{ commit.message | upper_first }}\ +{% endfor %} +{% endfor %} +""" +trim = true + +[git] +conventional_commits = true +filter_unconventional = true +commit_parsers = [ + { message = "^feat", group = "Added" }, + { message = "^fix", group = "Fixed" }, + { message = "^perf", group = "Changed" }, + { message = "^refactor", group = "Changed" }, + { message = "^docs", group = "Docs" }, + { message = "^chore", skip = true }, + { message = "^ci", skip = true }, + { message = "^test", skip = true }, +] +filter_commits = true +tag_pattern = "v[0-9]*" diff --git a/commitlint.config.cjs b/commitlint.config.cjs new file mode 100644 index 0000000..03965a1 --- /dev/null +++ b/commitlint.config.cjs @@ -0,0 +1,32 @@ +// commitlint.config.cjs — Conventional Commits enforcement for every repo. +// Used by .gitea/workflows/ci-*.yaml `wagoid/commitlint-github-action`. + +module.exports = { + extends: ['@commitlint/config-conventional'], + rules: { + 'type-enum': [2, 'always', [ + 'feat', // new feature + 'fix', // bug fix + 'docs', // documentation + 'chore', // tooling, deps, no production code change + 'refactor', // refactor with no behavior change + 'test', // tests only + 'perf', // performance + 'build', // build system, Dockerfile + 'ci', // CI config + 'revert', // revert a prior commit + ]], + 'subject-case': [2, 'always', 'sentence-case'], + 'subject-max-length': [2, 'always', 72], + 'body-max-line-length': [1, 'always', 100], + 'footer-leading-blank': [2, 'always'], + 'references-empty': [1, 'never'], // warn if no Refs: M1.2 footer + }, + parserPreset: { + parserOpts: { + // Capture milestone references: "Refs: M5.2" or "Closes: M5.2" + referenceActions: ['close', 'closes', 'closed', 'fix', 'fixes', 'fixed', 'refs', 'ref'], + issuePrefixes: ['M', '#'], + }, + }, +};