feat(pipeline): v3 — scoped control applicability + source_type classification
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 36s
CI/CD / test-python-document-crawler (push) Successful in 27s
CI/CD / test-python-dsms-gateway (push) Successful in 18s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped

Phase 4: source_type (law/guideline/standard/restricted) on source_citation
- NIST/OWASP/ENISA correctly shown as "Standard" instead of "Gesetzliche Grundlage"
- Dynamic frontend labels based on source_type
- Backfill endpoint POST /v1/canonical/generate/backfill-source-type

Phase v3: Scoped Control Applicability
- 3 new fields: applicable_industries, applicable_company_size, scope_conditions
- LLM prompt extended with 39 industries, 5 company sizes, 10 scope signals
- All 5 generation paths (Rule 1/2/3, batch structure, batch reform) updated
- _build_control_from_json: parsing + validation (string→list, size validation)
- _store_control: writes 3 new JSONB columns
- API: response models, create/update requests, SELECT queries extended
- Migration 063: 3 new JSONB columns with GIN indexes
- 110 generator tests + 28 route tests = 138 total, all passing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-18 16:28:05 +01:00
parent 3bb9fffab6
commit f2819b99af
9 changed files with 685 additions and 139 deletions

View File

@@ -82,6 +82,9 @@ class ControlResponse(BaseModel):
target_audience: Optional[str] = None
generation_metadata: Optional[dict] = None
generation_strategy: Optional[str] = "ungrouped"
applicable_industries: Optional[list] = None
applicable_company_size: Optional[list] = None
scope_conditions: Optional[dict] = None
created_at: str
updated_at: str
@@ -111,6 +114,9 @@ class ControlCreateRequest(BaseModel):
category: Optional[str] = None
target_audience: Optional[str] = None
generation_metadata: Optional[dict] = None
applicable_industries: Optional[list] = None
applicable_company_size: Optional[list] = None
scope_conditions: Optional[dict] = None
class ControlUpdateRequest(BaseModel):
@@ -136,6 +142,9 @@ class ControlUpdateRequest(BaseModel):
category: Optional[str] = None
target_audience: Optional[str] = None
generation_metadata: Optional[dict] = None
applicable_industries: Optional[list] = None
applicable_company_size: Optional[list] = None
scope_conditions: Optional[dict] = None
class SimilarityCheckRequest(BaseModel):
@@ -164,6 +173,7 @@ _CONTROL_COLS = """id, framework_id, control_id, title, objective, rationale,
license_rule, source_original_text, source_citation,
customer_visible, verification_method, category,
target_audience, generation_metadata, generation_strategy,
applicable_industries, applicable_company_size, scope_conditions,
created_at, updated_at"""
@@ -511,7 +521,8 @@ async def create_control(body: ControlCreateRequest):
open_anchors, release_state, tags,
license_rule, source_original_text, source_citation,
customer_visible, verification_method, category,
target_audience, generation_metadata
target_audience, generation_metadata,
applicable_industries, applicable_company_size, scope_conditions
) VALUES (
:fw_id, :cid, :title, :objective, :rationale,
CAST(:scope AS jsonb), CAST(:requirements AS jsonb),
@@ -521,7 +532,10 @@ async def create_control(body: ControlCreateRequest):
:license_rule, :source_original_text,
CAST(:source_citation AS jsonb),
:customer_visible, :verification_method, :category,
:target_audience, CAST(:generation_metadata AS jsonb)
:target_audience, CAST(:generation_metadata AS jsonb),
CAST(:applicable_industries AS jsonb),
CAST(:applicable_company_size AS jsonb),
CAST(:scope_conditions AS jsonb)
)
RETURNING {_CONTROL_COLS}
"""),
@@ -550,6 +564,9 @@ async def create_control(body: ControlCreateRequest):
"category": body.category,
"target_audience": body.target_audience,
"generation_metadata": _json.dumps(body.generation_metadata) if body.generation_metadata else None,
"applicable_industries": _json.dumps(body.applicable_industries) if body.applicable_industries else None,
"applicable_company_size": _json.dumps(body.applicable_company_size) if body.applicable_company_size else None,
"scope_conditions": _json.dumps(body.scope_conditions) if body.scope_conditions else None,
},
).fetchone()
db.commit()
@@ -778,6 +795,9 @@ def _control_row(r) -> dict:
"target_audience": r.target_audience,
"generation_metadata": r.generation_metadata,
"generation_strategy": getattr(r, "generation_strategy", "ungrouped"),
"applicable_industries": getattr(r, "applicable_industries", None),
"applicable_company_size": getattr(r, "applicable_company_size", None),
"scope_conditions": getattr(r, "scope_conditions", None),
"created_at": r.created_at.isoformat() if r.created_at else None,
"updated_at": r.updated_at.isoformat() if r.updated_at else None,
}

View File

@@ -28,6 +28,7 @@ from compliance.services.control_generator import (
ALL_COLLECTIONS,
VALID_CATEGORIES,
VALID_DOMAINS,
_classify_regulation,
_detect_category,
_detect_domain,
_llm_local,
@@ -978,3 +979,122 @@ async def get_domain_backfill_status(backfill_id: str):
if not status:
raise HTTPException(status_code=404, detail="Domain backfill job not found")
return status
# ---------------------------------------------------------------------------
# Source-Type Backfill — Classify law vs guideline vs standard vs restricted
# ---------------------------------------------------------------------------
class SourceTypeBackfillRequest(BaseModel):
dry_run: bool = True
_source_type_backfill_status: dict = {}
async def _run_source_type_backfill(dry_run: bool, backfill_id: str):
"""Backfill source_type into source_citation JSONB for all controls."""
db = SessionLocal()
try:
# Find controls with source_citation that lack source_type
rows = db.execute(text("""
SELECT control_id, source_citation, generation_metadata
FROM compliance.canonical_controls
WHERE source_citation IS NOT NULL
AND (source_citation->>'source_type' IS NULL
OR source_citation->>'source_type' = '')
""")).fetchall()
total = len(rows)
updated = 0
already_correct = 0
errors = []
_source_type_backfill_status[backfill_id] = {
"status": "running", "total": total, "updated": 0, "dry_run": dry_run,
}
for row in rows:
cid = row[0]
citation = row[1] if isinstance(row[1], dict) else json.loads(row[1] or "{}")
metadata = row[2] if isinstance(row[2], dict) else json.loads(row[2] or "{}")
# Get regulation_code from metadata
reg_code = metadata.get("source_regulation", "")
if not reg_code:
# Try to infer from source name
errors.append(f"{cid}: no source_regulation in metadata")
continue
# Classify
license_info = _classify_regulation(reg_code)
source_type = license_info.get("source_type", "restricted")
# Update citation
citation["source_type"] = source_type
if not dry_run:
db.execute(text("""
UPDATE compliance.canonical_controls
SET source_citation = :citation
WHERE control_id = :cid
"""), {"citation": json.dumps(citation), "cid": cid})
if updated % 100 == 0:
db.commit()
updated += 1
if not dry_run:
db.commit()
# Count distribution
dist_query = db.execute(text("""
SELECT source_citation->>'source_type' as st, COUNT(*)
FROM compliance.canonical_controls
WHERE source_citation IS NOT NULL
AND source_citation->>'source_type' IS NOT NULL
GROUP BY st
""")).fetchall() if not dry_run else []
distribution = {r[0]: r[1] for r in dist_query}
_source_type_backfill_status[backfill_id] = {
"status": "completed", "total": total, "updated": updated,
"dry_run": dry_run, "distribution": distribution,
"errors": errors[:50],
}
logger.info("Source-type backfill %s completed: %d/%d updated (dry_run=%s)",
backfill_id, updated, total, dry_run)
except Exception as e:
logger.error("Source-type backfill %s failed: %s", backfill_id, e)
_source_type_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
finally:
db.close()
@router.post("/generate/backfill-source-type")
async def start_source_type_backfill(req: SourceTypeBackfillRequest):
"""Backfill source_type (law/guideline/standard/restricted) into source_citation JSONB.
Classifies each control's source as binding law, authority guideline,
voluntary standard, or restricted norm based on regulation_code.
Default is dry_run=True (preview only).
"""
import uuid
backfill_id = str(uuid.uuid4())[:8]
_source_type_backfill_status[backfill_id] = {"status": "starting"}
asyncio.create_task(_run_source_type_backfill(req.dry_run, backfill_id))
return {
"status": "running",
"backfill_id": backfill_id,
"message": f"Source-type backfill started. Poll /generate/source-type-backfill-status/{backfill_id}",
}
@router.get("/generate/source-type-backfill-status/{backfill_id}")
async def get_source_type_backfill_status(backfill_id: str):
"""Get status of a source-type backfill job."""
status = _source_type_backfill_status.get(backfill_id)
if not status:
raise HTTPException(status_code=404, detail="Source-type backfill job not found")
return status