feat(pipeline): v3 — scoped control applicability + source_type classification
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 36s
CI/CD / test-python-document-crawler (push) Successful in 27s
CI/CD / test-python-dsms-gateway (push) Successful in 18s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 36s
CI/CD / test-python-document-crawler (push) Successful in 27s
CI/CD / test-python-dsms-gateway (push) Successful in 18s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
Phase 4: source_type (law/guideline/standard/restricted) on source_citation - NIST/OWASP/ENISA correctly shown as "Standard" instead of "Gesetzliche Grundlage" - Dynamic frontend labels based on source_type - Backfill endpoint POST /v1/canonical/generate/backfill-source-type Phase v3: Scoped Control Applicability - 3 new fields: applicable_industries, applicable_company_size, scope_conditions - LLM prompt extended with 39 industries, 5 company sizes, 10 scope signals - All 5 generation paths (Rule 1/2/3, batch structure, batch reform) updated - _build_control_from_json: parsing + validation (string→list, size validation) - _store_control: writes 3 new JSONB columns - API: response models, create/update requests, SELECT queries extended - Migration 063: 3 new JSONB columns with GIN indexes - 110 generator tests + 28 route tests = 138 total, all passing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -82,6 +82,9 @@ class ControlResponse(BaseModel):
|
||||
target_audience: Optional[str] = None
|
||||
generation_metadata: Optional[dict] = None
|
||||
generation_strategy: Optional[str] = "ungrouped"
|
||||
applicable_industries: Optional[list] = None
|
||||
applicable_company_size: Optional[list] = None
|
||||
scope_conditions: Optional[dict] = None
|
||||
created_at: str
|
||||
updated_at: str
|
||||
|
||||
@@ -111,6 +114,9 @@ class ControlCreateRequest(BaseModel):
|
||||
category: Optional[str] = None
|
||||
target_audience: Optional[str] = None
|
||||
generation_metadata: Optional[dict] = None
|
||||
applicable_industries: Optional[list] = None
|
||||
applicable_company_size: Optional[list] = None
|
||||
scope_conditions: Optional[dict] = None
|
||||
|
||||
|
||||
class ControlUpdateRequest(BaseModel):
|
||||
@@ -136,6 +142,9 @@ class ControlUpdateRequest(BaseModel):
|
||||
category: Optional[str] = None
|
||||
target_audience: Optional[str] = None
|
||||
generation_metadata: Optional[dict] = None
|
||||
applicable_industries: Optional[list] = None
|
||||
applicable_company_size: Optional[list] = None
|
||||
scope_conditions: Optional[dict] = None
|
||||
|
||||
|
||||
class SimilarityCheckRequest(BaseModel):
|
||||
@@ -164,6 +173,7 @@ _CONTROL_COLS = """id, framework_id, control_id, title, objective, rationale,
|
||||
license_rule, source_original_text, source_citation,
|
||||
customer_visible, verification_method, category,
|
||||
target_audience, generation_metadata, generation_strategy,
|
||||
applicable_industries, applicable_company_size, scope_conditions,
|
||||
created_at, updated_at"""
|
||||
|
||||
|
||||
@@ -511,7 +521,8 @@ async def create_control(body: ControlCreateRequest):
|
||||
open_anchors, release_state, tags,
|
||||
license_rule, source_original_text, source_citation,
|
||||
customer_visible, verification_method, category,
|
||||
target_audience, generation_metadata
|
||||
target_audience, generation_metadata,
|
||||
applicable_industries, applicable_company_size, scope_conditions
|
||||
) VALUES (
|
||||
:fw_id, :cid, :title, :objective, :rationale,
|
||||
CAST(:scope AS jsonb), CAST(:requirements AS jsonb),
|
||||
@@ -521,7 +532,10 @@ async def create_control(body: ControlCreateRequest):
|
||||
:license_rule, :source_original_text,
|
||||
CAST(:source_citation AS jsonb),
|
||||
:customer_visible, :verification_method, :category,
|
||||
:target_audience, CAST(:generation_metadata AS jsonb)
|
||||
:target_audience, CAST(:generation_metadata AS jsonb),
|
||||
CAST(:applicable_industries AS jsonb),
|
||||
CAST(:applicable_company_size AS jsonb),
|
||||
CAST(:scope_conditions AS jsonb)
|
||||
)
|
||||
RETURNING {_CONTROL_COLS}
|
||||
"""),
|
||||
@@ -550,6 +564,9 @@ async def create_control(body: ControlCreateRequest):
|
||||
"category": body.category,
|
||||
"target_audience": body.target_audience,
|
||||
"generation_metadata": _json.dumps(body.generation_metadata) if body.generation_metadata else None,
|
||||
"applicable_industries": _json.dumps(body.applicable_industries) if body.applicable_industries else None,
|
||||
"applicable_company_size": _json.dumps(body.applicable_company_size) if body.applicable_company_size else None,
|
||||
"scope_conditions": _json.dumps(body.scope_conditions) if body.scope_conditions else None,
|
||||
},
|
||||
).fetchone()
|
||||
db.commit()
|
||||
@@ -778,6 +795,9 @@ def _control_row(r) -> dict:
|
||||
"target_audience": r.target_audience,
|
||||
"generation_metadata": r.generation_metadata,
|
||||
"generation_strategy": getattr(r, "generation_strategy", "ungrouped"),
|
||||
"applicable_industries": getattr(r, "applicable_industries", None),
|
||||
"applicable_company_size": getattr(r, "applicable_company_size", None),
|
||||
"scope_conditions": getattr(r, "scope_conditions", None),
|
||||
"created_at": r.created_at.isoformat() if r.created_at else None,
|
||||
"updated_at": r.updated_at.isoformat() if r.updated_at else None,
|
||||
}
|
||||
|
||||
@@ -28,6 +28,7 @@ from compliance.services.control_generator import (
|
||||
ALL_COLLECTIONS,
|
||||
VALID_CATEGORIES,
|
||||
VALID_DOMAINS,
|
||||
_classify_regulation,
|
||||
_detect_category,
|
||||
_detect_domain,
|
||||
_llm_local,
|
||||
@@ -978,3 +979,122 @@ async def get_domain_backfill_status(backfill_id: str):
|
||||
if not status:
|
||||
raise HTTPException(status_code=404, detail="Domain backfill job not found")
|
||||
return status
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Source-Type Backfill — Classify law vs guideline vs standard vs restricted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class SourceTypeBackfillRequest(BaseModel):
|
||||
dry_run: bool = True
|
||||
|
||||
|
||||
_source_type_backfill_status: dict = {}
|
||||
|
||||
|
||||
async def _run_source_type_backfill(dry_run: bool, backfill_id: str):
|
||||
"""Backfill source_type into source_citation JSONB for all controls."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Find controls with source_citation that lack source_type
|
||||
rows = db.execute(text("""
|
||||
SELECT control_id, source_citation, generation_metadata
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation IS NOT NULL
|
||||
AND (source_citation->>'source_type' IS NULL
|
||||
OR source_citation->>'source_type' = '')
|
||||
""")).fetchall()
|
||||
|
||||
total = len(rows)
|
||||
updated = 0
|
||||
already_correct = 0
|
||||
errors = []
|
||||
|
||||
_source_type_backfill_status[backfill_id] = {
|
||||
"status": "running", "total": total, "updated": 0, "dry_run": dry_run,
|
||||
}
|
||||
|
||||
for row in rows:
|
||||
cid = row[0]
|
||||
citation = row[1] if isinstance(row[1], dict) else json.loads(row[1] or "{}")
|
||||
metadata = row[2] if isinstance(row[2], dict) else json.loads(row[2] or "{}")
|
||||
|
||||
# Get regulation_code from metadata
|
||||
reg_code = metadata.get("source_regulation", "")
|
||||
if not reg_code:
|
||||
# Try to infer from source name
|
||||
errors.append(f"{cid}: no source_regulation in metadata")
|
||||
continue
|
||||
|
||||
# Classify
|
||||
license_info = _classify_regulation(reg_code)
|
||||
source_type = license_info.get("source_type", "restricted")
|
||||
|
||||
# Update citation
|
||||
citation["source_type"] = source_type
|
||||
|
||||
if not dry_run:
|
||||
db.execute(text("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = :citation
|
||||
WHERE control_id = :cid
|
||||
"""), {"citation": json.dumps(citation), "cid": cid})
|
||||
if updated % 100 == 0:
|
||||
db.commit()
|
||||
updated += 1
|
||||
|
||||
if not dry_run:
|
||||
db.commit()
|
||||
|
||||
# Count distribution
|
||||
dist_query = db.execute(text("""
|
||||
SELECT source_citation->>'source_type' as st, COUNT(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation IS NOT NULL
|
||||
AND source_citation->>'source_type' IS NOT NULL
|
||||
GROUP BY st
|
||||
""")).fetchall() if not dry_run else []
|
||||
|
||||
distribution = {r[0]: r[1] for r in dist_query}
|
||||
|
||||
_source_type_backfill_status[backfill_id] = {
|
||||
"status": "completed", "total": total, "updated": updated,
|
||||
"dry_run": dry_run, "distribution": distribution,
|
||||
"errors": errors[:50],
|
||||
}
|
||||
logger.info("Source-type backfill %s completed: %d/%d updated (dry_run=%s)",
|
||||
backfill_id, updated, total, dry_run)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Source-type backfill %s failed: %s", backfill_id, e)
|
||||
_source_type_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/generate/backfill-source-type")
|
||||
async def start_source_type_backfill(req: SourceTypeBackfillRequest):
|
||||
"""Backfill source_type (law/guideline/standard/restricted) into source_citation JSONB.
|
||||
|
||||
Classifies each control's source as binding law, authority guideline,
|
||||
voluntary standard, or restricted norm based on regulation_code.
|
||||
Default is dry_run=True (preview only).
|
||||
"""
|
||||
import uuid
|
||||
backfill_id = str(uuid.uuid4())[:8]
|
||||
_source_type_backfill_status[backfill_id] = {"status": "starting"}
|
||||
asyncio.create_task(_run_source_type_backfill(req.dry_run, backfill_id))
|
||||
return {
|
||||
"status": "running",
|
||||
"backfill_id": backfill_id,
|
||||
"message": f"Source-type backfill started. Poll /generate/source-type-backfill-status/{backfill_id}",
|
||||
}
|
||||
|
||||
|
||||
@router.get("/generate/source-type-backfill-status/{backfill_id}")
|
||||
async def get_source_type_backfill_status(backfill_id: str):
|
||||
"""Get status of a source-type backfill job."""
|
||||
status = _source_type_backfill_status.get(backfill_id)
|
||||
if not status:
|
||||
raise HTTPException(status_code=404, detail="Source-type backfill job not found")
|
||||
return status
|
||||
|
||||
Reference in New Issue
Block a user