feat(pipeline): v3 — scoped control applicability + source_type classification
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 36s
CI/CD / test-python-document-crawler (push) Successful in 27s
CI/CD / test-python-dsms-gateway (push) Successful in 18s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped

Phase 4: source_type (law/guideline/standard/restricted) on source_citation
- NIST/OWASP/ENISA correctly shown as "Standard" instead of "Gesetzliche Grundlage"
- Dynamic frontend labels based on source_type
- Backfill endpoint POST /v1/canonical/generate/backfill-source-type

Phase v3: Scoped Control Applicability
- 3 new fields: applicable_industries, applicable_company_size, scope_conditions
- LLM prompt extended with 39 industries, 5 company sizes, 10 scope signals
- All 5 generation paths (Rule 1/2/3, batch structure, batch reform) updated
- _build_control_from_json: parsing + validation (string→list, size validation)
- _store_control: writes 3 new JSONB columns
- API: response models, create/update requests, SELECT queries extended
- Migration 063: 3 new JSONB columns with GIN indexes
- 110 generator tests + 28 route tests = 138 total, all passing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-18 16:28:05 +01:00
parent 3bb9fffab6
commit f2819b99af
9 changed files with 685 additions and 139 deletions

View File

@@ -28,6 +28,7 @@ from compliance.services.control_generator import (
ALL_COLLECTIONS,
VALID_CATEGORIES,
VALID_DOMAINS,
_classify_regulation,
_detect_category,
_detect_domain,
_llm_local,
@@ -978,3 +979,122 @@ async def get_domain_backfill_status(backfill_id: str):
if not status:
raise HTTPException(status_code=404, detail="Domain backfill job not found")
return status
# ---------------------------------------------------------------------------
# Source-Type Backfill — Classify law vs guideline vs standard vs restricted
# ---------------------------------------------------------------------------
class SourceTypeBackfillRequest(BaseModel):
dry_run: bool = True
_source_type_backfill_status: dict = {}
async def _run_source_type_backfill(dry_run: bool, backfill_id: str):
"""Backfill source_type into source_citation JSONB for all controls."""
db = SessionLocal()
try:
# Find controls with source_citation that lack source_type
rows = db.execute(text("""
SELECT control_id, source_citation, generation_metadata
FROM compliance.canonical_controls
WHERE source_citation IS NOT NULL
AND (source_citation->>'source_type' IS NULL
OR source_citation->>'source_type' = '')
""")).fetchall()
total = len(rows)
updated = 0
already_correct = 0
errors = []
_source_type_backfill_status[backfill_id] = {
"status": "running", "total": total, "updated": 0, "dry_run": dry_run,
}
for row in rows:
cid = row[0]
citation = row[1] if isinstance(row[1], dict) else json.loads(row[1] or "{}")
metadata = row[2] if isinstance(row[2], dict) else json.loads(row[2] or "{}")
# Get regulation_code from metadata
reg_code = metadata.get("source_regulation", "")
if not reg_code:
# Try to infer from source name
errors.append(f"{cid}: no source_regulation in metadata")
continue
# Classify
license_info = _classify_regulation(reg_code)
source_type = license_info.get("source_type", "restricted")
# Update citation
citation["source_type"] = source_type
if not dry_run:
db.execute(text("""
UPDATE compliance.canonical_controls
SET source_citation = :citation
WHERE control_id = :cid
"""), {"citation": json.dumps(citation), "cid": cid})
if updated % 100 == 0:
db.commit()
updated += 1
if not dry_run:
db.commit()
# Count distribution
dist_query = db.execute(text("""
SELECT source_citation->>'source_type' as st, COUNT(*)
FROM compliance.canonical_controls
WHERE source_citation IS NOT NULL
AND source_citation->>'source_type' IS NOT NULL
GROUP BY st
""")).fetchall() if not dry_run else []
distribution = {r[0]: r[1] for r in dist_query}
_source_type_backfill_status[backfill_id] = {
"status": "completed", "total": total, "updated": updated,
"dry_run": dry_run, "distribution": distribution,
"errors": errors[:50],
}
logger.info("Source-type backfill %s completed: %d/%d updated (dry_run=%s)",
backfill_id, updated, total, dry_run)
except Exception as e:
logger.error("Source-type backfill %s failed: %s", backfill_id, e)
_source_type_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
finally:
db.close()
@router.post("/generate/backfill-source-type")
async def start_source_type_backfill(req: SourceTypeBackfillRequest):
"""Backfill source_type (law/guideline/standard/restricted) into source_citation JSONB.
Classifies each control's source as binding law, authority guideline,
voluntary standard, or restricted norm based on regulation_code.
Default is dry_run=True (preview only).
"""
import uuid
backfill_id = str(uuid.uuid4())[:8]
_source_type_backfill_status[backfill_id] = {"status": "starting"}
asyncio.create_task(_run_source_type_backfill(req.dry_run, backfill_id))
return {
"status": "running",
"backfill_id": backfill_id,
"message": f"Source-type backfill started. Poll /generate/source-type-backfill-status/{backfill_id}",
}
@router.get("/generate/source-type-backfill-status/{backfill_id}")
async def get_source_type_backfill_status(backfill_id: str):
"""Get status of a source-type backfill job."""
status = _source_type_backfill_status.get(backfill_id)
if not status:
raise HTTPException(status_code=404, detail="Source-type backfill job not found")
return status