feat(iace): add hazard-matching-engine with component library, tag system, and pattern engine

Implements Phases 1-4 of the IACE Hazard-Matching-Engine: - 120 machine components (C001-C120) in 11 categories - 20 energy sources (EN01-EN20) - ~85 tag taxonomy across 5 domains - 44 hazard patterns with AND/NOT matching logic - Pattern engine with tag resolution and confidence scoring - 8 new API endpoints (component-library, energy-sources, tags, patterns, match/apply) - Completeness gate G09 for pattern matching - 320 tests passing (36 new) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 08:50:11 +01:00
parent c7651796c9
commit 3b2006ebce
21 changed files with 4993 additions and 41 deletions
@@ -27,6 +27,7 @@ from compliance.services.control_generator import (
    GeneratorConfig,
    ALL_COLLECTIONS,
 )
+from compliance.services.citation_backfill import CitationBackfill, BackfillResult
 from compliance.services.rag_client import get_rag_client

 logger = logging.getLogger(__name__)
@@ -496,3 +497,288 @@ async def get_controls_customer_view(
        return {"controls": controls, "total": len(controls)}
    finally:
        db.close()
+
+
+# =============================================================================
+# CITATION BACKFILL
+# =============================================================================
+
+class BackfillRequest(BaseModel):
+    dry_run: bool = True  # Default to dry_run for safety
+    limit: int = 0  # 0 = all controls
+
+
+class BackfillResponse(BaseModel):
+    status: str
+    total_controls: int = 0
+    matched_hash: int = 0
+    matched_regex: int = 0
+    matched_llm: int = 0
+    unmatched: int = 0
+    updated: int = 0
+    errors: list = []
+
+
+_backfill_status: dict = {}
+
+
+async def _run_backfill_background(dry_run: bool, limit: int, backfill_id: str):
+    """Run backfill in background with own DB session."""
+    db = SessionLocal()
+    try:
+        backfill = CitationBackfill(db=db, rag_client=get_rag_client())
+        result = await backfill.run(dry_run=dry_run, limit=limit)
+        _backfill_status[backfill_id] = {
+            "status": "completed",
+            "total_controls": result.total_controls,
+            "matched_hash": result.matched_hash,
+            "matched_regex": result.matched_regex,
+            "matched_llm": result.matched_llm,
+            "unmatched": result.unmatched,
+            "updated": result.updated,
+            "errors": result.errors[:50],
+        }
+        logger.info("Backfill %s completed: %d updated", backfill_id, result.updated)
+    except Exception as e:
+        logger.error("Backfill %s failed: %s", backfill_id, e)
+        _backfill_status[backfill_id] = {"status": "failed", "errors": [str(e)]}
+    finally:
+        db.close()
+
+
+@router.post("/generate/backfill-citations", response_model=BackfillResponse)
+async def start_backfill(req: BackfillRequest):
+    """Backfill article/paragraph into existing control source_citations.
+
+    Uses 3-tier matching: hash lookup → regex parse → Ollama LLM.
+    Default is dry_run=True (preview only, no DB changes).
+    """
+    import uuid
+    backfill_id = str(uuid.uuid4())[:8]
+    _backfill_status[backfill_id] = {"status": "running"}
+
+    # Always run in background (RAG index build takes minutes)
+    asyncio.create_task(_run_backfill_background(req.dry_run, req.limit, backfill_id))
+    return BackfillResponse(
+        status=f"running (id={backfill_id})",
+    )
+
+
+@router.get("/generate/backfill-status/{backfill_id}")
+async def get_backfill_status(backfill_id: str):
+    """Get status of a backfill job."""
+    status = _backfill_status.get(backfill_id)
+    if not status:
+        raise HTTPException(status_code=404, detail="Backfill job not found")
+    return status
+
+
+# =============================================================================
+# DOMAIN + TARGET AUDIENCE BACKFILL
+# =============================================================================
+
+class DomainBackfillRequest(BaseModel):
+    dry_run: bool = True
+    job_id: Optional[str] = None  # Only backfill controls from this job
+    limit: int = 0  # 0 = all
+
+_domain_backfill_status: dict = {}
+
+
+async def _run_domain_backfill(req: DomainBackfillRequest, backfill_id: str):
+    """Backfill domain, category, and target_audience for existing controls using Anthropic."""
+    import os
+    import httpx
+
+    ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
+    ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
+
+    if not ANTHROPIC_API_KEY:
+        _domain_backfill_status[backfill_id] = {
+            "status": "failed", "error": "ANTHROPIC_API_KEY not set"
+        }
+        return
+
+    db = SessionLocal()
+    try:
+        # Find controls needing backfill
+        where_clauses = ["(target_audience IS NULL OR target_audience = '[]' OR target_audience = 'null')"]
+        params: dict = {}
+        if req.job_id:
+            where_clauses.append("generation_metadata->>'job_id' = :job_id")
+            params["job_id"] = req.job_id
+
+        query = f"""
+            SELECT id, control_id, title, objective, category, source_original_text, tags
+            FROM canonical_controls
+            WHERE {' AND '.join(where_clauses)}
+            ORDER BY control_id
+        """
+        if req.limit > 0:
+            query += f" LIMIT {req.limit}"
+
+        result = db.execute(text(query), params)
+        controls = [dict(zip(result.keys(), row)) for row in result]
+
+        total = len(controls)
+        updated = 0
+        errors = []
+
+        _domain_backfill_status[backfill_id] = {
+            "status": "running", "total": total, "updated": 0, "errors": []
+        }
+
+        # Process in batches of 10
+        BATCH_SIZE = 10
+        for batch_start in range(0, total, BATCH_SIZE):
+            batch = controls[batch_start:batch_start + BATCH_SIZE]
+
+            entries = []
+            for idx, ctrl in enumerate(batch):
+                text_for_analysis = ctrl.get("objective") or ctrl.get("title") or ""
+                original = ctrl.get("source_original_text") or ""
+                if original:
+                    text_for_analysis += f"\n\nQuelltext-Auszug: {original[:500]}"
+                entries.append(
+                    f"--- CONTROL {idx + 1}: {ctrl['control_id']} ---\n"
+                    f"Titel: {ctrl.get('title', '')}\n"
+                    f"Objective: {text_for_analysis[:800]}\n"
+                    f"Tags: {json.dumps(ctrl.get('tags', []))}"
+                )
+
+            prompt = f"""Analysiere die folgenden {len(batch)} Controls und bestimme fuer jedes:
+1. domain: Das Fachgebiet (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT)
+2. category: Die Kategorie (encryption, authentication, network, data_protection, logging, incident, continuity, compliance, supply_chain, physical, personnel, application, system, risk, governance, hardware, identity, public_administration, labor_law, finance, trade_regulation, environmental, health)
+3. target_audience: Liste der Zielgruppen (moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
+
+Antworte mit einem JSON-Array mit {len(batch)} Objekten. Jedes Objekt hat:
+- control_index: 1-basierter Index
+- domain: Fachgebiet-Kuerzel
+- category: Kategorie
+- target_audience: Liste der Zielgruppen
+
+{"".join(entries)}"""
+
+            try:
+                headers = {
+                    "x-api-key": ANTHROPIC_API_KEY,
+                    "anthropic-version": "2023-06-01",
+                    "content-type": "application/json",
+                }
+                payload = {
+                    "model": ANTHROPIC_MODEL,
+                    "max_tokens": 4096,
+                    "system": "Du bist ein Compliance-Experte. Klassifiziere Controls nach Fachgebiet und Zielgruppe. Antworte NUR mit validem JSON.",
+                    "messages": [{"role": "user", "content": prompt}],
+                }
+
+                async with httpx.AsyncClient(timeout=60.0) as client:
+                    resp = await client.post(
+                        "https://api.anthropic.com/v1/messages",
+                        headers=headers,
+                        json=payload,
+                    )
+                    if resp.status_code != 200:
+                        errors.append(f"Anthropic API {resp.status_code} at batch {batch_start}")
+                        continue
+
+                    raw = resp.json().get("content", [{}])[0].get("text", "")
+
+                # Parse response
+                import re
+                bracket_match = re.search(r"\[.*\]", raw, re.DOTALL)
+                if not bracket_match:
+                    errors.append(f"No JSON array in response at batch {batch_start}")
+                    continue
+
+                results_list = json.loads(bracket_match.group(0))
+
+                for item in results_list:
+                    idx = item.get("control_index", 0) - 1
+                    if idx < 0 or idx >= len(batch):
+                        continue
+                    ctrl = batch[idx]
+                    ctrl_id = str(ctrl["id"])
+
+                    new_domain = item.get("domain", "")
+                    new_category = item.get("category", "")
+                    new_audience = item.get("target_audience", [])
+
+                    if not isinstance(new_audience, list):
+                        new_audience = []
+
+                    # Build new control_id from domain if domain changed
+                    old_prefix = ctrl["control_id"].split("-")[0] if ctrl["control_id"] else ""
+                    new_prefix = new_domain.upper()[:4] if new_domain else old_prefix
+
+                    if not req.dry_run:
+                        update_parts = []
+                        update_params: dict = {"ctrl_id": ctrl_id}
+
+                        if new_category:
+                            update_parts.append("category = :category")
+                            update_params["category"] = new_category
+
+                        if new_audience:
+                            update_parts.append("target_audience = :target_audience")
+                            update_params["target_audience"] = json.dumps(new_audience)
+
+                        # Note: We do NOT rename control_ids here — that would
+                        # break references and cause unique constraint violations.
+
+                        if update_parts:
+                            update_parts.append("updated_at = NOW()")
+                            db.execute(
+                                text(f"UPDATE canonical_controls SET {', '.join(update_parts)} WHERE id = CAST(:ctrl_id AS uuid)"),
+                                update_params,
+                            )
+                            updated += 1
+
+                if not req.dry_run:
+                    db.commit()
+
+            except Exception as e:
+                errors.append(f"Batch {batch_start}: {str(e)}")
+                db.rollback()
+
+            _domain_backfill_status[backfill_id] = {
+                "status": "running", "total": total, "updated": updated,
+                "progress": f"{min(batch_start + BATCH_SIZE, total)}/{total}",
+                "errors": errors[-10:],
+            }
+
+        _domain_backfill_status[backfill_id] = {
+            "status": "completed", "total": total, "updated": updated,
+            "errors": errors[-50:],
+        }
+        logger.info("Domain backfill %s completed: %d/%d updated", backfill_id, updated, total)
+
+    except Exception as e:
+        logger.error("Domain backfill %s failed: %s", backfill_id, e)
+        _domain_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
+    finally:
+        db.close()
+
+
+@router.post("/generate/backfill-domain")
+async def start_domain_backfill(req: DomainBackfillRequest):
+    """Backfill domain, category, and target_audience for controls using Anthropic API.
+
+    Finds controls where target_audience is NULL and enriches them.
+    Default is dry_run=True (preview only).
+    """
+    import uuid
+    backfill_id = str(uuid.uuid4())[:8]
+    _domain_backfill_status[backfill_id] = {"status": "starting"}
+    asyncio.create_task(_run_domain_backfill(req, backfill_id))
+    return {"status": "running", "backfill_id": backfill_id,
+            "message": f"Domain backfill started. Poll /generate/backfill-status/{backfill_id}"}
+
+
+@router.get("/generate/domain-backfill-status/{backfill_id}")
+async def get_domain_backfill_status(backfill_id: str):
+    """Get status of a domain backfill job."""
+    status = _domain_backfill_status.get(backfill_id)
+    if not status:
+        raise HTTPException(status_code=404, detail="Domain backfill job not found")
+    return status