feat: add policy library with 29 German policy templates
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 34s
CI/CD / test-python-backend-compliance (push) Successful in 35s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Successful in 2s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 34s
CI/CD / test-python-backend-compliance (push) Successful in 35s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Successful in 2s
Add 29 new document types (IT security, data, personnel, vendor, BCM policies) to VALID_DOCUMENT_TYPES and 5 category pills to the document generator UI. Include seed script for production DB population. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -32,19 +32,26 @@ import {
|
||||
|
||||
const CATEGORIES: { key: string; label: string; types: string[] | null }[] = [
|
||||
{ key: 'all', label: 'Alle', types: null },
|
||||
// Legal / Vertragsvorlagen
|
||||
{ key: 'privacy_policy', label: 'Datenschutz', types: ['privacy_policy'] },
|
||||
{ key: 'terms', label: 'AGB', types: ['terms_of_service', 'agb', 'clause'] },
|
||||
{ key: 'impressum', label: 'Impressum', types: ['impressum'] },
|
||||
{ key: 'dpa', label: 'AVV/DPA', types: ['dpa'] },
|
||||
{ key: 'nda', label: 'NDA', types: ['nda'] },
|
||||
{ key: 'sla', label: 'SLA', types: ['sla'] },
|
||||
{ key: 'acceptable_use', label: 'AUP', types: ['acceptable_use'] },
|
||||
{ key: 'widerruf', label: 'Widerruf', types: ['widerruf'] },
|
||||
{ key: 'cookie', label: 'Cookie', types: ['cookie_policy', 'cookie_banner'] },
|
||||
{ key: 'cloud', label: 'Cloud', types: ['cloud_service_agreement'] },
|
||||
{ key: 'misc', label: 'Weitere', types: ['community_guidelines', 'copyright_policy', 'data_usage_clause'] },
|
||||
{ key: 'dsfa', label: 'DSFA', types: ['dsfa'] },
|
||||
// Sicherheitskonzepte (Migration 051)
|
||||
{ key: 'security', label: 'Sicherheitskonzepte', types: ['it_security_concept', 'data_protection_concept', 'backup_recovery_concept', 'logging_concept', 'incident_response_plan', 'access_control_concept', 'risk_management_concept'] },
|
||||
// Policy-Bibliothek (Migration 054)
|
||||
{ key: 'it_security_policies', label: 'IT-Sicherheit Policies', types: ['information_security_policy', 'access_control_policy', 'password_policy', 'encryption_policy', 'logging_policy', 'backup_policy', 'incident_response_policy', 'change_management_policy', 'patch_management_policy', 'asset_management_policy', 'cloud_security_policy', 'devsecops_policy', 'secrets_management_policy', 'vulnerability_management_policy'] },
|
||||
{ key: 'data_policies', label: 'Daten-Policies', types: ['data_protection_policy', 'data_classification_policy', 'data_retention_policy', 'data_transfer_policy', 'privacy_incident_policy'] },
|
||||
{ key: 'hr_policies', label: 'Personal-Policies', types: ['employee_security_policy', 'security_awareness_policy', 'acceptable_use', 'remote_work_policy', 'offboarding_policy'] },
|
||||
{ key: 'vendor_policies', label: 'Lieferanten-Policies', types: ['vendor_risk_management_policy', 'third_party_security_policy', 'supplier_security_policy'] },
|
||||
{ key: 'bcm_policies', label: 'BCM/Notfall', types: ['business_continuity_policy', 'disaster_recovery_policy', 'crisis_management_policy'] },
|
||||
]
|
||||
|
||||
// =============================================================================
|
||||
|
||||
@@ -58,6 +58,40 @@ VALID_DOCUMENT_TYPES = {
|
||||
"incident_response_plan",
|
||||
"access_control_concept",
|
||||
"risk_management_concept",
|
||||
# Policy templates — IT Security (Migration 054)
|
||||
"information_security_policy",
|
||||
"access_control_policy",
|
||||
"password_policy",
|
||||
"encryption_policy",
|
||||
"logging_policy",
|
||||
"backup_policy",
|
||||
"incident_response_policy",
|
||||
"change_management_policy",
|
||||
"patch_management_policy",
|
||||
"asset_management_policy",
|
||||
"cloud_security_policy",
|
||||
"devsecops_policy",
|
||||
"secrets_management_policy",
|
||||
"vulnerability_management_policy",
|
||||
# Policy templates — Data (Migration 054)
|
||||
"data_protection_policy",
|
||||
"data_classification_policy",
|
||||
"data_retention_policy",
|
||||
"data_transfer_policy",
|
||||
"privacy_incident_policy",
|
||||
# Policy templates — Personnel (Migration 054)
|
||||
"employee_security_policy",
|
||||
"security_awareness_policy",
|
||||
"remote_work_policy",
|
||||
"offboarding_policy",
|
||||
# Policy templates — Vendor/Supply Chain (Migration 054)
|
||||
"vendor_risk_management_policy",
|
||||
"third_party_security_policy",
|
||||
"supplier_security_policy",
|
||||
# Policy templates — BCM (Migration 054)
|
||||
"business_continuity_policy",
|
||||
"disaster_recovery_policy",
|
||||
"crisis_management_policy",
|
||||
}
|
||||
VALID_STATUSES = {"published", "draft", "archived"}
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
||||
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
||||
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "120"))
|
||||
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
||||
|
||||
HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
|
||||
|
||||
@@ -466,7 +466,7 @@ async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> st
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 4096,
|
||||
"max_tokens": 8192,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
if system_prompt:
|
||||
@@ -488,7 +488,7 @@ async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> st
|
||||
return content[0].get("text", "")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error("Anthropic request failed: %s", e)
|
||||
logger.error("Anthropic request failed: %s (type: %s)", e, type(e).__name__)
|
||||
return ""
|
||||
|
||||
|
||||
@@ -598,6 +598,57 @@ def _parse_llm_json(raw: str) -> dict:
|
||||
return {}
|
||||
|
||||
|
||||
def _parse_llm_json_array(raw: str) -> list[dict]:
|
||||
"""Extract a JSON array from LLM response — returns list of dicts."""
|
||||
match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
|
||||
text = match.group(1) if match else raw
|
||||
|
||||
# Try parsing as array directly
|
||||
try:
|
||||
parsed = json.loads(text)
|
||||
if isinstance(parsed, list):
|
||||
return parsed
|
||||
if isinstance(parsed, dict):
|
||||
# Check if it wraps an array (e.g. {"controls": [...]})
|
||||
for key in ("controls", "results", "items", "data"):
|
||||
if key in parsed and isinstance(parsed[key], list):
|
||||
return parsed[key]
|
||||
return [parsed]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try finding [ ... ] block
|
||||
bracket_match = re.search(r"\[.*\]", text, re.DOTALL)
|
||||
if bracket_match:
|
||||
try:
|
||||
parsed = json.loads(bracket_match.group(0))
|
||||
if isinstance(parsed, list):
|
||||
return parsed
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try finding multiple { ... } blocks (LLM sometimes returns separate objects)
|
||||
objects = []
|
||||
for obj_match in re.finditer(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL):
|
||||
try:
|
||||
obj = json.loads(obj_match.group(0))
|
||||
if isinstance(obj, dict) and obj.get("title"):
|
||||
objects.append(obj)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if objects:
|
||||
logger.info("Parsed %d individual JSON objects from batch response", len(objects))
|
||||
return objects
|
||||
|
||||
# Fallback: try single object
|
||||
single = _parse_llm_json(raw)
|
||||
if single:
|
||||
logger.info("Batch parse fallback: extracted single object")
|
||||
else:
|
||||
logger.warning("Batch parse failed — logging first 500 chars: %s", raw[:500])
|
||||
return [single] if single else []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -606,11 +657,11 @@ REFORM_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Deine Aufgabe
|
||||
Security Controls zu formulieren. Du formulierst IMMER in eigenen Worten.
|
||||
KOPIERE KEINE Sätze aus dem Quelltext. Verwende eigene Begriffe und Struktur.
|
||||
NENNE NICHT die Quelle. Keine proprietären Bezeichner.
|
||||
Antworte NUR mit validem JSON."""
|
||||
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
||||
|
||||
STRUCTURE_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
|
||||
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
|
||||
Antworte NUR mit validem JSON."""
|
||||
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
||||
|
||||
|
||||
class ControlGeneratorPipeline:
|
||||
@@ -881,6 +932,241 @@ Gib JSON zurück mit diesen Feldern:
|
||||
}
|
||||
return control
|
||||
|
||||
# ── Stage 3 BATCH: Multiple chunks in one API call ─────────────────
|
||||
|
||||
async def _structure_batch(
|
||||
self,
|
||||
chunks: list[RAGSearchResult],
|
||||
license_infos: list[dict],
|
||||
) -> list[Optional[GeneratedControl]]:
|
||||
"""Structure multiple free-use/citation chunks in a single Anthropic call."""
|
||||
chunk_entries = []
|
||||
for idx, (chunk, lic) in enumerate(zip(chunks, license_infos)):
|
||||
source_name = lic.get("name", chunk.regulation_name)
|
||||
chunk_entries.append(
|
||||
f"--- CHUNK {idx + 1} ---\n"
|
||||
f"Text: {chunk.text[:2000]}\n"
|
||||
f"Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}\n"
|
||||
f"Lizenz: {source_name} ({lic.get('license', '')})"
|
||||
)
|
||||
joined = "\n\n".join(chunk_entries)
|
||||
prompt = f"""Strukturiere die folgenden {len(chunks)} Gesetzestexte jeweils als eigenstaendiges Security/Compliance Control.
|
||||
Du DARFST den Originaltext verwenden (Quellen sind jeweils angegeben).
|
||||
|
||||
WICHTIG:
|
||||
- Erstelle fuer JEDEN Chunk ein separates Control mit verstaendlicher, praxisorientierter Formulierung.
|
||||
- Jedes Control muss eigenstaendig und vollstaendig sein — nicht auf andere Controls verweisen.
|
||||
- Qualitaet ist wichtiger als Geschwindigkeit. Jedes Control muss die gleiche Qualitaet haben wie ein einzeln erstelltes.
|
||||
|
||||
Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat diese Felder:
|
||||
- chunk_index: 1-basierter Index des Chunks (1, 2, 3, ...)
|
||||
- title: Kurzer praegnanter Titel (max 100 Zeichen)
|
||||
- objective: Was soll erreicht werden? (1-3 Saetze)
|
||||
- rationale: Warum ist das wichtig? (1-2 Saetze)
|
||||
- requirements: Liste von konkreten Anforderungen (Strings)
|
||||
- test_procedure: Liste von Pruefschritten (Strings)
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags
|
||||
|
||||
{joined}"""
|
||||
|
||||
raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
|
||||
results = _parse_llm_json_array(raw)
|
||||
logger.info("Batch structure: parsed %d results from API response", len(results))
|
||||
|
||||
# Map results back to chunks by chunk_index (or by position if no index)
|
||||
controls: list[Optional[GeneratedControl]] = [None] * len(chunks)
|
||||
for pos, data in enumerate(results):
|
||||
# Try chunk_index first, fall back to position
|
||||
idx = data.get("chunk_index")
|
||||
if idx is not None:
|
||||
idx = int(idx) - 1 # Convert to 0-based
|
||||
else:
|
||||
idx = pos # Use position as fallback
|
||||
if idx < 0 or idx >= len(chunks):
|
||||
logger.warning("Batch: chunk_index %d out of range (0-%d), using position %d", idx, len(chunks)-1, pos)
|
||||
idx = min(pos, len(chunks) - 1)
|
||||
chunk = chunks[idx]
|
||||
lic = license_infos[idx]
|
||||
domain = _detect_domain(chunk.text)
|
||||
control = self._build_control_from_json(data, domain)
|
||||
control.license_rule = lic["rule"]
|
||||
if lic["rule"] in (1, 2):
|
||||
control.source_original_text = chunk.text
|
||||
control.source_citation = {
|
||||
"source": f"{chunk.regulation_name} {chunk.article or ''}".strip(),
|
||||
"license": lic.get("license", ""),
|
||||
"license_notice": lic.get("attribution", ""),
|
||||
"url": chunk.source_url or "",
|
||||
}
|
||||
control.customer_visible = True
|
||||
control.verification_method = _detect_verification_method(chunk.text)
|
||||
control.category = _detect_category(chunk.text)
|
||||
control.generation_metadata = {
|
||||
"processing_path": "structured_batch",
|
||||
"license_rule": lic["rule"],
|
||||
"source_regulation": chunk.regulation_code,
|
||||
"source_article": chunk.article,
|
||||
"batch_size": len(chunks),
|
||||
}
|
||||
controls[idx] = control
|
||||
|
||||
return controls
|
||||
|
||||
async def _reformulate_batch(
|
||||
self,
|
||||
chunks: list[RAGSearchResult],
|
||||
config: GeneratorConfig,
|
||||
) -> list[Optional[GeneratedControl]]:
|
||||
"""Reformulate multiple restricted chunks in a single Anthropic call."""
|
||||
chunk_entries = []
|
||||
for idx, chunk in enumerate(chunks):
|
||||
domain = config.domain or _detect_domain(chunk.text)
|
||||
chunk_entries.append(
|
||||
f"--- ASPEKT {idx + 1} ---\n"
|
||||
f"Domain: {domain}\n"
|
||||
f"Text (nur zur Analyse, NICHT kopieren, NICHT referenzieren):\n{chunk.text[:1500]}"
|
||||
)
|
||||
joined = "\n\n".join(chunk_entries)
|
||||
prompt = f"""Analysiere die folgenden {len(chunks)} Pruefaspekte und formuliere fuer JEDEN ein EIGENSTAENDIGES Security Control.
|
||||
KOPIERE KEINE Saetze. Verwende eigene Begriffe und Struktur.
|
||||
NENNE NICHT die Quellen. Keine proprietaeren Bezeichner (kein O.Auth_*, TR-03161, BSI-TR etc.).
|
||||
|
||||
WICHTIG:
|
||||
- Jedes Control muss eigenstaendig und vollstaendig sein — nicht auf andere Controls verweisen.
|
||||
- Qualitaet ist wichtiger als Geschwindigkeit. Jedes Control muss die gleiche Qualitaet haben wie ein einzeln erstelltes.
|
||||
|
||||
Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat diese Felder:
|
||||
- chunk_index: 1-basierter Index des Aspekts (1, 2, 3, ...)
|
||||
- title: Kurzer eigenstaendiger Titel (max 100 Zeichen)
|
||||
- objective: Eigenstaendige Formulierung des Ziels (1-3 Saetze)
|
||||
- rationale: Eigenstaendige Begruendung (1-2 Saetze)
|
||||
- requirements: Liste von konkreten Anforderungen (Strings, eigene Worte)
|
||||
- test_procedure: Liste von Pruefschritten (Strings)
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags (eigene Begriffe)
|
||||
|
||||
{joined}"""
|
||||
|
||||
raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
|
||||
results = _parse_llm_json_array(raw)
|
||||
logger.info("Batch reform: parsed %d results from API response", len(results))
|
||||
|
||||
controls: list[Optional[GeneratedControl]] = [None] * len(chunks)
|
||||
for pos, data in enumerate(results):
|
||||
idx = data.get("chunk_index")
|
||||
if idx is not None:
|
||||
idx = int(idx) - 1
|
||||
else:
|
||||
idx = pos
|
||||
if idx < 0 or idx >= len(chunks):
|
||||
logger.warning("Batch reform: chunk_index %d out of range, using position %d", idx, pos)
|
||||
idx = min(pos, len(chunks) - 1)
|
||||
chunk = chunks[idx]
|
||||
domain = config.domain or _detect_domain(chunk.text)
|
||||
control = self._build_control_from_json(data, domain)
|
||||
control.license_rule = 3
|
||||
control.source_original_text = None
|
||||
control.source_citation = None
|
||||
control.customer_visible = False
|
||||
control.verification_method = _detect_verification_method(chunk.text)
|
||||
control.category = _detect_category(chunk.text)
|
||||
control.generation_metadata = {
|
||||
"processing_path": "llm_reform_batch",
|
||||
"license_rule": 3,
|
||||
"batch_size": len(chunks),
|
||||
}
|
||||
controls[idx] = control
|
||||
|
||||
return controls
|
||||
|
||||
async def _process_batch(
|
||||
self,
|
||||
batch_items: list[tuple[RAGSearchResult, dict]],
|
||||
config: GeneratorConfig,
|
||||
job_id: str,
|
||||
) -> list[Optional[GeneratedControl]]:
|
||||
"""Process a batch of (chunk, license_info) through stages 3-5."""
|
||||
# Split by license rule: Rule 1+2 → structure, Rule 3 → reform
|
||||
structure_items = [(c, l) for c, l in batch_items if l["rule"] in (1, 2)]
|
||||
reform_items = [(c, l) for c, l in batch_items if l["rule"] == 3]
|
||||
|
||||
all_controls: dict[int, Optional[GeneratedControl]] = {}
|
||||
|
||||
if structure_items:
|
||||
s_chunks = [c for c, _ in structure_items]
|
||||
s_lics = [l for _, l in structure_items]
|
||||
s_controls = await self._structure_batch(s_chunks, s_lics)
|
||||
for (chunk, _), ctrl in zip(structure_items, s_controls):
|
||||
orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
|
||||
all_controls[orig_idx] = ctrl
|
||||
|
||||
if reform_items:
|
||||
r_chunks = [c for c, _ in reform_items]
|
||||
r_controls = await self._reformulate_batch(r_chunks, config)
|
||||
for (chunk, _), ctrl in zip(reform_items, r_controls):
|
||||
orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
|
||||
if ctrl:
|
||||
# Too-Close-Check for Rule 3
|
||||
similarity = await check_similarity(chunk.text, f"{ctrl.objective} {ctrl.rationale}")
|
||||
if similarity.status == "FAIL":
|
||||
ctrl.release_state = "too_close"
|
||||
ctrl.generation_metadata["similarity_status"] = "FAIL"
|
||||
ctrl.generation_metadata["similarity_scores"] = {
|
||||
"token_overlap": similarity.token_overlap,
|
||||
"ngram_jaccard": similarity.ngram_jaccard,
|
||||
"lcs_ratio": similarity.lcs_ratio,
|
||||
}
|
||||
all_controls[orig_idx] = ctrl
|
||||
|
||||
# Post-process all controls: harmonization + anchor search
|
||||
final: list[Optional[GeneratedControl]] = []
|
||||
for i in range(len(batch_items)):
|
||||
control = all_controls.get(i)
|
||||
if not control or (not control.title and not control.objective):
|
||||
final.append(None)
|
||||
continue
|
||||
|
||||
if control.release_state == "too_close":
|
||||
final.append(control)
|
||||
continue
|
||||
|
||||
# Harmonization
|
||||
duplicates = await self._check_harmonization(control)
|
||||
if duplicates:
|
||||
control.release_state = "duplicate"
|
||||
control.generation_metadata["similar_controls"] = duplicates
|
||||
final.append(control)
|
||||
continue
|
||||
|
||||
# Anchor search
|
||||
try:
|
||||
from .anchor_finder import AnchorFinder
|
||||
finder = AnchorFinder(self.rag)
|
||||
anchors = await finder.find_anchors(control, skip_web=config.skip_web_search)
|
||||
control.open_anchors = [asdict(a) if hasattr(a, '__dataclass_fields__') else a for a in anchors]
|
||||
except Exception as e:
|
||||
logger.warning("Anchor search failed: %s", e)
|
||||
|
||||
# Release state
|
||||
if control.license_rule in (1, 2):
|
||||
control.release_state = "draft"
|
||||
elif control.open_anchors:
|
||||
control.release_state = "draft"
|
||||
else:
|
||||
control.release_state = "needs_review"
|
||||
|
||||
# Control ID
|
||||
domain = config.domain or _detect_domain(control.objective)
|
||||
control.control_id = self._generate_control_id(domain, self.db)
|
||||
control.generation_metadata["job_id"] = job_id
|
||||
|
||||
final.append(control)
|
||||
|
||||
return final
|
||||
|
||||
# ── Stage 4: Harmonization ─────────────────────────────────────────
|
||||
|
||||
async def _check_harmonization(self, new_control: GeneratedControl) -> Optional[list]:
|
||||
@@ -1168,6 +1454,7 @@ Gib JSON zurück mit diesen Feldern:
|
||||
self.db.commit()
|
||||
except Exception as e:
|
||||
logger.warning("Failed to mark chunk processed: %s", e)
|
||||
self.db.rollback()
|
||||
|
||||
# ── Main Pipeline ──────────────────────────────────────────────────
|
||||
|
||||
@@ -1192,9 +1479,71 @@ Gib JSON zurück mit diesen Feldern:
|
||||
self._update_job(job_id, result)
|
||||
return result
|
||||
|
||||
# Process chunks
|
||||
# Process chunks — batch mode (N chunks per Anthropic API call)
|
||||
BATCH_SIZE = config.batch_size or 5
|
||||
controls_count = 0
|
||||
chunks_skipped_prefilter = 0
|
||||
pending_batch: list[tuple[RAGSearchResult, dict]] = [] # (chunk, license_info)
|
||||
|
||||
async def _flush_batch():
|
||||
"""Send pending batch to Anthropic and process results."""
|
||||
nonlocal controls_count
|
||||
if not pending_batch:
|
||||
return
|
||||
batch = pending_batch.copy()
|
||||
pending_batch.clear()
|
||||
|
||||
logger.info("Processing batch of %d chunks via single API call...", len(batch))
|
||||
try:
|
||||
batch_controls = await self._process_batch(batch, config, job_id)
|
||||
except Exception as e:
|
||||
logger.error("Batch processing failed: %s — falling back to single-chunk mode", e)
|
||||
# Fallback: process each chunk individually
|
||||
batch_controls = []
|
||||
for chunk, _lic in batch:
|
||||
try:
|
||||
ctrl = await self._process_single_chunk(chunk, config, job_id)
|
||||
batch_controls.append(ctrl)
|
||||
except Exception as e2:
|
||||
logger.error("Single-chunk fallback also failed: %s", e2)
|
||||
batch_controls.append(None)
|
||||
|
||||
for (chunk, lic_info), control in zip(batch, batch_controls):
|
||||
if control is None:
|
||||
if not config.dry_run:
|
||||
self._mark_chunk_processed(chunk, lic_info, "no_control", [], job_id)
|
||||
continue
|
||||
|
||||
# Count by state
|
||||
if control.release_state == "too_close":
|
||||
result.controls_too_close += 1
|
||||
elif control.release_state == "duplicate":
|
||||
result.controls_duplicates_found += 1
|
||||
elif control.release_state == "needs_review":
|
||||
result.controls_needs_review += 1
|
||||
else:
|
||||
result.controls_verified += 1
|
||||
|
||||
# Store
|
||||
if not config.dry_run:
|
||||
ctrl_uuid = self._store_control(control, job_id)
|
||||
if ctrl_uuid:
|
||||
path = control.generation_metadata.get("processing_path", "structured_batch")
|
||||
self._mark_chunk_processed(chunk, lic_info, path, [ctrl_uuid], job_id)
|
||||
else:
|
||||
self._mark_chunk_processed(chunk, lic_info, "store_failed", [], job_id)
|
||||
|
||||
result.controls_generated += 1
|
||||
result.controls.append(asdict(control))
|
||||
controls_count += 1
|
||||
|
||||
if self._existing_controls is not None:
|
||||
self._existing_controls.append({
|
||||
"control_id": control.control_id,
|
||||
"title": control.title,
|
||||
"objective": control.objective,
|
||||
})
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
try:
|
||||
# Progress logging every 50 chunks
|
||||
@@ -1210,65 +1559,24 @@ Gib JSON zurück mit diesen Feldern:
|
||||
is_relevant, prefilter_reason = await _prefilter_chunk(chunk.text)
|
||||
if not is_relevant:
|
||||
chunks_skipped_prefilter += 1
|
||||
# Mark as processed so we don't re-check next time
|
||||
license_info = self._classify_license(chunk)
|
||||
self._mark_chunk_processed(
|
||||
chunk, license_info, "prefilter_skip", [], job_id
|
||||
)
|
||||
continue
|
||||
|
||||
control = await self._process_single_chunk(chunk, config, job_id)
|
||||
if control is None:
|
||||
# No control generated — still mark as processed
|
||||
if not config.dry_run:
|
||||
# Classify license and add to batch
|
||||
license_info = self._classify_license(chunk)
|
||||
self._mark_chunk_processed(
|
||||
chunk, license_info, "no_control", [], job_id
|
||||
)
|
||||
continue
|
||||
pending_batch.append((chunk, license_info))
|
||||
|
||||
# Count by state
|
||||
if control.release_state == "too_close":
|
||||
result.controls_too_close += 1
|
||||
elif control.release_state == "duplicate":
|
||||
result.controls_duplicates_found += 1
|
||||
elif control.release_state == "needs_review":
|
||||
result.controls_needs_review += 1
|
||||
else:
|
||||
result.controls_verified += 1
|
||||
|
||||
# Store (unless dry run)
|
||||
if not config.dry_run:
|
||||
ctrl_uuid = self._store_control(control, job_id)
|
||||
if ctrl_uuid:
|
||||
# Stage 7: Mark chunk processed
|
||||
license_info = self._classify_license(chunk)
|
||||
path = "llm_reform" if license_info["rule"] == 3 else "structured"
|
||||
self._mark_chunk_processed(chunk, license_info, path, [ctrl_uuid], job_id)
|
||||
else:
|
||||
# Store failed — still mark as processed
|
||||
license_info = self._classify_license(chunk)
|
||||
self._mark_chunk_processed(
|
||||
chunk, license_info, "store_failed", [], job_id
|
||||
)
|
||||
|
||||
result.controls_generated += 1
|
||||
result.controls.append(asdict(control))
|
||||
controls_count += 1
|
||||
|
||||
# Add to existing controls for harmonization of next chunks
|
||||
if self._existing_controls is not None:
|
||||
self._existing_controls.append({
|
||||
"control_id": control.control_id,
|
||||
"title": control.title,
|
||||
"objective": control.objective,
|
||||
})
|
||||
# Flush when batch is full
|
||||
if len(pending_batch) >= BATCH_SIZE:
|
||||
await _flush_batch()
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error processing chunk {chunk.regulation_code}/{chunk.article}: {e}"
|
||||
logger.error(error_msg)
|
||||
result.errors.append(error_msg)
|
||||
# Mark failed chunks as processed too (so we don't retry endlessly)
|
||||
try:
|
||||
if not config.dry_run:
|
||||
license_info = self._classify_license(chunk)
|
||||
@@ -1278,6 +1586,9 @@ Gib JSON zurück mit diesen Feldern:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Flush remaining chunks
|
||||
await _flush_batch()
|
||||
|
||||
result.chunks_skipped_prefilter = chunks_skipped_prefilter
|
||||
logger.info(
|
||||
"Pipeline complete: %d controls generated, %d chunks skipped by prefilter, %d total chunks",
|
||||
|
||||
1436
backend-compliance/scripts/seed_policy_templates.py
Normal file
1436
backend-compliance/scripts/seed_policy_templates.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user