feat(rag): optimize RAG pipeline — JSON-Mode, CoT, Hybrid Search, Re-Ranking, Cross-Reg Dedup, chunk 1024
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Phase 1 (LLM Quality): - Add format=json to all Ollama payloads (obligation_extractor, control_generator, citation_backfill) - Add Chain-of-Thought analysis steps to Pass 0a/0b system prompts Phase 2 (Retrieval Quality): - Hybrid search via Qdrant Query API with RRF fusion + automatic text index (legal_rag.go) - Fallback to dense-only search if Query API unavailable - Cross-encoder re-ranking with BGE Reranker v2 (RERANK_ENABLED=false by default) - CPU-only PyTorch dependency to keep Docker image small Phase 3 (Data Layer): - Cross-regulation dedup pass (threshold 0.95) links controls across regulations - DedupResult.link_type field distinguishes dedup_merge vs cross_regulation - Chunk size defaults updated 512/50 → 1024/128 for new ingestions only - Existing collections and controls are NOT affected Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -46,20 +46,62 @@ ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Normative signal detection (Rule 1)
|
||||
# Normative signal detection — 3-Tier Classification
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tier 1: Pflicht (mandatory) — strong normative signals
|
||||
# Tier 2: Empfehlung (recommendation) — weaker normative signals
|
||||
# Tier 3: Kann (optional/permissive) — permissive signals
|
||||
# Nothing is rejected — everything is classified.
|
||||
|
||||
_NORMATIVE_SIGNALS = [
|
||||
_PFLICHT_SIGNALS = [
|
||||
# Deutsche modale Pflichtformulierungen
|
||||
r"\bmüssen\b", r"\bmuss\b", r"\bhat\s+sicherzustellen\b",
|
||||
r"\bhaben\s+sicherzustellen\b", r"\bsind\s+verpflichtet\b",
|
||||
r"\bist\s+verpflichtet\b", r"\bist\s+zu\s+\w+en\b",
|
||||
r"\bsind\s+zu\s+\w+en\b", r"\bhat\s+zu\s+\w+en\b",
|
||||
r"\bhaben\s+zu\s+\w+en\b", r"\bsoll\b", r"\bsollen\b",
|
||||
r"\bgewährleisten\b", r"\bsicherstellen\b",
|
||||
r"\bist\s+verpflichtet\b",
|
||||
# "ist zu prüfen", "sind zu dokumentieren" (direkt)
|
||||
r"\bist\s+zu\s+\w+en\b", r"\bsind\s+zu\s+\w+en\b",
|
||||
r"\bhat\s+zu\s+\w+en\b", r"\bhaben\s+zu\s+\w+en\b",
|
||||
# "ist festzustellen", "sind vorzunehmen" (Compound-Verben, eingebettetes zu)
|
||||
r"\bist\s+\w+zu\w+en\b", r"\bsind\s+\w+zu\w+en\b",
|
||||
# "ist zusätzlich zu prüfen", "sind regelmäßig zu überwachen" (Adverb dazwischen)
|
||||
r"\bist\s+\w+\s+zu\s+\w+en\b", r"\bsind\s+\w+\s+zu\s+\w+en\b",
|
||||
r"\bhat\s+\w+\s+zu\s+\w+en\b", r"\bhaben\s+\w+\s+zu\s+\w+en\b",
|
||||
# Englische Pflicht-Signale
|
||||
r"\bshall\b", r"\bmust\b", r"\brequired\b",
|
||||
r"\bshould\b", r"\bensure\b",
|
||||
# Compound-Infinitive (Gerundivum): mitzuteilen, anzuwenden, bereitzustellen
|
||||
r"\b\w+zuteilen\b", r"\b\w+zuwenden\b", r"\b\w+zustellen\b", r"\b\w+zulegen\b",
|
||||
r"\b\w+zunehmen\b", r"\b\w+zuführen\b", r"\b\w+zuhalten\b", r"\b\w+zusetzen\b",
|
||||
r"\b\w+zuweisen\b", r"\b\w+zuordnen\b", r"\b\w+zufügen\b", r"\b\w+zugeben\b",
|
||||
# Breites Pattern: "ist ... [bis 80 Zeichen] ... zu + Infinitiv"
|
||||
r"\bist\b.{1,80}\bzu\s+\w+en\b", r"\bsind\b.{1,80}\bzu\s+\w+en\b",
|
||||
]
|
||||
_NORMATIVE_RE = re.compile("|".join(_NORMATIVE_SIGNALS), re.IGNORECASE)
|
||||
_PFLICHT_RE = re.compile("|".join(_PFLICHT_SIGNALS), re.IGNORECASE)
|
||||
|
||||
_EMPFEHLUNG_SIGNALS = [
|
||||
# Modale Verben (schwaecher als "muss")
|
||||
r"\bsoll\b", r"\bsollen\b", r"\bsollte\b", r"\bsollten\b",
|
||||
r"\bgewährleisten\b", r"\bsicherstellen\b",
|
||||
# Englische Empfehlungs-Signale
|
||||
r"\bshould\b", r"\bensure\b", r"\brecommend\w*\b",
|
||||
# Haeufige normative Infinitive (ohne Hilfsverb, als Empfehlung)
|
||||
r"\bnachweisen\b", r"\beinhalten\b", r"\bunterlassen\b", r"\bwahren\b",
|
||||
r"\bdokumentieren\b", r"\bimplementieren\b", r"\büberprüfen\b", r"\büberwachen\b",
|
||||
# Pruefanweisungen als normative Aussage
|
||||
r"\bprüfen,\s+ob\b", r"\bkontrollieren,\s+ob\b",
|
||||
]
|
||||
_EMPFEHLUNG_RE = re.compile("|".join(_EMPFEHLUNG_SIGNALS), re.IGNORECASE)
|
||||
|
||||
_KANN_SIGNALS = [
|
||||
r"\bkann\b", r"\bkönnen\b", r"\bdarf\b", r"\bdürfen\b",
|
||||
r"\bmay\b", r"\boptional\b",
|
||||
]
|
||||
_KANN_RE = re.compile("|".join(_KANN_SIGNALS), re.IGNORECASE)
|
||||
|
||||
# Union of all normative signals (for backward-compatible has_normative_signal flag)
|
||||
_NORMATIVE_RE = re.compile(
|
||||
"|".join(_PFLICHT_SIGNALS + _EMPFEHLUNG_SIGNALS + _KANN_SIGNALS),
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_RATIONALE_SIGNALS = [
|
||||
r"\bda\s+", r"\bweil\b", r"\bgrund\b", r"\berwägung",
|
||||
@@ -100,6 +142,7 @@ class ObligationCandidate:
|
||||
object_: str = ""
|
||||
condition: Optional[str] = None
|
||||
normative_strength: str = "must"
|
||||
obligation_type: str = "pflicht" # pflicht | empfehlung | kann
|
||||
is_test_obligation: bool = False
|
||||
is_reporting_obligation: bool = False
|
||||
extraction_confidence: float = 0.0
|
||||
@@ -115,6 +158,7 @@ class ObligationCandidate:
|
||||
"object": self.object_,
|
||||
"condition": self.condition,
|
||||
"normative_strength": self.normative_strength,
|
||||
"obligation_type": self.obligation_type,
|
||||
"is_test_obligation": self.is_test_obligation,
|
||||
"is_reporting_obligation": self.is_reporting_obligation,
|
||||
"extraction_confidence": self.extraction_confidence,
|
||||
@@ -162,11 +206,30 @@ class AtomicControlCandidate:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def classify_obligation_type(txt: str) -> str:
|
||||
"""Classify obligation text into pflicht/empfehlung/kann.
|
||||
|
||||
Priority: pflicht > empfehlung > kann > empfehlung (default).
|
||||
Nothing is rejected — obligations without normative signal default
|
||||
to 'empfehlung' (recommendation).
|
||||
"""
|
||||
if _PFLICHT_RE.search(txt):
|
||||
return "pflicht"
|
||||
if _EMPFEHLUNG_RE.search(txt):
|
||||
return "empfehlung"
|
||||
if _KANN_RE.search(txt):
|
||||
return "kann"
|
||||
# No signal at all — LLM thought it was an obligation, classify
|
||||
# as recommendation (the user can still use it).
|
||||
return "empfehlung"
|
||||
|
||||
|
||||
def quality_gate(candidate: ObligationCandidate) -> dict:
|
||||
"""Validate an obligation candidate. Returns quality flags dict.
|
||||
|
||||
Checks:
|
||||
has_normative_signal: text contains normative language
|
||||
has_normative_signal: text contains normative language (informational)
|
||||
obligation_type: pflicht | empfehlung | kann (classified, never rejected)
|
||||
single_action: only one main action (heuristic)
|
||||
not_rationale: not just a justification/reasoning
|
||||
not_evidence_only: not just an evidence requirement
|
||||
@@ -176,9 +239,12 @@ def quality_gate(candidate: ObligationCandidate) -> dict:
|
||||
txt = candidate.obligation_text
|
||||
flags = {}
|
||||
|
||||
# 1. Normative signal
|
||||
# 1. Normative signal (informational — no longer used for rejection)
|
||||
flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(txt))
|
||||
|
||||
# 1b. Obligation type classification
|
||||
flags["obligation_type"] = classify_obligation_type(txt)
|
||||
|
||||
# 2. Single action heuristic — count "und" / "and" / "sowie" splits
|
||||
# that connect different verbs (imperfect but useful)
|
||||
multi_verb_re = re.compile(
|
||||
@@ -210,8 +276,12 @@ def quality_gate(candidate: ObligationCandidate) -> dict:
|
||||
|
||||
|
||||
def passes_quality_gate(flags: dict) -> bool:
|
||||
"""Check if all critical quality flags pass."""
|
||||
critical = ["has_normative_signal", "not_evidence_only", "min_length", "has_parent_link"]
|
||||
"""Check if critical quality flags pass.
|
||||
|
||||
Note: has_normative_signal is NO LONGER critical — obligations without
|
||||
normative signal are classified as 'empfehlung' instead of being rejected.
|
||||
"""
|
||||
critical = ["not_evidence_only", "min_length", "has_parent_link"]
|
||||
return all(flags.get(k, False) for k in critical)
|
||||
|
||||
|
||||
@@ -224,6 +294,13 @@ _PASS0A_SYSTEM_PROMPT = """\
|
||||
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
|
||||
in einzelne atomare Pflichten.
|
||||
|
||||
ANALYSE-SCHRITTE (intern durchfuehren, NICHT im Output!):
|
||||
1. Identifiziere den Adressaten (Wer muss handeln?)
|
||||
2. Identifiziere die Handlung (Was muss getan werden?)
|
||||
3. Bestimme die normative Staerke (muss/soll/kann)
|
||||
4. Pruefe ob Test- oder Meldepflicht vorliegt (separat erfassen!)
|
||||
5. Formuliere jede Pflicht als eigenstaendiges JSON-Objekt
|
||||
|
||||
REGELN (STRIKT EINHALTEN):
|
||||
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
|
||||
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
|
||||
@@ -272,6 +349,12 @@ _PASS0B_SYSTEM_PROMPT = """\
|
||||
Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
|
||||
normativen Pflicht ein praxisorientiertes, atomares Security Control.
|
||||
|
||||
ANALYSE-SCHRITTE (intern durchfuehren, NICHT im Output!):
|
||||
1. Identifiziere die konkrete Anforderung aus der Pflicht
|
||||
2. Leite eine umsetzbare technische/organisatorische Massnahme ab
|
||||
3. Definiere ein Pruefverfahren (wie wird Umsetzung verifiziert?)
|
||||
4. Bestimme den Nachweis (welches Dokument/Artefakt belegt Compliance?)
|
||||
|
||||
Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
|
||||
Antworte NUR als JSON. Keine Erklärungen."""
|
||||
|
||||
@@ -603,8 +686,15 @@ class DecompositionPass:
|
||||
stats_0b = await decomp.run_pass0b(limit=100)
|
||||
"""
|
||||
|
||||
def __init__(self, db: Session):
|
||||
def __init__(self, db: Session, dedup_enabled: bool = False):
|
||||
self.db = db
|
||||
self._dedup = None
|
||||
if dedup_enabled:
|
||||
from compliance.services.control_dedup import (
|
||||
ControlDedupChecker, DEDUP_ENABLED,
|
||||
)
|
||||
if DEDUP_ENABLED:
|
||||
self._dedup = ControlDedupChecker(db)
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Pass 0a: Obligation Extraction
|
||||
@@ -810,10 +900,11 @@ class DecompositionPass:
|
||||
if not cand.is_reporting_obligation and _REPORTING_RE.search(cand.obligation_text):
|
||||
cand.is_reporting_obligation = True
|
||||
|
||||
# Quality gate
|
||||
# Quality gate + obligation type classification
|
||||
flags = quality_gate(cand)
|
||||
cand.quality_flags = flags
|
||||
cand.extraction_confidence = _compute_extraction_confidence(flags)
|
||||
cand.obligation_type = flags.get("obligation_type", "empfehlung")
|
||||
|
||||
if passes_quality_gate(flags):
|
||||
cand.release_state = "validated"
|
||||
@@ -877,6 +968,9 @@ class DecompositionPass:
|
||||
"errors": 0,
|
||||
"provider": "anthropic" if use_anthropic else "ollama",
|
||||
"batch_size": batch_size,
|
||||
"dedup_enabled": self._dedup is not None,
|
||||
"dedup_linked": 0,
|
||||
"dedup_review": 0,
|
||||
}
|
||||
|
||||
# Prepare obligation data
|
||||
@@ -915,7 +1009,7 @@ class DecompositionPass:
|
||||
results_by_id = _parse_json_object(llm_response)
|
||||
for obl in batch:
|
||||
parsed = results_by_id.get(obl["candidate_id"], {})
|
||||
self._process_pass0b_control(obl, parsed, stats)
|
||||
await self._process_pass0b_control(obl, parsed, stats)
|
||||
elif use_anthropic:
|
||||
obl = batch[0]
|
||||
prompt = _build_pass0b_prompt(
|
||||
@@ -931,7 +1025,7 @@ class DecompositionPass:
|
||||
)
|
||||
stats["llm_calls"] += 1
|
||||
parsed = _parse_json_object(llm_response)
|
||||
self._process_pass0b_control(obl, parsed, stats)
|
||||
await self._process_pass0b_control(obl, parsed, stats)
|
||||
else:
|
||||
from compliance.services.obligation_extractor import _llm_ollama
|
||||
obl = batch[0]
|
||||
@@ -948,7 +1042,7 @@ class DecompositionPass:
|
||||
)
|
||||
stats["llm_calls"] += 1
|
||||
parsed = _parse_json_object(llm_response)
|
||||
self._process_pass0b_control(obl, parsed, stats)
|
||||
await self._process_pass0b_control(obl, parsed, stats)
|
||||
|
||||
except Exception as e:
|
||||
ids = ", ".join(o["candidate_id"] for o in batch)
|
||||
@@ -959,10 +1053,16 @@ class DecompositionPass:
|
||||
logger.info("Pass 0b: %s", stats)
|
||||
return stats
|
||||
|
||||
def _process_pass0b_control(
|
||||
async def _process_pass0b_control(
|
||||
self, obl: dict, parsed: dict, stats: dict,
|
||||
) -> None:
|
||||
"""Create atomic control from parsed LLM output or template fallback."""
|
||||
"""Create atomic control from parsed LLM output or template fallback.
|
||||
|
||||
If dedup is enabled, checks for duplicates before insertion:
|
||||
- LINK: adds parent link to existing control instead of creating new
|
||||
- REVIEW: queues for human review, does not create control
|
||||
- NEW: creates new control and indexes in Qdrant
|
||||
"""
|
||||
if not parsed or not parsed.get("title"):
|
||||
atomic = _template_fallback(
|
||||
obligation_text=obl["obligation_text"],
|
||||
@@ -990,6 +1090,56 @@ class DecompositionPass:
|
||||
atomic.parent_control_uuid = obl["parent_uuid"]
|
||||
atomic.obligation_candidate_id = obl["candidate_id"]
|
||||
|
||||
# ── Dedup check (if enabled) ────────────────────────────
|
||||
if self._dedup:
|
||||
pattern_id = None
|
||||
# Try to get pattern_id from parent control
|
||||
pid_row = self.db.execute(text(
|
||||
"SELECT pattern_id FROM canonical_controls WHERE id = CAST(:uid AS uuid)"
|
||||
), {"uid": obl["parent_uuid"]}).fetchone()
|
||||
if pid_row:
|
||||
pattern_id = pid_row[0]
|
||||
|
||||
result = await self._dedup.check_duplicate(
|
||||
action=obl.get("action", ""),
|
||||
obj=obl.get("object", ""),
|
||||
title=atomic.title,
|
||||
pattern_id=pattern_id,
|
||||
)
|
||||
|
||||
if result.verdict == "link":
|
||||
self._dedup.add_parent_link(
|
||||
control_uuid=result.matched_control_uuid,
|
||||
parent_control_uuid=obl["parent_uuid"],
|
||||
link_type="dedup_merge",
|
||||
confidence=result.similarity_score,
|
||||
)
|
||||
stats.setdefault("dedup_linked", 0)
|
||||
stats["dedup_linked"] += 1
|
||||
stats["candidates_processed"] += 1
|
||||
logger.info("Dedup LINK: %s → %s (%.3f, %s)",
|
||||
atomic.title[:60], result.matched_control_id,
|
||||
result.similarity_score, result.stage)
|
||||
return
|
||||
|
||||
if result.verdict == "review":
|
||||
self._dedup.write_review(
|
||||
candidate_control_id=atomic.candidate_id or "",
|
||||
candidate_title=atomic.title,
|
||||
candidate_objective=atomic.objective,
|
||||
result=result,
|
||||
parent_control_uuid=obl["parent_uuid"],
|
||||
obligation_candidate_id=obl.get("oc_id"),
|
||||
)
|
||||
stats.setdefault("dedup_review", 0)
|
||||
stats["dedup_review"] += 1
|
||||
stats["candidates_processed"] += 1
|
||||
logger.info("Dedup REVIEW: %s ↔ %s (%.3f, %s)",
|
||||
atomic.title[:60], result.matched_control_id,
|
||||
result.similarity_score, result.stage)
|
||||
return
|
||||
|
||||
# ── Create new atomic control ───────────────────────────
|
||||
seq = self._next_atomic_seq(obl["parent_control_id"])
|
||||
atomic.candidate_id = f"{obl['parent_control_id']}-A{seq:02d}"
|
||||
|
||||
@@ -1006,6 +1156,29 @@ class DecompositionPass:
|
||||
{"oc_id": obl["oc_id"]},
|
||||
)
|
||||
|
||||
# Index in Qdrant for future dedup checks
|
||||
if self._dedup:
|
||||
pattern_id_val = None
|
||||
pid_row2 = self.db.execute(text(
|
||||
"SELECT pattern_id FROM canonical_controls WHERE id = CAST(:uid AS uuid)"
|
||||
), {"uid": obl["parent_uuid"]}).fetchone()
|
||||
if pid_row2:
|
||||
pattern_id_val = pid_row2[0]
|
||||
|
||||
# Get the UUID of the newly inserted control
|
||||
new_row = self.db.execute(text(
|
||||
"SELECT id::text FROM canonical_controls WHERE control_id = :cid ORDER BY created_at DESC LIMIT 1"
|
||||
), {"cid": atomic.candidate_id}).fetchone()
|
||||
if new_row and pattern_id_val:
|
||||
await self._dedup.index_control(
|
||||
control_uuid=new_row[0],
|
||||
control_id=atomic.candidate_id,
|
||||
title=atomic.title,
|
||||
action=obl.get("action", ""),
|
||||
obj=obl.get("object", ""),
|
||||
pattern_id=pattern_id_val,
|
||||
)
|
||||
|
||||
stats["controls_created"] += 1
|
||||
stats["candidates_processed"] += 1
|
||||
|
||||
@@ -1415,7 +1588,7 @@ class DecompositionPass:
|
||||
if pass_type == "0a":
|
||||
self._handle_batch_result_0a(custom_id, text_content, stats)
|
||||
else:
|
||||
self._handle_batch_result_0b(custom_id, text_content, stats)
|
||||
await self._handle_batch_result_0b(custom_id, text_content, stats)
|
||||
except Exception as e:
|
||||
logger.error("Processing batch result %s: %s", custom_id, e)
|
||||
stats["errors"] += 1
|
||||
@@ -1466,7 +1639,7 @@ class DecompositionPass:
|
||||
self._process_pass0a_obligations(raw_obls, control_id, control_uuid, stats)
|
||||
stats["controls_processed"] += 1
|
||||
|
||||
def _handle_batch_result_0b(
|
||||
async def _handle_batch_result_0b(
|
||||
self, custom_id: str, text_content: str, stats: dict,
|
||||
) -> None:
|
||||
"""Process a single Pass 0b batch result."""
|
||||
@@ -1477,14 +1650,14 @@ class DecompositionPass:
|
||||
parsed = _parse_json_object(text_content)
|
||||
obl = self._load_obligation_for_0b(candidate_ids[0])
|
||||
if obl:
|
||||
self._process_pass0b_control(obl, parsed, stats)
|
||||
await self._process_pass0b_control(obl, parsed, stats)
|
||||
else:
|
||||
results_by_id = _parse_json_object(text_content)
|
||||
for cand_id in candidate_ids:
|
||||
parsed = results_by_id.get(cand_id, {})
|
||||
obl = self._load_obligation_for_0b(cand_id)
|
||||
if obl:
|
||||
self._process_pass0b_control(obl, parsed, stats)
|
||||
await self._process_pass0b_control(obl, parsed, stats)
|
||||
|
||||
def _load_obligation_for_0b(self, candidate_id: str) -> Optional[dict]:
|
||||
"""Load obligation data needed for Pass 0b processing."""
|
||||
|
||||
Reference in New Issue
Block a user