feat(compliance-check): split shared URLs into sections per doc_type
Build + Deploy / build-admin-compliance (push) Successful in 2m4s
Build + Deploy / build-backend-compliance (push) Successful in 3m39s
Build + Deploy / build-ai-sdk (push) Successful in 50s
Build + Deploy / build-developer-portal (push) Successful in 1m12s
Build + Deploy / build-tts (push) Successful in 2m16s
Build + Deploy / build-document-crawler (push) Successful in 1m9s
Build + Deploy / build-dsms-gateway (push) Successful in 35s
Build + Deploy / build-dsms-node (push) Successful in 32s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 16s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m37s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Successful in 43s
CI / test-python-backend (push) Successful in 39s
CI / test-python-document-crawler (push) Successful in 27s
CI / test-python-dsms-gateway (push) Successful in 22s
CI / validate-canonical-controls (push) Successful in 15s
Build + Deploy / trigger-orca (push) Successful in 3m16s
Build + Deploy / build-admin-compliance (push) Successful in 2m4s
Build + Deploy / build-backend-compliance (push) Successful in 3m39s
Build + Deploy / build-ai-sdk (push) Successful in 50s
Build + Deploy / build-developer-portal (push) Successful in 1m12s
Build + Deploy / build-tts (push) Successful in 2m16s
Build + Deploy / build-document-crawler (push) Successful in 1m9s
Build + Deploy / build-dsms-gateway (push) Successful in 35s
Build + Deploy / build-dsms-node (push) Successful in 32s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 16s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m37s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Successful in 43s
CI / test-python-backend (push) Successful in 39s
CI / test-python-document-crawler (push) Successful in 27s
CI / test-python-dsms-gateway (push) Successful in 22s
CI / validate-canonical-controls (push) Successful in 15s
Build + Deploy / trigger-orca (push) Successful in 3m16s
When the same URL is used for multiple document types (e.g. /datenschutz for DSI + Cookie + DSB), the section splitter now: - Detects duplicate URLs and fetches text only once - Splits text at classified headings (Cookie, Google Analytics, etc.) - Assigns matching sections to each doc_type - DSI always keeps the full text Extracted to section_splitter.py (170 LOC) to keep routes under 500. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -151,11 +151,20 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
doc_texts: dict[str, str] = {}
|
||||
doc_entries: list[dict] = []
|
||||
|
||||
# Cache fetched URLs to detect duplicates
|
||||
url_text_cache: dict[str, str] = {}
|
||||
|
||||
for i, doc in enumerate(req.documents):
|
||||
_update(check_id, f"Dokument {i+1}/{len(req.documents)}: {doc.doc_type}...")
|
||||
text = doc.text
|
||||
if not text and doc.url:
|
||||
text = await _fetch_text(doc.url)
|
||||
url_key = doc.url.strip().rstrip("/").lower()
|
||||
if url_key in url_text_cache:
|
||||
text = url_text_cache[url_key]
|
||||
else:
|
||||
text = await _fetch_text(doc.url)
|
||||
if text:
|
||||
url_text_cache[url_key] = text
|
||||
if text:
|
||||
doc_texts[doc.doc_type] = text
|
||||
doc_entries.append({
|
||||
@@ -165,6 +174,14 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
"word_count": len(text.split()) if text else 0,
|
||||
})
|
||||
|
||||
# Step 1b: If same URL used for multiple doc_types, try section splitting
|
||||
from compliance.services.section_splitter import split_shared_texts
|
||||
split_shared_texts(doc_entries, url_text_cache)
|
||||
# Refresh doc_texts after splitting
|
||||
for entry in doc_entries:
|
||||
if entry.get("text"):
|
||||
doc_texts[entry["doc_type"]] = entry["text"]
|
||||
|
||||
# Step 2: Detect business profile
|
||||
_update(check_id, "Geschaeftsmodell wird erkannt...")
|
||||
profile = await detect_business_profile(doc_texts)
|
||||
@@ -431,19 +448,13 @@ def _doc_type_label(doc_type: str) -> str:
|
||||
|
||||
def _result_to_dict(r) -> dict:
|
||||
"""Convert DocCheckResult to JSON-serializable dict."""
|
||||
fields = ("id", "label", "passed", "severity", "matched_text",
|
||||
"level", "parent", "skipped", "hint")
|
||||
return {
|
||||
"label": r.label, "url": r.url, "doc_type": r.doc_type,
|
||||
"word_count": r.word_count, "completeness_pct": r.completeness_pct,
|
||||
"correctness_pct": r.correctness_pct,
|
||||
"checks": [
|
||||
{
|
||||
"id": c.id, "label": c.label, "passed": c.passed,
|
||||
"severity": c.severity, "matched_text": c.matched_text,
|
||||
"level": c.level, "parent": c.parent,
|
||||
"skipped": c.skipped, "hint": c.hint,
|
||||
}
|
||||
for c in r.checks
|
||||
],
|
||||
"checks": [{f: getattr(c, f) for f in fields} for c in r.checks],
|
||||
"findings_count": r.findings_count, "error": r.error,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user