feat(compliance-check): split shared URLs into sections per doc_type

When the same URL is used for multiple document types (e.g. /datenschutz for DSI + Cookie + DSB), the section splitter now: - Detects duplicate URLs and fetches text only once - Splits text at classified headings (Cookie, Google Analytics, etc.) - Assigns matching sections to each doc_type - DSI always keeps the full text Extracted to section_splitter.py (170 LOC) to keep routes under 500. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-12 12:49:57 +02:00
parent 128967fa3d
commit 74f00bbb0f
2 changed files with 191 additions and 10 deletions
@@ -151,11 +151,20 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
        doc_texts: dict[str, str] = {}
        doc_entries: list[dict] = []

+        # Cache fetched URLs to detect duplicates
+        url_text_cache: dict[str, str] = {}
+
        for i, doc in enumerate(req.documents):
            _update(check_id, f"Dokument {i+1}/{len(req.documents)}: {doc.doc_type}...")
            text = doc.text
            if not text and doc.url:
-                text = await _fetch_text(doc.url)
+                url_key = doc.url.strip().rstrip("/").lower()
+                if url_key in url_text_cache:
+                    text = url_text_cache[url_key]
+                else:
+                    text = await _fetch_text(doc.url)
+                    if text:
+                        url_text_cache[url_key] = text
            if text:
                doc_texts[doc.doc_type] = text
            doc_entries.append({
@@ -165,6 +174,14 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
                "word_count": len(text.split()) if text else 0,
            })

+        # Step 1b: If same URL used for multiple doc_types, try section splitting
+        from compliance.services.section_splitter import split_shared_texts
+        split_shared_texts(doc_entries, url_text_cache)
+        # Refresh doc_texts after splitting
+        for entry in doc_entries:
+            if entry.get("text"):
+                doc_texts[entry["doc_type"]] = entry["text"]
+
        # Step 2: Detect business profile
        _update(check_id, "Geschaeftsmodell wird erkannt...")
        profile = await detect_business_profile(doc_texts)
@@ -431,19 +448,13 @@ def _doc_type_label(doc_type: str) -> str:

 def _result_to_dict(r) -> dict:
    """Convert DocCheckResult to JSON-serializable dict."""
+    fields = ("id", "label", "passed", "severity", "matched_text",
+              "level", "parent", "skipped", "hint")
    return {
        "label": r.label, "url": r.url, "doc_type": r.doc_type,
        "word_count": r.word_count, "completeness_pct": r.completeness_pct,
        "correctness_pct": r.correctness_pct,
-        "checks": [
-            {
-                "id": c.id, "label": c.label, "passed": c.passed,
-                "severity": c.severity, "matched_text": c.matched_text,
-                "level": c.level, "parent": c.parent,
-                "skipped": c.skipped, "hint": c.hint,
-            }
-            for c in r.checks
-        ],
+        "checks": [{f: getattr(c, f) for f in fields} for c in r.checks],
        "findings_count": r.findings_count, "error": r.error,
    }