From b2b4d7787783a3757cafa3eaa558b9308caa8b88 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Sun, 17 May 2026 01:28:51 +0200
Subject: [PATCH] fix(auto-discovery): compute missing against canonical 8
 types, not submitted

Frontend filters out empty doc rows -> req.documents only contains the
N submitted entries (3 in BMW case). The old auto-discovery loop
computed 'missing' as 'entries in doc_entries with empty text', which
was always empty for those N entries -> discovery never fired.

Fix:
- missing = _ALL_DOC_TYPES - {canonical doc_types in doc_entries}
- For each missing type, APPEND a new entry to doc_entries with
  discovery_attempted=True. If a discovered doc matched, fill text/url
  and set auto_discovered=True.
- Check loop: skip entries with no URL and no text (let padding label
  them). Entries with URL but no text keep the 'Kein Text' error so the
  user sees fetch failures explicitly.
---
 .../api/agent_compliance_check_routes.py      | 87 ++++++++++++-------
 1 file changed, 54 insertions(+), 33 deletions(-)

diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py
index 926cb240..0104e7d4 100644
--- a/backend-compliance/compliance/api/agent_compliance_check_routes.py
+++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py
@@ -250,10 +250,16 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
             _update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct)
 
             if not text or len(text) < 50:
-                results.append(DocCheckResult(
-                    label=label, url=url, doc_type=doc_type,
-                    error="Kein Text vorhanden oder zu kurz",
-                ))
+                # Empty entry — either from auto-discovery padding (no URL
+                # to fetch) or from a fetch that returned nothing. If there
+                # was a URL we keep the error so the user knows the fetch
+                # failed; otherwise let the padding step label it
+                # 'Nicht eingereicht' / 'Auf der Website nicht gefunden'.
+                if (entry.get("url") or "").strip():
+                    results.append(DocCheckResult(
+                        label=label, url=url, doc_type=doc_type,
+                        error="Kein Text vorhanden oder zu kurz",
+                    ))
                 continue
 
             result = await _check_single(
@@ -503,11 +509,17 @@ async def _autodiscover_missing(
     """
     from urllib.parse import urlparse
 
-    # Which canonical types are still empty (no text, no submitted URL)?
-    missing = {
+    # Submitted doc_types (those the user actually entered URL or text for).
+    submitted_types = {
         e["doc_type"] for e in doc_entries
-        if not e.get("text") and not (e.get("url") or "").strip()
+        if e.get("text") or (e.get("url") or "").strip()
     }
+    # Map alias types to canonical
+    submitted_canon = {
+        "dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
+    }
+    # Missing = canonical types the user did NOT submit
+    missing = set(_ALL_DOC_TYPES) - submitted_canon
     if not missing:
         return
 
@@ -520,10 +532,14 @@ async def _autodiscover_missing(
             base = f"{p.scheme}://{p.netloc}"
             bases[base] = bases.get(base, 0) + 1
     if not bases:
-        # No submitted URL at all — nothing to crawl from.
-        for e in doc_entries:
-            if not e.get("text") and not (e.get("url") or "").strip():
-                e["discovery_attempted"] = False
+        # No submitted URL at all — nothing to crawl from. Add empty
+        # placeholders (with discovery_attempted=False) so the padding
+        # step renders them as 'Nicht eingereicht' (not 'Nicht gefunden').
+        for dt in missing:
+            doc_entries.append({
+                "doc_type": dt, "url": "", "text": "", "word_count": 0,
+                "auto_discovered": False, "discovery_attempted": False,
+            })
         return
 
     base = max(bases, key=bases.get) + "/"
@@ -561,30 +577,35 @@ async def _autodiscover_missing(
         if canon and canon in missing and canon not in by_type:
             by_type[canon] = d
 
-    # Fill matching entries
+    # Append a new entry for every missing canonical type. Auto-discovered
+    # ones get the text/URL filled; ungratched ones stay empty so the
+    # padding step renders them as 'Auf der Website nicht gefunden'.
     filled = 0
-    for entry in doc_entries:
-        dt = entry["doc_type"]
-        entry["discovery_attempted"] = dt in missing
-        if dt not in missing or dt not in by_type:
-            continue
-        d = by_type[dt]
-        full = d.get("full_text") or d.get("text_preview") or ""
-        if len(full.split()) < 100:
-            continue
-        entry["text"] = full
-        entry["url"] = d.get("url", "")
-        entry["word_count"] = len(full.split())
-        entry["auto_discovered"] = True
-        doc_texts[dt] = full
-        filled += 1
-        logger.info(
-            "auto-discovered %s on %s: %s (%d words)",
-            dt, base, d.get("url", "")[:80], entry["word_count"],
-        )
+    for dt in missing:
+        new_entry: dict = {
+            "doc_type": dt, "url": "", "text": "", "word_count": 0,
+            "auto_discovered": False, "discovery_attempted": True,
+        }
+        d = by_type.get(dt)
+        if d:
+            full = d.get("full_text") or d.get("text_preview") or ""
+            if len(full.split()) >= 100:
+                new_entry["text"] = full
+                new_entry["url"] = d.get("url", "")
+                new_entry["word_count"] = len(full.split())
+                new_entry["auto_discovered"] = True
+                doc_texts[dt] = full
+                filled += 1
+                logger.info(
+                    "auto-discovered %s on %s: %s (%d words)",
+                    dt, base, d.get("url", "")[:80], new_entry["word_count"],
+                )
+        doc_entries.append(new_entry)
 
-    if filled:
-        logger.info("auto-discovery: filled %d/%d missing types", filled, len(missing))
+    logger.info(
+        "auto-discovery: filled %d/%d missing types from %s",
+        filled, len(missing), base,
+    )
 
 
 # Title/URL keywords → canonical doc_type. Order matters: most-specific first.