fix(auto-discovery): compute missing against canonical 8 types, not submitted

Frontend filters out empty doc rows -> req.documents only contains the N submitted entries (3 in BMW case). The old auto-discovery loop computed 'missing' as 'entries in doc_entries with empty text', which was always empty for those N entries -> discovery never fired. Fix: - missing = _ALL_DOC_TYPES - {canonical doc_types in doc_entries} - For each missing type, APPEND a new entry to doc_entries with discovery_attempted=True. If a discovered doc matched, fill text/url and set auto_discovered=True. - Check loop: skip entries with no URL and no text (let padding label them). Entries with URL but no text keep the 'Kein Text' error so the user sees fetch failures explicitly.
2026-05-17 01:28:51 +02:00
parent f19a75d83d
commit b2b4d77877
1 changed files with 54 additions and 33 deletions
@@ -250,10 +250,16 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
            _update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct)
            if not text or len(text) < 50:
-                results.append(DocCheckResult(
+                # Empty entry — either from auto-discovery padding (no URL
-                    label=label, url=url, doc_type=doc_type,
+                # to fetch) or from a fetch that returned nothing. If there
-                    error="Kein Text vorhanden oder zu kurz",
+                # was a URL we keep the error so the user knows the fetch
-                ))
+                # failed; otherwise let the padding step label it
                # 'Nicht eingereicht' / 'Auf der Website nicht gefunden'.
                if (entry.get("url") or "").strip():
                    results.append(DocCheckResult(
                        label=label, url=url, doc_type=doc_type,
                        error="Kein Text vorhanden oder zu kurz",
                    ))
                continue
            result = await _check_single(
@@ -503,11 +509,17 @@ async def _autodiscover_missing(
    """
    from urllib.parse import urlparse
-    # Which canonical types are still empty (no text, no submitted URL)?
+    # Submitted doc_types (those the user actually entered URL or text for).
-    missing = {
+    submitted_types = {
        e["doc_type"] for e in doc_entries
-        if not e.get("text") and not (e.get("url") or "").strip()
+        if e.get("text") or (e.get("url") or "").strip()
    }
    # Map alias types to canonical
    submitted_canon = {
        "dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
    }
    # Missing = canonical types the user did NOT submit
    missing = set(_ALL_DOC_TYPES) - submitted_canon
    if not missing:
        return
@@ -520,10 +532,14 @@ async def _autodiscover_missing(
            base = f"{p.scheme}://{p.netloc}"
            bases[base] = bases.get(base, 0) + 1
    if not bases:
-        # No submitted URL at all — nothing to crawl from.
+        # No submitted URL at all — nothing to crawl from. Add empty
-        for e in doc_entries:
+        # placeholders (with discovery_attempted=False) so the padding
-            if not e.get("text") and not (e.get("url") or "").strip():
+        # step renders them as 'Nicht eingereicht' (not 'Nicht gefunden').
-                e["discovery_attempted"] = False
+        for dt in missing:
            doc_entries.append({
                "doc_type": dt, "url": "", "text": "", "word_count": 0,
                "auto_discovered": False, "discovery_attempted": False,
            })
        return
    base = max(bases, key=bases.get) + "/"
@@ -561,30 +577,35 @@ async def _autodiscover_missing(
        if canon and canon in missing and canon not in by_type:
            by_type[canon] = d
-    # Fill matching entries
+    # Append a new entry for every missing canonical type. Auto-discovered
    # ones get the text/URL filled; ungratched ones stay empty so the
    # padding step renders them as 'Auf der Website nicht gefunden'.
    filled = 0
-    for entry in doc_entries:
+    for dt in missing:
-        dt = entry["doc_type"]
+        new_entry: dict = {
-        entry["discovery_attempted"] = dt in missing
+            "doc_type": dt, "url": "", "text": "", "word_count": 0,
-        if dt not in missing or dt not in by_type:
+            "auto_discovered": False, "discovery_attempted": True,
-            continue
+        }
-        d = by_type[dt]
+        d = by_type.get(dt)
-        full = d.get("full_text") or d.get("text_preview") or ""
+        if d:
-        if len(full.split()) < 100:
+            full = d.get("full_text") or d.get("text_preview") or ""
-            continue
+            if len(full.split()) >= 100:
-        entry["text"] = full
+                new_entry["text"] = full
-        entry["url"] = d.get("url", "")
+                new_entry["url"] = d.get("url", "")
-        entry["word_count"] = len(full.split())
+                new_entry["word_count"] = len(full.split())
-        entry["auto_discovered"] = True
+                new_entry["auto_discovered"] = True
-        doc_texts[dt] = full
+                doc_texts[dt] = full
-        filled += 1
+                filled += 1
-        logger.info(
+                logger.info(
-            "auto-discovered %s on %s: %s (%d words)",
+                    "auto-discovered %s on %s: %s (%d words)",
-            dt, base, d.get("url", "")[:80], entry["word_count"],
+                    dt, base, d.get("url", "")[:80], new_entry["word_count"],
-        )
+                )
        doc_entries.append(new_entry)
-    if filled:
+    logger.info(
-        logger.info("auto-discovery: filled %d/%d missing types", filled, len(missing))
+        "auto-discovery: filled %d/%d missing types from %s",
        filled, len(missing), base,
    )
 # Title/URL keywords → canonical doc_type. Order matters: most-specific first.