From b2b4d7787783a3757cafa3eaa558b9308caa8b88 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 17 May 2026 01:28:51 +0200 Subject: [PATCH] fix(auto-discovery): compute missing against canonical 8 types, not submitted Frontend filters out empty doc rows -> req.documents only contains the N submitted entries (3 in BMW case). The old auto-discovery loop computed 'missing' as 'entries in doc_entries with empty text', which was always empty for those N entries -> discovery never fired. Fix: - missing = _ALL_DOC_TYPES - {canonical doc_types in doc_entries} - For each missing type, APPEND a new entry to doc_entries with discovery_attempted=True. If a discovered doc matched, fill text/url and set auto_discovered=True. - Check loop: skip entries with no URL and no text (let padding label them). Entries with URL but no text keep the 'Kein Text' error so the user sees fetch failures explicitly. --- .../api/agent_compliance_check_routes.py | 87 ++++++++++++------- 1 file changed, 54 insertions(+), 33 deletions(-) diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 926cb240..0104e7d4 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -250,10 +250,16 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): _update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct) if not text or len(text) < 50: - results.append(DocCheckResult( - label=label, url=url, doc_type=doc_type, - error="Kein Text vorhanden oder zu kurz", - )) + # Empty entry — either from auto-discovery padding (no URL + # to fetch) or from a fetch that returned nothing. If there + # was a URL we keep the error so the user knows the fetch + # failed; otherwise let the padding step label it + # 'Nicht eingereicht' / 'Auf der Website nicht gefunden'. + if (entry.get("url") or "").strip(): + results.append(DocCheckResult( + label=label, url=url, doc_type=doc_type, + error="Kein Text vorhanden oder zu kurz", + )) continue result = await _check_single( @@ -503,11 +509,17 @@ async def _autodiscover_missing( """ from urllib.parse import urlparse - # Which canonical types are still empty (no text, no submitted URL)? - missing = { + # Submitted doc_types (those the user actually entered URL or text for). + submitted_types = { e["doc_type"] for e in doc_entries - if not e.get("text") and not (e.get("url") or "").strip() + if e.get("text") or (e.get("url") or "").strip() } + # Map alias types to canonical + submitted_canon = { + "dse" if t in ("datenschutz", "privacy") else t for t in submitted_types + } + # Missing = canonical types the user did NOT submit + missing = set(_ALL_DOC_TYPES) - submitted_canon if not missing: return @@ -520,10 +532,14 @@ async def _autodiscover_missing( base = f"{p.scheme}://{p.netloc}" bases[base] = bases.get(base, 0) + 1 if not bases: - # No submitted URL at all — nothing to crawl from. - for e in doc_entries: - if not e.get("text") and not (e.get("url") or "").strip(): - e["discovery_attempted"] = False + # No submitted URL at all — nothing to crawl from. Add empty + # placeholders (with discovery_attempted=False) so the padding + # step renders them as 'Nicht eingereicht' (not 'Nicht gefunden'). + for dt in missing: + doc_entries.append({ + "doc_type": dt, "url": "", "text": "", "word_count": 0, + "auto_discovered": False, "discovery_attempted": False, + }) return base = max(bases, key=bases.get) + "/" @@ -561,30 +577,35 @@ async def _autodiscover_missing( if canon and canon in missing and canon not in by_type: by_type[canon] = d - # Fill matching entries + # Append a new entry for every missing canonical type. Auto-discovered + # ones get the text/URL filled; ungratched ones stay empty so the + # padding step renders them as 'Auf der Website nicht gefunden'. filled = 0 - for entry in doc_entries: - dt = entry["doc_type"] - entry["discovery_attempted"] = dt in missing - if dt not in missing or dt not in by_type: - continue - d = by_type[dt] - full = d.get("full_text") or d.get("text_preview") or "" - if len(full.split()) < 100: - continue - entry["text"] = full - entry["url"] = d.get("url", "") - entry["word_count"] = len(full.split()) - entry["auto_discovered"] = True - doc_texts[dt] = full - filled += 1 - logger.info( - "auto-discovered %s on %s: %s (%d words)", - dt, base, d.get("url", "")[:80], entry["word_count"], - ) + for dt in missing: + new_entry: dict = { + "doc_type": dt, "url": "", "text": "", "word_count": 0, + "auto_discovered": False, "discovery_attempted": True, + } + d = by_type.get(dt) + if d: + full = d.get("full_text") or d.get("text_preview") or "" + if len(full.split()) >= 100: + new_entry["text"] = full + new_entry["url"] = d.get("url", "") + new_entry["word_count"] = len(full.split()) + new_entry["auto_discovered"] = True + doc_texts[dt] = full + filled += 1 + logger.info( + "auto-discovered %s on %s: %s (%d words)", + dt, base, d.get("url", "")[:80], new_entry["word_count"], + ) + doc_entries.append(new_entry) - if filled: - logger.info("auto-discovery: filled %d/%d missing types", filled, len(missing)) + logger.info( + "auto-discovery: filled %d/%d missing types from %s", + filled, len(missing), base, + ) # Title/URL keywords → canonical doc_type. Order matters: most-specific first.