fix(auto-discovery): compute missing against canonical 8 types, not submitted

Frontend filters out empty doc rows -> req.documents only contains the
N submitted entries (3 in BMW case). The old auto-discovery loop
computed 'missing' as 'entries in doc_entries with empty text', which
was always empty for those N entries -> discovery never fired.

Fix:
- missing = _ALL_DOC_TYPES - {canonical doc_types in doc_entries}
- For each missing type, APPEND a new entry to doc_entries with
  discovery_attempted=True. If a discovered doc matched, fill text/url
  and set auto_discovered=True.
- Check loop: skip entries with no URL and no text (let padding label
  them). Entries with URL but no text keep the 'Kein Text' error so the
  user sees fetch failures explicitly.
This commit is contained in:
Benjamin Admin
2026-05-17 01:28:51 +02:00
parent f19a75d83d
commit b2b4d77877
@@ -250,10 +250,16 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
_update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct) _update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct)
if not text or len(text) < 50: if not text or len(text) < 50:
results.append(DocCheckResult( # Empty entry — either from auto-discovery padding (no URL
label=label, url=url, doc_type=doc_type, # to fetch) or from a fetch that returned nothing. If there
error="Kein Text vorhanden oder zu kurz", # was a URL we keep the error so the user knows the fetch
)) # failed; otherwise let the padding step label it
# 'Nicht eingereicht' / 'Auf der Website nicht gefunden'.
if (entry.get("url") or "").strip():
results.append(DocCheckResult(
label=label, url=url, doc_type=doc_type,
error="Kein Text vorhanden oder zu kurz",
))
continue continue
result = await _check_single( result = await _check_single(
@@ -503,11 +509,17 @@ async def _autodiscover_missing(
""" """
from urllib.parse import urlparse from urllib.parse import urlparse
# Which canonical types are still empty (no text, no submitted URL)? # Submitted doc_types (those the user actually entered URL or text for).
missing = { submitted_types = {
e["doc_type"] for e in doc_entries e["doc_type"] for e in doc_entries
if not e.get("text") and not (e.get("url") or "").strip() if e.get("text") or (e.get("url") or "").strip()
} }
# Map alias types to canonical
submitted_canon = {
"dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
}
# Missing = canonical types the user did NOT submit
missing = set(_ALL_DOC_TYPES) - submitted_canon
if not missing: if not missing:
return return
@@ -520,10 +532,14 @@ async def _autodiscover_missing(
base = f"{p.scheme}://{p.netloc}" base = f"{p.scheme}://{p.netloc}"
bases[base] = bases.get(base, 0) + 1 bases[base] = bases.get(base, 0) + 1
if not bases: if not bases:
# No submitted URL at all — nothing to crawl from. # No submitted URL at all — nothing to crawl from. Add empty
for e in doc_entries: # placeholders (with discovery_attempted=False) so the padding
if not e.get("text") and not (e.get("url") or "").strip(): # step renders them as 'Nicht eingereicht' (not 'Nicht gefunden').
e["discovery_attempted"] = False for dt in missing:
doc_entries.append({
"doc_type": dt, "url": "", "text": "", "word_count": 0,
"auto_discovered": False, "discovery_attempted": False,
})
return return
base = max(bases, key=bases.get) + "/" base = max(bases, key=bases.get) + "/"
@@ -561,30 +577,35 @@ async def _autodiscover_missing(
if canon and canon in missing and canon not in by_type: if canon and canon in missing and canon not in by_type:
by_type[canon] = d by_type[canon] = d
# Fill matching entries # Append a new entry for every missing canonical type. Auto-discovered
# ones get the text/URL filled; ungratched ones stay empty so the
# padding step renders them as 'Auf der Website nicht gefunden'.
filled = 0 filled = 0
for entry in doc_entries: for dt in missing:
dt = entry["doc_type"] new_entry: dict = {
entry["discovery_attempted"] = dt in missing "doc_type": dt, "url": "", "text": "", "word_count": 0,
if dt not in missing or dt not in by_type: "auto_discovered": False, "discovery_attempted": True,
continue }
d = by_type[dt] d = by_type.get(dt)
full = d.get("full_text") or d.get("text_preview") or "" if d:
if len(full.split()) < 100: full = d.get("full_text") or d.get("text_preview") or ""
continue if len(full.split()) >= 100:
entry["text"] = full new_entry["text"] = full
entry["url"] = d.get("url", "") new_entry["url"] = d.get("url", "")
entry["word_count"] = len(full.split()) new_entry["word_count"] = len(full.split())
entry["auto_discovered"] = True new_entry["auto_discovered"] = True
doc_texts[dt] = full doc_texts[dt] = full
filled += 1 filled += 1
logger.info( logger.info(
"auto-discovered %s on %s: %s (%d words)", "auto-discovered %s on %s: %s (%d words)",
dt, base, d.get("url", "")[:80], entry["word_count"], dt, base, d.get("url", "")[:80], new_entry["word_count"],
) )
doc_entries.append(new_entry)
if filled: logger.info(
logger.info("auto-discovery: filled %d/%d missing types", filled, len(missing)) "auto-discovery: filled %d/%d missing types from %s",
filled, len(missing), base,
)
# Title/URL keywords → canonical doc_type. Order matters: most-specific first. # Title/URL keywords → canonical doc_type. Order matters: most-specific first.