fix(auto-discovery): compute missing against canonical 8 types, not submitted

Frontend filters out empty doc rows -> req.documents only contains the
N submitted entries (3 in BMW case). The old auto-discovery loop
computed 'missing' as 'entries in doc_entries with empty text', which
was always empty for those N entries -> discovery never fired.

Fix:
- missing = _ALL_DOC_TYPES - {canonical doc_types in doc_entries}
- For each missing type, APPEND a new entry to doc_entries with
  discovery_attempted=True. If a discovered doc matched, fill text/url
  and set auto_discovered=True.
- Check loop: skip entries with no URL and no text (let padding label
  them). Entries with URL but no text keep the 'Kein Text' error so the
  user sees fetch failures explicitly.
This commit is contained in:
Benjamin Admin
2026-05-17 01:28:51 +02:00
parent f19a75d83d
commit b2b4d77877
@@ -250,10 +250,16 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
_update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct)
if not text or len(text) < 50:
results.append(DocCheckResult(
label=label, url=url, doc_type=doc_type,
error="Kein Text vorhanden oder zu kurz",
))
# Empty entry — either from auto-discovery padding (no URL
# to fetch) or from a fetch that returned nothing. If there
# was a URL we keep the error so the user knows the fetch
# failed; otherwise let the padding step label it
# 'Nicht eingereicht' / 'Auf der Website nicht gefunden'.
if (entry.get("url") or "").strip():
results.append(DocCheckResult(
label=label, url=url, doc_type=doc_type,
error="Kein Text vorhanden oder zu kurz",
))
continue
result = await _check_single(
@@ -503,11 +509,17 @@ async def _autodiscover_missing(
"""
from urllib.parse import urlparse
# Which canonical types are still empty (no text, no submitted URL)?
missing = {
# Submitted doc_types (those the user actually entered URL or text for).
submitted_types = {
e["doc_type"] for e in doc_entries
if not e.get("text") and not (e.get("url") or "").strip()
if e.get("text") or (e.get("url") or "").strip()
}
# Map alias types to canonical
submitted_canon = {
"dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
}
# Missing = canonical types the user did NOT submit
missing = set(_ALL_DOC_TYPES) - submitted_canon
if not missing:
return
@@ -520,10 +532,14 @@ async def _autodiscover_missing(
base = f"{p.scheme}://{p.netloc}"
bases[base] = bases.get(base, 0) + 1
if not bases:
# No submitted URL at all — nothing to crawl from.
for e in doc_entries:
if not e.get("text") and not (e.get("url") or "").strip():
e["discovery_attempted"] = False
# No submitted URL at all — nothing to crawl from. Add empty
# placeholders (with discovery_attempted=False) so the padding
# step renders them as 'Nicht eingereicht' (not 'Nicht gefunden').
for dt in missing:
doc_entries.append({
"doc_type": dt, "url": "", "text": "", "word_count": 0,
"auto_discovered": False, "discovery_attempted": False,
})
return
base = max(bases, key=bases.get) + "/"
@@ -561,30 +577,35 @@ async def _autodiscover_missing(
if canon and canon in missing and canon not in by_type:
by_type[canon] = d
# Fill matching entries
# Append a new entry for every missing canonical type. Auto-discovered
# ones get the text/URL filled; ungratched ones stay empty so the
# padding step renders them as 'Auf der Website nicht gefunden'.
filled = 0
for entry in doc_entries:
dt = entry["doc_type"]
entry["discovery_attempted"] = dt in missing
if dt not in missing or dt not in by_type:
continue
d = by_type[dt]
full = d.get("full_text") or d.get("text_preview") or ""
if len(full.split()) < 100:
continue
entry["text"] = full
entry["url"] = d.get("url", "")
entry["word_count"] = len(full.split())
entry["auto_discovered"] = True
doc_texts[dt] = full
filled += 1
logger.info(
"auto-discovered %s on %s: %s (%d words)",
dt, base, d.get("url", "")[:80], entry["word_count"],
)
for dt in missing:
new_entry: dict = {
"doc_type": dt, "url": "", "text": "", "word_count": 0,
"auto_discovered": False, "discovery_attempted": True,
}
d = by_type.get(dt)
if d:
full = d.get("full_text") or d.get("text_preview") or ""
if len(full.split()) >= 100:
new_entry["text"] = full
new_entry["url"] = d.get("url", "")
new_entry["word_count"] = len(full.split())
new_entry["auto_discovered"] = True
doc_texts[dt] = full
filled += 1
logger.info(
"auto-discovered %s on %s: %s (%d words)",
dt, base, d.get("url", "")[:80], new_entry["word_count"],
)
doc_entries.append(new_entry)
if filled:
logger.info("auto-discovery: filled %d/%d missing types", filled, len(missing))
logger.info(
"auto-discovery: filled %d/%d missing types from %s",
filled, len(missing), base,
)
# Title/URL keywords → canonical doc_type. Order matters: most-specific first.