fix(auto-discovery): compute missing against canonical 8 types, not submitted
Frontend filters out empty doc rows -> req.documents only contains the
N submitted entries (3 in BMW case). The old auto-discovery loop
computed 'missing' as 'entries in doc_entries with empty text', which
was always empty for those N entries -> discovery never fired.
Fix:
- missing = _ALL_DOC_TYPES - {canonical doc_types in doc_entries}
- For each missing type, APPEND a new entry to doc_entries with
discovery_attempted=True. If a discovered doc matched, fill text/url
and set auto_discovered=True.
- Check loop: skip entries with no URL and no text (let padding label
them). Entries with URL but no text keep the 'Kein Text' error so the
user sees fetch failures explicitly.
This commit is contained in:
@@ -250,10 +250,16 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
_update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct)
|
||||
|
||||
if not text or len(text) < 50:
|
||||
results.append(DocCheckResult(
|
||||
label=label, url=url, doc_type=doc_type,
|
||||
error="Kein Text vorhanden oder zu kurz",
|
||||
))
|
||||
# Empty entry — either from auto-discovery padding (no URL
|
||||
# to fetch) or from a fetch that returned nothing. If there
|
||||
# was a URL we keep the error so the user knows the fetch
|
||||
# failed; otherwise let the padding step label it
|
||||
# 'Nicht eingereicht' / 'Auf der Website nicht gefunden'.
|
||||
if (entry.get("url") or "").strip():
|
||||
results.append(DocCheckResult(
|
||||
label=label, url=url, doc_type=doc_type,
|
||||
error="Kein Text vorhanden oder zu kurz",
|
||||
))
|
||||
continue
|
||||
|
||||
result = await _check_single(
|
||||
@@ -503,11 +509,17 @@ async def _autodiscover_missing(
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Which canonical types are still empty (no text, no submitted URL)?
|
||||
missing = {
|
||||
# Submitted doc_types (those the user actually entered URL or text for).
|
||||
submitted_types = {
|
||||
e["doc_type"] for e in doc_entries
|
||||
if not e.get("text") and not (e.get("url") or "").strip()
|
||||
if e.get("text") or (e.get("url") or "").strip()
|
||||
}
|
||||
# Map alias types to canonical
|
||||
submitted_canon = {
|
||||
"dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
|
||||
}
|
||||
# Missing = canonical types the user did NOT submit
|
||||
missing = set(_ALL_DOC_TYPES) - submitted_canon
|
||||
if not missing:
|
||||
return
|
||||
|
||||
@@ -520,10 +532,14 @@ async def _autodiscover_missing(
|
||||
base = f"{p.scheme}://{p.netloc}"
|
||||
bases[base] = bases.get(base, 0) + 1
|
||||
if not bases:
|
||||
# No submitted URL at all — nothing to crawl from.
|
||||
for e in doc_entries:
|
||||
if not e.get("text") and not (e.get("url") or "").strip():
|
||||
e["discovery_attempted"] = False
|
||||
# No submitted URL at all — nothing to crawl from. Add empty
|
||||
# placeholders (with discovery_attempted=False) so the padding
|
||||
# step renders them as 'Nicht eingereicht' (not 'Nicht gefunden').
|
||||
for dt in missing:
|
||||
doc_entries.append({
|
||||
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
||||
"auto_discovered": False, "discovery_attempted": False,
|
||||
})
|
||||
return
|
||||
|
||||
base = max(bases, key=bases.get) + "/"
|
||||
@@ -561,30 +577,35 @@ async def _autodiscover_missing(
|
||||
if canon and canon in missing and canon not in by_type:
|
||||
by_type[canon] = d
|
||||
|
||||
# Fill matching entries
|
||||
# Append a new entry for every missing canonical type. Auto-discovered
|
||||
# ones get the text/URL filled; ungratched ones stay empty so the
|
||||
# padding step renders them as 'Auf der Website nicht gefunden'.
|
||||
filled = 0
|
||||
for entry in doc_entries:
|
||||
dt = entry["doc_type"]
|
||||
entry["discovery_attempted"] = dt in missing
|
||||
if dt not in missing or dt not in by_type:
|
||||
continue
|
||||
d = by_type[dt]
|
||||
full = d.get("full_text") or d.get("text_preview") or ""
|
||||
if len(full.split()) < 100:
|
||||
continue
|
||||
entry["text"] = full
|
||||
entry["url"] = d.get("url", "")
|
||||
entry["word_count"] = len(full.split())
|
||||
entry["auto_discovered"] = True
|
||||
doc_texts[dt] = full
|
||||
filled += 1
|
||||
logger.info(
|
||||
"auto-discovered %s on %s: %s (%d words)",
|
||||
dt, base, d.get("url", "")[:80], entry["word_count"],
|
||||
)
|
||||
for dt in missing:
|
||||
new_entry: dict = {
|
||||
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
||||
"auto_discovered": False, "discovery_attempted": True,
|
||||
}
|
||||
d = by_type.get(dt)
|
||||
if d:
|
||||
full = d.get("full_text") or d.get("text_preview") or ""
|
||||
if len(full.split()) >= 100:
|
||||
new_entry["text"] = full
|
||||
new_entry["url"] = d.get("url", "")
|
||||
new_entry["word_count"] = len(full.split())
|
||||
new_entry["auto_discovered"] = True
|
||||
doc_texts[dt] = full
|
||||
filled += 1
|
||||
logger.info(
|
||||
"auto-discovered %s on %s: %s (%d words)",
|
||||
dt, base, d.get("url", "")[:80], new_entry["word_count"],
|
||||
)
|
||||
doc_entries.append(new_entry)
|
||||
|
||||
if filled:
|
||||
logger.info("auto-discovery: filled %d/%d missing types", filled, len(missing))
|
||||
logger.info(
|
||||
"auto-discovery: filled %d/%d missing types from %s",
|
||||
filled, len(missing), base,
|
||||
)
|
||||
|
||||
|
||||
# Title/URL keywords → canonical doc_type. Order matters: most-specific first.
|
||||
|
||||
Reference in New Issue
Block a user