breakpilot-compliance/consent-tester/services/dsi_discovery.py

"""
DSI Discovery — Generic privacy document finder and parser.

Finds all privacy/data protection documents on any website regardless of:
- Technology (static HTML, SPA, WordPress, Typo3, etc.)
- Structure (accordion, sidebar, footer, inline links, separate pages)
- Format (HTML sections, PDF downloads, cross-domain links)
- Language (all 26 EU/EEA official languages)

Flow:
1. Load page with Playwright (full JS rendering)
2. Find all links matching DSI keywords (26 languages)
3. Expand accordions, click tabs, open dropdowns
4. Follow cross-domain links (e.g. instagram.com → help.instagram.com)
5. Extract document text from each link target
6. Return structured list of discovered documents
"""

import logging
import re
from dataclasses import dataclass, field
from urllib.parse import urlparse, urljoin

from playwright.async_api import Page

from services.dsi_helpers import goto_resilient, try_dismiss_consent_banner, is_pdf_redirect
from services.cmp_extractor import CMPCapture

logger = logging.getLogger(__name__)

# Legal document keywords in all EU/EEA official languages.
# Covers: DSI (privacy), AGB (terms), Widerruf (cancellation),
# Cookie-Richtlinie, Impressum, NB (Nutzungsbedingungen).
DSI_KEYWORDS: dict[str, list[str]] = {
    "de": [
        # Datenschutz
        "datenschutz", "datenschutzerklaerung", "datenschutzinformation",
        "datenschutzhinweis", "datenschutzrichtlinie", "dsgvo", "privatsphäre",
        "datenschutzbestimmung", "verarbeitung personenbezogener daten",
        # AGB / Nutzungsbedingungen
        "allgemeine geschäftsbedingungen", "agb", "nutzungsbedingungen",
        "nutzungsordnung", "geschäftsbedingungen",
        # Widerruf
        "widerrufsbelehrung", "widerrufsrecht", "widerrufsformular",
        "widerruf", "rücktrittsrecht",
        # Cookie
        "cookie-richtlinie", "cookie-policy", "cookie-hinweis",
        # Impressum
        "impressum", "anbieterkennzeichnung",
        # Imprint (EN)
        "imprint", "legal notice", "site notice",
    ],
    "en": [
        "privacy policy", "privacy notice", "data protection", "data policy",
        "privacy statement", "gdpr", "personal data", "cookie policy",
        "terms of service", "terms and conditions", "terms of use",
        "cancellation policy", "right of withdrawal", "refund policy",
        "cookie notice",
    ],
    "fr": [
        "politique de confidentialité", "protection des données",
        "données personnelles", "vie privée", "rgpd",
        "conditions générales", "conditions d'utilisation",
        "droit de rétractation", "politique de cookies",
    ],
    "es": [
        "política de privacidad", "protección de datos",
        "datos personales", "aviso de privacidad",
        "términos y condiciones", "condiciones de uso",
        "derecho de desistimiento", "política de cookies",
    ],
    "it": [
        "informativa sulla privacy", "protezione dei dati",
        "dati personali", "privacy policy",
        "termini e condizioni", "condizioni d'uso",
        "diritto di recesso", "politica dei cookie",
    ],
    "nl": [
        "privacybeleid", "gegevensbescherming", "privacyverklaring",
        "persoonsgegevens", "avg",
        "algemene voorwaarden", "gebruiksvoorwaarden",
        "herroepingsrecht", "cookiebeleid",
    ],
    "pl": [
        "polityka prywatności", "ochrona danych osobowych",
        "dane osobowe", "rodo",
        "regulamin", "warunki korzystania",
        "prawo odstąpienia", "polityka cookies",
    ],
    "pt": [
        "política de privacidade", "proteção de dados",
        "dados pessoais", "lgpd",
        "termos e condições", "condições de utilização",
        "direito de resolução", "política de cookies",
    ],
    "sv": [
        "integritetspolicy", "dataskydd", "personuppgifter",
        "sekretesspolicy",
        "allmänna villkor", "användarvillkor",
        "ångerrätt", "cookiepolicy",
    ],
    "da": [
        "privatlivspolitik", "databeskyttelse", "personoplysninger",
        "persondatapolitik",
        "handelsbetingelser", "brugsbetingelser",
        "fortrydelsesret", "cookiepolitik",
    ],
    "fi": [
        "tietosuojaseloste", "tietosuoja", "henkilötiedot",
        "rekisteriseloste",
        "yleiset ehdot", "käyttöehdot",
        "peruutusoikeus", "evästekäytäntö",
    ],
    "cs": ["zásady ochrany osobních údajů", "ochrana osobních údajů",
           "zpracování osobních údajů", "obchodní podmínky", "zásady cookies"],
    "el": ["πολιτική απορρήτου", "προστασία δεδομένων",
           "προσωπικά δεδομένα", "όροι χρήσης", "πολιτική cookies"],
    "hu": ["adatvédelmi szabályzat", "adatvédelem", "személyes adatok",
           "általános szerződési feltételek", "cookie szabályzat"],
    "ro": ["politica de confidențialitate", "protecția datelor",
           "date cu caracter personal", "termeni și condiții", "politica cookies"],
    "bg": ["политика за поверителност", "защита на данните",
           "лични данни", "общи условия", "политика за бисквитки"],
    "hr": ["politika privatnosti", "zaštita podataka", "osobni podaci",
           "opći uvjeti", "politika kolačića"],
    "sk": ["zásady ochrany osobných údajov", "ochrana osobných údajov",
           "obchodné podmienky", "zásady cookies"],
    "sl": ["politika zasebnosti", "varstvo podatkov", "osebni podatki",
           "splošni pogoji", "politika piškotkov"],
    "et": ["privaatsuspoliitika", "andmekaitse", "isikuandmed",
           "kasutustingimused", "küpsiste poliitika"],
    "lt": ["privatumo politika", "duomenų apsauga", "asmens duomenys",
           "naudojimosi sąlygos", "slapukų politika"],
    "lv": ["privātuma politika", "datu aizsardzība", "personas dati",
           "lietošanas noteikumi", "sīkdatņu politika"],
    "mt": ["politika tal-privatezza", "protezzjoni tad-data",
           "termini u kundizzjonijiet"],
    "ga": ["polasaí príobháideachais", "cosaint sonraí",
           "téarmaí agus coinníollacha"],
    "is": ["persónuverndarstefna", "persónuvernd",
           "skilmálar og skilyrði"],
    "no": ["personvernerklæring", "personvern", "personopplysninger",
           "brukervilkår", "angrerett", "informasjonskapsler"],
}

# Flatten all keywords for quick matching
ALL_DSI_KEYWORDS: list[str] = []
for kw_list in DSI_KEYWORDS.values():
    ALL_DSI_KEYWORDS.extend(kw_list)

@dataclass
class DiscoveredDSI:
    """A discovered privacy/data protection document."""
    title: str
    url: str
    source_url: str  # Page where the link was found
    language: str = ""
    doc_type: str = ""  # "html_section", "html_page", "pdf", "accordion", "cross_domain"
    text: str = ""  # Extracted full text
    sections: list[dict] = field(default_factory=list)  # Parsed sections
    word_count: int = 0
    # D — Tab-getrennte HTML-Tabellen aus dem DOM. Pro Tabelle eine
    # Liste von Zeilen, jede Zeile ein Tab-getrennter String. Erlaubt
    # dem Backend deterministischen Cookie-Tabellen-Parse ohne LLM.
    tables: list[list[str]] = field(default_factory=list)

@dataclass
class DSIDiscoveryResult:
    """Result of DSI discovery scan."""
    base_url: str
    documents: list[DiscoveredDSI] = field(default_factory=list)
    total_found: int = 0
    languages_detected: list[str] = field(default_factory=list)
    errors: list[str] = field(default_factory=list)
    # Raw CMP payloads captured during navigation (one per matched JSON).
    # Schema: [{"kind": str, "url": str, "data": dict}, ...]
    # Backend uses these to build vendor records + run per-vendor checks.
    cmp_payloads: list[dict] = field(default_factory=list)
    # Reconstructed cookie-policy text from all captured CMP payloads
    # (CMP-library reconstruct + heuristic generic). Backend uses this as
    # the authoritative cookie-text so MC checks run on the real policy,
    # not the homepage navigation that DOM extraction returns.
    cmp_cookie_text: str = ""

def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
    """Check if text contains any DSI keyword. Returns (match, language)."""
    text_lower = text.lower().strip()
    for lang, keywords in DSI_KEYWORDS.items():
        for kw in keywords:
            if kw in text_lower:
                return True, lang
    return False, ""

def _is_allowed_domain(href: str, base_domain: str) -> bool:
    """Allow same domain + known related domains (e.g. help.instagram.com)."""
    try:
        link_domain = urlparse(href).netloc.replace("www.", "")
        base_clean = base_domain.replace("www.", "")
        # Same domain
        if link_domain == base_clean:
            return True
        # Subdomain (help.instagram.com for instagram.com)
        if link_domain.endswith(f".{base_clean}"):
            return True
        # Parent domain (instagram.com links from about.instagram.com)
        if base_clean.endswith(f".{link_domain}"):
            return True
        # Known related patterns
        parts_base = base_clean.split(".")
        parts_link = link_domain.split(".")
        if len(parts_base) >= 2 and len(parts_link) >= 2:
            if parts_base[-2] == parts_link[-2] and parts_base[-1] == parts_link[-1]:
                return True  # Same registrable domain
    except Exception:
        pass
    return False

async def discover_dsi_documents(
    page: Page,
    url: str,
    max_documents: int = 100,
    timeout_seconds: int = 300,
) -> DSIDiscoveryResult:
    """Discover all privacy/data protection documents on a website.

    Works generically regardless of website technology, structure, or language.
    Searches exhaustively until no new documents are found — no arbitrary page limit.
    Stops when: all discovered links have been visited OR timeout reached.
    """
    import time
    deadline = time.time() + timeout_seconds

    result = DSIDiscoveryResult(base_url=url)
    base_domain = urlparse(url).netloc
    seen_urls: set[str] = set()
    seen_titles: set[str] = set()

    # CMP capture must be wired BEFORE navigation so we catch the JSON requests
    # that fire as soon as the consent widget initializes (e.g. BMW ePaaS).
    cmp_capture = CMPCapture()
    cmp_capture.attach(page)

    # Also collect a generic JSON response log for the LLM fallback (Phase C+D)
    # if everything else fails. Keep it small (header info only, not bodies).
    network_log: list[dict] = []

    async def _on_response_log(response):
        try:
            ct = (response.headers.get("content-type") or "").lower()
            if "json" not in ct:
                return
            network_log.append({
                "url": response.url,
                "status": response.status,
                "content_type": ct,
                "size": int(response.headers.get("content-length") or 0),
            })
        except Exception:
            pass

    page.on("response", _on_response_log)

    try:
        # Step 1: Load the page (with networkidle → domcontentloaded fallback)
        await goto_resilient(page, url, timeout=60000)
        await page.wait_for_timeout(2000)

        # Step 1a: Detect PDF redirects (e.g. dm.de redirects to GCS PDF)
        final_url = page.url
        if is_pdf_redirect(url, final_url):
            is_dsi_url, dsi_lang = _matches_dsi_keyword(urlparse(url).path.lower())
            if is_dsi_url:
                result.documents.append(DiscoveredDSI(
                    title=urlparse(url).path.split("/")[-1] or "Datenschutzerklaerung",
                    url=final_url,
                    source_url=url,
                    language=dsi_lang or "de",
                    doc_type="pdf",
                    text="[PDF — Textextraktion erforderlich]",
                ))
                seen_urls.add(url)
                seen_urls.add(final_url)
                logger.info("PDF redirect detected: %s -> %s", url, final_url)
            # Return early — a PDF redirect means no HTML content to scan
            result.total_found = len(result.documents)
            result.cmp_payloads = [
                {"kind": kind, "data": data}
                for kind, data in cmp_capture.payloads
            ]
            return result

        # Step 1b: Try dismissing cookie consent banners before extraction.
        # Many German sites (dm.de, Zalando, etc.) block page content behind
        # a consent wall. Dismissing it reveals the actual DSI text.
        banner_dismissed = await try_dismiss_consent_banner(page)
        if banner_dismissed:
            # After consent, page may reload or reveal hidden content
            await page.wait_for_timeout(2000)
            # Re-navigate if the page redirected after consent
            try:
                if page.url != url:
                    await goto_resilient(page, url, timeout=30000)
                    await page.wait_for_timeout(2000)
            except Exception:
                pass

        # Step 1c: Self-extraction — if the URL itself is a DSI page,
        # extract its full text as the first document. This handles the
        # case where the user provides the DSE URL directly (e.g.
        # example.com/datenschutz) instead of the homepage.
        current_url_path = urlparse(url).path.lower()
        is_self_dsi, self_lang = _matches_dsi_keyword(current_url_path)
        if not is_self_dsi:
            # Also check the page title
            page_title = await page.title() or ""
            is_self_dsi, self_lang = _matches_dsi_keyword(page_title)
        if is_self_dsi:
            try:
                # Wait for substantive content to appear (SPAs need time to render).
                # Polls body.innerText length up to 10s. Many sites (BMW, Daimler)
                # render via React/Vue after domcontentloaded fires.
                try:
                    await page.wait_for_function(
                        "() => (document.body && document.body.innerText || '').length > 500",
                        timeout=10000,
                    )
                except Exception:
                    pass  # Continue anyway, extractor below has fallbacks

                # Scroll to bottom to trigger lazy-loading of full content
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(1500)
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(1000)

                self_text = await _extract_text_robust(page)
                self_wc = len(self_text.split()) if self_text else 0

                # If still too short, try same-origin iframes (some sites
                # embed cookie policies via OneTrust/Sourcepoint iframes).
                if self_wc < 100:
                    iframe_text = await _extract_text_from_iframes(page)
                    if iframe_text and len(iframe_text.split()) > self_wc:
                        self_text = iframe_text
                        self_wc = len(self_text.split())
                        logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)

                # If a CMP JSON was captured (BMW ePaaS, OneTrust, etc.) it is
                # the authoritative source for the cookie policy — far more
                # reliable than the rendered DOM, which usually only contains
                # site chrome (navigation/footer) when the policy widget hasn't
                # finished rendering yet.
                #
                # Prefer the CMP-reconstructed text when ANY of:
                #   - DOM extraction was very short (< 300 words)
                #   - CMP text is at least 1.5x longer than DOM
                #   - CMP text exceeds 1000 words (always authoritative at scale)
                if cmp_capture.payloads:
                    cmp_text = cmp_capture.reconstruct_cookie_policy()
                    cmp_wc = len(cmp_text.split()) if cmp_text else 0
                    # Replace DOM with CMP only when CMP is *strictly larger*
                    # AND meets at least one of: DOM was very thin, CMP is
                    # substantial, or CMP is significantly longer than DOM.
                    # The strict-larger guard prevents a tiny heuristic match
                    # (e.g. an unrelated /api/data JSON) from clobbering a
                    # bigger DOM extraction.
                    if cmp_wc > self_wc and (
                        self_wc < 300
                        or cmp_wc >= 1000
                        or cmp_wc > self_wc * 1.5
                    ):
                        logger.info(
                            "Self-extraction via CMP capture for %s: %d words "
                            "(replacing %d-word DOM extraction, %d CMP payloads)",
                            url, cmp_wc, self_wc, len(cmp_capture.payloads),
                        )
                        self_text = cmp_text
                        self_wc = cmp_wc

                # Phase C/D: LLM cascade fallback. Triggers only when both
                # named CMPs (Phase B) and the generic heuristic (Phase A)
                # produced nothing AND the DOM is too thin to be a real policy.
                if self_wc < 300 and not cmp_capture.payloads:
                    llm_text, llm_wc = await _try_llm_cascade(
                        page, url, network_log,
                    )
                    if llm_wc > self_wc:
                        logger.info(
                            "Self-extraction via LLM cascade for %s: %d words "
                            "(replacing %d-word DOM)",
                            url, llm_wc, self_wc,
                        )
                        self_text = llm_text
                        self_wc = llm_wc

                if self_wc >= 100:
                    page_title = await page.title() or url
                    result.documents.append(DiscoveredDSI(
                        title=page_title.strip(),
                        url=url,
                        source_url=url,
                        language=self_lang or "de",
                        doc_type="html_full_page",
                        text=self_text.strip(),
                        word_count=self_wc,
                    ))
                    seen_urls.add(url)
                    logger.info("Self-extracted %d words from %s", self_wc, url)
                else:
                    logger.info("Self-extraction too short (%d words) for %s", self_wc, url)
            except Exception as e:
                logger.warning("Self-extraction failed for %s: %s", url, e)

        # Step 2: Find DSI links in current page
        links = await _find_dsi_links(page, base_domain)
        logger.info("Found %d DSI links on %s", len(links), url)

        # Step 3: Expand accordions, tabs, dropdowns to find hidden content
        await _expand_all_interactive(page)
        await page.wait_for_timeout(1000)

        # Step 3b: Re-scan after expanding (may reveal new links)
        links_after = await _find_dsi_links(page, base_domain)
        for link in links_after:
            if link["href"] not in [l["href"] for l in links]:
                links.append(link)

        # Step 4: Check for inline DSI sections (accordion content already visible)
        inline_sections = await _find_inline_dsi_sections(page)
        for section in inline_sections:
            title_norm = section["title"].strip().lower()
            if title_norm not in seen_titles:
                seen_titles.add(title_norm)
                is_dsi, lang = _matches_dsi_keyword(section["title"])
                doc = DiscoveredDSI(
                    title=section["title"],
                    url=f"{url}#{section.get('id', '')}",
                    source_url=url,
                    language=lang,
                    doc_type="html_section",
                    text=section["text"],
                    word_count=len(section["text"].split()),
                )
                result.documents.append(doc)

        # Step 5: Follow each DSI link and extract content.
        # Exhaustive: processes ALL found links. On each visited page,
        # searches for MORE links (recursive discovery). Stops only when
        # all links visited or timeout reached.
        pending_links = list(links)
        pages_to_revisit: list[str] = []  # Pages where we found docs — may have more links

        while pending_links and time.time() < deadline and len(result.documents) < max_documents:
            link_info = pending_links.pop(0)
            href = link_info["href"]
            if href in seen_urls:
                continue
            seen_urls.add(href)

            title = link_info["text"]
            title_norm = title.strip().lower()
            if title_norm in seen_titles:
                continue
            seen_titles.add(title_norm)

            is_dsi, lang = _matches_dsi_keyword(title)
            is_pdf = href.lower().endswith(".pdf")

            if is_pdf:
                result.documents.append(DiscoveredDSI(
                    title=title, url=href, source_url=url,
                    language=lang, doc_type="pdf",
                    text="[PDF — Textextraktion erforderlich]",
                ))
                continue

            try:
                # Skip anchor links on same page — they are sections of the parent doc
                is_anchor = "#" in href and href.split("#")[0] in (url.split("#")[0], page.url.split("#")[0])
                if is_anchor:
                    continue

                # Navigate to page — with networkidle/domcontentloaded fallback
                await goto_resilient(page, href, timeout=45000)
                resp_url = page.url

                # Check for PDF redirect on followed links
                if is_pdf_redirect(href, resp_url):
                    result.documents.append(DiscoveredDSI(
                        title=title, url=resp_url, source_url=url,
                        language=lang, doc_type="pdf",
                        text="[PDF — Textextraktion erforderlich]",
                    ))
                    await goto_resilient(page, url, timeout=45000)
                    continue

                await try_dismiss_consent_banner(page)
                await _expand_all_interactive(page)
                await page.wait_for_timeout(500)

                # Extract text — try specific content areas, fall back to full body
                text = await page.evaluate("""
                    () => {
                        // Try progressively broader content selectors
                        const selectors = [
                            '.article-content', '.page-content', '.entry-content',
                            '[class*="content-area"]', '[class*="main-content"]',
                            'main article', 'main', 'article',
                            '[role="main"]', '.content', '#content',
                        ];
                        for (const sel of selectors) {
                            const el = document.querySelector(sel);
                            if (el) {
                                // P98: innerText statt textContent — innerText
                                // respektiert Whitespace zwischen Block-Elementen.
                                // textContent verkettet HTML-Tabellen-Zellen ohne
                                // Spaces (VW-Cookie-Tabelle: ~100 Cookie-Namen
                                // wurden zu einem Klumpen "smartSignals2UiDsmartSignals2sUiD...").
                                const txt = (el.innerText || el.textContent || '').trim();
                                if (txt.length > 200) return txt;
                            }
                        }
                        // Fallback: full body minus nav/header/footer
                        const body = document.body.cloneNode(true);
                        body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
                        // P98: innerText respektiert Whitespace (s.o.)
                        return (body.innerText || body.textContent || '').trim();
                    }
                """)
                # D — HTML-Tabellen separat extrahieren. Pro Tabelle ein
                # Array von Zeilen, jede Zeile ein Tab-getrennter String.
                # Das erlaubt dem Backend deterministischen Spalten-Parse
                # (cookies_table_parser) ohne LLM-Halluzinationen.
                tables = await page.evaluate("""
                    () => {
                        const out = [];
                        document.querySelectorAll('table').forEach(t => {
                            const rows = [];
                            t.querySelectorAll('tr').forEach(tr => {
                                const cells = [];
                                tr.querySelectorAll('th, td').forEach(c => {
                                    cells.push((c.innerText || c.textContent || '').trim().replace(/\\s+/g, ' '));
                                });
                                if (cells.length >= 2) rows.push(cells.join('\\t'));
                            });
                            if (rows.length >= 3) out.push(rows);
                        });
                        return out;
                    }
                """)
                if text and len(text) > 50:
                    result.documents.append(DiscoveredDSI(
                        title=title, url=href, source_url=url,
                        language=lang,
                        doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
                        text=text[:200000], word_count=len(text.split()),
                        tables=(tables or [])[:10],
                    ))

                # Recursive: search THIS page for more DSI links
                new_links = await _find_dsi_links(page, base_domain)
                for nl in new_links:
                    if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
                        pending_links.append(nl)

                # Navigate back for next link
                await goto_resilient(page, url, timeout=45000)
                await page.wait_for_timeout(500)
                await _expand_all_interactive(page)

            except Exception as e:
                result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
                try:
                    await goto_resilient(page, url, timeout=45000)
                except Exception:
                    pass

    except Exception as e:
        result.errors.append(f"Discovery failed: {str(e)[:100]}")
        logger.error("DSI discovery failed: %s", e)

    # Deduplicate: remove noise titles + merge docs with identical word_count
    result.documents = _deduplicate_documents(result.documents)

    result.total_found = len(result.documents)
    result.languages_detected = list(set(
        d.language for d in result.documents if d.language
    ))
    result.cmp_payloads = [
        {"kind": kind, "data": data} for kind, data in cmp_capture.payloads
    ]
    if cmp_capture.payloads:
        try:
            result.cmp_cookie_text = cmp_capture.reconstruct_cookie_policy()
        except Exception as e:
            logger.warning("CMP reconstruct on discovery failed: %s", e)
    logger.info(
        "DSI discovery complete: %d documents found in %s, %d CMP payloads, "
        "cmp_cookie_text=%d words",
        result.total_found, result.languages_detected, len(result.cmp_payloads),
        len(result.cmp_cookie_text.split()) if result.cmp_cookie_text else 0,
    )
    return result

# Nav elements, not real documents
# NOTE: "datenschutz" was removed — it's a legitimate document title
NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share",
    "kontakt", "contact", "suche", "search", "menü", "menu", "home"}

def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
    """Remove duplicate and noise documents."""
    # Step 1: Filter noise titles (nav elements, not real docs)
    filtered = []
    for d in docs:
        title_lower = d.title.strip().lower()
        # Skip very short titles that are nav elements
        if title_lower in NOISE_TITLES:
            continue
        # Skip titles that are just URLs
        if title_lower.startswith("http") or title_lower.startswith("www."):
            continue
        # Skip very short documents (< 50 words) — likely nav snippets
        if d.word_count < 50 and d.doc_type != "pdf":
            continue
        filtered.append(d)

    # Step 2: Merge docs with identical word_count (same page text, different title)
    seen_wordcounts: dict[int, DiscoveredDSI] = {}
    unique = []
    for d in filtered:
        if d.word_count > 200:  # Only dedup substantial docs
            if d.word_count in seen_wordcounts:
                existing = seen_wordcounts[d.word_count]
                # Prefer "Datenschutzinformation*" titles over section headings
                d_is_dsi = d.title.lower().startswith("datenschutzinformation")
                ex_is_dsi = existing.title.lower().startswith("datenschutzinformation")
                if d_is_dsi and not ex_is_dsi:
                    unique = [x for x in unique if x is not existing]
                    unique.append(d)
                    seen_wordcounts[d.word_count] = d
                continue
            seen_wordcounts[d.word_count] = d
        unique.append(d)

    return unique

async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
    """Find all links whose text or href matches DSI keywords."""
    try:
        all_links = await page.evaluate("""
            () => [...document.querySelectorAll('a[href]')].map(a => ({
                href: a.href,
                text: (a.textContent || '').trim().substring(0, 200),
                ariaLabel: a.getAttribute('aria-label') || '',
                title: a.getAttribute('title') || '',
                visible: a.getBoundingClientRect().width > 0,
            }))
        """)
        dsi_links = []
        for link in (all_links or []):
            search_text = f"{link['text']} {link['ariaLabel']} {link['title']}".lower()
            href = link["href"]
            href_lower = href.lower()

            # Match by link text or href
            is_match = any(kw in search_text or kw in href_lower for kw in ALL_DSI_KEYWORDS)
            if not is_match:
                continue

            # Allow same domain + related domains + PDFs
            if _is_allowed_domain(href, base_domain) or href.endswith(".pdf"):
                dsi_links.append({
                    "href": href,
                    "text": link["text"],
                    "visible": link["visible"],
                })

        return dsi_links
    except Exception as e:
        logger.warning("DSI link scan failed: %s", e)
        return []

async def _expand_all_interactive(page: Page) -> None:
    """Expand all accordions, tabs, details, dropdowns on the page.

    IMPORTANT: Only expand CLOSED elements. Never click elements that
    are already expanded (aria-expanded="true") — that would close them.
    BMW, for example, has accordions open by default.
    """
    try:
        await page.evaluate("""() => {
            // 1. Open all <details> that are closed
            document.querySelectorAll('details:not([open])').forEach(d => d.open = true);

            // 2. Click buttons that are explicitly CLOSED (aria-expanded="false")
            document.querySelectorAll('button[aria-expanded="false"]').forEach(b => {
                try { b.click(); } catch {}
            });

            // 3. Bootstrap/jQuery collapse triggers (only closed ones)
            document.querySelectorAll('[data-toggle="collapse"].collapsed').forEach(e => {
                try { e.click(); } catch {}
            });
            document.querySelectorAll('[data-bs-toggle="collapse"].collapsed').forEach(e => {
                try { e.click(); } catch {}
            });

            // 4. "Show more" / "Mehr anzeigen" buttons
            document.querySelectorAll('button,a').forEach(b => {
                const t = (b.textContent || '').trim();
                if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test(t))
                    try { b.click(); } catch {}
            });

            // 5. Tabs — click each to make content visible, then go back
            // (don't click, just make tab panels visible)
            document.querySelectorAll('[role="tabpanel"][hidden]').forEach(p => {
                p.removeAttribute('hidden');
                p.style.display = '';
            });
        }""")
    except Exception:
        pass

async def _find_inline_dsi_sections(page: Page) -> list[dict]:
    """Find DSI content already visible on the page (e.g. expanded accordions).

    Only counts top-level documents (H1/H2 with DSI keywords).
    Sub-sections (H3/H4 like 'Cookies', 'Betroffenenrechte') are NOT counted
    as separate documents — their text is part of the parent document.
    """
    try:
        sections = await page.evaluate("""
            () => {
                const results = [];
                // Only H1 and H2 count as document-level headings
                const headings = document.querySelectorAll('h1, h2');
                const dsiKeywords = [
                    'datenschutz', 'privacy', 'données', 'privacidad', 'protezione',
                    'gegevensbescherming', 'ochrona danych', 'tietosuoja', 'integritet',
                    'databeskyttelse', 'ochrana', 'adatvédel', 'confidential',
                ];
                for (const h of headings) {
                    const text = (h.textContent || '').trim();
                    const textLower = text.toLowerCase();
                    if (!dsiKeywords.some(kw => textLower.includes(kw))) continue;

                    // Get ALL content until the next H1/H2 (include sub-sections H3-H5)
                    let content = '';
                    let el = h.nextElementSibling;
                    let count = 0;
                    while (el && count < 200) {
                        // Stop at next H1 or H2 (next top-level document)
                        if (el.tagName === 'H1' || el.tagName === 'H2') break;
                        content += (el.textContent || '').trim() + '\\n';
                        el = el.nextElementSibling;
                        count++;
                    }

                    if (content.length > 100) {
                        results.push({
                            title: text.substring(0, 200),
                            text: content.substring(0, 50000),
                            id: h.id || '',
                        });
                    }
                }
                return results;
            }
        """)
        return sections or []
    except Exception:
        return []


async def _extract_text_robust(page: Page) -> str:
    """Multi-strategy text extraction for SPA-heavy pages (BMW, Daimler, etc).

    Tries progressively broader selectors, falls back to body-minus-chrome,
    final fallback: join all paragraph/list/cell tags' textContent.
    """
    try:
        return await page.evaluate("""
            () => {
                // 1) Specific content containers
                const selectors = [
                    '.article-content', '.page-content', '.entry-content',
                    '[class*="content-area"]', '[class*="main-content"]',
                    '[class*="legal-text"]', '[class*="policy-content"]',
                    'main article', 'main', 'article',
                    '[role="main"]', '.content', '#content', '.bodytext',
                ];
                for (const sel of selectors) {
                    const el = document.querySelector(sel);
                    if (el && el.textContent.trim().length > 200) {
                        return el.textContent.trim().replace(/\\s+/g, ' ');
                    }
                }
                // 2) Body minus nav/header/footer/scripts
                const body = document.body.cloneNode(true);
                body.querySelectorAll(
                    'nav, header, footer, script, style, noscript,' +
                    ' [class*="nav"], [class*="sidebar"], [class*="cookie"],' +
                    ' [class*="banner"], [id*="cookie"], [id*="banner"]'
                ).forEach(e => e.remove());
                const bodyText = (body.textContent || '').trim().replace(/\\s+/g, ' ');
                if (bodyText.length > 200) return bodyText;
                // 3) Final fallback: collect all text-bearing tags
                const blocks = document.querySelectorAll('p, li, dd, td, h1, h2, h3, h4');
                const parts = [];
                for (const b of blocks) {
                    const t = (b.textContent || '').trim();
                    if (t.length > 20) parts.push(t);
                }
                return parts.join(' ').replace(/\\s+/g, ' ');
            }
        """) or ""
    except Exception as e:
        logger.warning("Robust text extraction failed: %s", e)
        return ""


async def _extract_text_from_iframes(page: Page) -> str:
    """Collect text from same-origin iframes (OneTrust, Sourcepoint embeds).

    Many sites render cookie policies inside iframes managed by CMP vendors.
    """
    try:
        from urllib.parse import urlparse
        page_host = urlparse(page.url).netloc
        chunks: list[str] = []
        for frame in page.frames:
            if frame == page.main_frame:
                continue
            try:
                frame_host = urlparse(frame.url).netloc
                # Accept same-origin or known CMP frames
                if frame_host and frame_host != page_host:
                    cmp_hosts = ("onetrust", "cookiebot", "consensu", "sourcepoint",
                                 "usercentrics", "didomi", "klaro")
                    if not any(h in frame_host for h in cmp_hosts):
                        continue
                text = await frame.evaluate(
                    "() => (document.body && document.body.innerText || '').trim()"
                )
                if text and len(text.split()) > 50:
                    chunks.append(text)
            except Exception:
                continue
        return "\n\n".join(chunks)
    except Exception as e:
        logger.debug("Iframe extraction failed: %s", e)
        return ""


async def _try_llm_cascade(
    page: Page, target_url: str, network_log: list[dict],
) -> tuple[str, int]:
    """Phase C/D fallback: ask Qwen (then OVH) where the cookie policy is.

    Returns (text, word_count). On failure or no LLM configured: ("", 0).

    Caches the LLM's suggestion in Valkey per netloc (7d TTL) so subsequent
    runs against the same domain skip the LLM call.
    """
    from urllib.parse import urlparse
    from services.cmp_llm_fallback import (
        LLMCascade, cache_get, cache_set,
    )

    netloc = urlparse(target_url).netloc.lower()
    if not netloc:
        return "", 0

    # Cache hit: apply hint directly
    cached = await cache_get(netloc)
    if cached:
        text = await _apply_llm_hint(page, cached)
        wc = len(text.split()) if text else 0
        if wc >= 300:
            logger.info("LLM cache hit for %s: %d words", netloc, wc)
            return text, wc
        # Cached hint stale — fall through to fresh LLM call

    # DOM snapshot for the LLM prompt
    try:
        dom_snapshot = await page.evaluate(
            "() => (document.body && document.body.innerText || '').slice(0, 5000)"
        ) or ""
    except Exception:
        dom_snapshot = ""

    cascade = LLMCascade.from_env()
    hint = await cascade.analyze(target_url, dom_snapshot, network_log)
    if not hint:
        return "", 0

    text = await _apply_llm_hint(page, hint)
    wc = len(text.split()) if text else 0
    if wc >= 300:
        await cache_set(netloc, hint)
        logger.info("LLM cached for %s (%s): %d words", netloc, hint.get("_tier"), wc)
        # Phase E: log discovery + (if eligible) auto-promote to named CMP
        try:
            from services.cmp_discovery_log import record_discovery
            record_discovery(
                domain=netloc,
                llm_used=hint.get("_tier", "unknown"),
                strategy=hint.get("strategy", ""),
                value=hint.get("value", ""),
                extracted_text=text,
            )
        except Exception as e:
            logger.debug("CMP discovery log failed: %s", e)
    return text, wc


async def _apply_llm_hint(page: Page, hint: dict) -> str:
    """Execute the LLM's suggested strategy and return extracted text."""
    strategy = hint.get("strategy")
    value = hint.get("value", "")

    if strategy == "text":
        return value or ""

    if strategy == "selector" and value:
        try:
            return await page.evaluate(
                "(sel) => { const e = document.querySelector(sel); "
                "return e ? (e.innerText || e.textContent || '').trim() : ''; }",
                value,
            ) or ""
        except Exception as e:
            logger.debug("LLM selector failed (%s): %s", value, e)
            return ""

    if strategy == "url" and value:
        try:
            resp = await page.context.request.get(value, timeout=30000)
            if resp.status != 200:
                return ""
            ct = (resp.headers.get("content-type") or "").lower()
            if "json" in ct:
                from services.cmp_heuristic import (
                    looks_like_cookie_policy, reconstruct_generic,
                )
                data = await resp.json()
                if looks_like_cookie_policy(data):
                    return reconstruct_generic(data)
                # Even if heuristic rejects, try generic walker
                return reconstruct_generic(data)
            text = await resp.text()
            # Strip HTML if HTML response
            if "html" in ct:
                import re as _re
                text = _re.sub(r"<[^>]+>", " ", text)
                text = _re.sub(r"\s+", " ", text).strip()
            return text
        except Exception as e:
            logger.debug("LLM url fetch failed (%s): %s", value[:80], e)
            return ""

    return ""