diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py
index 304c2425..8b971e72 100644
--- a/consent-tester/services/dsi_discovery.py
+++ b/consent-tester/services/dsi_discovery.py
@@ -308,21 +308,31 @@ async def discover_dsi_documents(
                         self_wc = len(self_text.split())
                         logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)
 
-                # If the rendered DOM is still short, the page is likely a
-                # JS-injected CMP widget (BMW ePaaS, OneTrust Cookie List).
-                # Use the JSON we captured from network responses instead —
-                # that's the structured source the widget would have rendered.
-                # We also prefer CMP data over thin DOM extraction (< 300 words)
-                # because thin DOM = mostly site navigation, not policy.
-                if self_wc < 300 and cmp_capture.payloads:
+                # If a CMP JSON was captured (BMW ePaaS, OneTrust, etc.) it is
+                # the authoritative source for the cookie policy — far more
+                # reliable than the rendered DOM, which usually only contains
+                # site chrome (navigation/footer) when the policy widget hasn't
+                # finished rendering yet.
+                #
+                # Prefer the CMP-reconstructed text when ANY of:
+                #   - DOM extraction was very short (< 300 words)
+                #   - CMP text is at least 1.5x longer than DOM
+                #   - CMP text exceeds 1000 words (always authoritative at scale)
+                if cmp_capture.payloads:
                     cmp_text = cmp_capture.reconstruct_cookie_policy()
                     cmp_wc = len(cmp_text.split()) if cmp_text else 0
-                    if cmp_wc > self_wc:
+                    if cmp_wc > 0 and (
+                        self_wc < 300
+                        or cmp_wc >= 1000
+                        or cmp_wc > self_wc * 1.5
+                    ):
+                        logger.info(
+                            "Self-extraction via CMP capture for %s: %d words "
+                            "(replacing %d-word DOM extraction, %d CMP payloads)",
+                            url, cmp_wc, self_wc, len(cmp_capture.payloads),
+                        )
                         self_text = cmp_text
                         self_wc = cmp_wc
-                        logger.info("Self-extraction via CMP capture for %s: %d words "
-                                    "(%d CMP payloads)", url, self_wc,
-                                    len(cmp_capture.payloads))
 
                 if self_wc >= 100:
                     page_title = await page.title() or url