Fix regulation_filter bypass for chunks without regulation_code

Chunks without a regulation_code were silently passing through the filter in _scan_rag(), causing unrelated documents (e.g. Data Act, legal templates) to be included in filtered generation jobs. Now chunks without reg_code are skipped when regulation_filter is active. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 13:38:25 +01:00
parent d22c47c9eb
commit 36ef34169a
2 changed files with 61 additions and 5 deletions
@@ -806,7 +806,9 @@ class ControlGeneratorPipeline:
                                or payload.get("source_code", ""))

                    # Filter by regulation_code if configured
-                    if config.regulation_filter and reg_code:
+                    if config.regulation_filter:
+                        if not reg_code:
+                            continue  # Skip chunks without regulation code
                        code_lower = reg_code.lower()
                        if not any(code_lower.startswith(f.lower()) for f in config.regulation_filter):
                            continue
@@ -852,10 +854,16 @@ class ControlGeneratorPipeline:
                collection, collection_total, collection_new,
            )

-        logger.info(
-            "RAG scroll complete: %d total unique seen, %d new unprocessed to process",
-            len(seen_hashes), len(all_results),
-        )
+        if config.regulation_filter:
+            logger.info(
+                "RAG scroll complete: %d total unique seen, %d passed regulation_filter %s",
+                len(seen_hashes), len(all_results), config.regulation_filter,
+            )
+        else:
+            logger.info(
+                "RAG scroll complete: %d total unique seen, %d new unprocessed to process",
+                len(seen_hashes), len(all_results),
+            )
        return all_results

    def _get_processed_hashes(self, hashes: list[str]) -> set[str]: