Fix regulation_filter bypass for chunks without regulation_code

Chunks without a regulation_code were silently passing through the filter
in _scan_rag(), causing unrelated documents (e.g. Data Act, legal templates)
to be included in filtered generation jobs. Now chunks without reg_code are
skipped when regulation_filter is active.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-17 13:38:25 +01:00
parent d22c47c9eb
commit 36ef34169a
2 changed files with 61 additions and 5 deletions

View File

@@ -806,7 +806,9 @@ class ControlGeneratorPipeline:
or payload.get("source_code", ""))
# Filter by regulation_code if configured
if config.regulation_filter and reg_code:
if config.regulation_filter:
if not reg_code:
continue # Skip chunks without regulation code
code_lower = reg_code.lower()
if not any(code_lower.startswith(f.lower()) for f in config.regulation_filter):
continue
@@ -852,10 +854,16 @@ class ControlGeneratorPipeline:
collection, collection_total, collection_new,
)
logger.info(
"RAG scroll complete: %d total unique seen, %d new unprocessed to process",
len(seen_hashes), len(all_results),
)
if config.regulation_filter:
logger.info(
"RAG scroll complete: %d total unique seen, %d passed regulation_filter %s",
len(seen_hashes), len(all_results), config.regulation_filter,
)
else:
logger.info(
"RAG scroll complete: %d total unique seen, %d new unprocessed to process",
len(seen_hashes), len(all_results),
)
return all_results
def _get_processed_hashes(self, hashes: list[str]) -> set[str]: