Fix regulation_filter bypass for chunks without regulation_code
Chunks without a regulation_code were silently passing through the filter in _scan_rag(), causing unrelated documents (e.g. Data Act, legal templates) to be included in filtered generation jobs. Now chunks without reg_code are skipped when regulation_filter is active. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -806,7 +806,9 @@ class ControlGeneratorPipeline:
|
||||
or payload.get("source_code", ""))
|
||||
|
||||
# Filter by regulation_code if configured
|
||||
if config.regulation_filter and reg_code:
|
||||
if config.regulation_filter:
|
||||
if not reg_code:
|
||||
continue # Skip chunks without regulation code
|
||||
code_lower = reg_code.lower()
|
||||
if not any(code_lower.startswith(f.lower()) for f in config.regulation_filter):
|
||||
continue
|
||||
@@ -852,10 +854,16 @@ class ControlGeneratorPipeline:
|
||||
collection, collection_total, collection_new,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"RAG scroll complete: %d total unique seen, %d new unprocessed to process",
|
||||
len(seen_hashes), len(all_results),
|
||||
)
|
||||
if config.regulation_filter:
|
||||
logger.info(
|
||||
"RAG scroll complete: %d total unique seen, %d passed regulation_filter %s",
|
||||
len(seen_hashes), len(all_results), config.regulation_filter,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"RAG scroll complete: %d total unique seen, %d new unprocessed to process",
|
||||
len(seen_hashes), len(all_results),
|
||||
)
|
||||
return all_results
|
||||
|
||||
def _get_processed_hashes(self, hashes: list[str]) -> set[str]:
|
||||
|
||||
Reference in New Issue
Block a user