fix(quality): Ruff/CVE/TS-Fixes, 104 neue Tests, Complexity-Refactoring
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Failing after 30s
CI / test-python-backend-compliance (push) Successful in 30s
CI / test-python-document-crawler (push) Successful in 21s
CI / test-python-dsms-gateway (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Failing after 30s
CI / test-python-backend-compliance (push) Successful in 30s
CI / test-python-document-crawler (push) Successful in 21s
CI / test-python-dsms-gateway (push) Successful in 17s
- Ruff: 144 auto-fixes (unused imports, == None → is None), F821/F811/F841 manuell - CVEs: python-multipart>=0.0.22, weasyprint>=68.0, pillow>=12.1.1, npm audit fix (0 vulns) - TS: 5 tote Drafting-Engine-Dateien entfernt, allowed-facts/sanitizer/StepHeader/context fixes - Tests: +104 (ISMS 58, Evidence 18, VVT 14, Generation 14) → 1449 passed - Refactoring: collect_ci_evidence (F→A), row_to_response (E→A), extract_requirements (E→A) - Dead Code: pca-platform, 7 Go-Handler, dsr_api.py, duplicate Schemas entfernt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,13 +20,13 @@ import asyncio
|
||||
from typing import Optional, List, Dict
|
||||
from datetime import datetime
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from fastapi import APIRouter, Depends
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from classroom_engine.database import get_db
|
||||
from ..db import RegulationRepository, RequirementRepository
|
||||
from ..db.models import RegulationDB, RequirementDB, RegulationTypeEnum
|
||||
from ..db.models import RegulationDB, RegulationTypeEnum
|
||||
from ..services.rag_client import get_rag_client, RAGSearchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -185,6 +185,169 @@ def _build_existing_articles(
|
||||
return {r.article for r in existing}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extraction helpers — independently testable
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _parse_rag_results(
|
||||
all_results: List[RAGSearchResult],
|
||||
regulation_codes: Optional[List[str]] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Filter, deduplicate, and group RAG search results by regulation code.
|
||||
|
||||
Returns a dict with:
|
||||
- deduped_by_reg: Dict[str, List[tuple[str, RAGSearchResult]]]
|
||||
- skipped_no_article: List[RAGSearchResult]
|
||||
- unique_count: int
|
||||
"""
|
||||
# Filter by regulation_codes if requested
|
||||
if regulation_codes:
|
||||
all_results = [
|
||||
r for r in all_results
|
||||
if r.regulation_code in regulation_codes
|
||||
]
|
||||
|
||||
# Deduplicate at result level (regulation_code + article)
|
||||
seen: set[tuple[str, str]] = set()
|
||||
unique_count = 0
|
||||
for r in sorted(all_results, key=lambda x: x.score, reverse=True):
|
||||
article = _normalize_article(r)
|
||||
if not article:
|
||||
continue
|
||||
key = (r.regulation_code, article)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique_count += 1
|
||||
|
||||
# Group by regulation_code
|
||||
by_reg: Dict[str, List[tuple[str, RAGSearchResult]]] = {}
|
||||
skipped_no_article: List[RAGSearchResult] = []
|
||||
|
||||
for r in all_results:
|
||||
article = _normalize_article(r)
|
||||
if not article:
|
||||
skipped_no_article.append(r)
|
||||
continue
|
||||
key_r = r.regulation_code or "UNKNOWN"
|
||||
if key_r not in by_reg:
|
||||
by_reg[key_r] = []
|
||||
by_reg[key_r].append((article, r))
|
||||
|
||||
# Deduplicate within groups
|
||||
deduped_by_reg: Dict[str, List[tuple[str, RAGSearchResult]]] = {}
|
||||
for reg_code, items in by_reg.items():
|
||||
seen_articles: set[str] = set()
|
||||
deduped: List[tuple[str, RAGSearchResult]] = []
|
||||
for art, r in sorted(items, key=lambda x: x[1].score, reverse=True):
|
||||
if art not in seen_articles:
|
||||
seen_articles.add(art)
|
||||
deduped.append((art, r))
|
||||
deduped_by_reg[reg_code] = deduped
|
||||
|
||||
return {
|
||||
"deduped_by_reg": deduped_by_reg,
|
||||
"skipped_no_article": skipped_no_article,
|
||||
"unique_count": unique_count,
|
||||
}
|
||||
|
||||
|
||||
def _store_requirements(
|
||||
db: Session,
|
||||
deduped_by_reg: Dict[str, List[tuple[str, "RAGSearchResult"]]],
|
||||
dry_run: bool,
|
||||
) -> dict:
|
||||
"""
|
||||
Persist extracted requirements to the database (or simulate in dry_run mode).
|
||||
|
||||
Returns a dict with:
|
||||
- created_count: int
|
||||
- skipped_dup_count: int
|
||||
- failed_count: int
|
||||
- result_items: List[ExtractedRequirement]
|
||||
"""
|
||||
req_repo = RequirementRepository(db)
|
||||
created_count = 0
|
||||
skipped_dup_count = 0
|
||||
failed_count = 0
|
||||
result_items: List[ExtractedRequirement] = []
|
||||
|
||||
for reg_code, items in deduped_by_reg.items():
|
||||
if not items:
|
||||
continue
|
||||
|
||||
# Find or create regulation
|
||||
try:
|
||||
first_result = items[0][1]
|
||||
regulation_name = first_result.regulation_name or first_result.regulation_short or reg_code
|
||||
if dry_run:
|
||||
# For dry_run, fake a regulation id
|
||||
regulation_id = f"dry-run-{reg_code}"
|
||||
existing_articles: set[str] = set()
|
||||
else:
|
||||
reg = _get_or_create_regulation(db, reg_code, regulation_name)
|
||||
regulation_id = reg.id
|
||||
existing_articles = _build_existing_articles(db, regulation_id)
|
||||
except Exception as e:
|
||||
logger.error("Failed to get/create regulation %s: %s", reg_code, e)
|
||||
failed_count += len(items)
|
||||
continue
|
||||
|
||||
for article, r in items:
|
||||
title = _derive_title(r.text, article)
|
||||
|
||||
if article in existing_articles:
|
||||
skipped_dup_count += 1
|
||||
result_items.append(ExtractedRequirement(
|
||||
regulation_code=reg_code,
|
||||
article=article,
|
||||
title=title,
|
||||
requirement_text=r.text[:1000],
|
||||
source_url=r.source_url,
|
||||
score=r.score,
|
||||
action="skipped_duplicate",
|
||||
))
|
||||
continue
|
||||
|
||||
if not dry_run:
|
||||
try:
|
||||
req_repo.create(
|
||||
regulation_id=regulation_id,
|
||||
article=article,
|
||||
title=title,
|
||||
description=f"Extrahiert aus RAG-Korpus (Collection: {r.category or r.regulation_code}). Score: {r.score:.2f}",
|
||||
requirement_text=r.text[:2000],
|
||||
breakpilot_interpretation=None,
|
||||
is_applicable=True,
|
||||
priority=2,
|
||||
)
|
||||
existing_articles.add(article) # prevent intra-batch duplication
|
||||
created_count += 1
|
||||
except Exception as e:
|
||||
logger.error("Failed to create requirement %s/%s: %s", reg_code, article, e)
|
||||
failed_count += 1
|
||||
continue
|
||||
else:
|
||||
created_count += 1 # dry_run: count as would-create
|
||||
|
||||
result_items.append(ExtractedRequirement(
|
||||
regulation_code=reg_code,
|
||||
article=article,
|
||||
title=title,
|
||||
requirement_text=r.text[:1000],
|
||||
source_url=r.source_url,
|
||||
score=r.score,
|
||||
action="created" if not dry_run else "would_create",
|
||||
))
|
||||
|
||||
return {
|
||||
"created_count": created_count,
|
||||
"skipped_dup_count": skipped_dup_count,
|
||||
"failed_count": failed_count,
|
||||
"result_items": result_items,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Endpoint
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -225,126 +388,19 @@ async def extract_requirements_from_rag(
|
||||
|
||||
logger.info("RAG extraction: %d raw results from %d collections", len(all_results), len(collections))
|
||||
|
||||
# --- 2. Filter by regulation_codes if requested ---
|
||||
if body.regulation_codes:
|
||||
all_results = [
|
||||
r for r in all_results
|
||||
if r.regulation_code in body.regulation_codes
|
||||
]
|
||||
# --- 2. Parse, filter, deduplicate, and group ---
|
||||
parsed = _parse_rag_results(all_results, body.regulation_codes)
|
||||
deduped_by_reg = parsed["deduped_by_reg"]
|
||||
skipped_no_article = parsed["skipped_no_article"]
|
||||
|
||||
# --- 3. Deduplicate at result level (regulation_code + article) ---
|
||||
seen: set[tuple[str, str]] = set()
|
||||
unique_results: List[RAGSearchResult] = []
|
||||
for r in sorted(all_results, key=lambda x: x.score, reverse=True):
|
||||
article = _normalize_article(r)
|
||||
if not article:
|
||||
continue
|
||||
key = (r.regulation_code, article)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique_results.append(r)
|
||||
logger.info("RAG extraction: %d unique (regulation, article) pairs", parsed["unique_count"])
|
||||
|
||||
logger.info("RAG extraction: %d unique (regulation, article) pairs", len(unique_results))
|
||||
|
||||
# --- 4. Group by regulation_code and process ---
|
||||
by_reg: Dict[str, List[tuple[str, RAGSearchResult]]] = {}
|
||||
skipped_no_article: List[RAGSearchResult] = []
|
||||
|
||||
for r in all_results:
|
||||
article = _normalize_article(r)
|
||||
if not article:
|
||||
skipped_no_article.append(r)
|
||||
continue
|
||||
key_r = r.regulation_code or "UNKNOWN"
|
||||
if key_r not in by_reg:
|
||||
by_reg[key_r] = []
|
||||
by_reg[key_r].append((article, r))
|
||||
|
||||
# Deduplicate within groups
|
||||
deduped_by_reg: Dict[str, List[tuple[str, RAGSearchResult]]] = {}
|
||||
for reg_code, items in by_reg.items():
|
||||
seen_articles: set[str] = set()
|
||||
deduped: List[tuple[str, RAGSearchResult]] = []
|
||||
for art, r in sorted(items, key=lambda x: x[1].score, reverse=True):
|
||||
if art not in seen_articles:
|
||||
seen_articles.add(art)
|
||||
deduped.append((art, r))
|
||||
deduped_by_reg[reg_code] = deduped
|
||||
|
||||
# --- 5. Create requirements ---
|
||||
req_repo = RequirementRepository(db)
|
||||
created_count = 0
|
||||
skipped_dup_count = 0
|
||||
failed_count = 0
|
||||
result_items: List[ExtractedRequirement] = []
|
||||
|
||||
for reg_code, items in deduped_by_reg.items():
|
||||
if not items:
|
||||
continue
|
||||
|
||||
# Find or create regulation
|
||||
try:
|
||||
first_result = items[0][1]
|
||||
regulation_name = first_result.regulation_name or first_result.regulation_short or reg_code
|
||||
if body.dry_run:
|
||||
# For dry_run, fake a regulation id
|
||||
regulation_id = f"dry-run-{reg_code}"
|
||||
existing_articles: set[str] = set()
|
||||
else:
|
||||
reg = _get_or_create_regulation(db, reg_code, regulation_name)
|
||||
regulation_id = reg.id
|
||||
existing_articles = _build_existing_articles(db, regulation_id)
|
||||
except Exception as e:
|
||||
logger.error("Failed to get/create regulation %s: %s", reg_code, e)
|
||||
failed_count += len(items)
|
||||
continue
|
||||
|
||||
for article, r in items:
|
||||
title = _derive_title(r.text, article)
|
||||
|
||||
if article in existing_articles:
|
||||
skipped_dup_count += 1
|
||||
result_items.append(ExtractedRequirement(
|
||||
regulation_code=reg_code,
|
||||
article=article,
|
||||
title=title,
|
||||
requirement_text=r.text[:1000],
|
||||
source_url=r.source_url,
|
||||
score=r.score,
|
||||
action="skipped_duplicate",
|
||||
))
|
||||
continue
|
||||
|
||||
if not body.dry_run:
|
||||
try:
|
||||
req_repo.create(
|
||||
regulation_id=regulation_id,
|
||||
article=article,
|
||||
title=title,
|
||||
description=f"Extrahiert aus RAG-Korpus (Collection: {r.category or r.regulation_code}). Score: {r.score:.2f}",
|
||||
requirement_text=r.text[:2000],
|
||||
breakpilot_interpretation=None,
|
||||
is_applicable=True,
|
||||
priority=2,
|
||||
)
|
||||
existing_articles.add(article) # prevent intra-batch duplication
|
||||
created_count += 1
|
||||
except Exception as e:
|
||||
logger.error("Failed to create requirement %s/%s: %s", reg_code, article, e)
|
||||
failed_count += 1
|
||||
continue
|
||||
else:
|
||||
created_count += 1 # dry_run: count as would-create
|
||||
|
||||
result_items.append(ExtractedRequirement(
|
||||
regulation_code=reg_code,
|
||||
article=article,
|
||||
title=title,
|
||||
requirement_text=r.text[:1000],
|
||||
source_url=r.source_url,
|
||||
score=r.score,
|
||||
action="created" if not body.dry_run else "would_create",
|
||||
))
|
||||
# --- 3. Create requirements ---
|
||||
store_result = _store_requirements(db, deduped_by_reg, body.dry_run)
|
||||
created_count = store_result["created_count"]
|
||||
skipped_dup_count = store_result["skipped_dup_count"]
|
||||
failed_count = store_result["failed_count"]
|
||||
result_items = store_result["result_items"]
|
||||
|
||||
message = (
|
||||
f"{'[DRY RUN] ' if body.dry_run else ''}"
|
||||
|
||||
Reference in New Issue
Block a user