fix(quality): Ruff/CVE/TS-Fixes, 104 neue Tests, Complexity-Refactoring
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Failing after 30s
CI / test-python-backend-compliance (push) Successful in 30s
CI / test-python-document-crawler (push) Successful in 21s
CI / test-python-dsms-gateway (push) Successful in 17s

- Ruff: 144 auto-fixes (unused imports, == None → is None), F821/F811/F841 manuell
- CVEs: python-multipart>=0.0.22, weasyprint>=68.0, pillow>=12.1.1, npm audit fix (0 vulns)
- TS: 5 tote Drafting-Engine-Dateien entfernt, allowed-facts/sanitizer/StepHeader/context fixes
- Tests: +104 (ISMS 58, Evidence 18, VVT 14, Generation 14) → 1449 passed
- Refactoring: collect_ci_evidence (F→A), row_to_response (E→A), extract_requirements (E→A)
- Dead Code: pca-platform, 7 Go-Handler, dsr_api.py, duplicate Schemas entfernt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-07 19:00:33 +01:00
parent 6509e64dd9
commit 95fcba34cd
124 changed files with 2533 additions and 15709 deletions

View File

@@ -20,13 +20,13 @@ import asyncio
from typing import Optional, List, Dict
from datetime import datetime
from fastapi import APIRouter, Depends, Query
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from ..db import RegulationRepository, RequirementRepository
from ..db.models import RegulationDB, RequirementDB, RegulationTypeEnum
from ..db.models import RegulationDB, RegulationTypeEnum
from ..services.rag_client import get_rag_client, RAGSearchResult
logger = logging.getLogger(__name__)
@@ -185,6 +185,169 @@ def _build_existing_articles(
return {r.article for r in existing}
# ---------------------------------------------------------------------------
# Extraction helpers — independently testable
# ---------------------------------------------------------------------------
def _parse_rag_results(
all_results: List[RAGSearchResult],
regulation_codes: Optional[List[str]] = None,
) -> dict:
"""
Filter, deduplicate, and group RAG search results by regulation code.
Returns a dict with:
- deduped_by_reg: Dict[str, List[tuple[str, RAGSearchResult]]]
- skipped_no_article: List[RAGSearchResult]
- unique_count: int
"""
# Filter by regulation_codes if requested
if regulation_codes:
all_results = [
r for r in all_results
if r.regulation_code in regulation_codes
]
# Deduplicate at result level (regulation_code + article)
seen: set[tuple[str, str]] = set()
unique_count = 0
for r in sorted(all_results, key=lambda x: x.score, reverse=True):
article = _normalize_article(r)
if not article:
continue
key = (r.regulation_code, article)
if key not in seen:
seen.add(key)
unique_count += 1
# Group by regulation_code
by_reg: Dict[str, List[tuple[str, RAGSearchResult]]] = {}
skipped_no_article: List[RAGSearchResult] = []
for r in all_results:
article = _normalize_article(r)
if not article:
skipped_no_article.append(r)
continue
key_r = r.regulation_code or "UNKNOWN"
if key_r not in by_reg:
by_reg[key_r] = []
by_reg[key_r].append((article, r))
# Deduplicate within groups
deduped_by_reg: Dict[str, List[tuple[str, RAGSearchResult]]] = {}
for reg_code, items in by_reg.items():
seen_articles: set[str] = set()
deduped: List[tuple[str, RAGSearchResult]] = []
for art, r in sorted(items, key=lambda x: x[1].score, reverse=True):
if art not in seen_articles:
seen_articles.add(art)
deduped.append((art, r))
deduped_by_reg[reg_code] = deduped
return {
"deduped_by_reg": deduped_by_reg,
"skipped_no_article": skipped_no_article,
"unique_count": unique_count,
}
def _store_requirements(
db: Session,
deduped_by_reg: Dict[str, List[tuple[str, "RAGSearchResult"]]],
dry_run: bool,
) -> dict:
"""
Persist extracted requirements to the database (or simulate in dry_run mode).
Returns a dict with:
- created_count: int
- skipped_dup_count: int
- failed_count: int
- result_items: List[ExtractedRequirement]
"""
req_repo = RequirementRepository(db)
created_count = 0
skipped_dup_count = 0
failed_count = 0
result_items: List[ExtractedRequirement] = []
for reg_code, items in deduped_by_reg.items():
if not items:
continue
# Find or create regulation
try:
first_result = items[0][1]
regulation_name = first_result.regulation_name or first_result.regulation_short or reg_code
if dry_run:
# For dry_run, fake a regulation id
regulation_id = f"dry-run-{reg_code}"
existing_articles: set[str] = set()
else:
reg = _get_or_create_regulation(db, reg_code, regulation_name)
regulation_id = reg.id
existing_articles = _build_existing_articles(db, regulation_id)
except Exception as e:
logger.error("Failed to get/create regulation %s: %s", reg_code, e)
failed_count += len(items)
continue
for article, r in items:
title = _derive_title(r.text, article)
if article in existing_articles:
skipped_dup_count += 1
result_items.append(ExtractedRequirement(
regulation_code=reg_code,
article=article,
title=title,
requirement_text=r.text[:1000],
source_url=r.source_url,
score=r.score,
action="skipped_duplicate",
))
continue
if not dry_run:
try:
req_repo.create(
regulation_id=regulation_id,
article=article,
title=title,
description=f"Extrahiert aus RAG-Korpus (Collection: {r.category or r.regulation_code}). Score: {r.score:.2f}",
requirement_text=r.text[:2000],
breakpilot_interpretation=None,
is_applicable=True,
priority=2,
)
existing_articles.add(article) # prevent intra-batch duplication
created_count += 1
except Exception as e:
logger.error("Failed to create requirement %s/%s: %s", reg_code, article, e)
failed_count += 1
continue
else:
created_count += 1 # dry_run: count as would-create
result_items.append(ExtractedRequirement(
regulation_code=reg_code,
article=article,
title=title,
requirement_text=r.text[:1000],
source_url=r.source_url,
score=r.score,
action="created" if not dry_run else "would_create",
))
return {
"created_count": created_count,
"skipped_dup_count": skipped_dup_count,
"failed_count": failed_count,
"result_items": result_items,
}
# ---------------------------------------------------------------------------
# Endpoint
# ---------------------------------------------------------------------------
@@ -225,126 +388,19 @@ async def extract_requirements_from_rag(
logger.info("RAG extraction: %d raw results from %d collections", len(all_results), len(collections))
# --- 2. Filter by regulation_codes if requested ---
if body.regulation_codes:
all_results = [
r for r in all_results
if r.regulation_code in body.regulation_codes
]
# --- 2. Parse, filter, deduplicate, and group ---
parsed = _parse_rag_results(all_results, body.regulation_codes)
deduped_by_reg = parsed["deduped_by_reg"]
skipped_no_article = parsed["skipped_no_article"]
# --- 3. Deduplicate at result level (regulation_code + article) ---
seen: set[tuple[str, str]] = set()
unique_results: List[RAGSearchResult] = []
for r in sorted(all_results, key=lambda x: x.score, reverse=True):
article = _normalize_article(r)
if not article:
continue
key = (r.regulation_code, article)
if key not in seen:
seen.add(key)
unique_results.append(r)
logger.info("RAG extraction: %d unique (regulation, article) pairs", parsed["unique_count"])
logger.info("RAG extraction: %d unique (regulation, article) pairs", len(unique_results))
# --- 4. Group by regulation_code and process ---
by_reg: Dict[str, List[tuple[str, RAGSearchResult]]] = {}
skipped_no_article: List[RAGSearchResult] = []
for r in all_results:
article = _normalize_article(r)
if not article:
skipped_no_article.append(r)
continue
key_r = r.regulation_code or "UNKNOWN"
if key_r not in by_reg:
by_reg[key_r] = []
by_reg[key_r].append((article, r))
# Deduplicate within groups
deduped_by_reg: Dict[str, List[tuple[str, RAGSearchResult]]] = {}
for reg_code, items in by_reg.items():
seen_articles: set[str] = set()
deduped: List[tuple[str, RAGSearchResult]] = []
for art, r in sorted(items, key=lambda x: x[1].score, reverse=True):
if art not in seen_articles:
seen_articles.add(art)
deduped.append((art, r))
deduped_by_reg[reg_code] = deduped
# --- 5. Create requirements ---
req_repo = RequirementRepository(db)
created_count = 0
skipped_dup_count = 0
failed_count = 0
result_items: List[ExtractedRequirement] = []
for reg_code, items in deduped_by_reg.items():
if not items:
continue
# Find or create regulation
try:
first_result = items[0][1]
regulation_name = first_result.regulation_name or first_result.regulation_short or reg_code
if body.dry_run:
# For dry_run, fake a regulation id
regulation_id = f"dry-run-{reg_code}"
existing_articles: set[str] = set()
else:
reg = _get_or_create_regulation(db, reg_code, regulation_name)
regulation_id = reg.id
existing_articles = _build_existing_articles(db, regulation_id)
except Exception as e:
logger.error("Failed to get/create regulation %s: %s", reg_code, e)
failed_count += len(items)
continue
for article, r in items:
title = _derive_title(r.text, article)
if article in existing_articles:
skipped_dup_count += 1
result_items.append(ExtractedRequirement(
regulation_code=reg_code,
article=article,
title=title,
requirement_text=r.text[:1000],
source_url=r.source_url,
score=r.score,
action="skipped_duplicate",
))
continue
if not body.dry_run:
try:
req_repo.create(
regulation_id=regulation_id,
article=article,
title=title,
description=f"Extrahiert aus RAG-Korpus (Collection: {r.category or r.regulation_code}). Score: {r.score:.2f}",
requirement_text=r.text[:2000],
breakpilot_interpretation=None,
is_applicable=True,
priority=2,
)
existing_articles.add(article) # prevent intra-batch duplication
created_count += 1
except Exception as e:
logger.error("Failed to create requirement %s/%s: %s", reg_code, article, e)
failed_count += 1
continue
else:
created_count += 1 # dry_run: count as would-create
result_items.append(ExtractedRequirement(
regulation_code=reg_code,
article=article,
title=title,
requirement_text=r.text[:1000],
source_url=r.source_url,
score=r.score,
action="created" if not body.dry_run else "would_create",
))
# --- 3. Create requirements ---
store_result = _store_requirements(db, deduped_by_reg, body.dry_run)
created_count = store_result["created_count"]
skipped_dup_count = store_result["skipped_dup_count"]
failed_count = store_result["failed_count"]
result_items = store_result["result_items"]
message = (
f"{'[DRY RUN] ' if body.dry_run else ''}"