This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/voice-service/bqas/backlog_generator.py
BreakPilot Dev 19855efacc
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
feat: BreakPilot PWA - Full codebase (clean push without large binaries)
All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00

325 lines
9.7 KiB
Python

"""
Backlog Generator
Automatically creates GitHub issues for test failures and regressions
"""
import subprocess
import json
import structlog
from typing import Optional, List
from datetime import datetime
from bqas.config import BQASConfig
from bqas.regression_tracker import TestRun
from bqas.metrics import TestResult, BQASMetrics
logger = structlog.get_logger(__name__)
ISSUE_TEMPLATE = """## BQAS Test Failure Report
**Test Run:** {timestamp}
**Git Commit:** {commit}
**Git Branch:** {branch}
### Summary
- **Total Tests:** {total_tests}
- **Passed:** {passed_tests}
- **Failed:** {failed_tests}
- **Pass Rate:** {pass_rate:.1f}%
- **Average Score:** {avg_score:.3f}/5
### Failed Tests
{failed_tests_table}
### Regression Alert
{regression_info}
### Suggested Actions
{suggestions}
### By Intent
{intent_breakdown}
---
_Automatisch generiert von BQAS (Breakpilot Quality Assurance System)_
"""
FAILED_TEST_ROW = """| {test_id} | {test_name} | {expected} | {detected} | {score} | {reasoning} |"""
class BacklogGenerator:
"""
Generates GitHub issues for test failures.
Uses gh CLI for GitHub integration.
"""
def __init__(self, config: Optional[BQASConfig] = None):
self.config = config or BQASConfig.from_env()
def _check_gh_available(self) -> bool:
"""Check if gh CLI is available and authenticated."""
try:
result = subprocess.run(
["gh", "auth", "status"],
capture_output=True,
text=True,
)
return result.returncode == 0
except FileNotFoundError:
return False
def _format_failed_tests(self, results: List[TestResult]) -> str:
"""Format failed tests as markdown table."""
if not results:
return "_Keine fehlgeschlagenen Tests_"
lines = [
"| Test ID | Name | Expected | Detected | Score | Reason |",
"|---------|------|----------|----------|-------|--------|",
]
for r in results[:20]: # Limit to 20
lines.append(FAILED_TEST_ROW.format(
test_id=r.test_id,
test_name=r.test_name[:30],
expected=r.expected_intent,
detected=r.detected_intent,
score=f"{r.composite_score:.2f}",
reasoning=r.reasoning[:50] + "..." if len(r.reasoning) > 50 else r.reasoning,
))
if len(results) > 20:
lines.append(f"| ... | _und {len(results) - 20} weitere_ | | | | |")
return "\n".join(lines)
def _generate_suggestions(self, results: List[TestResult]) -> str:
"""Generate improvement suggestions based on failures."""
suggestions = []
# Analyze failure patterns
intent_failures = {}
for r in results:
if r.expected_intent not in intent_failures:
intent_failures[r.expected_intent] = 0
intent_failures[r.expected_intent] += 1
# Most problematic intents
sorted_intents = sorted(intent_failures.items(), key=lambda x: x[1], reverse=True)
if sorted_intents:
worst = sorted_intents[0]
suggestions.append(f"- [ ] **Intent '{worst[0]}'** hat {worst[1]} Fehler - Muster ueberpruefen")
# Low accuracy
low_accuracy = [r for r in results if r.intent_accuracy < 50]
if low_accuracy:
suggestions.append(f"- [ ] {len(low_accuracy)} Tests mit niedriger Intent-Genauigkeit (<50%) - Patterns erweitern")
# Safety failures
safety_fails = [r for r in results if r.safety == "fail"]
if safety_fails:
suggestions.append(f"- [ ] **{len(safety_fails)} Safety-Failures** - PII-Filter pruefen")
# Low coherence
low_coherence = [r for r in results if r.coherence < 3]
if low_coherence:
suggestions.append(f"- [ ] {len(low_coherence)} Tests mit niedriger Kohaerenz - Response-Generierung pruefen")
if not suggestions:
suggestions.append("- [ ] Detaillierte Analyse der Fehler durchfuehren")
return "\n".join(suggestions)
def _format_intent_breakdown(self, metrics: BQASMetrics) -> str:
"""Format scores by intent."""
if not metrics.scores_by_intent:
return "_Keine Intent-Aufschluesselung verfuegbar_"
lines = ["| Intent | Score |", "|--------|-------|"]
for intent, score in sorted(metrics.scores_by_intent.items(), key=lambda x: x[1]):
emoji = "🔴" if score < 3.0 else "🟡" if score < 4.0 else "🟢"
lines.append(f"| {emoji} {intent} | {score:.3f} |")
return "\n".join(lines)
async def create_issue(
self,
run: TestRun,
metrics: BQASMetrics,
failed_results: List[TestResult],
regression_delta: float = 0.0,
) -> Optional[str]:
"""
Create a GitHub issue for test failures.
Args:
run: Test run record
metrics: Aggregated metrics
failed_results: List of failed test results
regression_delta: Score regression amount
Returns:
Issue URL if created, None otherwise
"""
if not self.config.github_repo:
logger.warning("GitHub repo not configured, skipping issue creation")
return None
if not self._check_gh_available():
logger.warning("gh CLI not available or not authenticated")
return None
# Format regression info
if regression_delta > 0:
regression_info = f"**Regression erkannt!** Score um **{regression_delta:.3f}** gefallen."
else:
regression_info = "Keine signifikante Regression."
# Build issue body
body = ISSUE_TEMPLATE.format(
timestamp=run.timestamp.isoformat(),
commit=run.git_commit,
branch=run.git_branch,
total_tests=metrics.total_tests,
passed_tests=metrics.passed_tests,
failed_tests=metrics.failed_tests,
pass_rate=(metrics.passed_tests / metrics.total_tests * 100) if metrics.total_tests > 0 else 0,
avg_score=metrics.avg_composite_score,
failed_tests_table=self._format_failed_tests(failed_results),
regression_info=regression_info,
suggestions=self._generate_suggestions(failed_results),
intent_breakdown=self._format_intent_breakdown(metrics),
)
# Create title
title = f"BQAS: {metrics.failed_tests} Test-Failures ({run.git_commit})"
try:
# Use gh CLI to create issue
result = subprocess.run(
[
"gh", "issue", "create",
"--repo", self.config.github_repo,
"--title", title,
"--body", body,
"--label", "bqas,automated,quality",
],
capture_output=True,
text=True,
)
if result.returncode == 0:
issue_url = result.stdout.strip()
logger.info("GitHub issue created", url=issue_url)
return issue_url
else:
logger.error("Failed to create issue", error=result.stderr)
return None
except Exception as e:
logger.error("Issue creation failed", error=str(e))
return None
async def create_regression_alert(
self,
current_score: float,
previous_avg: float,
delta: float,
run: TestRun,
) -> Optional[str]:
"""
Create a specific regression alert issue.
Args:
current_score: Current test score
previous_avg: Average of previous runs
delta: Score difference
run: Current test run
Returns:
Issue URL if created
"""
if not self.config.github_repo:
return None
body = f"""## Regression Alert
**Current Score:** {current_score:.3f}
**Previous Average:** {previous_avg:.3f}
**Delta:** -{delta:.3f}
### Context
- **Commit:** {run.git_commit}
- **Branch:** {run.git_branch}
- **Timestamp:** {run.timestamp.isoformat()}
### Action Required
Die Testqualitaet ist signifikant gefallen. Bitte pruefen:
1. Letzte Commits auf moegliche Regressionen
2. Intent-Router Patterns
3. LLM Responses
4. Edge Cases
---
_Automatisch generiert von BQAS_
"""
title = f"🔴 BQAS Regression: Score -{delta:.3f}"
try:
result = subprocess.run(
[
"gh", "issue", "create",
"--repo", self.config.github_repo,
"--title", title,
"--body", body,
"--label", "bqas,regression,urgent",
],
capture_output=True,
text=True,
)
if result.returncode == 0:
return result.stdout.strip()
except Exception as e:
logger.error("Regression alert creation failed", error=str(e))
return None
def list_bqas_issues(self) -> List[dict]:
"""List existing BQAS issues."""
if not self.config.github_repo:
return []
try:
result = subprocess.run(
[
"gh", "issue", "list",
"--repo", self.config.github_repo,
"--label", "bqas",
"--json", "number,title,state,createdAt",
],
capture_output=True,
text=True,
)
if result.returncode == 0:
return json.loads(result.stdout)
except Exception as e:
logger.error("Failed to list issues", error=str(e))
return []