325 lines
9.7 KiB
Python
325 lines
9.7 KiB
Python
"""
|
|
Backlog Generator
|
|
Automatically creates GitHub issues for test failures and regressions
|
|
"""
|
|
import subprocess
|
|
import json
|
|
import structlog
|
|
from typing import Optional, List
|
|
from datetime import datetime
|
|
|
|
from bqas.config import BQASConfig
|
|
from bqas.regression_tracker import TestRun
|
|
from bqas.metrics import TestResult, BQASMetrics
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
|
|
ISSUE_TEMPLATE = """## BQAS Test Failure Report
|
|
|
|
**Test Run:** {timestamp}
|
|
**Git Commit:** {commit}
|
|
**Git Branch:** {branch}
|
|
|
|
### Summary
|
|
|
|
- **Total Tests:** {total_tests}
|
|
- **Passed:** {passed_tests}
|
|
- **Failed:** {failed_tests}
|
|
- **Pass Rate:** {pass_rate:.1f}%
|
|
- **Average Score:** {avg_score:.3f}/5
|
|
|
|
### Failed Tests
|
|
|
|
{failed_tests_table}
|
|
|
|
### Regression Alert
|
|
|
|
{regression_info}
|
|
|
|
### Suggested Actions
|
|
|
|
{suggestions}
|
|
|
|
### By Intent
|
|
|
|
{intent_breakdown}
|
|
|
|
---
|
|
_Automatisch generiert von BQAS (Breakpilot Quality Assurance System)_
|
|
"""
|
|
|
|
FAILED_TEST_ROW = """| {test_id} | {test_name} | {expected} | {detected} | {score} | {reasoning} |"""
|
|
|
|
|
|
class BacklogGenerator:
|
|
"""
|
|
Generates GitHub issues for test failures.
|
|
|
|
Uses gh CLI for GitHub integration.
|
|
"""
|
|
|
|
def __init__(self, config: Optional[BQASConfig] = None):
|
|
self.config = config or BQASConfig.from_env()
|
|
|
|
def _check_gh_available(self) -> bool:
|
|
"""Check if gh CLI is available and authenticated."""
|
|
try:
|
|
result = subprocess.run(
|
|
["gh", "auth", "status"],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
return result.returncode == 0
|
|
except FileNotFoundError:
|
|
return False
|
|
|
|
def _format_failed_tests(self, results: List[TestResult]) -> str:
|
|
"""Format failed tests as markdown table."""
|
|
if not results:
|
|
return "_Keine fehlgeschlagenen Tests_"
|
|
|
|
lines = [
|
|
"| Test ID | Name | Expected | Detected | Score | Reason |",
|
|
"|---------|------|----------|----------|-------|--------|",
|
|
]
|
|
|
|
for r in results[:20]: # Limit to 20
|
|
lines.append(FAILED_TEST_ROW.format(
|
|
test_id=r.test_id,
|
|
test_name=r.test_name[:30],
|
|
expected=r.expected_intent,
|
|
detected=r.detected_intent,
|
|
score=f"{r.composite_score:.2f}",
|
|
reasoning=r.reasoning[:50] + "..." if len(r.reasoning) > 50 else r.reasoning,
|
|
))
|
|
|
|
if len(results) > 20:
|
|
lines.append(f"| ... | _und {len(results) - 20} weitere_ | | | | |")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _generate_suggestions(self, results: List[TestResult]) -> str:
|
|
"""Generate improvement suggestions based on failures."""
|
|
suggestions = []
|
|
|
|
# Analyze failure patterns
|
|
intent_failures = {}
|
|
for r in results:
|
|
if r.expected_intent not in intent_failures:
|
|
intent_failures[r.expected_intent] = 0
|
|
intent_failures[r.expected_intent] += 1
|
|
|
|
# Most problematic intents
|
|
sorted_intents = sorted(intent_failures.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
if sorted_intents:
|
|
worst = sorted_intents[0]
|
|
suggestions.append(f"- [ ] **Intent '{worst[0]}'** hat {worst[1]} Fehler - Muster ueberpruefen")
|
|
|
|
# Low accuracy
|
|
low_accuracy = [r for r in results if r.intent_accuracy < 50]
|
|
if low_accuracy:
|
|
suggestions.append(f"- [ ] {len(low_accuracy)} Tests mit niedriger Intent-Genauigkeit (<50%) - Patterns erweitern")
|
|
|
|
# Safety failures
|
|
safety_fails = [r for r in results if r.safety == "fail"]
|
|
if safety_fails:
|
|
suggestions.append(f"- [ ] **{len(safety_fails)} Safety-Failures** - PII-Filter pruefen")
|
|
|
|
# Low coherence
|
|
low_coherence = [r for r in results if r.coherence < 3]
|
|
if low_coherence:
|
|
suggestions.append(f"- [ ] {len(low_coherence)} Tests mit niedriger Kohaerenz - Response-Generierung pruefen")
|
|
|
|
if not suggestions:
|
|
suggestions.append("- [ ] Detaillierte Analyse der Fehler durchfuehren")
|
|
|
|
return "\n".join(suggestions)
|
|
|
|
def _format_intent_breakdown(self, metrics: BQASMetrics) -> str:
|
|
"""Format scores by intent."""
|
|
if not metrics.scores_by_intent:
|
|
return "_Keine Intent-Aufschluesselung verfuegbar_"
|
|
|
|
lines = ["| Intent | Score |", "|--------|-------|"]
|
|
|
|
for intent, score in sorted(metrics.scores_by_intent.items(), key=lambda x: x[1]):
|
|
emoji = "🔴" if score < 3.0 else "🟡" if score < 4.0 else "🟢"
|
|
lines.append(f"| {emoji} {intent} | {score:.3f} |")
|
|
|
|
return "\n".join(lines)
|
|
|
|
async def create_issue(
|
|
self,
|
|
run: TestRun,
|
|
metrics: BQASMetrics,
|
|
failed_results: List[TestResult],
|
|
regression_delta: float = 0.0,
|
|
) -> Optional[str]:
|
|
"""
|
|
Create a GitHub issue for test failures.
|
|
|
|
Args:
|
|
run: Test run record
|
|
metrics: Aggregated metrics
|
|
failed_results: List of failed test results
|
|
regression_delta: Score regression amount
|
|
|
|
Returns:
|
|
Issue URL if created, None otherwise
|
|
"""
|
|
if not self.config.github_repo:
|
|
logger.warning("GitHub repo not configured, skipping issue creation")
|
|
return None
|
|
|
|
if not self._check_gh_available():
|
|
logger.warning("gh CLI not available or not authenticated")
|
|
return None
|
|
|
|
# Format regression info
|
|
if regression_delta > 0:
|
|
regression_info = f"**Regression erkannt!** Score um **{regression_delta:.3f}** gefallen."
|
|
else:
|
|
regression_info = "Keine signifikante Regression."
|
|
|
|
# Build issue body
|
|
body = ISSUE_TEMPLATE.format(
|
|
timestamp=run.timestamp.isoformat(),
|
|
commit=run.git_commit,
|
|
branch=run.git_branch,
|
|
total_tests=metrics.total_tests,
|
|
passed_tests=metrics.passed_tests,
|
|
failed_tests=metrics.failed_tests,
|
|
pass_rate=(metrics.passed_tests / metrics.total_tests * 100) if metrics.total_tests > 0 else 0,
|
|
avg_score=metrics.avg_composite_score,
|
|
failed_tests_table=self._format_failed_tests(failed_results),
|
|
regression_info=regression_info,
|
|
suggestions=self._generate_suggestions(failed_results),
|
|
intent_breakdown=self._format_intent_breakdown(metrics),
|
|
)
|
|
|
|
# Create title
|
|
title = f"BQAS: {metrics.failed_tests} Test-Failures ({run.git_commit})"
|
|
|
|
try:
|
|
# Use gh CLI to create issue
|
|
result = subprocess.run(
|
|
[
|
|
"gh", "issue", "create",
|
|
"--repo", self.config.github_repo,
|
|
"--title", title,
|
|
"--body", body,
|
|
"--label", "bqas,automated,quality",
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
issue_url = result.stdout.strip()
|
|
logger.info("GitHub issue created", url=issue_url)
|
|
return issue_url
|
|
else:
|
|
logger.error("Failed to create issue", error=result.stderr)
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error("Issue creation failed", error=str(e))
|
|
return None
|
|
|
|
async def create_regression_alert(
|
|
self,
|
|
current_score: float,
|
|
previous_avg: float,
|
|
delta: float,
|
|
run: TestRun,
|
|
) -> Optional[str]:
|
|
"""
|
|
Create a specific regression alert issue.
|
|
|
|
Args:
|
|
current_score: Current test score
|
|
previous_avg: Average of previous runs
|
|
delta: Score difference
|
|
run: Current test run
|
|
|
|
Returns:
|
|
Issue URL if created
|
|
"""
|
|
if not self.config.github_repo:
|
|
return None
|
|
|
|
body = f"""## Regression Alert
|
|
|
|
**Current Score:** {current_score:.3f}
|
|
**Previous Average:** {previous_avg:.3f}
|
|
**Delta:** -{delta:.3f}
|
|
|
|
### Context
|
|
|
|
- **Commit:** {run.git_commit}
|
|
- **Branch:** {run.git_branch}
|
|
- **Timestamp:** {run.timestamp.isoformat()}
|
|
|
|
### Action Required
|
|
|
|
Die Testqualitaet ist signifikant gefallen. Bitte pruefen:
|
|
|
|
1. Letzte Commits auf moegliche Regressionen
|
|
2. Intent-Router Patterns
|
|
3. LLM Responses
|
|
4. Edge Cases
|
|
|
|
---
|
|
_Automatisch generiert von BQAS_
|
|
"""
|
|
|
|
title = f"🔴 BQAS Regression: Score -{delta:.3f}"
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
"gh", "issue", "create",
|
|
"--repo", self.config.github_repo,
|
|
"--title", title,
|
|
"--body", body,
|
|
"--label", "bqas,regression,urgent",
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
return result.stdout.strip()
|
|
|
|
except Exception as e:
|
|
logger.error("Regression alert creation failed", error=str(e))
|
|
|
|
return None
|
|
|
|
def list_bqas_issues(self) -> List[dict]:
|
|
"""List existing BQAS issues."""
|
|
if not self.config.github_repo:
|
|
return []
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
"gh", "issue", "list",
|
|
"--repo", self.config.github_repo,
|
|
"--label", "bqas",
|
|
"--json", "number,title,state,createdAt",
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
return json.loads(result.stdout)
|
|
|
|
except Exception as e:
|
|
logger.error("Failed to list issues", error=str(e))
|
|
|
|
return []
|