""" Backlog Generator Automatically creates GitHub issues for test failures and regressions """ import subprocess import json import structlog from typing import Optional, List from datetime import datetime from bqas.config import BQASConfig from bqas.regression_tracker import TestRun from bqas.metrics import TestResult, BQASMetrics logger = structlog.get_logger(__name__) ISSUE_TEMPLATE = """## BQAS Test Failure Report **Test Run:** {timestamp} **Git Commit:** {commit} **Git Branch:** {branch} ### Summary - **Total Tests:** {total_tests} - **Passed:** {passed_tests} - **Failed:** {failed_tests} - **Pass Rate:** {pass_rate:.1f}% - **Average Score:** {avg_score:.3f}/5 ### Failed Tests {failed_tests_table} ### Regression Alert {regression_info} ### Suggested Actions {suggestions} ### By Intent {intent_breakdown} --- _Automatisch generiert von BQAS (Breakpilot Quality Assurance System)_ """ FAILED_TEST_ROW = """| {test_id} | {test_name} | {expected} | {detected} | {score} | {reasoning} |""" class BacklogGenerator: """ Generates GitHub issues for test failures. Uses gh CLI for GitHub integration. """ def __init__(self, config: Optional[BQASConfig] = None): self.config = config or BQASConfig.from_env() def _check_gh_available(self) -> bool: """Check if gh CLI is available and authenticated.""" try: result = subprocess.run( ["gh", "auth", "status"], capture_output=True, text=True, ) return result.returncode == 0 except FileNotFoundError: return False def _format_failed_tests(self, results: List[TestResult]) -> str: """Format failed tests as markdown table.""" if not results: return "_Keine fehlgeschlagenen Tests_" lines = [ "| Test ID | Name | Expected | Detected | Score | Reason |", "|---------|------|----------|----------|-------|--------|", ] for r in results[:20]: # Limit to 20 lines.append(FAILED_TEST_ROW.format( test_id=r.test_id, test_name=r.test_name[:30], expected=r.expected_intent, detected=r.detected_intent, score=f"{r.composite_score:.2f}", reasoning=r.reasoning[:50] + "..." if len(r.reasoning) > 50 else r.reasoning, )) if len(results) > 20: lines.append(f"| ... | _und {len(results) - 20} weitere_ | | | | |") return "\n".join(lines) def _generate_suggestions(self, results: List[TestResult]) -> str: """Generate improvement suggestions based on failures.""" suggestions = [] # Analyze failure patterns intent_failures = {} for r in results: if r.expected_intent not in intent_failures: intent_failures[r.expected_intent] = 0 intent_failures[r.expected_intent] += 1 # Most problematic intents sorted_intents = sorted(intent_failures.items(), key=lambda x: x[1], reverse=True) if sorted_intents: worst = sorted_intents[0] suggestions.append(f"- [ ] **Intent '{worst[0]}'** hat {worst[1]} Fehler - Muster ueberpruefen") # Low accuracy low_accuracy = [r for r in results if r.intent_accuracy < 50] if low_accuracy: suggestions.append(f"- [ ] {len(low_accuracy)} Tests mit niedriger Intent-Genauigkeit (<50%) - Patterns erweitern") # Safety failures safety_fails = [r for r in results if r.safety == "fail"] if safety_fails: suggestions.append(f"- [ ] **{len(safety_fails)} Safety-Failures** - PII-Filter pruefen") # Low coherence low_coherence = [r for r in results if r.coherence < 3] if low_coherence: suggestions.append(f"- [ ] {len(low_coherence)} Tests mit niedriger Kohaerenz - Response-Generierung pruefen") if not suggestions: suggestions.append("- [ ] Detaillierte Analyse der Fehler durchfuehren") return "\n".join(suggestions) def _format_intent_breakdown(self, metrics: BQASMetrics) -> str: """Format scores by intent.""" if not metrics.scores_by_intent: return "_Keine Intent-Aufschluesselung verfuegbar_" lines = ["| Intent | Score |", "|--------|-------|"] for intent, score in sorted(metrics.scores_by_intent.items(), key=lambda x: x[1]): emoji = "🔴" if score < 3.0 else "🟡" if score < 4.0 else "🟢" lines.append(f"| {emoji} {intent} | {score:.3f} |") return "\n".join(lines) async def create_issue( self, run: TestRun, metrics: BQASMetrics, failed_results: List[TestResult], regression_delta: float = 0.0, ) -> Optional[str]: """ Create a GitHub issue for test failures. Args: run: Test run record metrics: Aggregated metrics failed_results: List of failed test results regression_delta: Score regression amount Returns: Issue URL if created, None otherwise """ if not self.config.github_repo: logger.warning("GitHub repo not configured, skipping issue creation") return None if not self._check_gh_available(): logger.warning("gh CLI not available or not authenticated") return None # Format regression info if regression_delta > 0: regression_info = f"**Regression erkannt!** Score um **{regression_delta:.3f}** gefallen." else: regression_info = "Keine signifikante Regression." # Build issue body body = ISSUE_TEMPLATE.format( timestamp=run.timestamp.isoformat(), commit=run.git_commit, branch=run.git_branch, total_tests=metrics.total_tests, passed_tests=metrics.passed_tests, failed_tests=metrics.failed_tests, pass_rate=(metrics.passed_tests / metrics.total_tests * 100) if metrics.total_tests > 0 else 0, avg_score=metrics.avg_composite_score, failed_tests_table=self._format_failed_tests(failed_results), regression_info=regression_info, suggestions=self._generate_suggestions(failed_results), intent_breakdown=self._format_intent_breakdown(metrics), ) # Create title title = f"BQAS: {metrics.failed_tests} Test-Failures ({run.git_commit})" try: # Use gh CLI to create issue result = subprocess.run( [ "gh", "issue", "create", "--repo", self.config.github_repo, "--title", title, "--body", body, "--label", "bqas,automated,quality", ], capture_output=True, text=True, ) if result.returncode == 0: issue_url = result.stdout.strip() logger.info("GitHub issue created", url=issue_url) return issue_url else: logger.error("Failed to create issue", error=result.stderr) return None except Exception as e: logger.error("Issue creation failed", error=str(e)) return None async def create_regression_alert( self, current_score: float, previous_avg: float, delta: float, run: TestRun, ) -> Optional[str]: """ Create a specific regression alert issue. Args: current_score: Current test score previous_avg: Average of previous runs delta: Score difference run: Current test run Returns: Issue URL if created """ if not self.config.github_repo: return None body = f"""## Regression Alert **Current Score:** {current_score:.3f} **Previous Average:** {previous_avg:.3f} **Delta:** -{delta:.3f} ### Context - **Commit:** {run.git_commit} - **Branch:** {run.git_branch} - **Timestamp:** {run.timestamp.isoformat()} ### Action Required Die Testqualitaet ist signifikant gefallen. Bitte pruefen: 1. Letzte Commits auf moegliche Regressionen 2. Intent-Router Patterns 3. LLM Responses 4. Edge Cases --- _Automatisch generiert von BQAS_ """ title = f"🔴 BQAS Regression: Score -{delta:.3f}" try: result = subprocess.run( [ "gh", "issue", "create", "--repo", self.config.github_repo, "--title", title, "--body", body, "--label", "bqas,regression,urgent", ], capture_output=True, text=True, ) if result.returncode == 0: return result.stdout.strip() except Exception as e: logger.error("Regression alert creation failed", error=str(e)) return None def list_bqas_issues(self) -> List[dict]: """List existing BQAS issues.""" if not self.config.github_repo: return [] try: result = subprocess.run( [ "gh", "issue", "list", "--repo", self.config.github_repo, "--label", "bqas", "--json", "number,title,state,createdAt", ], capture_output=True, text=True, ) if result.returncode == 0: return json.loads(result.stdout) except Exception as e: logger.error("Failed to list issues", error=str(e)) return []