Initial commit: breakpilot-lehrer - Lehrer KI Platform

Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00
commit 5a31f52310
1224 changed files with 425430 additions and 0 deletions
@@ -0,0 +1,324 @@
+"""
+Backlog Generator
+Automatically creates GitHub issues for test failures and regressions
+"""
+import subprocess
+import json
+import structlog
+from typing import Optional, List
+from datetime import datetime
+
+from bqas.config import BQASConfig
+from bqas.regression_tracker import TestRun
+from bqas.metrics import TestResult, BQASMetrics
+
+logger = structlog.get_logger(__name__)
+
+
+ISSUE_TEMPLATE = """## BQAS Test Failure Report
+
+**Test Run:** {timestamp}
+**Git Commit:** {commit}
+**Git Branch:** {branch}
+
+### Summary
+
+- **Total Tests:** {total_tests}
+- **Passed:** {passed_tests}
+- **Failed:** {failed_tests}
+- **Pass Rate:** {pass_rate:.1f}%
+- **Average Score:** {avg_score:.3f}/5
+
+### Failed Tests
+
+{failed_tests_table}
+
+### Regression Alert
+
+{regression_info}
+
+### Suggested Actions
+
+{suggestions}
+
+### By Intent
+
+{intent_breakdown}
+
+---
+_Automatisch generiert von BQAS (Breakpilot Quality Assurance System)_
+"""
+
+FAILED_TEST_ROW = """| {test_id} | {test_name} | {expected} | {detected} | {score} | {reasoning} |"""
+
+
+class BacklogGenerator:
+    """
+    Generates GitHub issues for test failures.
+
+    Uses gh CLI for GitHub integration.
+    """
+
+    def __init__(self, config: Optional[BQASConfig] = None):
+        self.config = config or BQASConfig.from_env()
+
+    def _check_gh_available(self) -> bool:
+        """Check if gh CLI is available and authenticated."""
+        try:
+            result = subprocess.run(
+                ["gh", "auth", "status"],
+                capture_output=True,
+                text=True,
+            )
+            return result.returncode == 0
+        except FileNotFoundError:
+            return False
+
+    def _format_failed_tests(self, results: List[TestResult]) -> str:
+        """Format failed tests as markdown table."""
+        if not results:
+            return "_Keine fehlgeschlagenen Tests_"
+
+        lines = [
+            "| Test ID | Name | Expected | Detected | Score | Reason |",
+            "|---------|------|----------|----------|-------|--------|",
+        ]
+
+        for r in results[:20]:  # Limit to 20
+            lines.append(FAILED_TEST_ROW.format(
+                test_id=r.test_id,
+                test_name=r.test_name[:30],
+                expected=r.expected_intent,
+                detected=r.detected_intent,
+                score=f"{r.composite_score:.2f}",
+                reasoning=r.reasoning[:50] + "..." if len(r.reasoning) > 50 else r.reasoning,
+            ))
+
+        if len(results) > 20:
+            lines.append(f"| ... | _und {len(results) - 20} weitere_ | | | | |")
+
+        return "\n".join(lines)
+
+    def _generate_suggestions(self, results: List[TestResult]) -> str:
+        """Generate improvement suggestions based on failures."""
+        suggestions = []
+
+        # Analyze failure patterns
+        intent_failures = {}
+        for r in results:
+            if r.expected_intent not in intent_failures:
+                intent_failures[r.expected_intent] = 0
+            intent_failures[r.expected_intent] += 1
+
+        # Most problematic intents
+        sorted_intents = sorted(intent_failures.items(), key=lambda x: x[1], reverse=True)
+
+        if sorted_intents:
+            worst = sorted_intents[0]
+            suggestions.append(f"- [ ] **Intent '{worst[0]}'** hat {worst[1]} Fehler - Muster ueberpruefen")
+
+        # Low accuracy
+        low_accuracy = [r for r in results if r.intent_accuracy < 50]
+        if low_accuracy:
+            suggestions.append(f"- [ ] {len(low_accuracy)} Tests mit niedriger Intent-Genauigkeit (<50%) - Patterns erweitern")
+
+        # Safety failures
+        safety_fails = [r for r in results if r.safety == "fail"]
+        if safety_fails:
+            suggestions.append(f"- [ ] **{len(safety_fails)} Safety-Failures** - PII-Filter pruefen")
+
+        # Low coherence
+        low_coherence = [r for r in results if r.coherence < 3]
+        if low_coherence:
+            suggestions.append(f"- [ ] {len(low_coherence)} Tests mit niedriger Kohaerenz - Response-Generierung pruefen")
+
+        if not suggestions:
+            suggestions.append("- [ ] Detaillierte Analyse der Fehler durchfuehren")
+
+        return "\n".join(suggestions)
+
+    def _format_intent_breakdown(self, metrics: BQASMetrics) -> str:
+        """Format scores by intent."""
+        if not metrics.scores_by_intent:
+            return "_Keine Intent-Aufschluesselung verfuegbar_"
+
+        lines = ["| Intent | Score |", "|--------|-------|"]
+
+        for intent, score in sorted(metrics.scores_by_intent.items(), key=lambda x: x[1]):
+            emoji = "🔴" if score < 3.0 else "🟡" if score < 4.0 else "🟢"
+            lines.append(f"| {emoji} {intent} | {score:.3f} |")
+
+        return "\n".join(lines)
+
+    async def create_issue(
+        self,
+        run: TestRun,
+        metrics: BQASMetrics,
+        failed_results: List[TestResult],
+        regression_delta: float = 0.0,
+    ) -> Optional[str]:
+        """
+        Create a GitHub issue for test failures.
+
+        Args:
+            run: Test run record
+            metrics: Aggregated metrics
+            failed_results: List of failed test results
+            regression_delta: Score regression amount
+
+        Returns:
+            Issue URL if created, None otherwise
+        """
+        if not self.config.github_repo:
+            logger.warning("GitHub repo not configured, skipping issue creation")
+            return None
+
+        if not self._check_gh_available():
+            logger.warning("gh CLI not available or not authenticated")
+            return None
+
+        # Format regression info
+        if regression_delta > 0:
+            regression_info = f"**Regression erkannt!** Score um **{regression_delta:.3f}** gefallen."
+        else:
+            regression_info = "Keine signifikante Regression."
+
+        # Build issue body
+        body = ISSUE_TEMPLATE.format(
+            timestamp=run.timestamp.isoformat(),
+            commit=run.git_commit,
+            branch=run.git_branch,
+            total_tests=metrics.total_tests,
+            passed_tests=metrics.passed_tests,
+            failed_tests=metrics.failed_tests,
+            pass_rate=(metrics.passed_tests / metrics.total_tests * 100) if metrics.total_tests > 0 else 0,
+            avg_score=metrics.avg_composite_score,
+            failed_tests_table=self._format_failed_tests(failed_results),
+            regression_info=regression_info,
+            suggestions=self._generate_suggestions(failed_results),
+            intent_breakdown=self._format_intent_breakdown(metrics),
+        )
+
+        # Create title
+        title = f"BQAS: {metrics.failed_tests} Test-Failures ({run.git_commit})"
+
+        try:
+            # Use gh CLI to create issue
+            result = subprocess.run(
+                [
+                    "gh", "issue", "create",
+                    "--repo", self.config.github_repo,
+                    "--title", title,
+                    "--body", body,
+                    "--label", "bqas,automated,quality",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            if result.returncode == 0:
+                issue_url = result.stdout.strip()
+                logger.info("GitHub issue created", url=issue_url)
+                return issue_url
+            else:
+                logger.error("Failed to create issue", error=result.stderr)
+                return None
+
+        except Exception as e:
+            logger.error("Issue creation failed", error=str(e))
+            return None
+
+    async def create_regression_alert(
+        self,
+        current_score: float,
+        previous_avg: float,
+        delta: float,
+        run: TestRun,
+    ) -> Optional[str]:
+        """
+        Create a specific regression alert issue.
+
+        Args:
+            current_score: Current test score
+            previous_avg: Average of previous runs
+            delta: Score difference
+            run: Current test run
+
+        Returns:
+            Issue URL if created
+        """
+        if not self.config.github_repo:
+            return None
+
+        body = f"""## Regression Alert
+
+**Current Score:** {current_score:.3f}
+**Previous Average:** {previous_avg:.3f}
+**Delta:** -{delta:.3f}
+
+### Context
+
+- **Commit:** {run.git_commit}
+- **Branch:** {run.git_branch}
+- **Timestamp:** {run.timestamp.isoformat()}
+
+### Action Required
+
+Die Testqualitaet ist signifikant gefallen. Bitte pruefen:
+
+1. Letzte Commits auf moegliche Regressionen
+2. Intent-Router Patterns
+3. LLM Responses
+4. Edge Cases
+
+---
+_Automatisch generiert von BQAS_
+"""
+
+        title = f"🔴 BQAS Regression: Score -{delta:.3f}"
+
+        try:
+            result = subprocess.run(
+                [
+                    "gh", "issue", "create",
+                    "--repo", self.config.github_repo,
+                    "--title", title,
+                    "--body", body,
+                    "--label", "bqas,regression,urgent",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            if result.returncode == 0:
+                return result.stdout.strip()
+
+        except Exception as e:
+            logger.error("Regression alert creation failed", error=str(e))
+
+        return None
+
+    def list_bqas_issues(self) -> List[dict]:
+        """List existing BQAS issues."""
+        if not self.config.github_repo:
+            return []
+
+        try:
+            result = subprocess.run(
+                [
+                    "gh", "issue", "list",
+                    "--repo", self.config.github_repo,
+                    "--label", "bqas",
+                    "--json", "number,title,state,createdAt",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            if result.returncode == 0:
+                return json.loads(result.stdout)
+
+        except Exception as e:
+            logger.error("Failed to list issues", error=str(e))
+
+        return []