Initial commit: breakpilot-lehrer - Lehrer KI Platform

Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-11 23:47:26 +01:00
commit 5a31f52310
1224 changed files with 425430 additions and 0 deletions

View File

@@ -0,0 +1,324 @@
"""
Backlog Generator
Automatically creates GitHub issues for test failures and regressions
"""
import subprocess
import json
import structlog
from typing import Optional, List
from datetime import datetime
from bqas.config import BQASConfig
from bqas.regression_tracker import TestRun
from bqas.metrics import TestResult, BQASMetrics
logger = structlog.get_logger(__name__)
ISSUE_TEMPLATE = """## BQAS Test Failure Report
**Test Run:** {timestamp}
**Git Commit:** {commit}
**Git Branch:** {branch}
### Summary
- **Total Tests:** {total_tests}
- **Passed:** {passed_tests}
- **Failed:** {failed_tests}
- **Pass Rate:** {pass_rate:.1f}%
- **Average Score:** {avg_score:.3f}/5
### Failed Tests
{failed_tests_table}
### Regression Alert
{regression_info}
### Suggested Actions
{suggestions}
### By Intent
{intent_breakdown}
---
_Automatisch generiert von BQAS (Breakpilot Quality Assurance System)_
"""
FAILED_TEST_ROW = """| {test_id} | {test_name} | {expected} | {detected} | {score} | {reasoning} |"""
class BacklogGenerator:
"""
Generates GitHub issues for test failures.
Uses gh CLI for GitHub integration.
"""
def __init__(self, config: Optional[BQASConfig] = None):
self.config = config or BQASConfig.from_env()
def _check_gh_available(self) -> bool:
"""Check if gh CLI is available and authenticated."""
try:
result = subprocess.run(
["gh", "auth", "status"],
capture_output=True,
text=True,
)
return result.returncode == 0
except FileNotFoundError:
return False
def _format_failed_tests(self, results: List[TestResult]) -> str:
"""Format failed tests as markdown table."""
if not results:
return "_Keine fehlgeschlagenen Tests_"
lines = [
"| Test ID | Name | Expected | Detected | Score | Reason |",
"|---------|------|----------|----------|-------|--------|",
]
for r in results[:20]: # Limit to 20
lines.append(FAILED_TEST_ROW.format(
test_id=r.test_id,
test_name=r.test_name[:30],
expected=r.expected_intent,
detected=r.detected_intent,
score=f"{r.composite_score:.2f}",
reasoning=r.reasoning[:50] + "..." if len(r.reasoning) > 50 else r.reasoning,
))
if len(results) > 20:
lines.append(f"| ... | _und {len(results) - 20} weitere_ | | | | |")
return "\n".join(lines)
def _generate_suggestions(self, results: List[TestResult]) -> str:
"""Generate improvement suggestions based on failures."""
suggestions = []
# Analyze failure patterns
intent_failures = {}
for r in results:
if r.expected_intent not in intent_failures:
intent_failures[r.expected_intent] = 0
intent_failures[r.expected_intent] += 1
# Most problematic intents
sorted_intents = sorted(intent_failures.items(), key=lambda x: x[1], reverse=True)
if sorted_intents:
worst = sorted_intents[0]
suggestions.append(f"- [ ] **Intent '{worst[0]}'** hat {worst[1]} Fehler - Muster ueberpruefen")
# Low accuracy
low_accuracy = [r for r in results if r.intent_accuracy < 50]
if low_accuracy:
suggestions.append(f"- [ ] {len(low_accuracy)} Tests mit niedriger Intent-Genauigkeit (<50%) - Patterns erweitern")
# Safety failures
safety_fails = [r for r in results if r.safety == "fail"]
if safety_fails:
suggestions.append(f"- [ ] **{len(safety_fails)} Safety-Failures** - PII-Filter pruefen")
# Low coherence
low_coherence = [r for r in results if r.coherence < 3]
if low_coherence:
suggestions.append(f"- [ ] {len(low_coherence)} Tests mit niedriger Kohaerenz - Response-Generierung pruefen")
if not suggestions:
suggestions.append("- [ ] Detaillierte Analyse der Fehler durchfuehren")
return "\n".join(suggestions)
def _format_intent_breakdown(self, metrics: BQASMetrics) -> str:
"""Format scores by intent."""
if not metrics.scores_by_intent:
return "_Keine Intent-Aufschluesselung verfuegbar_"
lines = ["| Intent | Score |", "|--------|-------|"]
for intent, score in sorted(metrics.scores_by_intent.items(), key=lambda x: x[1]):
emoji = "🔴" if score < 3.0 else "🟡" if score < 4.0 else "🟢"
lines.append(f"| {emoji} {intent} | {score:.3f} |")
return "\n".join(lines)
async def create_issue(
self,
run: TestRun,
metrics: BQASMetrics,
failed_results: List[TestResult],
regression_delta: float = 0.0,
) -> Optional[str]:
"""
Create a GitHub issue for test failures.
Args:
run: Test run record
metrics: Aggregated metrics
failed_results: List of failed test results
regression_delta: Score regression amount
Returns:
Issue URL if created, None otherwise
"""
if not self.config.github_repo:
logger.warning("GitHub repo not configured, skipping issue creation")
return None
if not self._check_gh_available():
logger.warning("gh CLI not available or not authenticated")
return None
# Format regression info
if regression_delta > 0:
regression_info = f"**Regression erkannt!** Score um **{regression_delta:.3f}** gefallen."
else:
regression_info = "Keine signifikante Regression."
# Build issue body
body = ISSUE_TEMPLATE.format(
timestamp=run.timestamp.isoformat(),
commit=run.git_commit,
branch=run.git_branch,
total_tests=metrics.total_tests,
passed_tests=metrics.passed_tests,
failed_tests=metrics.failed_tests,
pass_rate=(metrics.passed_tests / metrics.total_tests * 100) if metrics.total_tests > 0 else 0,
avg_score=metrics.avg_composite_score,
failed_tests_table=self._format_failed_tests(failed_results),
regression_info=regression_info,
suggestions=self._generate_suggestions(failed_results),
intent_breakdown=self._format_intent_breakdown(metrics),
)
# Create title
title = f"BQAS: {metrics.failed_tests} Test-Failures ({run.git_commit})"
try:
# Use gh CLI to create issue
result = subprocess.run(
[
"gh", "issue", "create",
"--repo", self.config.github_repo,
"--title", title,
"--body", body,
"--label", "bqas,automated,quality",
],
capture_output=True,
text=True,
)
if result.returncode == 0:
issue_url = result.stdout.strip()
logger.info("GitHub issue created", url=issue_url)
return issue_url
else:
logger.error("Failed to create issue", error=result.stderr)
return None
except Exception as e:
logger.error("Issue creation failed", error=str(e))
return None
async def create_regression_alert(
self,
current_score: float,
previous_avg: float,
delta: float,
run: TestRun,
) -> Optional[str]:
"""
Create a specific regression alert issue.
Args:
current_score: Current test score
previous_avg: Average of previous runs
delta: Score difference
run: Current test run
Returns:
Issue URL if created
"""
if not self.config.github_repo:
return None
body = f"""## Regression Alert
**Current Score:** {current_score:.3f}
**Previous Average:** {previous_avg:.3f}
**Delta:** -{delta:.3f}
### Context
- **Commit:** {run.git_commit}
- **Branch:** {run.git_branch}
- **Timestamp:** {run.timestamp.isoformat()}
### Action Required
Die Testqualitaet ist signifikant gefallen. Bitte pruefen:
1. Letzte Commits auf moegliche Regressionen
2. Intent-Router Patterns
3. LLM Responses
4. Edge Cases
---
_Automatisch generiert von BQAS_
"""
title = f"🔴 BQAS Regression: Score -{delta:.3f}"
try:
result = subprocess.run(
[
"gh", "issue", "create",
"--repo", self.config.github_repo,
"--title", title,
"--body", body,
"--label", "bqas,regression,urgent",
],
capture_output=True,
text=True,
)
if result.returncode == 0:
return result.stdout.strip()
except Exception as e:
logger.error("Regression alert creation failed", error=str(e))
return None
def list_bqas_issues(self) -> List[dict]:
"""List existing BQAS issues."""
if not self.config.github_repo:
return []
try:
result = subprocess.run(
[
"gh", "issue", "list",
"--repo", self.config.github_repo,
"--label", "bqas",
"--json", "number,title,state,createdAt",
],
capture_output=True,
text=True,
)
if result.returncode == 0:
return json.loads(result.stdout)
except Exception as e:
logger.error("Failed to list issues", error=str(e))
return []