breakpilot-pwa/voice-service/bqas/backlog_generator.py

"""
Backlog Generator
Automatically creates GitHub issues for test failures and regressions
"""
import subprocess
import json
import structlog
from typing import Optional, List
from datetime import datetime

from bqas.config import BQASConfig
from bqas.regression_tracker import TestRun
from bqas.metrics import TestResult, BQASMetrics

logger = structlog.get_logger(__name__)


ISSUE_TEMPLATE = """## BQAS Test Failure Report

**Test Run:** {timestamp}
**Git Commit:** {commit}
**Git Branch:** {branch}

### Summary

- **Total Tests:** {total_tests}
- **Passed:** {passed_tests}
- **Failed:** {failed_tests}
- **Pass Rate:** {pass_rate:.1f}%
- **Average Score:** {avg_score:.3f}/5

### Failed Tests

{failed_tests_table}

### Regression Alert

{regression_info}

### Suggested Actions

{suggestions}

### By Intent

{intent_breakdown}

---
_Automatisch generiert von BQAS (Breakpilot Quality Assurance System)_
"""

FAILED_TEST_ROW = """| {test_id} | {test_name} | {expected} | {detected} | {score} | {reasoning} |"""


class BacklogGenerator:
    """
    Generates GitHub issues for test failures.

    Uses gh CLI for GitHub integration.
    """

    def __init__(self, config: Optional[BQASConfig] = None):
        self.config = config or BQASConfig.from_env()

    def _check_gh_available(self) -> bool:
        """Check if gh CLI is available and authenticated."""
        try:
            result = subprocess.run(
                ["gh", "auth", "status"],
                capture_output=True,
                text=True,
            )
            return result.returncode == 0
        except FileNotFoundError:
            return False

    def _format_failed_tests(self, results: List[TestResult]) -> str:
        """Format failed tests as markdown table."""
        if not results:
            return "_Keine fehlgeschlagenen Tests_"

        lines = [
            "| Test ID | Name | Expected | Detected | Score | Reason |",
            "|---------|------|----------|----------|-------|--------|",
        ]

        for r in results[:20]:  # Limit to 20
            lines.append(FAILED_TEST_ROW.format(
                test_id=r.test_id,
                test_name=r.test_name[:30],
                expected=r.expected_intent,
                detected=r.detected_intent,
                score=f"{r.composite_score:.2f}",
                reasoning=r.reasoning[:50] + "..." if len(r.reasoning) > 50 else r.reasoning,
            ))

        if len(results) > 20:
            lines.append(f"| ... | _und {len(results) - 20} weitere_ | | | | |")

        return "\n".join(lines)

    def _generate_suggestions(self, results: List[TestResult]) -> str:
        """Generate improvement suggestions based on failures."""
        suggestions = []

        # Analyze failure patterns
        intent_failures = {}
        for r in results:
            if r.expected_intent not in intent_failures:
                intent_failures[r.expected_intent] = 0
            intent_failures[r.expected_intent] += 1

        # Most problematic intents
        sorted_intents = sorted(intent_failures.items(), key=lambda x: x[1], reverse=True)

        if sorted_intents:
            worst = sorted_intents[0]
            suggestions.append(f"- [ ] **Intent '{worst[0]}'** hat {worst[1]} Fehler - Muster ueberpruefen")

        # Low accuracy
        low_accuracy = [r for r in results if r.intent_accuracy < 50]
        if low_accuracy:
            suggestions.append(f"- [ ] {len(low_accuracy)} Tests mit niedriger Intent-Genauigkeit (<50%) - Patterns erweitern")

        # Safety failures
        safety_fails = [r for r in results if r.safety == "fail"]
        if safety_fails:
            suggestions.append(f"- [ ] **{len(safety_fails)} Safety-Failures** - PII-Filter pruefen")

        # Low coherence
        low_coherence = [r for r in results if r.coherence < 3]
        if low_coherence:
            suggestions.append(f"- [ ] {len(low_coherence)} Tests mit niedriger Kohaerenz - Response-Generierung pruefen")

        if not suggestions:
            suggestions.append("- [ ] Detaillierte Analyse der Fehler durchfuehren")

        return "\n".join(suggestions)

    def _format_intent_breakdown(self, metrics: BQASMetrics) -> str:
        """Format scores by intent."""
        if not metrics.scores_by_intent:
            return "_Keine Intent-Aufschluesselung verfuegbar_"

        lines = ["| Intent | Score |", "|--------|-------|"]

        for intent, score in sorted(metrics.scores_by_intent.items(), key=lambda x: x[1]):
            emoji = "🔴" if score < 3.0 else "🟡" if score < 4.0 else "🟢"
            lines.append(f"| {emoji} {intent} | {score:.3f} |")

        return "\n".join(lines)

    async def create_issue(
        self,
        run: TestRun,
        metrics: BQASMetrics,
        failed_results: List[TestResult],
        regression_delta: float = 0.0,
    ) -> Optional[str]:
        """
        Create a GitHub issue for test failures.

        Args:
            run: Test run record
            metrics: Aggregated metrics
            failed_results: List of failed test results
            regression_delta: Score regression amount

        Returns:
            Issue URL if created, None otherwise
        """
        if not self.config.github_repo:
            logger.warning("GitHub repo not configured, skipping issue creation")
            return None

        if not self._check_gh_available():
            logger.warning("gh CLI not available or not authenticated")
            return None

        # Format regression info
        if regression_delta > 0:
            regression_info = f"**Regression erkannt!** Score um **{regression_delta:.3f}** gefallen."
        else:
            regression_info = "Keine signifikante Regression."

        # Build issue body
        body = ISSUE_TEMPLATE.format(
            timestamp=run.timestamp.isoformat(),
            commit=run.git_commit,
            branch=run.git_branch,
            total_tests=metrics.total_tests,
            passed_tests=metrics.passed_tests,
            failed_tests=metrics.failed_tests,
            pass_rate=(metrics.passed_tests / metrics.total_tests * 100) if metrics.total_tests > 0 else 0,
            avg_score=metrics.avg_composite_score,
            failed_tests_table=self._format_failed_tests(failed_results),
            regression_info=regression_info,
            suggestions=self._generate_suggestions(failed_results),
            intent_breakdown=self._format_intent_breakdown(metrics),
        )

        # Create title
        title = f"BQAS: {metrics.failed_tests} Test-Failures ({run.git_commit})"

        try:
            # Use gh CLI to create issue
            result = subprocess.run(
                [
                    "gh", "issue", "create",
                    "--repo", self.config.github_repo,
                    "--title", title,
                    "--body", body,
                    "--label", "bqas,automated,quality",
                ],
                capture_output=True,
                text=True,
            )

            if result.returncode == 0:
                issue_url = result.stdout.strip()
                logger.info("GitHub issue created", url=issue_url)
                return issue_url
            else:
                logger.error("Failed to create issue", error=result.stderr)
                return None

        except Exception as e:
            logger.error("Issue creation failed", error=str(e))
            return None

    async def create_regression_alert(
        self,
        current_score: float,
        previous_avg: float,
        delta: float,
        run: TestRun,
    ) -> Optional[str]:
        """
        Create a specific regression alert issue.

        Args:
            current_score: Current test score
            previous_avg: Average of previous runs
            delta: Score difference
            run: Current test run

        Returns:
            Issue URL if created
        """
        if not self.config.github_repo:
            return None

        body = f"""## Regression Alert

**Current Score:** {current_score:.3f}
**Previous Average:** {previous_avg:.3f}
**Delta:** -{delta:.3f}

### Context

- **Commit:** {run.git_commit}
- **Branch:** {run.git_branch}
- **Timestamp:** {run.timestamp.isoformat()}

### Action Required

Die Testqualitaet ist signifikant gefallen. Bitte pruefen:

1. Letzte Commits auf moegliche Regressionen
2. Intent-Router Patterns
3. LLM Responses
4. Edge Cases

---
_Automatisch generiert von BQAS_
"""

        title = f"🔴 BQAS Regression: Score -{delta:.3f}"

        try:
            result = subprocess.run(
                [
                    "gh", "issue", "create",
                    "--repo", self.config.github_repo,
                    "--title", title,
                    "--body", body,
                    "--label", "bqas,regression,urgent",
                ],
                capture_output=True,
                text=True,
            )

            if result.returncode == 0:
                return result.stdout.strip()

        except Exception as e:
            logger.error("Regression alert creation failed", error=str(e))

        return None

    def list_bqas_issues(self) -> List[dict]:
        """List existing BQAS issues."""
        if not self.config.github_repo:
            return []

        try:
            result = subprocess.run(
                [
                    "gh", "issue", "list",
                    "--repo", self.config.github_repo,
                    "--label", "bqas",
                    "--json", "number,title,state,createdAt",
                ],
                capture_output=True,
                text=True,
            )

            if result.returncode == 0:
                return json.loads(result.stdout)

        except Exception as e:
            logger.error("Failed to list issues", error=str(e))

        return []