fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
@@ -0,0 +1,19 @@
+"""
+AI Processor - Vision Module
+
+Scan analysis and HTML generation.
+"""
+
+from .scan_analyzer import (
+    analyze_scan_structure_with_ai,
+    describe_scan_with_ai,
+    remove_handwriting_from_scan,
+)
+from .html_builder import build_clean_html_from_analysis
+
+__all__ = [
+    "analyze_scan_structure_with_ai",
+    "describe_scan_with_ai",
+    "remove_handwriting_from_scan",
+    "build_clean_html_from_analysis",
+]
@@ -0,0 +1,218 @@
+"""
+AI Processor - HTML Builder
+
+Build clean HTML worksheets from analysis data.
+"""
+
+from pathlib import Path
+import json
+import logging
+
+from ..config import BEREINIGT_DIR
+
+logger = logging.getLogger(__name__)
+
+
+def build_clean_html_from_analysis(analysis_path: Path) -> Path:
+    """
+    Build a clean HTML worksheet from an analysis JSON file.
+
+    Features:
+    - Focus on printed text (canonical_text / printed_blocks)
+    - Handwritten entries and crossed-out words are NOT included
+    - Uses open-source font stack (Inter / Noto Sans)
+
+    Args:
+        analysis_path: Path to *_analyse.json file
+
+    Returns:
+        Path to the generated HTML file
+    """
+    if not analysis_path.exists():
+        raise FileNotFoundError(f"Analysedatei nicht gefunden: {analysis_path}")
+    try:
+        data = json.loads(analysis_path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as e:
+        raise RuntimeError(f"Analyse-Datei enthaelt kein gueltiges JSON: {analysis_path}\n{e}") from e
+
+    title = data.get("title") or "Arbeitsblatt"
+    subject = data.get("subject") or ""
+    grade_level = data.get("grade_level") or ""
+    instructions = data.get("instructions") or ""
+    tasks = data.get("tasks", []) or []
+    canonical_text = data.get("canonical_text") or ""
+    printed_blocks = data.get("printed_blocks") or []
+    struck = data.get("struck_through_words") or []
+
+    html_parts = []
+    html_parts.append("<!DOCTYPE html>")
+    html_parts.append("<html lang='de'>")
+    html_parts.append("<head>")
+    html_parts.append("<meta charset='UTF-8'>")
+    html_parts.append(f"<title>{title}</title>")
+    html_parts.append(_get_html_styles())
+    html_parts.append("</head>")
+    html_parts.append("<body>")
+    html_parts.append("<div class='page'>")
+
+    # Header section
+    html_parts.append(f"<h1>{title}</h1>")
+    meta_bits = []
+    if subject:
+        meta_bits.append(f"Fach: {subject}")
+    if grade_level:
+        meta_bits.append(f"Klassenstufe: {grade_level}")
+    if meta_bits:
+        html_parts.append(f"<div class='meta'>{' | '.join(meta_bits)}</div>")
+
+    if instructions:
+        html_parts.append(
+            f"<div class='instructions'><strong>Arbeitsanweisung:</strong> {instructions}</div>"
+        )
+
+    # Main text / printed blocks
+    html_parts.append("<section class='text-blocks'>")
+
+    if printed_blocks:
+        for block in printed_blocks:
+            role = (block.get("role") or "body").lower()
+            text = (block.get("text") or "").strip()
+            if not text:
+                continue
+            html_parts.append("<div class='text-block'>")
+            if role == "title":
+                html_parts.append(f"<div class='text-block-title'>{text}</div>")
+            else:
+                html_parts.append(f"<div>{text}</div>")
+            html_parts.append("</div>")
+    elif canonical_text:
+        # Fallback: split canonical_text into paragraphs
+        paragraphs = [
+            p.strip()
+            for p in canonical_text.replace("\r\n", "\n").split("\n\n")
+            if p.strip()
+        ]
+        for p in paragraphs:
+            html_parts.append(f"<div class='text-block'>{p}</div>")
+
+    html_parts.append("</section>")
+
+    # Tasks section
+    if tasks:
+        html_parts.append("<h2>Aufgaben</h2>")
+        html_parts.append("<div class='task-list'>")
+
+        for idx, task in enumerate(tasks, start=1):
+            t_type = task.get("type") or "other"
+            desc = task.get("description") or ""
+            text_with_gaps = task.get("text_with_gaps")
+
+            html_parts.append("<div class='task'>")
+            html_parts.append(
+                f"<div class='task-title'>Aufgabe {idx} ({t_type}): {desc}</div>"
+            )
+
+            if text_with_gaps:
+                rendered = text_with_gaps.replace("___", "<span class='gap-line'>&nbsp;</span>")
+                html_parts.append(f"<div>{rendered}</div>")
+            html_parts.append("</div>")
+
+        html_parts.append("</div>")
+
+    # Footer note
+    if struck:
+        html_parts.append(
+            "<div class='footnote'>Hinweis: Einige im Original durchgestrichene Woerter wurden "
+            "von der KI erkannt und NICHT in dieses saubere Arbeitsblatt uebernommen.</div>"
+        )
+    else:
+        html_parts.append(
+            "<div class='footnote'>Dieses Arbeitsblatt wurde automatisch aus einem Scan rekonstruiert "
+            "und von handschriftlichen Eintragungen bereinigt.</div>"
+        )
+
+    html_parts.append("</div>")  # .page
+    html_parts.append("</body></html>")
+
+    html_content = "\n".join(html_parts)
+    out_name = analysis_path.stem.replace("_analyse", "") + "_clean.html"
+    out_path = BEREINIGT_DIR / out_name
+    out_path.write_text(html_content, encoding="utf-8")
+    return out_path
+
+
+def _get_html_styles() -> str:
+    """Get CSS styles for clean HTML output."""
+    return """
+<style>
+  :root {
+    --font-main: "Inter", "Noto Sans", system-ui, -apple-system, BlinkMacSystemFont, sans-serif;
+  }
+  * { box-sizing: border-box; }
+  body {
+    font-family: var(--font-main);
+    margin: 32px;
+    line-height: 1.5;
+    font-size: 14px;
+    color: #111827;
+  }
+  .page {
+    max-width: 800px;
+    margin: 0 auto;
+  }
+  h1 {
+    font-size: 24px;
+    margin-bottom: 4px;
+  }
+  h2 {
+    font-size: 18px;
+    margin-top: 24px;
+  }
+  .meta {
+    font-size: 12px;
+    color: #6b7280;
+    margin-bottom: 16px;
+  }
+  .instructions {
+    margin-bottom: 20px;
+    padding: 8px 10px;
+    border-radius: 8px;
+    background: #eff6ff;
+    border: 1px solid #bfdbfe;
+    font-size: 13px;
+  }
+  .text-blocks {
+    margin-bottom: 24px;
+  }
+  .text-block {
+    margin-bottom: 8px;
+  }
+  .text-block-title {
+    font-weight: 600;
+    margin-bottom: 4px;
+  }
+  .task-list {
+    margin-top: 8px;
+  }
+  .task {
+    margin-bottom: 14px;
+    padding-bottom: 8px;
+    border-bottom: 1px dashed #e5e7eb;
+  }
+  .task-title {
+    font-weight: 600;
+    margin-bottom: 4px;
+  }
+  .gap-line {
+    display: inline-block;
+    border-bottom: 1px solid #000;
+    min-width: 80px;
+    margin: 0 4px;
+  }
+  .footnote {
+    margin-top: 24px;
+    font-size: 11px;
+    color: #9ca3af;
+  }
+</style>
+"""
@@ -0,0 +1,307 @@
+"""
+AI Processor - Scan Analyzer
+
+Vision-based analysis of worksheets using OpenAI and Claude APIs.
+"""
+
+from pathlib import Path
+import json
+import logging
+import shutil
+import requests
+
+from ..config import (
+    VISION_API,
+    BEREINIGT_DIR,
+    get_openai_api_key,
+)
+from ..utils import encode_image_to_data_url
+
+logger = logging.getLogger(__name__)
+
+
+def describe_scan_with_ai(input_path: Path) -> Path:
+    """
+    Vision model gives a short description of the worksheet.
+
+    Args:
+        input_path: Path to the input image
+
+    Returns:
+        Path to the description text file
+    """
+    if not input_path.exists():
+        raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
+
+    api_key = get_openai_api_key()
+    image_data_url = encode_image_to_data_url(input_path)
+
+    url = "https://api.openai.com/v1/chat/completions"
+    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+    payload = {
+        "model": "gpt-4o-mini",
+        "messages": [
+            {
+                "role": "system",
+                "content": "Du bist ein hilfreicher Assistent, der Schul-Arbeitsblaetter knapp beschreibt.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": (
+                            "Beschreibe dieses Arbeitsblatt knapp: Thema, Art der Aufgaben "
+                            "(z.B. Lueckentext, Multiple Choice, Rechenaufgaben) und groben Inhalt."
+                        ),
+                    },
+                    {"type": "image_url", "image_url": {"url": image_data_url}},
+                ],
+            },
+        ],
+        "max_tokens": 400,
+    }
+
+    response = requests.post(url, headers=headers, json=payload)
+    response.raise_for_status()
+    data = response.json()
+    try:
+        description = data["choices"][0]["message"]["content"]
+    except Exception as e:
+        raise RuntimeError(f"Unerwartete Antwortstruktur von der KI: {e}\nAntwort: {data}") from e
+
+    out_name = input_path.stem + "_beschreibung.txt"
+    out_path = BEREINIGT_DIR / out_name
+    out_path.write_text(description, encoding="utf-8")
+    return out_path
+
+
+def _analyze_with_openai(input_path: Path) -> Path:
+    """
+    Structured JSON analysis of the worksheet using OpenAI.
+
+    Features:
+    - canonical_text: complete corrected text without handwriting
+    - printed_blocks: structured blocks of printed text
+    - handwritten_annotations: student handwritten notes
+    - struck_through_words: crossed out words
+    """
+    if not input_path.exists():
+        raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
+
+    api_key = get_openai_api_key()
+    image_data_url = encode_image_to_data_url(input_path)
+
+    url = "https://api.openai.com/v1/chat/completions"
+    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+
+    system_prompt = (
+        "Du bist ein Experte fuer die Analyse von Schul-Arbeitsblaettern.\n\n"
+        "HAUPTAUFGABEN:\n"
+        "1. Erkenne ALLE gedruckten Elemente: Text, Ueberschriften, Tabellen, Linien, Kaestchen, Diagramme, Illustrationen\n"
+        "2. Identifiziere ALLE handschriftlichen Ergaenzungen: Antworten, Zahlen, Buchstaben, Notizen, Zeichnungen\n"
+        "3. Bestimme praezise Positionen (Bounding Boxes in Pixeln) fuer JEDES Element\n\n"
+        "KRITISCH - DIAGRAMME & ILLUSTRATIONEN:\n"
+        "- Suche aktiv nach: anatomischen Zeichnungen, beschrifteten Diagrammen, Grafiken, Tabellen, Skizzen\n"
+        "- Wenn du irgendeine bildliche Darstellung siehst (z.B. Auge, Pflanze, Karte, Schaubild), setze 'has_diagram: true'\n"
+        "- Fuer JEDES visuelle Element: Erstelle einen Eintrag in 'diagram_elements' mit genauer Position\n"
+        "- Beschrifte-Linien (von Beschriftung zu Bildteil) gehoeren zum Diagramm!\n\n"
+        "HANDSCHRIFT ERKENNUNG:\n"
+        "- Unterscheide gedruckt vs. handgeschrieben anhand der Schriftart\n"
+        "- Klassifiziere Farbe: blau/schwarz/rot/pencil (Bleistift)\n"
+        "- Durchgestrichene Woerter separat auflisten\n\n"
+        "AUSGABE: Gib deine Antwort AUSSCHLIESSLICH als gueltiges JSON zurueck (kein Markdown, keine Code-Bloecke)."
+    )
+
+    user_text = _get_analysis_user_prompt()
+
+    payload = {
+        "model": "gpt-4o-mini",
+        "response_format": {"type": "json_object"},
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": user_text},
+                    {"type": "image_url", "image_url": {"url": image_data_url}},
+                ],
+            },
+        ],
+        "max_tokens": 2500,
+        "temperature": 0.15,
+    }
+
+    response = requests.post(url, headers=headers, json=payload)
+    response.raise_for_status()
+    data = response.json()
+    try:
+        content = data["choices"][0]["message"]["content"]
+    except Exception as e:
+        raise RuntimeError(f"Unerwartete Antwortstruktur von der KI: {e}\nAntwort: {data}") from e
+
+    try:
+        obj = json.loads(content)
+    except json.JSONDecodeError as e:
+        raise RuntimeError(f"Modell hat ungueltiges JSON geliefert: {e}\nInhalt: {content}") from e
+
+    out_name = input_path.stem + "_analyse.json"
+    out_path = BEREINIGT_DIR / out_name
+    out_path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
+    return out_path
+
+
+def _analyze_with_claude(input_path: Path) -> Path:
+    """
+    Structured JSON analysis with Claude Vision API.
+
+    Uses Claude 3.5 Sonnet for better OCR and layout detection.
+    """
+    from claude_vision import analyze_worksheet_with_claude
+
+    if not input_path.exists():
+        raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
+
+    logger.info(f"Analyzing with Claude Vision: {input_path.name}")
+
+    try:
+        analysis_data = analyze_worksheet_with_claude(
+            input_path,
+            max_tokens=2500
+        )
+
+        out_name = input_path.stem + "_analyse.json"
+        out_path = BEREINIGT_DIR / out_name
+        out_path.write_text(
+            json.dumps(analysis_data, ensure_ascii=False, indent=2),
+            encoding="utf-8"
+        )
+
+        logger.info(f"Claude analysis saved: {out_path.name}")
+        return out_path
+
+    except Exception as e:
+        logger.error(f"Claude analysis failed: {e}")
+        raise
+
+
+def analyze_scan_structure_with_ai(input_path: Path) -> Path:
+    """
+    Structured JSON analysis of the worksheet (Hybrid mode).
+
+    Uses the API configured in VISION_API:
+    - "claude" (default): Claude 3.5 Sonnet - better OCR, layout detection
+    - "openai": OpenAI GPT-4o-mini - cheaper, faster
+
+    Switch via environment variable:
+        export VISION_API="claude"  # or "openai"
+
+    Returns:
+        Path to analysis JSON file
+    """
+    logger.info(f"Using Vision API: {VISION_API}")
+
+    if VISION_API == "claude":
+        try:
+            return _analyze_with_claude(input_path)
+        except Exception as e:
+            logger.warning(f"Claude failed, falling back to OpenAI: {e}")
+            return _analyze_with_openai(input_path)
+
+    elif VISION_API == "openai":
+        return _analyze_with_openai(input_path)
+
+    else:
+        logger.warning(f"Unknown VISION_API '{VISION_API}', using Claude as default")
+        return _analyze_with_claude(input_path)
+
+
+def remove_handwriting_from_scan(input_path: Path) -> Path:
+    """
+    Remove handwriting from worksheet scan using AI-guided image processing.
+
+    Process:
+    1. Load corresponding analysis JSON (from Stage 1)
+    2. Apply multi-strategy cleaning using WorksheetCleaner
+    3. Preserve diagrams and printed content
+    4. Save cleaned image
+
+    Returns:
+        Path to cleaned image (*_clean.jpg)
+    """
+    if not input_path.exists():
+        raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
+
+    from image_cleaner import WorksheetCleaner
+
+    # Load analysis JSON (from Stage 1)
+    analysis_name = input_path.stem + "_analyse.json"
+    analysis_path = BEREINIGT_DIR / analysis_name
+
+    # If analysis doesn't exist, run it first
+    if not analysis_path.exists():
+        logger.info(f"Analysis not found for {input_path.name}, running analysis first")
+        analysis_path = analyze_scan_structure_with_ai(input_path)
+
+    # Load analysis data
+    try:
+        analysis_data = json.loads(analysis_path.read_text(encoding='utf-8'))
+    except json.JSONDecodeError as e:
+        logger.error(f"Invalid analysis JSON: {analysis_path}\n{e}")
+        analysis_data = {
+            "layout": {"text_regions": [], "diagram_elements": []},
+            "handwriting_regions": []
+        }
+
+    # Prepare output path
+    output_name = input_path.stem + "_clean" + input_path.suffix
+    output_path = BEREINIGT_DIR / output_name
+
+    # Clean the image using WorksheetCleaner
+    cleaner = WorksheetCleaner(debug_mode=False)
+    try:
+        cleaned_path = cleaner.clean_worksheet(input_path, analysis_data, output_path)
+        logger.info(f"Successfully cleaned {input_path.name}")
+        return cleaned_path
+    except Exception as e:
+        logger.error(f"Cleaning failed for {input_path.name}, using original: {e}")
+        shutil.copy2(input_path, output_path)
+        return output_path
+
+
+def _get_analysis_user_prompt() -> str:
+    """Get the user prompt for worksheet analysis."""
+    return (
+        "Analysiere dieses Arbeitsblatt und gib ein JSON mit folgendem Aufbau zurueck:\n\n"
+        "{\n"
+        '  "title": string | null,\n'
+        '  "subject": string | null,\n'
+        '  "grade_level": string | null,\n'
+        '  "instructions": string | null,\n'
+        '  "canonical_text": string | null,\n'
+        '  "printed_blocks": [\n'
+        "    {\n"
+        '      "id": string,\n'
+        '      "role": "title" | "instructions" | "body" | "other",\n'
+        '      "text": string\n'
+        "    }\n"
+        "  ],\n"
+        '  "layout": {\n'
+        '    "page_structure": {\n'
+        '      "has_diagram": boolean,\n'
+        '      "orientation": "portrait" | "landscape"\n'
+        "    },\n"
+        '    "text_regions": [...],\n'
+        '    "diagram_elements": [...]\n'
+        "  },\n"
+        '  "handwriting_regions": [...],\n'
+        '  "handwritten_annotations": [...],\n'
+        '  "struck_through_words": [...],\n'
+        '  "tasks": [...]\n'
+        "}\n\n"
+        "WICHTIG - BITTE GENAU BEACHTEN:\n"
+        "1. CANONICAL TEXT: Nur gedruckter Text, OHNE Handschrift\n"
+        "2. DIAGRAMME: Bei JEDER Zeichnung/Grafik has_diagram: true setzen\n"
+        "3. HANDSCHRIFT: Mit Farb-Klassifizierung und Bounding Boxes\n"
+        "4. Bei Unsicherheit: null oder leeres Array"
+    )