fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions

View File

@@ -0,0 +1,19 @@
"""
AI Processor - Vision Module
Scan analysis and HTML generation.
"""
from .scan_analyzer import (
analyze_scan_structure_with_ai,
describe_scan_with_ai,
remove_handwriting_from_scan,
)
from .html_builder import build_clean_html_from_analysis
__all__ = [
"analyze_scan_structure_with_ai",
"describe_scan_with_ai",
"remove_handwriting_from_scan",
"build_clean_html_from_analysis",
]

View File

@@ -0,0 +1,218 @@
"""
AI Processor - HTML Builder
Build clean HTML worksheets from analysis data.
"""
from pathlib import Path
import json
import logging
from ..config import BEREINIGT_DIR
logger = logging.getLogger(__name__)
def build_clean_html_from_analysis(analysis_path: Path) -> Path:
"""
Build a clean HTML worksheet from an analysis JSON file.
Features:
- Focus on printed text (canonical_text / printed_blocks)
- Handwritten entries and crossed-out words are NOT included
- Uses open-source font stack (Inter / Noto Sans)
Args:
analysis_path: Path to *_analyse.json file
Returns:
Path to the generated HTML file
"""
if not analysis_path.exists():
raise FileNotFoundError(f"Analysedatei nicht gefunden: {analysis_path}")
try:
data = json.loads(analysis_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as e:
raise RuntimeError(f"Analyse-Datei enthaelt kein gueltiges JSON: {analysis_path}\n{e}") from e
title = data.get("title") or "Arbeitsblatt"
subject = data.get("subject") or ""
grade_level = data.get("grade_level") or ""
instructions = data.get("instructions") or ""
tasks = data.get("tasks", []) or []
canonical_text = data.get("canonical_text") or ""
printed_blocks = data.get("printed_blocks") or []
struck = data.get("struck_through_words") or []
html_parts = []
html_parts.append("<!DOCTYPE html>")
html_parts.append("<html lang='de'>")
html_parts.append("<head>")
html_parts.append("<meta charset='UTF-8'>")
html_parts.append(f"<title>{title}</title>")
html_parts.append(_get_html_styles())
html_parts.append("</head>")
html_parts.append("<body>")
html_parts.append("<div class='page'>")
# Header section
html_parts.append(f"<h1>{title}</h1>")
meta_bits = []
if subject:
meta_bits.append(f"Fach: {subject}")
if grade_level:
meta_bits.append(f"Klassenstufe: {grade_level}")
if meta_bits:
html_parts.append(f"<div class='meta'>{' | '.join(meta_bits)}</div>")
if instructions:
html_parts.append(
f"<div class='instructions'><strong>Arbeitsanweisung:</strong> {instructions}</div>"
)
# Main text / printed blocks
html_parts.append("<section class='text-blocks'>")
if printed_blocks:
for block in printed_blocks:
role = (block.get("role") or "body").lower()
text = (block.get("text") or "").strip()
if not text:
continue
html_parts.append("<div class='text-block'>")
if role == "title":
html_parts.append(f"<div class='text-block-title'>{text}</div>")
else:
html_parts.append(f"<div>{text}</div>")
html_parts.append("</div>")
elif canonical_text:
# Fallback: split canonical_text into paragraphs
paragraphs = [
p.strip()
for p in canonical_text.replace("\r\n", "\n").split("\n\n")
if p.strip()
]
for p in paragraphs:
html_parts.append(f"<div class='text-block'>{p}</div>")
html_parts.append("</section>")
# Tasks section
if tasks:
html_parts.append("<h2>Aufgaben</h2>")
html_parts.append("<div class='task-list'>")
for idx, task in enumerate(tasks, start=1):
t_type = task.get("type") or "other"
desc = task.get("description") or ""
text_with_gaps = task.get("text_with_gaps")
html_parts.append("<div class='task'>")
html_parts.append(
f"<div class='task-title'>Aufgabe {idx} ({t_type}): {desc}</div>"
)
if text_with_gaps:
rendered = text_with_gaps.replace("___", "<span class='gap-line'>&nbsp;</span>")
html_parts.append(f"<div>{rendered}</div>")
html_parts.append("</div>")
html_parts.append("</div>")
# Footer note
if struck:
html_parts.append(
"<div class='footnote'>Hinweis: Einige im Original durchgestrichene Woerter wurden "
"von der KI erkannt und NICHT in dieses saubere Arbeitsblatt uebernommen.</div>"
)
else:
html_parts.append(
"<div class='footnote'>Dieses Arbeitsblatt wurde automatisch aus einem Scan rekonstruiert "
"und von handschriftlichen Eintragungen bereinigt.</div>"
)
html_parts.append("</div>") # .page
html_parts.append("</body></html>")
html_content = "\n".join(html_parts)
out_name = analysis_path.stem.replace("_analyse", "") + "_clean.html"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(html_content, encoding="utf-8")
return out_path
def _get_html_styles() -> str:
"""Get CSS styles for clean HTML output."""
return """
<style>
:root {
--font-main: "Inter", "Noto Sans", system-ui, -apple-system, BlinkMacSystemFont, sans-serif;
}
* { box-sizing: border-box; }
body {
font-family: var(--font-main);
margin: 32px;
line-height: 1.5;
font-size: 14px;
color: #111827;
}
.page {
max-width: 800px;
margin: 0 auto;
}
h1 {
font-size: 24px;
margin-bottom: 4px;
}
h2 {
font-size: 18px;
margin-top: 24px;
}
.meta {
font-size: 12px;
color: #6b7280;
margin-bottom: 16px;
}
.instructions {
margin-bottom: 20px;
padding: 8px 10px;
border-radius: 8px;
background: #eff6ff;
border: 1px solid #bfdbfe;
font-size: 13px;
}
.text-blocks {
margin-bottom: 24px;
}
.text-block {
margin-bottom: 8px;
}
.text-block-title {
font-weight: 600;
margin-bottom: 4px;
}
.task-list {
margin-top: 8px;
}
.task {
margin-bottom: 14px;
padding-bottom: 8px;
border-bottom: 1px dashed #e5e7eb;
}
.task-title {
font-weight: 600;
margin-bottom: 4px;
}
.gap-line {
display: inline-block;
border-bottom: 1px solid #000;
min-width: 80px;
margin: 0 4px;
}
.footnote {
margin-top: 24px;
font-size: 11px;
color: #9ca3af;
}
</style>
"""

View File

@@ -0,0 +1,307 @@
"""
AI Processor - Scan Analyzer
Vision-based analysis of worksheets using OpenAI and Claude APIs.
"""
from pathlib import Path
import json
import logging
import shutil
import requests
from ..config import (
VISION_API,
BEREINIGT_DIR,
get_openai_api_key,
)
from ..utils import encode_image_to_data_url
logger = logging.getLogger(__name__)
def describe_scan_with_ai(input_path: Path) -> Path:
"""
Vision model gives a short description of the worksheet.
Args:
input_path: Path to the input image
Returns:
Path to the description text file
"""
if not input_path.exists():
raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
api_key = get_openai_api_key()
image_data_url = encode_image_to_data_url(input_path)
url = "https://api.openai.com/v1/chat/completions"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
payload = {
"model": "gpt-4o-mini",
"messages": [
{
"role": "system",
"content": "Du bist ein hilfreicher Assistent, der Schul-Arbeitsblaetter knapp beschreibt.",
},
{
"role": "user",
"content": [
{
"type": "text",
"text": (
"Beschreibe dieses Arbeitsblatt knapp: Thema, Art der Aufgaben "
"(z.B. Lueckentext, Multiple Choice, Rechenaufgaben) und groben Inhalt."
),
},
{"type": "image_url", "image_url": {"url": image_data_url}},
],
},
],
"max_tokens": 400,
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
try:
description = data["choices"][0]["message"]["content"]
except Exception as e:
raise RuntimeError(f"Unerwartete Antwortstruktur von der KI: {e}\nAntwort: {data}") from e
out_name = input_path.stem + "_beschreibung.txt"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(description, encoding="utf-8")
return out_path
def _analyze_with_openai(input_path: Path) -> Path:
"""
Structured JSON analysis of the worksheet using OpenAI.
Features:
- canonical_text: complete corrected text without handwriting
- printed_blocks: structured blocks of printed text
- handwritten_annotations: student handwritten notes
- struck_through_words: crossed out words
"""
if not input_path.exists():
raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
api_key = get_openai_api_key()
image_data_url = encode_image_to_data_url(input_path)
url = "https://api.openai.com/v1/chat/completions"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
system_prompt = (
"Du bist ein Experte fuer die Analyse von Schul-Arbeitsblaettern.\n\n"
"HAUPTAUFGABEN:\n"
"1. Erkenne ALLE gedruckten Elemente: Text, Ueberschriften, Tabellen, Linien, Kaestchen, Diagramme, Illustrationen\n"
"2. Identifiziere ALLE handschriftlichen Ergaenzungen: Antworten, Zahlen, Buchstaben, Notizen, Zeichnungen\n"
"3. Bestimme praezise Positionen (Bounding Boxes in Pixeln) fuer JEDES Element\n\n"
"KRITISCH - DIAGRAMME & ILLUSTRATIONEN:\n"
"- Suche aktiv nach: anatomischen Zeichnungen, beschrifteten Diagrammen, Grafiken, Tabellen, Skizzen\n"
"- Wenn du irgendeine bildliche Darstellung siehst (z.B. Auge, Pflanze, Karte, Schaubild), setze 'has_diagram: true'\n"
"- Fuer JEDES visuelle Element: Erstelle einen Eintrag in 'diagram_elements' mit genauer Position\n"
"- Beschrifte-Linien (von Beschriftung zu Bildteil) gehoeren zum Diagramm!\n\n"
"HANDSCHRIFT ERKENNUNG:\n"
"- Unterscheide gedruckt vs. handgeschrieben anhand der Schriftart\n"
"- Klassifiziere Farbe: blau/schwarz/rot/pencil (Bleistift)\n"
"- Durchgestrichene Woerter separat auflisten\n\n"
"AUSGABE: Gib deine Antwort AUSSCHLIESSLICH als gueltiges JSON zurueck (kein Markdown, keine Code-Bloecke)."
)
user_text = _get_analysis_user_prompt()
payload = {
"model": "gpt-4o-mini",
"response_format": {"type": "json_object"},
"messages": [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": [
{"type": "text", "text": user_text},
{"type": "image_url", "image_url": {"url": image_data_url}},
],
},
],
"max_tokens": 2500,
"temperature": 0.15,
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
try:
content = data["choices"][0]["message"]["content"]
except Exception as e:
raise RuntimeError(f"Unerwartete Antwortstruktur von der KI: {e}\nAntwort: {data}") from e
try:
obj = json.loads(content)
except json.JSONDecodeError as e:
raise RuntimeError(f"Modell hat ungueltiges JSON geliefert: {e}\nInhalt: {content}") from e
out_name = input_path.stem + "_analyse.json"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
return out_path
def _analyze_with_claude(input_path: Path) -> Path:
"""
Structured JSON analysis with Claude Vision API.
Uses Claude 3.5 Sonnet for better OCR and layout detection.
"""
from claude_vision import analyze_worksheet_with_claude
if not input_path.exists():
raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
logger.info(f"Analyzing with Claude Vision: {input_path.name}")
try:
analysis_data = analyze_worksheet_with_claude(
input_path,
max_tokens=2500
)
out_name = input_path.stem + "_analyse.json"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(
json.dumps(analysis_data, ensure_ascii=False, indent=2),
encoding="utf-8"
)
logger.info(f"Claude analysis saved: {out_path.name}")
return out_path
except Exception as e:
logger.error(f"Claude analysis failed: {e}")
raise
def analyze_scan_structure_with_ai(input_path: Path) -> Path:
"""
Structured JSON analysis of the worksheet (Hybrid mode).
Uses the API configured in VISION_API:
- "claude" (default): Claude 3.5 Sonnet - better OCR, layout detection
- "openai": OpenAI GPT-4o-mini - cheaper, faster
Switch via environment variable:
export VISION_API="claude" # or "openai"
Returns:
Path to analysis JSON file
"""
logger.info(f"Using Vision API: {VISION_API}")
if VISION_API == "claude":
try:
return _analyze_with_claude(input_path)
except Exception as e:
logger.warning(f"Claude failed, falling back to OpenAI: {e}")
return _analyze_with_openai(input_path)
elif VISION_API == "openai":
return _analyze_with_openai(input_path)
else:
logger.warning(f"Unknown VISION_API '{VISION_API}', using Claude as default")
return _analyze_with_claude(input_path)
def remove_handwriting_from_scan(input_path: Path) -> Path:
"""
Remove handwriting from worksheet scan using AI-guided image processing.
Process:
1. Load corresponding analysis JSON (from Stage 1)
2. Apply multi-strategy cleaning using WorksheetCleaner
3. Preserve diagrams and printed content
4. Save cleaned image
Returns:
Path to cleaned image (*_clean.jpg)
"""
if not input_path.exists():
raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
from image_cleaner import WorksheetCleaner
# Load analysis JSON (from Stage 1)
analysis_name = input_path.stem + "_analyse.json"
analysis_path = BEREINIGT_DIR / analysis_name
# If analysis doesn't exist, run it first
if not analysis_path.exists():
logger.info(f"Analysis not found for {input_path.name}, running analysis first")
analysis_path = analyze_scan_structure_with_ai(input_path)
# Load analysis data
try:
analysis_data = json.loads(analysis_path.read_text(encoding='utf-8'))
except json.JSONDecodeError as e:
logger.error(f"Invalid analysis JSON: {analysis_path}\n{e}")
analysis_data = {
"layout": {"text_regions": [], "diagram_elements": []},
"handwriting_regions": []
}
# Prepare output path
output_name = input_path.stem + "_clean" + input_path.suffix
output_path = BEREINIGT_DIR / output_name
# Clean the image using WorksheetCleaner
cleaner = WorksheetCleaner(debug_mode=False)
try:
cleaned_path = cleaner.clean_worksheet(input_path, analysis_data, output_path)
logger.info(f"Successfully cleaned {input_path.name}")
return cleaned_path
except Exception as e:
logger.error(f"Cleaning failed for {input_path.name}, using original: {e}")
shutil.copy2(input_path, output_path)
return output_path
def _get_analysis_user_prompt() -> str:
"""Get the user prompt for worksheet analysis."""
return (
"Analysiere dieses Arbeitsblatt und gib ein JSON mit folgendem Aufbau zurueck:\n\n"
"{\n"
' "title": string | null,\n'
' "subject": string | null,\n'
' "grade_level": string | null,\n'
' "instructions": string | null,\n'
' "canonical_text": string | null,\n'
' "printed_blocks": [\n'
" {\n"
' "id": string,\n'
' "role": "title" | "instructions" | "body" | "other",\n'
' "text": string\n'
" }\n"
" ],\n"
' "layout": {\n'
' "page_structure": {\n'
' "has_diagram": boolean,\n'
' "orientation": "portrait" | "landscape"\n'
" },\n"
' "text_regions": [...],\n'
' "diagram_elements": [...]\n'
" },\n"
' "handwriting_regions": [...],\n'
' "handwritten_annotations": [...],\n'
' "struck_through_words": [...],\n'
' "tasks": [...]\n'
"}\n\n"
"WICHTIG - BITTE GENAU BEACHTEN:\n"
"1. CANONICAL TEXT: Nur gedruckter Text, OHNE Handschrift\n"
"2. DIAGRAMME: Bei JEDER Zeichnung/Grafik has_diagram: true setzen\n"
"3. HANDSCHRIFT: Mit Farb-Klassifizierung und Bounding Boxes\n"
"4. Bei Unsicherheit: null oder leeres Array"
)