breakpilot-compliance/backend-compliance/compliance/services/founding_wizard/markdown_to_docx.py

"""
Konvertiert gerendertes Markdown in eine .docx-Datei mittels python-docx.

Unterstuetzte Markdown-Elemente:
- # / ## / ### / #### / ##### Headings
- **bold** und _italic_ inline
- Tabellen (Pipe-Syntax)
- Listen mit - oder * oder Ziffer.)
- Horizontale Linien ---
- Code-Inline `code`

Bewusst minimal — fuer rechtliche Dokumente brauchen wir keine Bilder/Embeds.
"""

from __future__ import annotations

import io
import re
from typing import Any, Optional

from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

HEADING_RE = re.compile(r"^(#{1,5})\s+(.+)$")
HR_RE = re.compile(r"^[-_*]{3,}\s*$")
LIST_BULLET_RE = re.compile(r"^(\s*)([-*+])\s+(.+)$")
LIST_NUMBER_RE = re.compile(r"^(\s*)(\d+)[\.\)]\s+(.+)$")
TABLE_ROW_RE = re.compile(r"^\|(.+)\|\s*$")
TABLE_SEP_RE = re.compile(r"^\|[\s\-:|]+\|\s*$")

INLINE_BOLD = re.compile(r"\*\*([^*]+)\*\*")
INLINE_ITALIC = re.compile(r"(?<!\*)\*(?!\*)([^*]+)\*(?!\*)|_([^_]+)_")
INLINE_CODE = re.compile(r"`([^`]+)`")


def _add_runs(paragraph: Any, text: str) -> None:
    """Parse inline-Formatierung und fuege Runs hinzu."""
    pos = 0
    tokens: list[tuple[str, str]] = []
    while pos < len(text):
        m_bold = INLINE_BOLD.search(text, pos)
        m_code = INLINE_CODE.search(text, pos)
        m_italic = INLINE_ITALIC.search(text, pos)

        candidates = [m for m in (m_bold, m_code, m_italic) if m]
        if not candidates:
            tokens.append(("plain", text[pos:]))
            break
        first = min(candidates, key=lambda m: m.start())
        if first.start() > pos:
            tokens.append(("plain", text[pos:first.start()]))
        if first is m_bold:
            tokens.append(("bold", first.group(1)))
        elif first is m_code:
            tokens.append(("code", first.group(1)))
        elif m_italic is not None:
            content = m_italic.group(1) or m_italic.group(2)
            tokens.append(("italic", content))
        pos = first.end()

    for kind, content in tokens:
        run = paragraph.add_run(content)
        if kind == "bold":
            run.bold = True
        elif kind == "italic":
            run.italic = True
        elif kind == "code":
            run.font.name = "Courier New"
            run.font.size = Pt(10)


def _parse_table(lines: list[str], start: int) -> tuple[list[list[str]], int]:
    """Parst Markdown-Tabelle. Returns (rows, next_line_index)."""
    rows: list[list[str]] = []
    i = start
    while i < len(lines):
        line = lines[i].rstrip()
        if not TABLE_ROW_RE.match(line) and not TABLE_SEP_RE.match(line):
            break
        if TABLE_SEP_RE.match(line):
            i += 1
            continue
        cells = [c.strip() for c in line.strip("|").split("|")]
        rows.append(cells)
        i += 1
    return rows, i


def _add_table(doc: Any, rows: list[list[str]]) -> None:
    if not rows:
        return
    ncols = max(len(r) for r in rows)
    table = doc.add_table(rows=len(rows), cols=ncols)
    table.style = "Light Grid"
    for r_idx, row in enumerate(rows):
        for c_idx, cell_text in enumerate(row):
            if c_idx < ncols:
                cell = table.rows[r_idx].cells[c_idx]
                cell.text = ""
                p = cell.paragraphs[0]
                _add_runs(p, cell_text)
                if r_idx == 0:
                    for run in p.runs:
                        run.bold = True


def markdown_to_docx_bytes(markdown_text: str, title: Optional[str] = None) -> bytes:
    """Konvertiert Markdown nach DOCX und returns die Bytes."""
    doc = Document()

    # Basis-Style
    style = doc.styles["Normal"]
    style.font.name = "Calibri"
    style.font.size = Pt(11)

    if title:
        h = doc.add_heading(title, level=0)
        h.alignment = WD_ALIGN_PARAGRAPH.LEFT

    lines = markdown_text.split("\n")
    i = 0
    while i < len(lines):
        line = lines[i].rstrip()

        if not line.strip():
            i += 1
            continue

        # Heading
        h_match = HEADING_RE.match(line)
        if h_match:
            level = len(h_match.group(1))
            text = h_match.group(2)
            heading = doc.add_heading(level=min(level, 4))
            _add_runs(heading, text)
            i += 1
            continue

        # Horizontal Rule
        if HR_RE.match(line):
            doc.add_paragraph("─" * 60)
            i += 1
            continue

        # Tabelle
        if TABLE_ROW_RE.match(line):
            rows, i = _parse_table(lines, i)
            _add_table(doc, rows)
            doc.add_paragraph()
            continue

        # List Bullet
        b_match = LIST_BULLET_RE.match(line)
        if b_match:
            p = doc.add_paragraph(style="List Bullet")
            _add_runs(p, b_match.group(3))
            i += 1
            continue

        # List Number
        n_match = LIST_NUMBER_RE.match(line)
        if n_match:
            p = doc.add_paragraph(style="List Number")
            _add_runs(p, n_match.group(3))
            i += 1
            continue

        # Sonst: normaler Paragraph
        p = doc.add_paragraph()
        _add_runs(p, line)
        i += 1

    buf = io.BytesIO()
    doc.save(buf)
    return buf.getvalue()