Files
breakpilot-compliance/backend-compliance/compliance/services/founding_wizard/markdown_to_docx.py
T
Benjamin Admin 4478b7f479
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Successful in 17s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
fix(founding-wizard): mypy/ruff cleanup for CI
- markdown_to_docx.py: type annotations + unused import
- founding_wizard_routes.py: drop unused get_db import
2026-05-20 09:58:38 +02:00

177 lines
5.1 KiB
Python

"""
Konvertiert gerendertes Markdown in eine .docx-Datei mittels python-docx.
Unterstuetzte Markdown-Elemente:
- # / ## / ### / #### / ##### Headings
- **bold** und _italic_ inline
- Tabellen (Pipe-Syntax)
- Listen mit - oder * oder Ziffer.)
- Horizontale Linien ---
- Code-Inline `code`
Bewusst minimal — fuer rechtliche Dokumente brauchen wir keine Bilder/Embeds.
"""
from __future__ import annotations
import io
import re
from typing import Any, Optional
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
HEADING_RE = re.compile(r"^(#{1,5})\s+(.+)$")
HR_RE = re.compile(r"^[-_*]{3,}\s*$")
LIST_BULLET_RE = re.compile(r"^(\s*)([-*+])\s+(.+)$")
LIST_NUMBER_RE = re.compile(r"^(\s*)(\d+)[\.\)]\s+(.+)$")
TABLE_ROW_RE = re.compile(r"^\|(.+)\|\s*$")
TABLE_SEP_RE = re.compile(r"^\|[\s\-:|]+\|\s*$")
INLINE_BOLD = re.compile(r"\*\*([^*]+)\*\*")
INLINE_ITALIC = re.compile(r"(?<!\*)\*(?!\*)([^*]+)\*(?!\*)|_([^_]+)_")
INLINE_CODE = re.compile(r"`([^`]+)`")
def _add_runs(paragraph: Any, text: str) -> None:
"""Parse inline-Formatierung und fuege Runs hinzu."""
pos = 0
tokens: list[tuple[str, str]] = []
while pos < len(text):
m_bold = INLINE_BOLD.search(text, pos)
m_code = INLINE_CODE.search(text, pos)
m_italic = INLINE_ITALIC.search(text, pos)
candidates = [m for m in (m_bold, m_code, m_italic) if m]
if not candidates:
tokens.append(("plain", text[pos:]))
break
first = min(candidates, key=lambda m: m.start())
if first.start() > pos:
tokens.append(("plain", text[pos:first.start()]))
if first is m_bold:
tokens.append(("bold", first.group(1)))
elif first is m_code:
tokens.append(("code", first.group(1)))
elif m_italic is not None:
content = m_italic.group(1) or m_italic.group(2)
tokens.append(("italic", content))
pos = first.end()
for kind, content in tokens:
run = paragraph.add_run(content)
if kind == "bold":
run.bold = True
elif kind == "italic":
run.italic = True
elif kind == "code":
run.font.name = "Courier New"
run.font.size = Pt(10)
def _parse_table(lines: list[str], start: int) -> tuple[list[list[str]], int]:
"""Parst Markdown-Tabelle. Returns (rows, next_line_index)."""
rows: list[list[str]] = []
i = start
while i < len(lines):
line = lines[i].rstrip()
if not TABLE_ROW_RE.match(line) and not TABLE_SEP_RE.match(line):
break
if TABLE_SEP_RE.match(line):
i += 1
continue
cells = [c.strip() for c in line.strip("|").split("|")]
rows.append(cells)
i += 1
return rows, i
def _add_table(doc: Any, rows: list[list[str]]) -> None:
if not rows:
return
ncols = max(len(r) for r in rows)
table = doc.add_table(rows=len(rows), cols=ncols)
table.style = "Light Grid"
for r_idx, row in enumerate(rows):
for c_idx, cell_text in enumerate(row):
if c_idx < ncols:
cell = table.rows[r_idx].cells[c_idx]
cell.text = ""
p = cell.paragraphs[0]
_add_runs(p, cell_text)
if r_idx == 0:
for run in p.runs:
run.bold = True
def markdown_to_docx_bytes(markdown_text: str, title: Optional[str] = None) -> bytes:
"""Konvertiert Markdown nach DOCX und returns die Bytes."""
doc = Document()
# Basis-Style
style = doc.styles["Normal"]
style.font.name = "Calibri"
style.font.size = Pt(11)
if title:
h = doc.add_heading(title, level=0)
h.alignment = WD_ALIGN_PARAGRAPH.LEFT
lines = markdown_text.split("\n")
i = 0
while i < len(lines):
line = lines[i].rstrip()
if not line.strip():
i += 1
continue
# Heading
h_match = HEADING_RE.match(line)
if h_match:
level = len(h_match.group(1))
text = h_match.group(2)
heading = doc.add_heading(level=min(level, 4))
_add_runs(heading, text)
i += 1
continue
# Horizontal Rule
if HR_RE.match(line):
doc.add_paragraph("" * 60)
i += 1
continue
# Tabelle
if TABLE_ROW_RE.match(line):
rows, i = _parse_table(lines, i)
_add_table(doc, rows)
doc.add_paragraph()
continue
# List Bullet
b_match = LIST_BULLET_RE.match(line)
if b_match:
p = doc.add_paragraph(style="List Bullet")
_add_runs(p, b_match.group(3))
i += 1
continue
# List Number
n_match = LIST_NUMBER_RE.match(line)
if n_match:
p = doc.add_paragraph(style="List Number")
_add_runs(p, n_match.group(3))
i += 1
continue
# Sonst: normaler Paragraph
p = doc.add_paragraph()
_add_runs(p, line)
i += 1
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()