4478b7f479
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Successful in 17s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
- markdown_to_docx.py: type annotations + unused import - founding_wizard_routes.py: drop unused get_db import
177 lines
5.1 KiB
Python
177 lines
5.1 KiB
Python
"""
|
|
Konvertiert gerendertes Markdown in eine .docx-Datei mittels python-docx.
|
|
|
|
Unterstuetzte Markdown-Elemente:
|
|
- # / ## / ### / #### / ##### Headings
|
|
- **bold** und _italic_ inline
|
|
- Tabellen (Pipe-Syntax)
|
|
- Listen mit - oder * oder Ziffer.)
|
|
- Horizontale Linien ---
|
|
- Code-Inline `code`
|
|
|
|
Bewusst minimal — fuer rechtliche Dokumente brauchen wir keine Bilder/Embeds.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import re
|
|
from typing import Any, Optional
|
|
|
|
from docx import Document
|
|
from docx.shared import Pt
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
|
|
HEADING_RE = re.compile(r"^(#{1,5})\s+(.+)$")
|
|
HR_RE = re.compile(r"^[-_*]{3,}\s*$")
|
|
LIST_BULLET_RE = re.compile(r"^(\s*)([-*+])\s+(.+)$")
|
|
LIST_NUMBER_RE = re.compile(r"^(\s*)(\d+)[\.\)]\s+(.+)$")
|
|
TABLE_ROW_RE = re.compile(r"^\|(.+)\|\s*$")
|
|
TABLE_SEP_RE = re.compile(r"^\|[\s\-:|]+\|\s*$")
|
|
|
|
INLINE_BOLD = re.compile(r"\*\*([^*]+)\*\*")
|
|
INLINE_ITALIC = re.compile(r"(?<!\*)\*(?!\*)([^*]+)\*(?!\*)|_([^_]+)_")
|
|
INLINE_CODE = re.compile(r"`([^`]+)`")
|
|
|
|
|
|
def _add_runs(paragraph: Any, text: str) -> None:
|
|
"""Parse inline-Formatierung und fuege Runs hinzu."""
|
|
pos = 0
|
|
tokens: list[tuple[str, str]] = []
|
|
while pos < len(text):
|
|
m_bold = INLINE_BOLD.search(text, pos)
|
|
m_code = INLINE_CODE.search(text, pos)
|
|
m_italic = INLINE_ITALIC.search(text, pos)
|
|
|
|
candidates = [m for m in (m_bold, m_code, m_italic) if m]
|
|
if not candidates:
|
|
tokens.append(("plain", text[pos:]))
|
|
break
|
|
first = min(candidates, key=lambda m: m.start())
|
|
if first.start() > pos:
|
|
tokens.append(("plain", text[pos:first.start()]))
|
|
if first is m_bold:
|
|
tokens.append(("bold", first.group(1)))
|
|
elif first is m_code:
|
|
tokens.append(("code", first.group(1)))
|
|
elif m_italic is not None:
|
|
content = m_italic.group(1) or m_italic.group(2)
|
|
tokens.append(("italic", content))
|
|
pos = first.end()
|
|
|
|
for kind, content in tokens:
|
|
run = paragraph.add_run(content)
|
|
if kind == "bold":
|
|
run.bold = True
|
|
elif kind == "italic":
|
|
run.italic = True
|
|
elif kind == "code":
|
|
run.font.name = "Courier New"
|
|
run.font.size = Pt(10)
|
|
|
|
|
|
def _parse_table(lines: list[str], start: int) -> tuple[list[list[str]], int]:
|
|
"""Parst Markdown-Tabelle. Returns (rows, next_line_index)."""
|
|
rows: list[list[str]] = []
|
|
i = start
|
|
while i < len(lines):
|
|
line = lines[i].rstrip()
|
|
if not TABLE_ROW_RE.match(line) and not TABLE_SEP_RE.match(line):
|
|
break
|
|
if TABLE_SEP_RE.match(line):
|
|
i += 1
|
|
continue
|
|
cells = [c.strip() for c in line.strip("|").split("|")]
|
|
rows.append(cells)
|
|
i += 1
|
|
return rows, i
|
|
|
|
|
|
def _add_table(doc: Any, rows: list[list[str]]) -> None:
|
|
if not rows:
|
|
return
|
|
ncols = max(len(r) for r in rows)
|
|
table = doc.add_table(rows=len(rows), cols=ncols)
|
|
table.style = "Light Grid"
|
|
for r_idx, row in enumerate(rows):
|
|
for c_idx, cell_text in enumerate(row):
|
|
if c_idx < ncols:
|
|
cell = table.rows[r_idx].cells[c_idx]
|
|
cell.text = ""
|
|
p = cell.paragraphs[0]
|
|
_add_runs(p, cell_text)
|
|
if r_idx == 0:
|
|
for run in p.runs:
|
|
run.bold = True
|
|
|
|
|
|
def markdown_to_docx_bytes(markdown_text: str, title: Optional[str] = None) -> bytes:
|
|
"""Konvertiert Markdown nach DOCX und returns die Bytes."""
|
|
doc = Document()
|
|
|
|
# Basis-Style
|
|
style = doc.styles["Normal"]
|
|
style.font.name = "Calibri"
|
|
style.font.size = Pt(11)
|
|
|
|
if title:
|
|
h = doc.add_heading(title, level=0)
|
|
h.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
|
|
|
lines = markdown_text.split("\n")
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i].rstrip()
|
|
|
|
if not line.strip():
|
|
i += 1
|
|
continue
|
|
|
|
# Heading
|
|
h_match = HEADING_RE.match(line)
|
|
if h_match:
|
|
level = len(h_match.group(1))
|
|
text = h_match.group(2)
|
|
heading = doc.add_heading(level=min(level, 4))
|
|
_add_runs(heading, text)
|
|
i += 1
|
|
continue
|
|
|
|
# Horizontal Rule
|
|
if HR_RE.match(line):
|
|
doc.add_paragraph("─" * 60)
|
|
i += 1
|
|
continue
|
|
|
|
# Tabelle
|
|
if TABLE_ROW_RE.match(line):
|
|
rows, i = _parse_table(lines, i)
|
|
_add_table(doc, rows)
|
|
doc.add_paragraph()
|
|
continue
|
|
|
|
# List Bullet
|
|
b_match = LIST_BULLET_RE.match(line)
|
|
if b_match:
|
|
p = doc.add_paragraph(style="List Bullet")
|
|
_add_runs(p, b_match.group(3))
|
|
i += 1
|
|
continue
|
|
|
|
# List Number
|
|
n_match = LIST_NUMBER_RE.match(line)
|
|
if n_match:
|
|
p = doc.add_paragraph(style="List Number")
|
|
_add_runs(p, n_match.group(3))
|
|
i += 1
|
|
continue
|
|
|
|
# Sonst: normaler Paragraph
|
|
p = doc.add_paragraph()
|
|
_add_runs(p, line)
|
|
i += 1
|
|
|
|
buf = io.BytesIO()
|
|
doc.save(buf)
|
|
return buf.getvalue()
|