breakpilot-compliance/backend-compliance/compliance/services/dse_parser.py

"""
DSE Parser — parses privacy policy HTML into structured sections.

Extracts headings, section numbers, content blocks and builds a
hierarchical structure that enables precise text references.
"""

import logging
import re
from dataclasses import dataclass, field
from html.parser import HTMLParser

logger = logging.getLogger(__name__)


@dataclass
class DSESection:
    """A section in a privacy policy."""
    heading: str
    heading_level: int  # 1-4
    section_number: str  # "2.5" or "" if no number
    content: str  # Plain text content
    html: str  # Original HTML content
    parent_heading: str = ""
    url: str = ""
    element_id: str = ""
    paragraph_count: int = 0


class _HeadingExtractor(HTMLParser):
    """Extract headings and their content from HTML."""

    def __init__(self):
        super().__init__()
        self.sections: list[dict] = []
        self._current_tag = ""
        self._in_heading = False
        self._heading_level = 0
        self._heading_text = ""
        self._heading_id = ""
        self._content_parts: list[str] = []
        self._html_parts: list[str] = []
        self._skip_tags = {"script", "style", "nav", "footer", "header"}
        self._skip_depth = 0
        self._p_count = 0

    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        if tag in self._skip_tags:
            self._skip_depth += 1
            return
        if self._skip_depth > 0:
            return

        if tag in ("h1", "h2", "h3", "h4"):
            # Save previous section
            if self._heading_text:
                self._save_section()
            self._in_heading = True
            self._heading_level = int(tag[1])
            self._heading_text = ""
            self._heading_id = attrs_dict.get("id", "")
            self._content_parts = []
            self._html_parts = []
            self._p_count = 0

        if tag == "p":
            self._p_count += 1

        # Reconstruct HTML
        attr_str = " ".join(f'{k}="{v}"' for k, v in attrs)
        self._html_parts.append(f"<{tag}{' ' + attr_str if attr_str else ''}>")

    def handle_endtag(self, tag):
        if tag in self._skip_tags and self._skip_depth > 0:
            self._skip_depth -= 1
            return
        if self._skip_depth > 0:
            return

        if tag in ("h1", "h2", "h3", "h4"):
            self._in_heading = False

        self._html_parts.append(f"</{tag}>")

    def handle_data(self, data):
        if self._skip_depth > 0:
            return
        if self._in_heading:
            self._heading_text += data.strip()
        else:
            self._content_parts.append(data)
        self._html_parts.append(data)

    def _save_section(self):
        if not self._heading_text:
            return
        content = " ".join(self._content_parts)
        content = re.sub(r"\s+", " ", content).strip()
        self.sections.append({
            "heading": self._heading_text.strip(),
            "heading_level": self._heading_level,
            "element_id": self._heading_id,
            "content": content,
            "html": "".join(self._html_parts),
            "paragraph_count": self._p_count,
        })

    def finalize(self):
        """Call after feeding all data to save the last section."""
        if self._heading_text:
            self._save_section()


def parse_dse(html: str, url: str = "") -> list[DSESection]:
    """Parse privacy policy HTML into structured sections."""
    extractor = _HeadingExtractor()
    try:
        extractor.feed(html)
        extractor.finalize()
    except Exception as e:
        logger.warning("HTML parsing failed, falling back to regex: %s", e)
        return _regex_fallback(html, url)

    if not extractor.sections:
        return _regex_fallback(html, url)

    # Build parent hierarchy
    sections: list[DSESection] = []
    parent_stack: list[str] = [""]  # Stack of parent headings by level

    for raw in extractor.sections:
        heading = raw["heading"]
        level = raw["heading_level"]

        # Extract section number (e.g., "2.5" from "2.5 Webanalyse")
        num_match = re.match(r"^(\d+(?:\.\d+)*)\s*[.:]?\s*", heading)
        section_number = num_match.group(1) if num_match else ""

        # Track parent headings
        while len(parent_stack) > level:
            parent_stack.pop()
        parent = parent_stack[-1] if parent_stack else ""
        parent_stack.append(heading)

        sections.append(DSESection(
            heading=heading,
            heading_level=level,
            section_number=section_number,
            content=raw["content"][:2000],  # Cap content length
            html=raw["html"][:3000],
            parent_heading=parent,
            url=url,
            element_id=raw["element_id"],
            paragraph_count=raw["paragraph_count"],
        ))

    logger.info("Parsed DSE: %d sections from %s", len(sections), url)
    return sections


def _regex_fallback(html: str, url: str) -> list[DSESection]:
    """Fallback parser using regex when HTML parsing fails."""
    # Strip scripts and styles
    clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)

    sections = []
    # Find all headings
    for match in re.finditer(r"<h([1-4])[^>]*(?:id=[\"']([^\"']*)[\"'])?[^>]*>(.*?)</h\1>", clean, re.DOTALL | re.IGNORECASE):
        level = int(match.group(1))
        elem_id = match.group(2) or ""
        heading = re.sub(r"<[^>]+>", "", match.group(3)).strip()

        # Get content until next heading
        start = match.end()
        next_heading = re.search(r"<h[1-4]", clean[start:], re.IGNORECASE)
        end = start + next_heading.start() if next_heading else start + 2000
        content = clean[start:end]
        content = re.sub(r"<[^>]+>", " ", content)
        content = re.sub(r"\s+", " ", content).strip()

        num_match = re.match(r"^(\d+(?:\.\d+)*)", heading)

        sections.append(DSESection(
            heading=heading,
            heading_level=level,
            section_number=num_match.group(1) if num_match else "",
            content=content[:2000],
            html="",
            url=url,
            element_id=elem_id,
        ))

    return sections


def find_section_by_content(sections: list[DSESection], search_text: str) -> DSESection | None:
    """Find the section that contains specific text."""
    search_lower = search_text.lower()
    for section in sections:
        if search_lower in section.content.lower():
            return section
    return None


def find_section_by_category(sections: list[DSESection], category: str) -> DSESection | None:
    """Find the section most likely to contain a service category."""
    category_keywords = {
        "tracking": ["cookie", "tracking", "webanalyse", "analytics", "statistik"],
        "marketing": ["marketing", "werbung", "newsletter", "remarketing"],
        "payment": ["zahlung", "payment", "bezahlung", "zahlungsabwicklung"],
        "chatbot": ["chat", "kommunikation", "kundenservice", "kontakt"],
        "cdn": ["hosting", "bereitstellung", "technisch", "infrastruktur", "cdn"],
        "other": ["sonstig", "weitere", "dritte", "extern"],
    }
    keywords = category_keywords.get(category, category_keywords["other"])

    for section in sections:
        heading_lower = section.heading.lower()
        content_lower = section.content.lower()[:500]
        for kw in keywords:
            if kw in heading_lower or kw in content_lower:
                return section
    return None