""" DSE Parser — parses privacy policy HTML into structured sections. Extracts headings, section numbers, content blocks and builds a hierarchical structure that enables precise text references. """ import logging import re from dataclasses import dataclass, field from html.parser import HTMLParser logger = logging.getLogger(__name__) @dataclass class DSESection: """A section in a privacy policy.""" heading: str heading_level: int # 1-4 section_number: str # "2.5" or "" if no number content: str # Plain text content html: str # Original HTML content parent_heading: str = "" url: str = "" element_id: str = "" paragraph_count: int = 0 class _HeadingExtractor(HTMLParser): """Extract headings and their content from HTML.""" def __init__(self): super().__init__() self.sections: list[dict] = [] self._current_tag = "" self._in_heading = False self._heading_level = 0 self._heading_text = "" self._heading_id = "" self._content_parts: list[str] = [] self._html_parts: list[str] = [] self._skip_tags = {"script", "style", "nav", "footer", "header"} self._skip_depth = 0 self._p_count = 0 def handle_starttag(self, tag, attrs): attrs_dict = dict(attrs) if tag in self._skip_tags: self._skip_depth += 1 return if self._skip_depth > 0: return if tag in ("h1", "h2", "h3", "h4"): # Save previous section if self._heading_text: self._save_section() self._in_heading = True self._heading_level = int(tag[1]) self._heading_text = "" self._heading_id = attrs_dict.get("id", "") self._content_parts = [] self._html_parts = [] self._p_count = 0 if tag == "p": self._p_count += 1 # Reconstruct HTML attr_str = " ".join(f'{k}="{v}"' for k, v in attrs) self._html_parts.append(f"<{tag}{' ' + attr_str if attr_str else ''}>") def handle_endtag(self, tag): if tag in self._skip_tags and self._skip_depth > 0: self._skip_depth -= 1 return if self._skip_depth > 0: return if tag in ("h1", "h2", "h3", "h4"): self._in_heading = False self._html_parts.append(f"") def handle_data(self, data): if self._skip_depth > 0: return if self._in_heading: self._heading_text += data.strip() else: self._content_parts.append(data) self._html_parts.append(data) def _save_section(self): if not self._heading_text: return content = " ".join(self._content_parts) content = re.sub(r"\s+", " ", content).strip() self.sections.append({ "heading": self._heading_text.strip(), "heading_level": self._heading_level, "element_id": self._heading_id, "content": content, "html": "".join(self._html_parts), "paragraph_count": self._p_count, }) def finalize(self): """Call after feeding all data to save the last section.""" if self._heading_text: self._save_section() def parse_dse(html: str, url: str = "") -> list[DSESection]: """Parse privacy policy HTML into structured sections.""" extractor = _HeadingExtractor() try: extractor.feed(html) extractor.finalize() except Exception as e: logger.warning("HTML parsing failed, falling back to regex: %s", e) return _regex_fallback(html, url) if not extractor.sections: return _regex_fallback(html, url) # Build parent hierarchy sections: list[DSESection] = [] parent_stack: list[str] = [""] # Stack of parent headings by level for raw in extractor.sections: heading = raw["heading"] level = raw["heading_level"] # Extract section number (e.g., "2.5" from "2.5 Webanalyse") num_match = re.match(r"^(\d+(?:\.\d+)*)\s*[.:]?\s*", heading) section_number = num_match.group(1) if num_match else "" # Track parent headings while len(parent_stack) > level: parent_stack.pop() parent = parent_stack[-1] if parent_stack else "" parent_stack.append(heading) sections.append(DSESection( heading=heading, heading_level=level, section_number=section_number, content=raw["content"][:2000], # Cap content length html=raw["html"][:3000], parent_heading=parent, url=url, element_id=raw["element_id"], paragraph_count=raw["paragraph_count"], )) logger.info("Parsed DSE: %d sections from %s", len(sections), url) return sections def _regex_fallback(html: str, url: str) -> list[DSESection]: """Fallback parser using regex when HTML parsing fails.""" # Strip scripts and styles clean = re.sub(r"<(script|style)[^>]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) sections = [] # Find all headings for match in re.finditer(r"]*(?:id=[\"']([^\"']*)[\"'])?[^>]*>(.*?)", clean, re.DOTALL | re.IGNORECASE): level = int(match.group(1)) elem_id = match.group(2) or "" heading = re.sub(r"<[^>]+>", "", match.group(3)).strip() # Get content until next heading start = match.end() next_heading = re.search(r"]+>", " ", content) content = re.sub(r"\s+", " ", content).strip() num_match = re.match(r"^(\d+(?:\.\d+)*)", heading) sections.append(DSESection( heading=heading, heading_level=level, section_number=num_match.group(1) if num_match else "", content=content[:2000], html="", url=url, element_id=elem_id, )) return sections def find_section_by_content(sections: list[DSESection], search_text: str) -> DSESection | None: """Find the section that contains specific text.""" search_lower = search_text.lower() for section in sections: if search_lower in section.content.lower(): return section return None def find_section_by_category(sections: list[DSESection], category: str) -> DSESection | None: """Find the section most likely to contain a service category.""" category_keywords = { "tracking": ["cookie", "tracking", "webanalyse", "analytics", "statistik"], "marketing": ["marketing", "werbung", "newsletter", "remarketing"], "payment": ["zahlung", "payment", "bezahlung", "zahlungsabwicklung"], "chatbot": ["chat", "kommunikation", "kundenservice", "kontakt"], "cdn": ["hosting", "bereitstellung", "technisch", "infrastruktur", "cdn"], "other": ["sonstig", "weitere", "dritte", "extern"], } keywords = category_keywords.get(category, category_keywords["other"]) for section in sections: heading_lower = section.heading.lower() content_lower = section.content.lower()[:500] for kw in keywords: if kw in heading_lower or kw in content_lower: return section return None