Services: Admin-Compliance, Backend-Compliance, AI-Compliance-SDK, Consent-SDK, Developer-Portal, PCA-Platform, DSMS Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
603 lines
22 KiB
Python
603 lines
22 KiB
Python
"""
|
||
PDF Extractor for BSI-TR-03161 and EU Regulation Documents.
|
||
|
||
This module extracts Pruefaspekte (test aspects) from BSI Technical Guidelines
|
||
and Articles from EU regulations in PDF format.
|
||
"""
|
||
|
||
import re
|
||
import logging
|
||
from dataclasses import dataclass, field
|
||
from typing import List, Optional, Dict, Any
|
||
from pathlib import Path
|
||
from enum import Enum
|
||
|
||
try:
|
||
import fitz # PyMuPDF
|
||
except ImportError:
|
||
fitz = None
|
||
logging.warning("PyMuPDF not installed. PDF extraction will not work.")
|
||
|
||
|
||
class RequirementLevel(str, Enum):
|
||
"""BSI requirement levels (German: Anforderungsstufen)."""
|
||
MUSS = "MUSS" # MUST - mandatory
|
||
SOLL = "SOLL" # SHOULD - recommended
|
||
KANN = "KANN" # MAY - optional
|
||
DARF_NICHT = "DARF NICHT" # MUST NOT - prohibited
|
||
|
||
|
||
class AspectCategory(str, Enum):
|
||
"""Categories for BSI-TR Pruefaspekte."""
|
||
AUTHENTICATION = "authentication"
|
||
SESSION_MANAGEMENT = "session_management"
|
||
CRYPTOGRAPHY = "cryptography"
|
||
INPUT_VALIDATION = "input_validation"
|
||
SQL_INJECTION = "sql_injection"
|
||
XSS_PREVENTION = "xss_prevention"
|
||
CSRF_PROTECTION = "csrf_protection"
|
||
LOGGING_AUDIT = "logging_audit"
|
||
ERROR_HANDLING = "error_handling"
|
||
NETWORK_SECURITY = "network_security"
|
||
SECURE_STORAGE = "secure_storage"
|
||
PRIVACY = "privacy"
|
||
ACCESS_CONTROL = "access_control"
|
||
DATA_PROTECTION = "data_protection"
|
||
KEY_MANAGEMENT = "key_management"
|
||
SECURE_COMMUNICATION = "secure_communication"
|
||
UPDATE_MECHANISM = "update_mechanism"
|
||
GENERAL = "general"
|
||
TEST_ASPECT = "test_aspect"
|
||
|
||
|
||
@dataclass
|
||
class BSIAspect:
|
||
"""A single extracted BSI-TR Pruefaspekt (test aspect)."""
|
||
aspect_id: str # e.g., "O.Auth_1", "T.Sess_2"
|
||
title: str # Short title
|
||
full_text: str # Complete requirement text
|
||
category: AspectCategory # Categorization
|
||
page_number: int # PDF page where found
|
||
section: str # Chapter/section number
|
||
requirement_level: RequirementLevel # MUSS/SOLL/KANN
|
||
source_document: str # e.g., "BSI-TR-03161-2"
|
||
context_before: str = "" # Text before the aspect
|
||
context_after: str = "" # Text after the aspect
|
||
related_aspects: List[str] = field(default_factory=list) # Related aspect IDs
|
||
keywords: List[str] = field(default_factory=list) # Extracted keywords
|
||
|
||
|
||
@dataclass
|
||
class EUArticle:
|
||
"""A single extracted EU regulation article."""
|
||
article_number: str # e.g., "Art. 32", "Artikel 5"
|
||
title: str # Article title
|
||
full_text: str # Complete article text
|
||
paragraphs: List[str] # Individual paragraphs
|
||
page_number: int # PDF page
|
||
regulation_name: str # e.g., "DSGVO", "AI Act"
|
||
recitals: List[str] = field(default_factory=list) # Related recitals
|
||
keywords: List[str] = field(default_factory=list) # Extracted keywords
|
||
|
||
|
||
class BSIPDFExtractor:
|
||
"""
|
||
Extracts Pruefaspekte from BSI-TR-03161 PDF documents.
|
||
|
||
The BSI-TR-03161 series contains security requirements for mobile applications:
|
||
- Part 1: General security requirements
|
||
- Part 2: Web application security (OAuth, Sessions, Input validation, etc.)
|
||
- Part 3: Backend/server security
|
||
|
||
Each document contains hundreds of Pruefaspekte (test aspects) that need to
|
||
be extracted, categorized, and stored for compliance tracking.
|
||
"""
|
||
|
||
# Regex patterns for BSI-TR aspect identification
|
||
PATTERNS = {
|
||
# Primary aspect ID patterns (e.g., O.Auth_1, T.Network_2)
|
||
'aspect_id': r'(O\.[A-Za-z]+_\d+|T\.[A-Za-z]+_\d+)',
|
||
|
||
# Alternative section-based pattern (e.g., "Pruefaspekt 4.2.1")
|
||
'section_aspect': r'(?:Prüfaspekt|Pruefaspekt|Anforderung)\s+(\d+\.\d+(?:\.\d+)?)',
|
||
|
||
# Section number pattern
|
||
'section': r'(\d+\.\d+(?:\.\d+)?)',
|
||
|
||
# Requirement level pattern
|
||
'requirement': r'\b(MUSS|SOLL|KANN|DARF\s+NICHT|muss|soll|kann|darf\s+nicht)\b',
|
||
|
||
# Table header pattern for Pruefaspekte tables
|
||
'table_header': r'(?:Prüfaspekt|Bezeichnung|ID|Anforderung)',
|
||
}
|
||
|
||
# Category mapping based on aspect ID prefix
|
||
CATEGORY_MAP = {
|
||
'O.Auth': AspectCategory.AUTHENTICATION,
|
||
'O.Sess': AspectCategory.SESSION_MANAGEMENT,
|
||
'O.Cryp': AspectCategory.CRYPTOGRAPHY,
|
||
'O.Crypto': AspectCategory.CRYPTOGRAPHY,
|
||
'O.Input': AspectCategory.INPUT_VALIDATION,
|
||
'O.SQL': AspectCategory.SQL_INJECTION,
|
||
'O.XSS': AspectCategory.XSS_PREVENTION,
|
||
'O.CSRF': AspectCategory.CSRF_PROTECTION,
|
||
'O.Log': AspectCategory.LOGGING_AUDIT,
|
||
'O.Audit': AspectCategory.LOGGING_AUDIT,
|
||
'O.Err': AspectCategory.ERROR_HANDLING,
|
||
'O.Error': AspectCategory.ERROR_HANDLING,
|
||
'O.Net': AspectCategory.NETWORK_SECURITY,
|
||
'O.Network': AspectCategory.NETWORK_SECURITY,
|
||
'O.Store': AspectCategory.SECURE_STORAGE,
|
||
'O.Storage': AspectCategory.SECURE_STORAGE,
|
||
'O.Priv': AspectCategory.PRIVACY,
|
||
'O.Privacy': AspectCategory.PRIVACY,
|
||
'O.Data': AspectCategory.DATA_PROTECTION,
|
||
'O.Access': AspectCategory.ACCESS_CONTROL,
|
||
'O.Key': AspectCategory.KEY_MANAGEMENT,
|
||
'O.Comm': AspectCategory.SECURE_COMMUNICATION,
|
||
'O.TLS': AspectCategory.SECURE_COMMUNICATION,
|
||
'O.Update': AspectCategory.UPDATE_MECHANISM,
|
||
'T.': AspectCategory.TEST_ASPECT,
|
||
}
|
||
|
||
# Keywords for category detection when aspect ID is ambiguous
|
||
CATEGORY_KEYWORDS = {
|
||
AspectCategory.AUTHENTICATION: [
|
||
'authentifizierung', 'authentication', 'login', 'anmeldung',
|
||
'passwort', 'password', 'credential', 'oauth', 'oidc', 'token',
|
||
'bearer', 'jwt', 'session', 'multi-faktor', 'mfa', '2fa'
|
||
],
|
||
AspectCategory.SESSION_MANAGEMENT: [
|
||
'session', 'sitzung', 'cookie', 'timeout', 'ablauf',
|
||
'session-id', 'sessionid', 'logout', 'abmeldung'
|
||
],
|
||
AspectCategory.CRYPTOGRAPHY: [
|
||
'verschlüsselung', 'encryption', 'kryptograph', 'cryptograph',
|
||
'aes', 'rsa', 'hash', 'signatur', 'signature', 'zertifikat',
|
||
'certificate', 'tls', 'ssl', 'hmac', 'pbkdf', 'argon'
|
||
],
|
||
AspectCategory.INPUT_VALIDATION: [
|
||
'eingabevalidierung', 'input validation', 'validierung',
|
||
'eingabeprüfung', 'sanitiz', 'whitelist', 'blacklist',
|
||
'filter', 'escape', 'encoding'
|
||
],
|
||
AspectCategory.SQL_INJECTION: [
|
||
'sql injection', 'sql-injection', 'prepared statement',
|
||
'parameterisiert', 'parameterized', 'orm', 'database'
|
||
],
|
||
AspectCategory.XSS_PREVENTION: [
|
||
'xss', 'cross-site scripting', 'script injection',
|
||
'html encoding', 'output encoding', 'csp', 'content-security'
|
||
],
|
||
AspectCategory.CSRF_PROTECTION: [
|
||
'csrf', 'cross-site request', 'token', 'anti-csrf',
|
||
'state parameter', 'same-site', 'samesite'
|
||
],
|
||
AspectCategory.LOGGING_AUDIT: [
|
||
'logging', 'protokollierung', 'audit', 'nachvollziehbar',
|
||
'traceability', 'log', 'event', 'monitoring'
|
||
],
|
||
AspectCategory.ERROR_HANDLING: [
|
||
'fehlerbehandlung', 'error handling', 'exception',
|
||
'fehlermeldung', 'error message', 'stack trace'
|
||
],
|
||
}
|
||
|
||
def __init__(self, logger: Optional[logging.Logger] = None):
|
||
"""Initialize the PDF extractor."""
|
||
self.logger = logger or logging.getLogger(__name__)
|
||
|
||
if fitz is None:
|
||
raise ImportError(
|
||
"PyMuPDF is required for PDF extraction. "
|
||
"Install it with: pip install PyMuPDF"
|
||
)
|
||
|
||
def extract_from_file(self, pdf_path: str, source_name: Optional[str] = None) -> List[BSIAspect]:
|
||
"""
|
||
Extract all Pruefaspekte from a BSI-TR PDF file.
|
||
|
||
Args:
|
||
pdf_path: Path to the PDF file
|
||
source_name: Optional source document name (auto-detected if not provided)
|
||
|
||
Returns:
|
||
List of extracted BSIAspect objects
|
||
"""
|
||
path = Path(pdf_path)
|
||
if not path.exists():
|
||
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
||
|
||
source = source_name or path.stem
|
||
self.logger.info(f"Extracting aspects from: {source}")
|
||
|
||
doc = fitz.open(pdf_path)
|
||
aspects = []
|
||
|
||
for page_num in range(len(doc)):
|
||
page = doc[page_num]
|
||
text = page.get_text()
|
||
|
||
# Extract aspects from this page
|
||
page_aspects = self._extract_aspects_from_text(
|
||
text=text,
|
||
page_num=page_num + 1,
|
||
source_document=source
|
||
)
|
||
aspects.extend(page_aspects)
|
||
|
||
doc.close()
|
||
|
||
# Post-process: deduplicate and enrich
|
||
aspects = self._deduplicate_aspects(aspects)
|
||
aspects = self._enrich_aspects(aspects)
|
||
|
||
self.logger.info(f"Extracted {len(aspects)} unique aspects from {source}")
|
||
return aspects
|
||
|
||
def extract_all_documents(self, docs_dir: str) -> Dict[str, List[BSIAspect]]:
|
||
"""
|
||
Extract aspects from all BSI-TR PDFs in a directory.
|
||
|
||
Args:
|
||
docs_dir: Directory containing BSI-TR PDF files
|
||
|
||
Returns:
|
||
Dictionary mapping document names to their extracted aspects
|
||
"""
|
||
docs_path = Path(docs_dir)
|
||
results = {}
|
||
|
||
# Look for BSI-TR PDFs
|
||
patterns = ["BSI-TR-03161*.pdf", "bsi-tr-03161*.pdf"]
|
||
|
||
for pattern in patterns:
|
||
for pdf_file in docs_path.glob(pattern):
|
||
try:
|
||
aspects = self.extract_from_file(str(pdf_file))
|
||
results[pdf_file.stem] = aspects
|
||
except Exception as e:
|
||
self.logger.error(f"Failed to extract from {pdf_file}: {e}")
|
||
|
||
return results
|
||
|
||
def _extract_aspects_from_text(
|
||
self,
|
||
text: str,
|
||
page_num: int,
|
||
source_document: str
|
||
) -> List[BSIAspect]:
|
||
"""Extract all Pruefaspekte from a page's text."""
|
||
aspects = []
|
||
|
||
# Find all aspect IDs on this page
|
||
for match in re.finditer(self.PATTERNS['aspect_id'], text, re.IGNORECASE):
|
||
aspect_id = match.group(1).upper()
|
||
|
||
# Extract context around the match
|
||
start = max(0, match.start() - 200)
|
||
end = min(len(text), match.end() + 1000)
|
||
context = text[start:end]
|
||
|
||
# Determine category from aspect ID
|
||
category = self._determine_category(aspect_id, context)
|
||
|
||
# Extract requirement level
|
||
req_level = self._extract_requirement_level(context)
|
||
|
||
# Extract title (text immediately after aspect ID)
|
||
title = self._extract_title(context, aspect_id)
|
||
|
||
# Extract section number
|
||
section = self._extract_section(context)
|
||
|
||
# Extract full requirement text
|
||
full_text = self._extract_full_text(context, aspect_id)
|
||
|
||
aspects.append(BSIAspect(
|
||
aspect_id=aspect_id,
|
||
title=title,
|
||
full_text=full_text,
|
||
category=category,
|
||
page_number=page_num,
|
||
section=section,
|
||
requirement_level=req_level,
|
||
source_document=source_document,
|
||
context_before=text[start:match.start()].strip()[-100:],
|
||
context_after=text[match.end():end].strip()[:200],
|
||
))
|
||
|
||
# Also look for section-based aspects
|
||
for match in re.finditer(self.PATTERNS['section_aspect'], text, re.IGNORECASE):
|
||
section_id = match.group(1)
|
||
aspect_id = f"SEC_{section_id.replace('.', '_')}"
|
||
|
||
# Check if we already have this as an O.* aspect
|
||
if any(a.section == section_id for a in aspects):
|
||
continue
|
||
|
||
start = max(0, match.start() - 100)
|
||
end = min(len(text), match.end() + 800)
|
||
context = text[start:end]
|
||
|
||
category = self._determine_category_from_keywords(context)
|
||
req_level = self._extract_requirement_level(context)
|
||
|
||
aspects.append(BSIAspect(
|
||
aspect_id=aspect_id,
|
||
title=f"Prüfaspekt {section_id}",
|
||
full_text=context.strip(),
|
||
category=category,
|
||
page_number=page_num,
|
||
section=section_id,
|
||
requirement_level=req_level,
|
||
source_document=source_document,
|
||
))
|
||
|
||
return aspects
|
||
|
||
def _determine_category(self, aspect_id: str, context: str) -> AspectCategory:
|
||
"""Determine the category of an aspect based on its ID and context."""
|
||
# First try to match by aspect ID prefix
|
||
for prefix, category in self.CATEGORY_MAP.items():
|
||
if aspect_id.upper().startswith(prefix.upper()):
|
||
return category
|
||
|
||
# Fall back to keyword-based detection
|
||
return self._determine_category_from_keywords(context)
|
||
|
||
def _determine_category_from_keywords(self, text: str) -> AspectCategory:
|
||
"""Determine category based on keywords in the text."""
|
||
text_lower = text.lower()
|
||
|
||
category_scores = {}
|
||
for category, keywords in self.CATEGORY_KEYWORDS.items():
|
||
score = sum(1 for kw in keywords if kw in text_lower)
|
||
if score > 0:
|
||
category_scores[category] = score
|
||
|
||
if category_scores:
|
||
return max(category_scores, key=category_scores.get)
|
||
|
||
return AspectCategory.GENERAL
|
||
|
||
def _extract_requirement_level(self, text: str) -> RequirementLevel:
|
||
"""Extract the requirement level from text."""
|
||
match = re.search(self.PATTERNS['requirement'], text, re.IGNORECASE)
|
||
if match:
|
||
level = match.group(1).upper()
|
||
if 'DARF' in level and 'NICHT' in level:
|
||
return RequirementLevel.DARF_NICHT
|
||
elif level == 'MUSS':
|
||
return RequirementLevel.MUSS
|
||
elif level == 'SOLL':
|
||
return RequirementLevel.SOLL
|
||
elif level == 'KANN':
|
||
return RequirementLevel.KANN
|
||
|
||
return RequirementLevel.SOLL # Default
|
||
|
||
def _extract_title(self, context: str, aspect_id: str) -> str:
|
||
"""Extract the title/short description of an aspect."""
|
||
# Look for text immediately after the aspect ID
|
||
pattern = rf'{re.escape(aspect_id)}\s*[:\-–]?\s*([^\n]+)'
|
||
match = re.search(pattern, context, re.IGNORECASE)
|
||
|
||
if match:
|
||
title = match.group(1).strip()
|
||
# Clean up the title
|
||
title = re.sub(r'\s+', ' ', title)
|
||
# Truncate if too long
|
||
if len(title) > 200:
|
||
title = title[:197] + "..."
|
||
return title
|
||
|
||
return aspect_id
|
||
|
||
def _extract_section(self, context: str) -> str:
|
||
"""Extract the section number from context."""
|
||
match = re.search(self.PATTERNS['section'], context)
|
||
return match.group(1) if match else ""
|
||
|
||
def _extract_full_text(self, context: str, aspect_id: str) -> str:
|
||
"""Extract the complete requirement text."""
|
||
# Find the aspect ID and get text until the next aspect or section
|
||
pattern = rf'{re.escape(aspect_id)}[^\n]*\n(.*?)(?=\n\s*(?:O\.[A-Z]|T\.[A-Z]|\d+\.\d+\s|\Z))'
|
||
match = re.search(pattern, context, re.IGNORECASE | re.DOTALL)
|
||
|
||
if match:
|
||
full_text = match.group(0).strip()
|
||
else:
|
||
# Fall back to context
|
||
full_text = context.strip()
|
||
|
||
# Clean up
|
||
full_text = re.sub(r'\s+', ' ', full_text)
|
||
return full_text
|
||
|
||
def _deduplicate_aspects(self, aspects: List[BSIAspect]) -> List[BSIAspect]:
|
||
"""Remove duplicate aspects, keeping the one with more context."""
|
||
seen = {}
|
||
|
||
for aspect in aspects:
|
||
key = aspect.aspect_id
|
||
if key not in seen:
|
||
seen[key] = aspect
|
||
else:
|
||
# Keep the one with longer full_text
|
||
if len(aspect.full_text) > len(seen[key].full_text):
|
||
seen[key] = aspect
|
||
|
||
return list(seen.values())
|
||
|
||
def _enrich_aspects(self, aspects: List[BSIAspect]) -> List[BSIAspect]:
|
||
"""Enrich aspects with additional metadata."""
|
||
aspect_ids = {a.aspect_id for a in aspects}
|
||
|
||
for aspect in aspects:
|
||
# Find related aspects mentioned in the full text
|
||
for other_id in aspect_ids:
|
||
if other_id != aspect.aspect_id and other_id in aspect.full_text:
|
||
aspect.related_aspects.append(other_id)
|
||
|
||
# Extract keywords based on category
|
||
aspect.keywords = self._extract_keywords(aspect)
|
||
|
||
return aspects
|
||
|
||
def _extract_keywords(self, aspect: BSIAspect) -> List[str]:
|
||
"""Extract relevant keywords from an aspect."""
|
||
keywords = []
|
||
text_lower = aspect.full_text.lower()
|
||
|
||
# Add keywords based on category
|
||
if aspect.category in self.CATEGORY_KEYWORDS:
|
||
for kw in self.CATEGORY_KEYWORDS[aspect.category]:
|
||
if kw in text_lower:
|
||
keywords.append(kw)
|
||
|
||
return list(set(keywords))[:10] # Limit to 10 keywords
|
||
|
||
def get_statistics(self, aspects: List[BSIAspect]) -> Dict[str, Any]:
|
||
"""Get statistics about extracted aspects."""
|
||
stats = {
|
||
"total_aspects": len(aspects),
|
||
"by_category": {},
|
||
"by_requirement_level": {},
|
||
"by_source": {},
|
||
"unique_sections": set(),
|
||
}
|
||
|
||
for aspect in aspects:
|
||
# By category
|
||
cat = aspect.category.value
|
||
stats["by_category"][cat] = stats["by_category"].get(cat, 0) + 1
|
||
|
||
# By requirement level
|
||
level = aspect.requirement_level.value
|
||
stats["by_requirement_level"][level] = stats["by_requirement_level"].get(level, 0) + 1
|
||
|
||
# By source
|
||
src = aspect.source_document
|
||
stats["by_source"][src] = stats["by_source"].get(src, 0) + 1
|
||
|
||
# Unique sections
|
||
if aspect.section:
|
||
stats["unique_sections"].add(aspect.section)
|
||
|
||
stats["unique_sections"] = len(stats["unique_sections"])
|
||
return stats
|
||
|
||
|
||
class EURegulationExtractor:
|
||
"""
|
||
Extracts Articles from EU Regulation PDF documents.
|
||
|
||
Handles documents like GDPR, AI Act, CRA, etc. in their official formats.
|
||
"""
|
||
|
||
PATTERNS = {
|
||
'article_de': r'Artikel\s+(\d+)',
|
||
'article_en': r'Article\s+(\d+)',
|
||
'paragraph': r'\((\d+)\)',
|
||
'recital': r'Erwägungsgrund\s+(\d+)|Recital\s+(\d+)',
|
||
}
|
||
|
||
def __init__(self, logger: Optional[logging.Logger] = None):
|
||
self.logger = logger or logging.getLogger(__name__)
|
||
|
||
def extract_from_file(
|
||
self,
|
||
pdf_path: str,
|
||
regulation_name: str,
|
||
language: str = "de"
|
||
) -> List[EUArticle]:
|
||
"""Extract all articles from an EU regulation PDF."""
|
||
if fitz is None:
|
||
raise ImportError("PyMuPDF is required for PDF extraction.")
|
||
|
||
path = Path(pdf_path)
|
||
if not path.exists():
|
||
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
||
|
||
doc = fitz.open(pdf_path)
|
||
articles = []
|
||
|
||
article_pattern = (
|
||
self.PATTERNS['article_de'] if language == "de"
|
||
else self.PATTERNS['article_en']
|
||
)
|
||
|
||
for page_num in range(len(doc)):
|
||
page = doc[page_num]
|
||
text = page.get_text()
|
||
|
||
# Find article starts
|
||
for match in re.finditer(article_pattern, text):
|
||
article_num = match.group(1)
|
||
|
||
# Extract article content
|
||
start = match.start()
|
||
# Find next article or end of page
|
||
next_match = re.search(article_pattern, text[match.end():])
|
||
end = match.end() + next_match.start() if next_match else len(text)
|
||
|
||
article_text = text[start:end].strip()
|
||
|
||
# Extract paragraphs
|
||
paragraphs = self._extract_paragraphs(article_text)
|
||
|
||
# Extract title
|
||
title = self._extract_article_title(article_text, article_num)
|
||
|
||
articles.append(EUArticle(
|
||
article_number=f"Art. {article_num}",
|
||
title=title,
|
||
full_text=article_text,
|
||
paragraphs=paragraphs,
|
||
page_number=page_num + 1,
|
||
regulation_name=regulation_name,
|
||
))
|
||
|
||
doc.close()
|
||
return self._deduplicate_articles(articles)
|
||
|
||
def _extract_paragraphs(self, text: str) -> List[str]:
|
||
"""Extract numbered paragraphs from article text."""
|
||
paragraphs = []
|
||
matches = list(re.finditer(self.PATTERNS['paragraph'], text))
|
||
|
||
for i, match in enumerate(matches):
|
||
start = match.start()
|
||
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
||
para_text = text[start:end].strip()
|
||
if para_text:
|
||
paragraphs.append(para_text)
|
||
|
||
return paragraphs
|
||
|
||
def _extract_article_title(self, text: str, article_num: str) -> str:
|
||
"""Extract the title of an article."""
|
||
# Look for title after "Artikel X"
|
||
pattern = rf'Artikel\s+{article_num}\s*\n\s*([^\n]+)'
|
||
match = re.search(pattern, text)
|
||
|
||
if match:
|
||
return match.group(1).strip()
|
||
|
||
return f"Artikel {article_num}"
|
||
|
||
def _deduplicate_articles(self, articles: List[EUArticle]) -> List[EUArticle]:
|
||
"""Remove duplicate articles."""
|
||
seen = {}
|
||
|
||
for article in articles:
|
||
key = article.article_number
|
||
if key not in seen:
|
||
seen[key] = article
|
||
else:
|
||
if len(article.full_text) > len(seen[key].full_text):
|
||
seen[key] = article
|
||
|
||
return list(seen.values())
|