fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
602
backend/compliance/services/pdf_extractor.py
Normal file
602
backend/compliance/services/pdf_extractor.py
Normal file
@@ -0,0 +1,602 @@
|
||||
"""
|
||||
PDF Extractor for BSI-TR-03161 and EU Regulation Documents.
|
||||
|
||||
This module extracts Pruefaspekte (test aspects) from BSI Technical Guidelines
|
||||
and Articles from EU regulations in PDF format.
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Any
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
fitz = None
|
||||
logging.warning("PyMuPDF not installed. PDF extraction will not work.")
|
||||
|
||||
|
||||
class RequirementLevel(str, Enum):
|
||||
"""BSI requirement levels (German: Anforderungsstufen)."""
|
||||
MUSS = "MUSS" # MUST - mandatory
|
||||
SOLL = "SOLL" # SHOULD - recommended
|
||||
KANN = "KANN" # MAY - optional
|
||||
DARF_NICHT = "DARF NICHT" # MUST NOT - prohibited
|
||||
|
||||
|
||||
class AspectCategory(str, Enum):
|
||||
"""Categories for BSI-TR Pruefaspekte."""
|
||||
AUTHENTICATION = "authentication"
|
||||
SESSION_MANAGEMENT = "session_management"
|
||||
CRYPTOGRAPHY = "cryptography"
|
||||
INPUT_VALIDATION = "input_validation"
|
||||
SQL_INJECTION = "sql_injection"
|
||||
XSS_PREVENTION = "xss_prevention"
|
||||
CSRF_PROTECTION = "csrf_protection"
|
||||
LOGGING_AUDIT = "logging_audit"
|
||||
ERROR_HANDLING = "error_handling"
|
||||
NETWORK_SECURITY = "network_security"
|
||||
SECURE_STORAGE = "secure_storage"
|
||||
PRIVACY = "privacy"
|
||||
ACCESS_CONTROL = "access_control"
|
||||
DATA_PROTECTION = "data_protection"
|
||||
KEY_MANAGEMENT = "key_management"
|
||||
SECURE_COMMUNICATION = "secure_communication"
|
||||
UPDATE_MECHANISM = "update_mechanism"
|
||||
GENERAL = "general"
|
||||
TEST_ASPECT = "test_aspect"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BSIAspect:
|
||||
"""A single extracted BSI-TR Pruefaspekt (test aspect)."""
|
||||
aspect_id: str # e.g., "O.Auth_1", "T.Sess_2"
|
||||
title: str # Short title
|
||||
full_text: str # Complete requirement text
|
||||
category: AspectCategory # Categorization
|
||||
page_number: int # PDF page where found
|
||||
section: str # Chapter/section number
|
||||
requirement_level: RequirementLevel # MUSS/SOLL/KANN
|
||||
source_document: str # e.g., "BSI-TR-03161-2"
|
||||
context_before: str = "" # Text before the aspect
|
||||
context_after: str = "" # Text after the aspect
|
||||
related_aspects: List[str] = field(default_factory=list) # Related aspect IDs
|
||||
keywords: List[str] = field(default_factory=list) # Extracted keywords
|
||||
|
||||
|
||||
@dataclass
|
||||
class EUArticle:
|
||||
"""A single extracted EU regulation article."""
|
||||
article_number: str # e.g., "Art. 32", "Artikel 5"
|
||||
title: str # Article title
|
||||
full_text: str # Complete article text
|
||||
paragraphs: List[str] # Individual paragraphs
|
||||
page_number: int # PDF page
|
||||
regulation_name: str # e.g., "DSGVO", "AI Act"
|
||||
recitals: List[str] = field(default_factory=list) # Related recitals
|
||||
keywords: List[str] = field(default_factory=list) # Extracted keywords
|
||||
|
||||
|
||||
class BSIPDFExtractor:
|
||||
"""
|
||||
Extracts Pruefaspekte from BSI-TR-03161 PDF documents.
|
||||
|
||||
The BSI-TR-03161 series contains security requirements for mobile applications:
|
||||
- Part 1: General security requirements
|
||||
- Part 2: Web application security (OAuth, Sessions, Input validation, etc.)
|
||||
- Part 3: Backend/server security
|
||||
|
||||
Each document contains hundreds of Pruefaspekte (test aspects) that need to
|
||||
be extracted, categorized, and stored for compliance tracking.
|
||||
"""
|
||||
|
||||
# Regex patterns for BSI-TR aspect identification
|
||||
PATTERNS = {
|
||||
# Primary aspect ID patterns (e.g., O.Auth_1, T.Network_2)
|
||||
'aspect_id': r'(O\.[A-Za-z]+_\d+|T\.[A-Za-z]+_\d+)',
|
||||
|
||||
# Alternative section-based pattern (e.g., "Pruefaspekt 4.2.1")
|
||||
'section_aspect': r'(?:Prüfaspekt|Pruefaspekt|Anforderung)\s+(\d+\.\d+(?:\.\d+)?)',
|
||||
|
||||
# Section number pattern
|
||||
'section': r'(\d+\.\d+(?:\.\d+)?)',
|
||||
|
||||
# Requirement level pattern
|
||||
'requirement': r'\b(MUSS|SOLL|KANN|DARF\s+NICHT|muss|soll|kann|darf\s+nicht)\b',
|
||||
|
||||
# Table header pattern for Pruefaspekte tables
|
||||
'table_header': r'(?:Prüfaspekt|Bezeichnung|ID|Anforderung)',
|
||||
}
|
||||
|
||||
# Category mapping based on aspect ID prefix
|
||||
CATEGORY_MAP = {
|
||||
'O.Auth': AspectCategory.AUTHENTICATION,
|
||||
'O.Sess': AspectCategory.SESSION_MANAGEMENT,
|
||||
'O.Cryp': AspectCategory.CRYPTOGRAPHY,
|
||||
'O.Crypto': AspectCategory.CRYPTOGRAPHY,
|
||||
'O.Input': AspectCategory.INPUT_VALIDATION,
|
||||
'O.SQL': AspectCategory.SQL_INJECTION,
|
||||
'O.XSS': AspectCategory.XSS_PREVENTION,
|
||||
'O.CSRF': AspectCategory.CSRF_PROTECTION,
|
||||
'O.Log': AspectCategory.LOGGING_AUDIT,
|
||||
'O.Audit': AspectCategory.LOGGING_AUDIT,
|
||||
'O.Err': AspectCategory.ERROR_HANDLING,
|
||||
'O.Error': AspectCategory.ERROR_HANDLING,
|
||||
'O.Net': AspectCategory.NETWORK_SECURITY,
|
||||
'O.Network': AspectCategory.NETWORK_SECURITY,
|
||||
'O.Store': AspectCategory.SECURE_STORAGE,
|
||||
'O.Storage': AspectCategory.SECURE_STORAGE,
|
||||
'O.Priv': AspectCategory.PRIVACY,
|
||||
'O.Privacy': AspectCategory.PRIVACY,
|
||||
'O.Data': AspectCategory.DATA_PROTECTION,
|
||||
'O.Access': AspectCategory.ACCESS_CONTROL,
|
||||
'O.Key': AspectCategory.KEY_MANAGEMENT,
|
||||
'O.Comm': AspectCategory.SECURE_COMMUNICATION,
|
||||
'O.TLS': AspectCategory.SECURE_COMMUNICATION,
|
||||
'O.Update': AspectCategory.UPDATE_MECHANISM,
|
||||
'T.': AspectCategory.TEST_ASPECT,
|
||||
}
|
||||
|
||||
# Keywords for category detection when aspect ID is ambiguous
|
||||
CATEGORY_KEYWORDS = {
|
||||
AspectCategory.AUTHENTICATION: [
|
||||
'authentifizierung', 'authentication', 'login', 'anmeldung',
|
||||
'passwort', 'password', 'credential', 'oauth', 'oidc', 'token',
|
||||
'bearer', 'jwt', 'session', 'multi-faktor', 'mfa', '2fa'
|
||||
],
|
||||
AspectCategory.SESSION_MANAGEMENT: [
|
||||
'session', 'sitzung', 'cookie', 'timeout', 'ablauf',
|
||||
'session-id', 'sessionid', 'logout', 'abmeldung'
|
||||
],
|
||||
AspectCategory.CRYPTOGRAPHY: [
|
||||
'verschlüsselung', 'encryption', 'kryptograph', 'cryptograph',
|
||||
'aes', 'rsa', 'hash', 'signatur', 'signature', 'zertifikat',
|
||||
'certificate', 'tls', 'ssl', 'hmac', 'pbkdf', 'argon'
|
||||
],
|
||||
AspectCategory.INPUT_VALIDATION: [
|
||||
'eingabevalidierung', 'input validation', 'validierung',
|
||||
'eingabeprüfung', 'sanitiz', 'whitelist', 'blacklist',
|
||||
'filter', 'escape', 'encoding'
|
||||
],
|
||||
AspectCategory.SQL_INJECTION: [
|
||||
'sql injection', 'sql-injection', 'prepared statement',
|
||||
'parameterisiert', 'parameterized', 'orm', 'database'
|
||||
],
|
||||
AspectCategory.XSS_PREVENTION: [
|
||||
'xss', 'cross-site scripting', 'script injection',
|
||||
'html encoding', 'output encoding', 'csp', 'content-security'
|
||||
],
|
||||
AspectCategory.CSRF_PROTECTION: [
|
||||
'csrf', 'cross-site request', 'token', 'anti-csrf',
|
||||
'state parameter', 'same-site', 'samesite'
|
||||
],
|
||||
AspectCategory.LOGGING_AUDIT: [
|
||||
'logging', 'protokollierung', 'audit', 'nachvollziehbar',
|
||||
'traceability', 'log', 'event', 'monitoring'
|
||||
],
|
||||
AspectCategory.ERROR_HANDLING: [
|
||||
'fehlerbehandlung', 'error handling', 'exception',
|
||||
'fehlermeldung', 'error message', 'stack trace'
|
||||
],
|
||||
}
|
||||
|
||||
def __init__(self, logger: Optional[logging.Logger] = None):
|
||||
"""Initialize the PDF extractor."""
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
|
||||
if fitz is None:
|
||||
raise ImportError(
|
||||
"PyMuPDF is required for PDF extraction. "
|
||||
"Install it with: pip install PyMuPDF"
|
||||
)
|
||||
|
||||
def extract_from_file(self, pdf_path: str, source_name: Optional[str] = None) -> List[BSIAspect]:
|
||||
"""
|
||||
Extract all Pruefaspekte from a BSI-TR PDF file.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
source_name: Optional source document name (auto-detected if not provided)
|
||||
|
||||
Returns:
|
||||
List of extracted BSIAspect objects
|
||||
"""
|
||||
path = Path(pdf_path)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
||||
|
||||
source = source_name or path.stem
|
||||
self.logger.info(f"Extracting aspects from: {source}")
|
||||
|
||||
doc = fitz.open(pdf_path)
|
||||
aspects = []
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
text = page.get_text()
|
||||
|
||||
# Extract aspects from this page
|
||||
page_aspects = self._extract_aspects_from_text(
|
||||
text=text,
|
||||
page_num=page_num + 1,
|
||||
source_document=source
|
||||
)
|
||||
aspects.extend(page_aspects)
|
||||
|
||||
doc.close()
|
||||
|
||||
# Post-process: deduplicate and enrich
|
||||
aspects = self._deduplicate_aspects(aspects)
|
||||
aspects = self._enrich_aspects(aspects)
|
||||
|
||||
self.logger.info(f"Extracted {len(aspects)} unique aspects from {source}")
|
||||
return aspects
|
||||
|
||||
def extract_all_documents(self, docs_dir: str) -> Dict[str, List[BSIAspect]]:
|
||||
"""
|
||||
Extract aspects from all BSI-TR PDFs in a directory.
|
||||
|
||||
Args:
|
||||
docs_dir: Directory containing BSI-TR PDF files
|
||||
|
||||
Returns:
|
||||
Dictionary mapping document names to their extracted aspects
|
||||
"""
|
||||
docs_path = Path(docs_dir)
|
||||
results = {}
|
||||
|
||||
# Look for BSI-TR PDFs
|
||||
patterns = ["BSI-TR-03161*.pdf", "bsi-tr-03161*.pdf"]
|
||||
|
||||
for pattern in patterns:
|
||||
for pdf_file in docs_path.glob(pattern):
|
||||
try:
|
||||
aspects = self.extract_from_file(str(pdf_file))
|
||||
results[pdf_file.stem] = aspects
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to extract from {pdf_file}: {e}")
|
||||
|
||||
return results
|
||||
|
||||
def _extract_aspects_from_text(
|
||||
self,
|
||||
text: str,
|
||||
page_num: int,
|
||||
source_document: str
|
||||
) -> List[BSIAspect]:
|
||||
"""Extract all Pruefaspekte from a page's text."""
|
||||
aspects = []
|
||||
|
||||
# Find all aspect IDs on this page
|
||||
for match in re.finditer(self.PATTERNS['aspect_id'], text, re.IGNORECASE):
|
||||
aspect_id = match.group(1).upper()
|
||||
|
||||
# Extract context around the match
|
||||
start = max(0, match.start() - 200)
|
||||
end = min(len(text), match.end() + 1000)
|
||||
context = text[start:end]
|
||||
|
||||
# Determine category from aspect ID
|
||||
category = self._determine_category(aspect_id, context)
|
||||
|
||||
# Extract requirement level
|
||||
req_level = self._extract_requirement_level(context)
|
||||
|
||||
# Extract title (text immediately after aspect ID)
|
||||
title = self._extract_title(context, aspect_id)
|
||||
|
||||
# Extract section number
|
||||
section = self._extract_section(context)
|
||||
|
||||
# Extract full requirement text
|
||||
full_text = self._extract_full_text(context, aspect_id)
|
||||
|
||||
aspects.append(BSIAspect(
|
||||
aspect_id=aspect_id,
|
||||
title=title,
|
||||
full_text=full_text,
|
||||
category=category,
|
||||
page_number=page_num,
|
||||
section=section,
|
||||
requirement_level=req_level,
|
||||
source_document=source_document,
|
||||
context_before=text[start:match.start()].strip()[-100:],
|
||||
context_after=text[match.end():end].strip()[:200],
|
||||
))
|
||||
|
||||
# Also look for section-based aspects
|
||||
for match in re.finditer(self.PATTERNS['section_aspect'], text, re.IGNORECASE):
|
||||
section_id = match.group(1)
|
||||
aspect_id = f"SEC_{section_id.replace('.', '_')}"
|
||||
|
||||
# Check if we already have this as an O.* aspect
|
||||
if any(a.section == section_id for a in aspects):
|
||||
continue
|
||||
|
||||
start = max(0, match.start() - 100)
|
||||
end = min(len(text), match.end() + 800)
|
||||
context = text[start:end]
|
||||
|
||||
category = self._determine_category_from_keywords(context)
|
||||
req_level = self._extract_requirement_level(context)
|
||||
|
||||
aspects.append(BSIAspect(
|
||||
aspect_id=aspect_id,
|
||||
title=f"Prüfaspekt {section_id}",
|
||||
full_text=context.strip(),
|
||||
category=category,
|
||||
page_number=page_num,
|
||||
section=section_id,
|
||||
requirement_level=req_level,
|
||||
source_document=source_document,
|
||||
))
|
||||
|
||||
return aspects
|
||||
|
||||
def _determine_category(self, aspect_id: str, context: str) -> AspectCategory:
|
||||
"""Determine the category of an aspect based on its ID and context."""
|
||||
# First try to match by aspect ID prefix
|
||||
for prefix, category in self.CATEGORY_MAP.items():
|
||||
if aspect_id.upper().startswith(prefix.upper()):
|
||||
return category
|
||||
|
||||
# Fall back to keyword-based detection
|
||||
return self._determine_category_from_keywords(context)
|
||||
|
||||
def _determine_category_from_keywords(self, text: str) -> AspectCategory:
|
||||
"""Determine category based on keywords in the text."""
|
||||
text_lower = text.lower()
|
||||
|
||||
category_scores = {}
|
||||
for category, keywords in self.CATEGORY_KEYWORDS.items():
|
||||
score = sum(1 for kw in keywords if kw in text_lower)
|
||||
if score > 0:
|
||||
category_scores[category] = score
|
||||
|
||||
if category_scores:
|
||||
return max(category_scores, key=category_scores.get)
|
||||
|
||||
return AspectCategory.GENERAL
|
||||
|
||||
def _extract_requirement_level(self, text: str) -> RequirementLevel:
|
||||
"""Extract the requirement level from text."""
|
||||
match = re.search(self.PATTERNS['requirement'], text, re.IGNORECASE)
|
||||
if match:
|
||||
level = match.group(1).upper()
|
||||
if 'DARF' in level and 'NICHT' in level:
|
||||
return RequirementLevel.DARF_NICHT
|
||||
elif level == 'MUSS':
|
||||
return RequirementLevel.MUSS
|
||||
elif level == 'SOLL':
|
||||
return RequirementLevel.SOLL
|
||||
elif level == 'KANN':
|
||||
return RequirementLevel.KANN
|
||||
|
||||
return RequirementLevel.SOLL # Default
|
||||
|
||||
def _extract_title(self, context: str, aspect_id: str) -> str:
|
||||
"""Extract the title/short description of an aspect."""
|
||||
# Look for text immediately after the aspect ID
|
||||
pattern = rf'{re.escape(aspect_id)}\s*[:\-–]?\s*([^\n]+)'
|
||||
match = re.search(pattern, context, re.IGNORECASE)
|
||||
|
||||
if match:
|
||||
title = match.group(1).strip()
|
||||
# Clean up the title
|
||||
title = re.sub(r'\s+', ' ', title)
|
||||
# Truncate if too long
|
||||
if len(title) > 200:
|
||||
title = title[:197] + "..."
|
||||
return title
|
||||
|
||||
return aspect_id
|
||||
|
||||
def _extract_section(self, context: str) -> str:
|
||||
"""Extract the section number from context."""
|
||||
match = re.search(self.PATTERNS['section'], context)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
def _extract_full_text(self, context: str, aspect_id: str) -> str:
|
||||
"""Extract the complete requirement text."""
|
||||
# Find the aspect ID and get text until the next aspect or section
|
||||
pattern = rf'{re.escape(aspect_id)}[^\n]*\n(.*?)(?=\n\s*(?:O\.[A-Z]|T\.[A-Z]|\d+\.\d+\s|\Z))'
|
||||
match = re.search(pattern, context, re.IGNORECASE | re.DOTALL)
|
||||
|
||||
if match:
|
||||
full_text = match.group(0).strip()
|
||||
else:
|
||||
# Fall back to context
|
||||
full_text = context.strip()
|
||||
|
||||
# Clean up
|
||||
full_text = re.sub(r'\s+', ' ', full_text)
|
||||
return full_text
|
||||
|
||||
def _deduplicate_aspects(self, aspects: List[BSIAspect]) -> List[BSIAspect]:
|
||||
"""Remove duplicate aspects, keeping the one with more context."""
|
||||
seen = {}
|
||||
|
||||
for aspect in aspects:
|
||||
key = aspect.aspect_id
|
||||
if key not in seen:
|
||||
seen[key] = aspect
|
||||
else:
|
||||
# Keep the one with longer full_text
|
||||
if len(aspect.full_text) > len(seen[key].full_text):
|
||||
seen[key] = aspect
|
||||
|
||||
return list(seen.values())
|
||||
|
||||
def _enrich_aspects(self, aspects: List[BSIAspect]) -> List[BSIAspect]:
|
||||
"""Enrich aspects with additional metadata."""
|
||||
aspect_ids = {a.aspect_id for a in aspects}
|
||||
|
||||
for aspect in aspects:
|
||||
# Find related aspects mentioned in the full text
|
||||
for other_id in aspect_ids:
|
||||
if other_id != aspect.aspect_id and other_id in aspect.full_text:
|
||||
aspect.related_aspects.append(other_id)
|
||||
|
||||
# Extract keywords based on category
|
||||
aspect.keywords = self._extract_keywords(aspect)
|
||||
|
||||
return aspects
|
||||
|
||||
def _extract_keywords(self, aspect: BSIAspect) -> List[str]:
|
||||
"""Extract relevant keywords from an aspect."""
|
||||
keywords = []
|
||||
text_lower = aspect.full_text.lower()
|
||||
|
||||
# Add keywords based on category
|
||||
if aspect.category in self.CATEGORY_KEYWORDS:
|
||||
for kw in self.CATEGORY_KEYWORDS[aspect.category]:
|
||||
if kw in text_lower:
|
||||
keywords.append(kw)
|
||||
|
||||
return list(set(keywords))[:10] # Limit to 10 keywords
|
||||
|
||||
def get_statistics(self, aspects: List[BSIAspect]) -> Dict[str, Any]:
|
||||
"""Get statistics about extracted aspects."""
|
||||
stats = {
|
||||
"total_aspects": len(aspects),
|
||||
"by_category": {},
|
||||
"by_requirement_level": {},
|
||||
"by_source": {},
|
||||
"unique_sections": set(),
|
||||
}
|
||||
|
||||
for aspect in aspects:
|
||||
# By category
|
||||
cat = aspect.category.value
|
||||
stats["by_category"][cat] = stats["by_category"].get(cat, 0) + 1
|
||||
|
||||
# By requirement level
|
||||
level = aspect.requirement_level.value
|
||||
stats["by_requirement_level"][level] = stats["by_requirement_level"].get(level, 0) + 1
|
||||
|
||||
# By source
|
||||
src = aspect.source_document
|
||||
stats["by_source"][src] = stats["by_source"].get(src, 0) + 1
|
||||
|
||||
# Unique sections
|
||||
if aspect.section:
|
||||
stats["unique_sections"].add(aspect.section)
|
||||
|
||||
stats["unique_sections"] = len(stats["unique_sections"])
|
||||
return stats
|
||||
|
||||
|
||||
class EURegulationExtractor:
|
||||
"""
|
||||
Extracts Articles from EU Regulation PDF documents.
|
||||
|
||||
Handles documents like GDPR, AI Act, CRA, etc. in their official formats.
|
||||
"""
|
||||
|
||||
PATTERNS = {
|
||||
'article_de': r'Artikel\s+(\d+)',
|
||||
'article_en': r'Article\s+(\d+)',
|
||||
'paragraph': r'\((\d+)\)',
|
||||
'recital': r'Erwägungsgrund\s+(\d+)|Recital\s+(\d+)',
|
||||
}
|
||||
|
||||
def __init__(self, logger: Optional[logging.Logger] = None):
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
|
||||
def extract_from_file(
|
||||
self,
|
||||
pdf_path: str,
|
||||
regulation_name: str,
|
||||
language: str = "de"
|
||||
) -> List[EUArticle]:
|
||||
"""Extract all articles from an EU regulation PDF."""
|
||||
if fitz is None:
|
||||
raise ImportError("PyMuPDF is required for PDF extraction.")
|
||||
|
||||
path = Path(pdf_path)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
||||
|
||||
doc = fitz.open(pdf_path)
|
||||
articles = []
|
||||
|
||||
article_pattern = (
|
||||
self.PATTERNS['article_de'] if language == "de"
|
||||
else self.PATTERNS['article_en']
|
||||
)
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
text = page.get_text()
|
||||
|
||||
# Find article starts
|
||||
for match in re.finditer(article_pattern, text):
|
||||
article_num = match.group(1)
|
||||
|
||||
# Extract article content
|
||||
start = match.start()
|
||||
# Find next article or end of page
|
||||
next_match = re.search(article_pattern, text[match.end():])
|
||||
end = match.end() + next_match.start() if next_match else len(text)
|
||||
|
||||
article_text = text[start:end].strip()
|
||||
|
||||
# Extract paragraphs
|
||||
paragraphs = self._extract_paragraphs(article_text)
|
||||
|
||||
# Extract title
|
||||
title = self._extract_article_title(article_text, article_num)
|
||||
|
||||
articles.append(EUArticle(
|
||||
article_number=f"Art. {article_num}",
|
||||
title=title,
|
||||
full_text=article_text,
|
||||
paragraphs=paragraphs,
|
||||
page_number=page_num + 1,
|
||||
regulation_name=regulation_name,
|
||||
))
|
||||
|
||||
doc.close()
|
||||
return self._deduplicate_articles(articles)
|
||||
|
||||
def _extract_paragraphs(self, text: str) -> List[str]:
|
||||
"""Extract numbered paragraphs from article text."""
|
||||
paragraphs = []
|
||||
matches = list(re.finditer(self.PATTERNS['paragraph'], text))
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
start = match.start()
|
||||
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
||||
para_text = text[start:end].strip()
|
||||
if para_text:
|
||||
paragraphs.append(para_text)
|
||||
|
||||
return paragraphs
|
||||
|
||||
def _extract_article_title(self, text: str, article_num: str) -> str:
|
||||
"""Extract the title of an article."""
|
||||
# Look for title after "Artikel X"
|
||||
pattern = rf'Artikel\s+{article_num}\s*\n\s*([^\n]+)'
|
||||
match = re.search(pattern, text)
|
||||
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
return f"Artikel {article_num}"
|
||||
|
||||
def _deduplicate_articles(self, articles: List[EUArticle]) -> List[EUArticle]:
|
||||
"""Remove duplicate articles."""
|
||||
seen = {}
|
||||
|
||||
for article in articles:
|
||||
key = article.article_number
|
||||
if key not in seen:
|
||||
seen[key] = article
|
||||
else:
|
||||
if len(article.full_text) > len(seen[key].full_text):
|
||||
seen[key] = article
|
||||
|
||||
return list(seen.values())
|
||||
Reference in New Issue
Block a user