feat(embedding): NIST PDF text normalization + safe re-ingest script
Fix broken multi-column PDF extraction for NIST/BSI/ENISA documents: - _normalize_pdf_text(): fixes broken section numbers (1 . 1 → 1.1), control IDs (AC - 1 → AC-1), ligatures, soft hyphens - pdfplumber tolerances increased (x=3,y=4) for better column handling - 3 new regex patterns: NIST CSF 2.0, NIST enhancements, OWASP Top 10 - reingest_nist.py: safe upload-before-delete for 4 lost NIST PDFs - reingest_d5.py: safety fix — upload first, verify, then delete old Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ This service handles all ML-heavy operations, keeping the main klausur-service l
|
||||
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import List, Optional
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
@@ -299,6 +300,9 @@ _LEGAL_SECTION_RE = re.compile(
|
||||
# NIST/ENISA/standard numbering
|
||||
r'|\d+\.\d+(?:\.\d+)*\s+[A-ZÄÖÜ]' # 1.1 Title, 2.3.1 Subtitle
|
||||
r'|[A-Z]{2,4}[-\.]\d+(?:\.\d+)*\b' # AC-1, AU-2, PO.1, PW.1.1
|
||||
r'|[A-Z]{2}\.[A-Z]{2}-\d{2}\b' # GV.OC-01 (NIST CSF 2.0)
|
||||
r'|[A-Z]{2,4}-\d+\(\d+\)' # AC-1(1) (NIST enhancements)
|
||||
r'|A\d{2}(?::\d{4})?\b' # A01:2021 (OWASP Top 10)
|
||||
r'|Table\s+\d+' # Table 1, Table A-1
|
||||
r'|Figure\s+\d+' # Figure 1
|
||||
r'|Appendix\s+[A-Z\d]' # Appendix A, Appendix 1
|
||||
@@ -827,6 +831,34 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
|
||||
pass
|
||||
|
||||
|
||||
def _normalize_pdf_text(text: str) -> str:
|
||||
"""Fix broken spacing from multi-column PDF extraction.
|
||||
|
||||
pdfplumber/pypdf often break section numbers in multi-column NIST/BSI/ENISA
|
||||
PDFs: "1 . 1" instead of "1.1", "AC - 1" instead of "AC-1".
|
||||
"""
|
||||
# Unicode NFKC: decompose ligatures (fi → fi) before other fixes
|
||||
text = unicodedata.normalize('NFKC', text)
|
||||
# Remove soft hyphens and zero-width spaces
|
||||
text = text.replace('\u00ad', '').replace('\u200b', '')
|
||||
# "1 . 1" → "1.1" (broken section numbers, apply repeatedly for nested)
|
||||
prev = None
|
||||
while prev != text:
|
||||
prev = text
|
||||
text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text)
|
||||
# "AC - 1" → "AC-1" (broken NIST control IDs, 2-4 uppercase letters)
|
||||
text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text)
|
||||
# "GV . OC - 01" → "GV.OC-01" (NIST CSF 2.0 compound IDs)
|
||||
text = re.sub(
|
||||
r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text
|
||||
)
|
||||
# "AC - 1 ( 1 )" → "AC-1(1)" (NIST enhancements with spaced parens)
|
||||
text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text)
|
||||
# Collapse multiple horizontal spaces (keep newlines)
|
||||
text = re.sub(r'[^\S\n]{2,}', ' ', text)
|
||||
return text
|
||||
|
||||
|
||||
def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
|
||||
"""Extract PDF using pdfplumber (best for multi-column EU regulation PDFs)."""
|
||||
import io
|
||||
@@ -839,12 +871,12 @@ def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
|
||||
with pdfplumber.open(pdf_file) as pdf:
|
||||
page_count = len(pdf.pages)
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text(x_tolerance=2, y_tolerance=3)
|
||||
text = page.extract_text(x_tolerance=3, y_tolerance=4)
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
return ExtractPDFResponse(
|
||||
text="\n\n".join(text_parts),
|
||||
text=_normalize_pdf_text("\n\n".join(text_parts)),
|
||||
backend_used="pdfplumber",
|
||||
pages=page_count,
|
||||
table_count=0,
|
||||
@@ -866,7 +898,7 @@ def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse:
|
||||
text_parts.append(text)
|
||||
|
||||
return ExtractPDFResponse(
|
||||
text="\n\n".join(text_parts),
|
||||
text=_normalize_pdf_text("\n\n".join(text_parts)),
|
||||
backend_used="pypdf",
|
||||
pages=len(reader.pages),
|
||||
table_count=0
|
||||
|
||||
Reference in New Issue
Block a user