feat(embedding): NIST PDF text normalization + safe re-ingest script

Fix broken multi-column PDF extraction for NIST/BSI/ENISA documents: - _normalize_pdf_text(): fixes broken section numbers (1 . 1 → 1.1), control IDs (AC - 1 → AC-1), ligatures, soft hyphens - pdfplumber tolerances increased (x=3,y=4) for better column handling - 3 new regex patterns: NIST CSF 2.0, NIST enhancements, OWASP Top 10 - reingest_nist.py: safe upload-before-delete for 4 lost NIST PDFs - reingest_d5.py: safety fix — upload first, verify, then delete old Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 06:42:46 +02:00
parent 97a7f6f264
commit 0b0eed27b0
4 changed files with 738 additions and 9 deletions
@@ -180,6 +180,29 @@ def delete_old_chunks(qdrant_url: str, collection: str, object_name: str) -> int
    return 0  # Qdrant delete doesn't return count


+def _delete_old_chunks_safe(
+    qdrant_url: str, collection: str, object_name: str, keep_doc_id: str,
+) -> None:
+    """Delete old chunks for a document, keeping chunks with keep_doc_id."""
+    with httpx.Client(timeout=30.0) as c:
+        resp = c.post(
+            f"{qdrant_url}/collections/{collection}/points/delete",
+            json={
+                "filter": {
+                    "must": [{
+                        "key": "object_name",
+                        "match": {"value": object_name},
+                    }],
+                    "must_not": [{
+                        "key": "document_id",
+                        "match": {"value": keep_doc_id},
+                    }],
+                }
+            },
+        )
+        resp.raise_for_status()
+
+
 def reupload_document(
    rag_url: str,
    file_bytes: bytes,
@@ -220,7 +243,11 @@ def process_document(
    progress: dict,
    max_retries: int = 2,
 ) -> bool:
-    """Process a single document: download → delete → re-upload. Returns success."""
+    """Process a single document: download → upload → verify → delete old.
+
+    Safe order: new chunks are created FIRST, old chunks deleted only after
+    successful verification (upload-before-delete pattern).
+    """
    key = doc_key(doc["object_name"], doc["collection"])

    # Skip if already done
@@ -237,20 +264,32 @@ def process_document(
                    "status": "skipped", "reason": "empty_file"}
                return False

-            # 2. Delete old chunks
-            delete_old_chunks(qdrant_url, doc["collection"], doc["object_name"])
-
-            # 3. Re-upload
+            # 2. Upload FIRST (creates new chunks alongside old ones)
            result = reupload_document(
                rag_url, file_bytes, doc["filename"],
                doc["collection"], doc["form"], doc["extra_metadata"],
            )

+            new_chunks = result.get("chunks_count", 0)
+            new_doc_id = result.get("document_id", "")
+            if new_chunks == 0:
+                logger.error("  Upload produced 0 chunks — keeping old data: %s",
+                             doc["object_name"])
+                progress.setdefault("documents", {})[key] = {
+                    "status": "error", "error": "0 new chunks"}
+                return False
+
+            # 3. Delete OLD chunks only (exclude the new document_id)
+            _delete_old_chunks_safe(
+                qdrant_url, doc["collection"],
+                doc["object_name"], new_doc_id,
+            )
+
            # 4. Record success
            progress.setdefault("documents", {})[key] = {
                "status": "done",
                "old_chunks": doc["old_chunk_count"],
-                "new_chunks": result.get("chunks_count", 0),
+                "new_chunks": new_chunks,
                "new_document_id": result.get("document_id", ""),
                "completed_at": datetime.now(timezone.utc).isoformat(),
            }
@@ -0,0 +1,485 @@
+#!/usr/bin/env python3
+"""Safe re-ingestion of NIST/BSI/ENISA PDFs from MinIO.
+
+Uses upload-before-delete pattern: new chunks are created FIRST,
+old chunks are only deleted after successful verification.
+
+Usage:
+    python3 control-pipeline/scripts/reingest_nist.py [--dry-run]
+    python3 control-pipeline/scripts/reingest_nist.py --only-missing
+"""
+
+import argparse
+import json
+import logging
+import sys
+import time
+
+import httpx
+
+sys.path.insert(0, "control-pipeline/scripts")
+from reingest_d5_config import (  # noqa: E402
+    CHUNK_OVERLAP,
+    CHUNK_SIZE,
+    CHUNK_STRATEGY,
+    DEFAULT_QDRANT_URL,
+    DEFAULT_RAG_URL,
+    content_type_from_filename,
+)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+logger = logging.getLogger("reingest-nist")
+
+UPLOAD_TIMEOUT = 1800.0  # 30 min for large PDFs
+
+# -------------------------------------------------------------------
+# Documents to re-ingest
+# -------------------------------------------------------------------
+
+# 4 documents with 0 chunks (deleted by D5, upload failed)
+MISSING_DOCS = [
+    {
+        "object_name": "compliance/bund/compliance/2026/NIST_SP_800_53r5.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "NIST_SP_800_53r5.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp800_53r5",
+            "source_id": "nist",
+            "doc_type": "controls_catalog",
+            "guideline_name": "NIST SP 800-53 Rev. 5 Security and Privacy Controls",
+            "license": "public_domain_us_gov",
+            "attribution": "NIST",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_sp_800_82r3.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "nist_sp_800_82r3.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_82r3",
+            "regulation_name_de": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
+            "regulation_name_en": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
+            "regulation_short": "NIST SP 800-82",
+            "category": "ot_security",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_sp_800_160v1r1.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "nist_sp_800_160v1r1.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_160v1r1",
+            "regulation_name_de": "NIST SP 800-160 Vol. 1 Rev. 1",
+            "regulation_name_en": "NIST SP 800-160 Vol. 1 Rev. 1",
+            "regulation_short": "NIST SP 800-160",
+            "category": "security_engineering",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "NIST_SP_800_207.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp800_207",
+            "source_id": "nist",
+            "doc_type": "architecture",
+            "guideline_name": "NIST SP 800-207 Zero Trust Architecture",
+            "license": "public_domain_us_gov",
+            "attribution": "NIST",
+            "source": "nist.gov",
+        },
+    },
+]
+
+# Additional NIST/BSI/ENISA docs with <10% section rate (re-ingest for quality)
+LOW_QUALITY_DOCS = [
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_csf_2_0.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "nist_csf_2_0.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_csf_2_0",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nistir_8259a.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "nistir_8259a.pdf",
+        "extra_metadata": {
+            "regulation_id": "nistir_8259a",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_ai_rmf.pdf",
+        "collection": "bp_compliance_datenschutz",
+        "filename": "nist_ai_rmf.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_ai_rmf",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/nist_sp_800_30r1.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "nist_sp_800_30r1.pdf",
+        "extra_metadata": {
+            "regulation_id": "nist_sp_800_30r1",
+            "license": "public_domain_us",
+            "source": "nist.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_good_practices.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "enisa_supply_chain_good_practices.pdf",
+        "extra_metadata": {
+            "regulation_id": "enisa_supply_chain_good_practices",
+            "license": "reuse_with_attribution",
+            "source": "enisa.europa.eu",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/enisa_ics_scada.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "enisa_ics_scada.pdf",
+        "extra_metadata": {
+            "regulation_id": "enisa_ics_scada_dependencies",
+            "license": "reuse_with_attribution",
+            "source": "enisa.europa.eu",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_security.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "enisa_supply_chain_security.pdf",
+        "extra_metadata": {
+            "regulation_id": "enisa_threat_landscape_supply_chain",
+            "license": "reuse_with_attribution",
+            "source": "enisa.europa.eu",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/cisa_secure_by_design.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "cisa_secure_by_design.pdf",
+        "extra_metadata": {
+            "regulation_id": "cisa_secure_by_design",
+            "license": "public_domain_us",
+            "source": "cisa.gov",
+        },
+    },
+    {
+        "object_name": "compliance/bund/compliance/2026/cvss_v4_0.pdf",
+        "collection": "bp_compliance_ce",
+        "filename": "cvss_v4_0.pdf",
+        "extra_metadata": {
+            "regulation_id": "cvss_v4_0",
+            "license": "public_domain_us",
+            "source": "first.org",
+        },
+    },
+]
+
+
+# -------------------------------------------------------------------
+# Qdrant helpers
+# -------------------------------------------------------------------
+def count_chunks(qdrant_url: str, collection: str, object_name: str) -> int:
+    """Count existing chunks for a document in Qdrant."""
+    with httpx.Client(timeout=30.0) as c:
+        resp = c.post(
+            f"{qdrant_url}/collections/{collection}/points/count",
+            json={
+                "filter": {
+                    "must": [{
+                        "key": "object_name",
+                        "match": {"value": object_name},
+                    }]
+                },
+                "exact": True,
+            },
+        )
+        resp.raise_for_status()
+        return resp.json()["result"]["count"]
+
+
+def get_old_document_ids(
+    qdrant_url: str, collection: str, object_name: str,
+) -> set:
+    """Get all document_ids for existing chunks of this document."""
+    doc_ids = set()
+    offset = None
+    with httpx.Client(timeout=60.0) as c:
+        while True:
+            body = {
+                "filter": {
+                    "must": [{
+                        "key": "object_name",
+                        "match": {"value": object_name},
+                    }]
+                },
+                "limit": 100,
+                "with_payload": ["document_id"],
+            }
+            if offset is not None:
+                body["offset"] = offset
+            resp = c.post(
+                f"{qdrant_url}/collections/{collection}/points/scroll",
+                json=body,
+            )
+            resp.raise_for_status()
+            data = resp.json()["result"]
+            for pt in data["points"]:
+                did = pt.get("payload", {}).get("document_id")
+                if did:
+                    doc_ids.add(did)
+            offset = data.get("next_page_offset")
+            if offset is None:
+                break
+    return doc_ids
+
+
+def delete_by_document_ids(
+    qdrant_url: str, collection: str, doc_ids: set,
+) -> None:
+    """Delete chunks matching specific document_ids."""
+    for did in doc_ids:
+        with httpx.Client(timeout=30.0) as c:
+            c.post(
+                f"{qdrant_url}/collections/{collection}/points/delete",
+                json={
+                    "filter": {
+                        "must": [{
+                            "key": "document_id",
+                            "match": {"value": did},
+                        }]
+                    }
+                },
+            ).raise_for_status()
+
+
+def check_section_rate(
+    qdrant_url: str, collection: str, object_name: str,
+) -> tuple:
+    """Check section rate for a document's chunks. Returns (total, with_section)."""
+    total = 0
+    with_section = 0
+    offset = None
+    with httpx.Client(timeout=60.0) as c:
+        while True:
+            body = {
+                "filter": {
+                    "must": [{
+                        "key": "object_name",
+                        "match": {"value": object_name},
+                    }]
+                },
+                "limit": 100,
+                "with_payload": ["section"],
+            }
+            if offset is not None:
+                body["offset"] = offset
+            resp = c.post(
+                f"{qdrant_url}/collections/{collection}/points/scroll",
+                json=body,
+            )
+            resp.raise_for_status()
+            data = resp.json()["result"]
+            for pt in data["points"]:
+                total += 1
+                sec = pt.get("payload", {}).get("section", "")
+                if sec and sec.strip():
+                    with_section += 1
+            offset = data.get("next_page_offset")
+            if offset is None:
+                break
+    return total, with_section
+
+
+# -------------------------------------------------------------------
+# Upload
+# -------------------------------------------------------------------
+def download_from_minio(rag_url: str, object_name: str) -> bytes:
+    """Download file from MinIO via RAG service presigned URL."""
+    with httpx.Client(timeout=60.0, verify=False) as c:
+        resp = c.get(f"{rag_url}/api/v1/documents/download/{object_name}")
+        resp.raise_for_status()
+        presigned_url = resp.json()["url"]
+
+    with httpx.Client(timeout=300.0, verify=False) as c:
+        resp = c.get(presigned_url)
+        resp.raise_for_status()
+        return resp.content
+
+
+def upload_document(
+    rag_url: str,
+    file_bytes: bytes,
+    filename: str,
+    collection: str,
+    extra_metadata: dict,
+) -> dict:
+    """Upload document to RAG service."""
+    ct = content_type_from_filename(filename)
+    form_data = {
+        "collection": collection,
+        "data_type": "compliance",
+        "bundesland": "bund",
+        "use_case": "compliance",
+        "year": "2026",
+        "chunk_strategy": CHUNK_STRATEGY,
+        "chunk_size": str(CHUNK_SIZE),
+        "chunk_overlap": str(CHUNK_OVERLAP),
+        "metadata_json": json.dumps(extra_metadata, ensure_ascii=False),
+    }
+    with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c:
+        resp = c.post(
+            f"{rag_url}/api/v1/documents/upload",
+            files={"file": (filename, file_bytes, ct)},
+            data=form_data,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+
+# -------------------------------------------------------------------
+# Main processing
+# -------------------------------------------------------------------
+def process_document(
+    doc: dict,
+    rag_url: str,
+    qdrant_url: str,
+    dry_run: bool = False,
+) -> dict:
+    """Safe re-ingest: upload first, then delete old. Returns result dict."""
+    obj = doc["object_name"]
+    coll = doc["collection"]
+    fname = doc["filename"]
+
+    # 1. Check existing state
+    old_count = count_chunks(qdrant_url, coll, obj)
+    old_doc_ids = get_old_document_ids(qdrant_url, coll, obj) if old_count > 0 else set()
+    logger.info("  [%s] existing: %d chunks, %d document_ids",
+                fname, old_count, len(old_doc_ids))
+
+    if dry_run:
+        logger.info("  [%s] DRY RUN — would download + upload + delete old", fname)
+        return {"status": "dry_run", "old_chunks": old_count}
+
+    # 2. Download from MinIO
+    logger.info("  [%s] downloading from MinIO...", fname)
+    file_bytes = download_from_minio(rag_url, obj)
+    size_mb = len(file_bytes) / (1024 * 1024)
+    logger.info("  [%s] downloaded %.1f MB", fname, size_mb)
+
+    # 3. Upload FIRST (creates new chunks)
+    logger.info("  [%s] uploading to RAG service...", fname)
+    result = upload_document(rag_url, file_bytes, fname, coll, doc["extra_metadata"])
+    new_chunks = result.get("chunks_count", 0)
+    new_doc_id = result.get("document_id", "")
+    logger.info("  [%s] uploaded: %d new chunks (doc_id=%s)", fname, new_chunks, new_doc_id)
+
+    # 4. Verify new chunks exist
+    if new_chunks == 0:
+        logger.error("  [%s] UPLOAD PRODUCED 0 CHUNKS — keeping old data!", fname)
+        return {"status": "error", "error": "0 new chunks", "old_chunks": old_count}
+
+    # 5. Delete old chunks (only if there were any)
+    if old_doc_ids:
+        logger.info("  [%s] deleting %d old document_ids...", fname, len(old_doc_ids))
+        delete_by_document_ids(qdrant_url, coll, old_doc_ids)
+        logger.info("  [%s] old chunks deleted", fname)
+
+    # 6. Check section rate
+    total, with_sec = check_section_rate(qdrant_url, coll, obj)
+    pct = (with_sec / total * 100) if total > 0 else 0
+    logger.info("  [%s] section rate: %d/%d (%.0f%%)", fname, with_sec, total, pct)
+
+    return {
+        "status": "ok",
+        "old_chunks": old_count,
+        "new_chunks": new_chunks,
+        "new_document_id": new_doc_id,
+        "section_rate": round(pct, 1),
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Safe NIST/BSI/ENISA re-ingestion")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
+    parser.add_argument("--only-missing", action="store_true",
+                        help="Only re-ingest the 4 missing docs (skip low-quality)")
+    parser.add_argument("--rag-url", default=DEFAULT_RAG_URL)
+    parser.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL)
+    args = parser.parse_args()
+
+    docs = list(MISSING_DOCS)
+    if not args.only_missing:
+        docs.extend(LOW_QUALITY_DOCS)
+
+    logger.info("=" * 60)
+    logger.info("NIST/BSI/ENISA Safe Re-Ingestion")
+    logger.info("  Documents: %d (%d missing + %d low-quality)",
+                len(docs), len(MISSING_DOCS),
+                0 if args.only_missing else len(LOW_QUALITY_DOCS))
+    logger.info("  RAG: %s", args.rag_url)
+    logger.info("  Qdrant: %s", args.qdrant_url)
+    logger.info("  Dry run: %s", args.dry_run)
+    logger.info("=" * 60)
+
+    results = {}
+    ok = 0
+    errors = 0
+
+    for i, doc in enumerate(docs, 1):
+        logger.info("[%d/%d] %s → %s", i, len(docs), doc["filename"], doc["collection"])
+        try:
+            r = process_document(doc, args.rag_url, args.qdrant_url, args.dry_run)
+            results[doc["filename"]] = r
+            if r["status"] == "ok":
+                ok += 1
+            elif r["status"] == "error":
+                errors += 1
+        except Exception as e:
+            logger.error("  FAILED: %s", e)
+            results[doc["filename"]] = {"status": "error", "error": str(e)}
+            errors += 1
+
+        if i < len(docs):
+            time.sleep(2)
+
+    # Summary
+    logger.info("")
+    logger.info("=" * 60)
+    logger.info("RESULTS")
+    logger.info("=" * 60)
+    for fname, r in results.items():
+        status = r["status"].upper()
+        old = r.get("old_chunks", "?")
+        new = r.get("new_chunks", "?")
+        sec = r.get("section_rate", "?")
+        logger.info("  %-40s %s  old=%s new=%s sect=%.0f%%",
+                     fname, status, old, new, sec if isinstance(sec, float) else 0)
+
+    logger.info("")
+    logger.info("OK: %d, Errors: %d, Total: %d", ok, errors, len(docs))
+
+    if errors > 0:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
@@ -12,6 +12,7 @@ This service handles all ML-heavy operations, keeping the main klausur-service l

 import logging
 import re
+import unicodedata
 from typing import List, Optional
 from contextlib import asynccontextmanager

@@ -299,6 +300,9 @@ _LEGAL_SECTION_RE = re.compile(
    # NIST/ENISA/standard numbering
    r'|\d+\.\d+(?:\.\d+)*\s+[A-ZÄÖÜ]'  # 1.1 Title, 2.3.1 Subtitle
    r'|[A-Z]{2,4}[-\.]\d+(?:\.\d+)*\b'  # AC-1, AU-2, PO.1, PW.1.1
+    r'|[A-Z]{2}\.[A-Z]{2}-\d{2}\b'      # GV.OC-01 (NIST CSF 2.0)
+    r'|[A-Z]{2,4}-\d+\(\d+\)'           # AC-1(1) (NIST enhancements)
+    r'|A\d{2}(?::\d{4})?\b'             # A01:2021 (OWASP Top 10)
    r'|Table\s+\d+'                      # Table 1, Table A-1
    r'|Figure\s+\d+'                     # Figure 1
    r'|Appendix\s+[A-Z\d]'              # Appendix A, Appendix 1
@@ -827,6 +831,34 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
            pass


+def _normalize_pdf_text(text: str) -> str:
+    """Fix broken spacing from multi-column PDF extraction.
+
+    pdfplumber/pypdf often break section numbers in multi-column NIST/BSI/ENISA
+    PDFs: "1 . 1" instead of "1.1", "AC - 1" instead of "AC-1".
+    """
+    # Unicode NFKC: decompose ligatures (fi → fi) before other fixes
+    text = unicodedata.normalize('NFKC', text)
+    # Remove soft hyphens and zero-width spaces
+    text = text.replace('\u00ad', '').replace('\u200b', '')
+    # "1 . 1" → "1.1" (broken section numbers, apply repeatedly for nested)
+    prev = None
+    while prev != text:
+        prev = text
+        text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text)
+    # "AC - 1" → "AC-1" (broken NIST control IDs, 2-4 uppercase letters)
+    text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text)
+    # "GV . OC - 01" → "GV.OC-01" (NIST CSF 2.0 compound IDs)
+    text = re.sub(
+        r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text
+    )
+    # "AC - 1 ( 1 )" → "AC-1(1)" (NIST enhancements with spaced parens)
+    text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text)
+    # Collapse multiple horizontal spaces (keep newlines)
+    text = re.sub(r'[^\S\n]{2,}', ' ', text)
+    return text
+
+
 def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
    """Extract PDF using pdfplumber (best for multi-column EU regulation PDFs)."""
    import io
@@ -839,12 +871,12 @@ def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
    with pdfplumber.open(pdf_file) as pdf:
        page_count = len(pdf.pages)
        for page in pdf.pages:
-            text = page.extract_text(x_tolerance=2, y_tolerance=3)
+            text = page.extract_text(x_tolerance=3, y_tolerance=4)
            if text:
                text_parts.append(text)

    return ExtractPDFResponse(
-        text="\n\n".join(text_parts),
+        text=_normalize_pdf_text("\n\n".join(text_parts)),
        backend_used="pdfplumber",
        pages=page_count,
        table_count=0,
@@ -866,7 +898,7 @@ def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse:
            text_parts.append(text)

    return ExtractPDFResponse(
-        text="\n\n".join(text_parts),
+        text=_normalize_pdf_text("\n\n".join(text_parts)),
        backend_used="pypdf",
        pages=len(reader.pages),
        table_count=0
@@ -0,0 +1,173 @@
+"""
+Tests for NIST/BSI/ENISA PDF text normalization and section detection.
+
+Covers:
+- _normalize_pdf_text() fixing broken multi-column PDF artifacts
+- Section detection after normalization
+- NIST CSF 2.0 compound IDs (GV.OC-01)
+- NIST SP 800-53 control IDs (AC-1, AC-1(1))
+- OWASP Top 10 IDs (A01:2021)
+- Unicode normalization (ligatures, soft hyphens)
+"""
+
+from main import (
+    _normalize_pdf_text,
+    _extract_section_header,
+    chunk_text_legal,
+)
+
+
+# =========================================================================
+# _normalize_pdf_text — broken spacing fixes
+# =========================================================================
+
+class TestNormalizePdfText:
+
+    def test_broken_section_number(self):
+        assert _normalize_pdf_text("1 . 1 Risk Framing") == "1.1 Risk Framing"
+
+    def test_nested_section_number(self):
+        assert _normalize_pdf_text("2 . 3 . 1 Subtitle") == "2.3.1 Subtitle"
+
+    def test_broken_nist_control_id(self):
+        assert _normalize_pdf_text("AC - 1 Account Management") == "AC-1 Account Management"
+
+    def test_broken_nist_control_au(self):
+        assert _normalize_pdf_text("AU - 2 Audit Events") == "AU-2 Audit Events"
+
+    def test_broken_csf_compound_id(self):
+        assert _normalize_pdf_text("GV . OC - 01 Context") == "GV.OC-01 Context"
+
+    def test_broken_enhancement_parens(self):
+        assert _normalize_pdf_text("AC-1( 1 ) Enhancement") == "AC-1(1) Enhancement"
+
+    def test_soft_hyphen_removed(self):
+        assert _normalize_pdf_text("infor\u00admation") == "information"
+
+    def test_zero_width_space_removed(self):
+        assert _normalize_pdf_text("data\u200bprotection") == "dataprotection"
+
+    def test_ligature_fi_normalized(self):
+        # U+FB01 = fi ligature
+        assert _normalize_pdf_text("con\ufb01dential") == "confidential"
+
+    def test_ligature_fl_normalized(self):
+        # U+FB02 = fl ligature
+        assert _normalize_pdf_text("over\ufb02ow") == "overflow"
+
+    def test_multiple_spaces_collapsed(self):
+        assert _normalize_pdf_text("too   many    spaces") == "too many spaces"
+
+    def test_newlines_preserved(self):
+        result = _normalize_pdf_text("line one\nline two\n\nline three")
+        assert "\n" in result
+        assert "line one" in result
+        assert "line three" in result
+
+    def test_normal_text_unchanged(self):
+        text = "AC-1 Account Management requires proper controls."
+        assert _normalize_pdf_text(text) == text
+
+    def test_combined_artifacts(self):
+        """Multiple broken artifacts in one text block."""
+        broken = "1 . 1 Overview\nAC - 1 Account Management\nGV . OC - 01 Context"
+        fixed = _normalize_pdf_text(broken)
+        assert "1.1 Overview" in fixed
+        assert "AC-1 Account Management" in fixed
+        assert "GV.OC-01 Context" in fixed
+
+
+# =========================================================================
+# Section detection after normalization
+# =========================================================================
+
+class TestNistSectionDetection:
+
+    def test_nist_control_ac1(self):
+        assert _extract_section_header("AC-1 Account Management") is not None
+
+    def test_nist_control_au2(self):
+        assert _extract_section_header("AU-2 Audit Events") is not None
+
+    def test_nist_csf_compound(self):
+        assert _extract_section_header("GV.OC-01 Organizational Context") is not None
+
+    def test_nist_enhancement(self):
+        assert _extract_section_header("AC-1(1) Policy and Procedures") is not None
+
+    def test_owasp_top10(self):
+        assert _extract_section_header("A01:2021 Broken Access Control") is not None
+
+    def test_owasp_without_year(self):
+        assert _extract_section_header("A03 Injection") is not None
+
+    def test_numbered_section(self):
+        assert _extract_section_header("2.1 Risk Framing") is not None
+
+    def test_deep_numbered_section(self):
+        assert _extract_section_header("3.2.1 Assessment Methodology") is not None
+
+    def test_broken_then_normalized_detects(self):
+        """After normalization, broken NIST IDs should be detected as sections."""
+        broken = "AC - 1 Account Management"
+        normalized = _normalize_pdf_text(broken)
+        assert _extract_section_header(normalized) is not None
+
+    def test_broken_csf_then_normalized_detects(self):
+        broken = "GV . OC - 01 Organizational Context"
+        normalized = _normalize_pdf_text(broken)
+        assert _extract_section_header(normalized) is not None
+
+    def test_broken_section_num_then_normalized(self):
+        broken = "2 . 1 Risk Framing"
+        normalized = _normalize_pdf_text(broken)
+        assert _extract_section_header(normalized) is not None
+
+
+# =========================================================================
+# Chunking with NIST-style text
+# =========================================================================
+
+class TestNistChunking:
+
+    NIST_SAMPLE = (
+        "AC-1 Account Management\n"
+        "The organization develops, documents, and disseminates an access "
+        "control policy that addresses purpose, scope, roles, responsibilities, "
+        "management commitment, coordination among organizational entities, "
+        "and compliance.\n\n"
+        "AC-2 Access Enforcement\n"
+        "The information system enforces approved authorizations for logical "
+        "access to information and system resources in accordance with "
+        "applicable access control policies.\n\n"
+        "AC-3 Information Flow Enforcement\n"
+        "The system enforces approved authorizations for controlling the flow "
+        "of information within the system and between interconnected systems.\n"
+    )
+
+    def test_chunks_have_section_prefix(self):
+        chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=300, overlap=50)
+        assert any("[AC-1" in c for c in chunks)
+        assert any("[AC-2" in c for c in chunks)
+
+    def test_sections_detected(self):
+        chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=500, overlap=50)
+        assert len(chunks) >= 2
+
+    def test_normalized_broken_text_chunks_correctly(self):
+        """Broken PDF text should chunk correctly after normalization."""
+        broken = (
+            "AC - 1 Account Management\n"
+            "The organization develops, documents, and disseminates an access "
+            "control policy that addresses purpose, scope, roles, responsibilities, "
+            "management commitment, coordination among organizational entities, "
+            "and compliance with applicable regulations and standards.\n\n"
+            "AC - 2 Access Enforcement\n"
+            "The information system enforces approved authorizations for logical "
+            "access to information and system resources in accordance with "
+            "applicable access control policies and procedures.\n"
+        )
+        normalized = _normalize_pdf_text(broken)
+        chunks = chunk_text_legal(normalized, chunk_size=300, overlap=50)
+        assert any("[AC-1" in c for c in chunks)
+        assert any("[AC-2" in c for c in chunks)