feat(embedding): NIST PDF text normalization + safe re-ingest script

Fix broken multi-column PDF extraction for NIST/BSI/ENISA documents: - _normalize_pdf_text(): fixes broken section numbers (1 . 1 → 1.1), control IDs (AC - 1 → AC-1), ligatures, soft hyphens - pdfplumber tolerances increased (x=3,y=4) for better column handling - 3 new regex patterns: NIST CSF 2.0, NIST enhancements, OWASP Top 10 - reingest_nist.py: safe upload-before-delete for 4 lost NIST PDFs - reingest_d5.py: safety fix — upload first, verify, then delete old Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 06:42:46 +02:00
parent 97a7f6f264
commit 0b0eed27b0
4 changed files with 738 additions and 9 deletions
@@ -180,6 +180,29 @@ def delete_old_chunks(qdrant_url: str, collection: str, object_name: str) -> int
    return 0  # Qdrant delete doesn't return count
 def _delete_old_chunks_safe(
    qdrant_url: str, collection: str, object_name: str, keep_doc_id: str,
 ) -> None:
    """Delete old chunks for a document, keeping chunks with keep_doc_id."""
    with httpx.Client(timeout=30.0) as c:
        resp = c.post(
            f"{qdrant_url}/collections/{collection}/points/delete",
            json={
                "filter": {
                    "must": [{
                        "key": "object_name",
                        "match": {"value": object_name},
                    }],
                    "must_not": [{
                        "key": "document_id",
                        "match": {"value": keep_doc_id},
                    }],
                }
            },
        )
        resp.raise_for_status()
 def reupload_document(
    rag_url: str,
    file_bytes: bytes,
@@ -220,7 +243,11 @@ def process_document(
    progress: dict,
    max_retries: int = 2,
 ) -> bool:
-    """Process a single document: download → delete → re-upload. Returns success."""
+    """Process a single document: download → upload → verify → delete old.
    Safe order: new chunks are created FIRST, old chunks deleted only after
    successful verification (upload-before-delete pattern).
    """
    key = doc_key(doc["object_name"], doc["collection"])
    # Skip if already done
@@ -237,20 +264,32 @@ def process_document(
                    "status": "skipped", "reason": "empty_file"}
                return False
-            # 2. Delete old chunks
+            # 2. Upload FIRST (creates new chunks alongside old ones)
            delete_old_chunks(qdrant_url, doc["collection"], doc["object_name"])
            # 3. Re-upload
            result = reupload_document(
                rag_url, file_bytes, doc["filename"],
                doc["collection"], doc["form"], doc["extra_metadata"],
            )
            new_chunks = result.get("chunks_count", 0)
            new_doc_id = result.get("document_id", "")
            if new_chunks == 0:
                logger.error("  Upload produced 0 chunks — keeping old data: %s",
                             doc["object_name"])
                progress.setdefault("documents", {})[key] = {
                    "status": "error", "error": "0 new chunks"}
                return False
            # 3. Delete OLD chunks only (exclude the new document_id)
            _delete_old_chunks_safe(
                qdrant_url, doc["collection"],
                doc["object_name"], new_doc_id,
            )
            # 4. Record success
            progress.setdefault("documents", {})[key] = {
                "status": "done",
                "old_chunks": doc["old_chunk_count"],
-                "new_chunks": result.get("chunks_count", 0),
+                "new_chunks": new_chunks,
                "new_document_id": result.get("document_id", ""),
                "completed_at": datetime.now(timezone.utc).isoformat(),
            }
@@ -0,0 +1,485 @@
 #!/usr/bin/env python3
 """Safe re-ingestion of NIST/BSI/ENISA PDFs from MinIO.
 Uses upload-before-delete pattern: new chunks are created FIRST,
 old chunks are only deleted after successful verification.
 Usage:
    python3 control-pipeline/scripts/reingest_nist.py [--dry-run]
    python3 control-pipeline/scripts/reingest_nist.py --only-missing
 """
 import argparse
 import json
 import logging
 import sys
 import time
 import httpx
 sys.path.insert(0, "control-pipeline/scripts")
 from reingest_d5_config import (  # noqa: E402
    CHUNK_OVERLAP,
    CHUNK_SIZE,
    CHUNK_STRATEGY,
    DEFAULT_QDRANT_URL,
    DEFAULT_RAG_URL,
    content_type_from_filename,
 )
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
 )
 logger = logging.getLogger("reingest-nist")
 UPLOAD_TIMEOUT = 1800.0  # 30 min for large PDFs
 # -------------------------------------------------------------------
 # Documents to re-ingest
 # -------------------------------------------------------------------
 # 4 documents with 0 chunks (deleted by D5, upload failed)
 MISSING_DOCS = [
    {
        "object_name": "compliance/bund/compliance/2026/NIST_SP_800_53r5.pdf",
        "collection": "bp_compliance_datenschutz",
        "filename": "NIST_SP_800_53r5.pdf",
        "extra_metadata": {
            "regulation_id": "nist_sp800_53r5",
            "source_id": "nist",
            "doc_type": "controls_catalog",
            "guideline_name": "NIST SP 800-53 Rev. 5 Security and Privacy Controls",
            "license": "public_domain_us_gov",
            "attribution": "NIST",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/nist_sp_800_82r3.pdf",
        "collection": "bp_compliance_ce",
        "filename": "nist_sp_800_82r3.pdf",
        "extra_metadata": {
            "regulation_id": "nist_sp_800_82r3",
            "regulation_name_de": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
            "regulation_name_en": "NIST SP 800-82 Rev. 3 — Guide to OT Security",
            "regulation_short": "NIST SP 800-82",
            "category": "ot_security",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/nist_sp_800_160v1r1.pdf",
        "collection": "bp_compliance_ce",
        "filename": "nist_sp_800_160v1r1.pdf",
        "extra_metadata": {
            "regulation_id": "nist_sp_800_160v1r1",
            "regulation_name_de": "NIST SP 800-160 Vol. 1 Rev. 1",
            "regulation_name_en": "NIST SP 800-160 Vol. 1 Rev. 1",
            "regulation_short": "NIST SP 800-160",
            "category": "security_engineering",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf",
        "collection": "bp_compliance_datenschutz",
        "filename": "NIST_SP_800_207.pdf",
        "extra_metadata": {
            "regulation_id": "nist_sp800_207",
            "source_id": "nist",
            "doc_type": "architecture",
            "guideline_name": "NIST SP 800-207 Zero Trust Architecture",
            "license": "public_domain_us_gov",
            "attribution": "NIST",
            "source": "nist.gov",
        },
    },
 ]
 # Additional NIST/BSI/ENISA docs with <10% section rate (re-ingest for quality)
 LOW_QUALITY_DOCS = [
    {
        "object_name": "compliance/bund/compliance/2026/nist_csf_2_0.pdf",
        "collection": "bp_compliance_datenschutz",
        "filename": "nist_csf_2_0.pdf",
        "extra_metadata": {
            "regulation_id": "nist_csf_2_0",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/nistir_8259a.pdf",
        "collection": "bp_compliance_datenschutz",
        "filename": "nistir_8259a.pdf",
        "extra_metadata": {
            "regulation_id": "nistir_8259a",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/nist_ai_rmf.pdf",
        "collection": "bp_compliance_datenschutz",
        "filename": "nist_ai_rmf.pdf",
        "extra_metadata": {
            "regulation_id": "nist_ai_rmf",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/nist_sp_800_30r1.pdf",
        "collection": "bp_compliance_ce",
        "filename": "nist_sp_800_30r1.pdf",
        "extra_metadata": {
            "regulation_id": "nist_sp_800_30r1",
            "license": "public_domain_us",
            "source": "nist.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_good_practices.pdf",
        "collection": "bp_compliance_ce",
        "filename": "enisa_supply_chain_good_practices.pdf",
        "extra_metadata": {
            "regulation_id": "enisa_supply_chain_good_practices",
            "license": "reuse_with_attribution",
            "source": "enisa.europa.eu",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/enisa_ics_scada.pdf",
        "collection": "bp_compliance_ce",
        "filename": "enisa_ics_scada.pdf",
        "extra_metadata": {
            "regulation_id": "enisa_ics_scada_dependencies",
            "license": "reuse_with_attribution",
            "source": "enisa.europa.eu",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_security.pdf",
        "collection": "bp_compliance_ce",
        "filename": "enisa_supply_chain_security.pdf",
        "extra_metadata": {
            "regulation_id": "enisa_threat_landscape_supply_chain",
            "license": "reuse_with_attribution",
            "source": "enisa.europa.eu",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/cisa_secure_by_design.pdf",
        "collection": "bp_compliance_ce",
        "filename": "cisa_secure_by_design.pdf",
        "extra_metadata": {
            "regulation_id": "cisa_secure_by_design",
            "license": "public_domain_us",
            "source": "cisa.gov",
        },
    },
    {
        "object_name": "compliance/bund/compliance/2026/cvss_v4_0.pdf",
        "collection": "bp_compliance_ce",
        "filename": "cvss_v4_0.pdf",
        "extra_metadata": {
            "regulation_id": "cvss_v4_0",
            "license": "public_domain_us",
            "source": "first.org",
        },
    },
 ]
 # -------------------------------------------------------------------
 # Qdrant helpers
 # -------------------------------------------------------------------
 def count_chunks(qdrant_url: str, collection: str, object_name: str) -> int:
    """Count existing chunks for a document in Qdrant."""
    with httpx.Client(timeout=30.0) as c:
        resp = c.post(
            f"{qdrant_url}/collections/{collection}/points/count",
            json={
                "filter": {
                    "must": [{
                        "key": "object_name",
                        "match": {"value": object_name},
                    }]
                },
                "exact": True,
            },
        )
        resp.raise_for_status()
        return resp.json()["result"]["count"]
 def get_old_document_ids(
    qdrant_url: str, collection: str, object_name: str,
 ) -> set:
    """Get all document_ids for existing chunks of this document."""
    doc_ids = set()
    offset = None
    with httpx.Client(timeout=60.0) as c:
        while True:
            body = {
                "filter": {
                    "must": [{
                        "key": "object_name",
                        "match": {"value": object_name},
                    }]
                },
                "limit": 100,
                "with_payload": ["document_id"],
            }
            if offset is not None:
                body["offset"] = offset
            resp = c.post(
                f"{qdrant_url}/collections/{collection}/points/scroll",
                json=body,
            )
            resp.raise_for_status()
            data = resp.json()["result"]
            for pt in data["points"]:
                did = pt.get("payload", {}).get("document_id")
                if did:
                    doc_ids.add(did)
            offset = data.get("next_page_offset")
            if offset is None:
                break
    return doc_ids
 def delete_by_document_ids(
    qdrant_url: str, collection: str, doc_ids: set,
 ) -> None:
    """Delete chunks matching specific document_ids."""
    for did in doc_ids:
        with httpx.Client(timeout=30.0) as c:
            c.post(
                f"{qdrant_url}/collections/{collection}/points/delete",
                json={
                    "filter": {
                        "must": [{
                            "key": "document_id",
                            "match": {"value": did},
                        }]
                    }
                },
            ).raise_for_status()
 def check_section_rate(
    qdrant_url: str, collection: str, object_name: str,
 ) -> tuple:
    """Check section rate for a document's chunks. Returns (total, with_section)."""
    total = 0
    with_section = 0
    offset = None
    with httpx.Client(timeout=60.0) as c:
        while True:
            body = {
                "filter": {
                    "must": [{
                        "key": "object_name",
                        "match": {"value": object_name},
                    }]
                },
                "limit": 100,
                "with_payload": ["section"],
            }
            if offset is not None:
                body["offset"] = offset
            resp = c.post(
                f"{qdrant_url}/collections/{collection}/points/scroll",
                json=body,
            )
            resp.raise_for_status()
            data = resp.json()["result"]
            for pt in data["points"]:
                total += 1
                sec = pt.get("payload", {}).get("section", "")
                if sec and sec.strip():
                    with_section += 1
            offset = data.get("next_page_offset")
            if offset is None:
                break
    return total, with_section
 # -------------------------------------------------------------------
 # Upload
 # -------------------------------------------------------------------
 def download_from_minio(rag_url: str, object_name: str) -> bytes:
    """Download file from MinIO via RAG service presigned URL."""
    with httpx.Client(timeout=60.0, verify=False) as c:
        resp = c.get(f"{rag_url}/api/v1/documents/download/{object_name}")
        resp.raise_for_status()
        presigned_url = resp.json()["url"]
    with httpx.Client(timeout=300.0, verify=False) as c:
        resp = c.get(presigned_url)
        resp.raise_for_status()
        return resp.content
 def upload_document(
    rag_url: str,
    file_bytes: bytes,
    filename: str,
    collection: str,
    extra_metadata: dict,
 ) -> dict:
    """Upload document to RAG service."""
    ct = content_type_from_filename(filename)
    form_data = {
        "collection": collection,
        "data_type": "compliance",
        "bundesland": "bund",
        "use_case": "compliance",
        "year": "2026",
        "chunk_strategy": CHUNK_STRATEGY,
        "chunk_size": str(CHUNK_SIZE),
        "chunk_overlap": str(CHUNK_OVERLAP),
        "metadata_json": json.dumps(extra_metadata, ensure_ascii=False),
    }
    with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c:
        resp = c.post(
            f"{rag_url}/api/v1/documents/upload",
            files={"file": (filename, file_bytes, ct)},
            data=form_data,
        )
        resp.raise_for_status()
        return resp.json()
 # -------------------------------------------------------------------
 # Main processing
 # -------------------------------------------------------------------
 def process_document(
    doc: dict,
    rag_url: str,
    qdrant_url: str,
    dry_run: bool = False,
 ) -> dict:
    """Safe re-ingest: upload first, then delete old. Returns result dict."""
    obj = doc["object_name"]
    coll = doc["collection"]
    fname = doc["filename"]
    # 1. Check existing state
    old_count = count_chunks(qdrant_url, coll, obj)
    old_doc_ids = get_old_document_ids(qdrant_url, coll, obj) if old_count > 0 else set()
    logger.info("  [%s] existing: %d chunks, %d document_ids",
                fname, old_count, len(old_doc_ids))
    if dry_run:
        logger.info("  [%s] DRY RUN — would download + upload + delete old", fname)
        return {"status": "dry_run", "old_chunks": old_count}
    # 2. Download from MinIO
    logger.info("  [%s] downloading from MinIO...", fname)
    file_bytes = download_from_minio(rag_url, obj)
    size_mb = len(file_bytes) / (1024 * 1024)
    logger.info("  [%s] downloaded %.1f MB", fname, size_mb)
    # 3. Upload FIRST (creates new chunks)
    logger.info("  [%s] uploading to RAG service...", fname)
    result = upload_document(rag_url, file_bytes, fname, coll, doc["extra_metadata"])
    new_chunks = result.get("chunks_count", 0)
    new_doc_id = result.get("document_id", "")
    logger.info("  [%s] uploaded: %d new chunks (doc_id=%s)", fname, new_chunks, new_doc_id)
    # 4. Verify new chunks exist
    if new_chunks == 0:
        logger.error("  [%s] UPLOAD PRODUCED 0 CHUNKS — keeping old data!", fname)
        return {"status": "error", "error": "0 new chunks", "old_chunks": old_count}
    # 5. Delete old chunks (only if there were any)
    if old_doc_ids:
        logger.info("  [%s] deleting %d old document_ids...", fname, len(old_doc_ids))
        delete_by_document_ids(qdrant_url, coll, old_doc_ids)
        logger.info("  [%s] old chunks deleted", fname)
    # 6. Check section rate
    total, with_sec = check_section_rate(qdrant_url, coll, obj)
    pct = (with_sec / total * 100) if total > 0 else 0
    logger.info("  [%s] section rate: %d/%d (%.0f%%)", fname, with_sec, total, pct)
    return {
        "status": "ok",
        "old_chunks": old_count,
        "new_chunks": new_chunks,
        "new_document_id": new_doc_id,
        "section_rate": round(pct, 1),
    }
 def main():
    parser = argparse.ArgumentParser(description="Safe NIST/BSI/ENISA re-ingestion")
    parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
    parser.add_argument("--only-missing", action="store_true",
                        help="Only re-ingest the 4 missing docs (skip low-quality)")
    parser.add_argument("--rag-url", default=DEFAULT_RAG_URL)
    parser.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL)
    args = parser.parse_args()
    docs = list(MISSING_DOCS)
    if not args.only_missing:
        docs.extend(LOW_QUALITY_DOCS)
    logger.info("=" * 60)
    logger.info("NIST/BSI/ENISA Safe Re-Ingestion")
    logger.info("  Documents: %d (%d missing + %d low-quality)",
                len(docs), len(MISSING_DOCS),
                0 if args.only_missing else len(LOW_QUALITY_DOCS))
    logger.info("  RAG: %s", args.rag_url)
    logger.info("  Qdrant: %s", args.qdrant_url)
    logger.info("  Dry run: %s", args.dry_run)
    logger.info("=" * 60)
    results = {}
    ok = 0
    errors = 0
    for i, doc in enumerate(docs, 1):
        logger.info("[%d/%d] %s → %s", i, len(docs), doc["filename"], doc["collection"])
        try:
            r = process_document(doc, args.rag_url, args.qdrant_url, args.dry_run)
            results[doc["filename"]] = r
            if r["status"] == "ok":
                ok += 1
            elif r["status"] == "error":
                errors += 1
        except Exception as e:
            logger.error("  FAILED: %s", e)
            results[doc["filename"]] = {"status": "error", "error": str(e)}
            errors += 1
        if i < len(docs):
            time.sleep(2)
    # Summary
    logger.info("")
    logger.info("=" * 60)
    logger.info("RESULTS")
    logger.info("=" * 60)
    for fname, r in results.items():
        status = r["status"].upper()
        old = r.get("old_chunks", "?")
        new = r.get("new_chunks", "?")
        sec = r.get("section_rate", "?")
        logger.info("  %-40s %s  old=%s new=%s sect=%.0f%%",
                     fname, status, old, new, sec if isinstance(sec, float) else 0)
    logger.info("")
    logger.info("OK: %d, Errors: %d, Total: %d", ok, errors, len(docs))
    if errors > 0:
        sys.exit(1)
 if __name__ == "__main__":
    main()
@@ -12,6 +12,7 @@ This service handles all ML-heavy operations, keeping the main klausur-service l
 import logging
 import re
 import unicodedata
 from typing import List, Optional
 from contextlib import asynccontextmanager
@@ -299,6 +300,9 @@ _LEGAL_SECTION_RE = re.compile(
    # NIST/ENISA/standard numbering
    r'|\d+\.\d+(?:\.\d+)*\s+[A-ZÄÖÜ]'  # 1.1 Title, 2.3.1 Subtitle
    r'|[A-Z]{2,4}[-\.]\d+(?:\.\d+)*\b'  # AC-1, AU-2, PO.1, PW.1.1
    r'|[A-Z]{2}\.[A-Z]{2}-\d{2}\b'      # GV.OC-01 (NIST CSF 2.0)
    r'|[A-Z]{2,4}-\d+\(\d+\)'           # AC-1(1) (NIST enhancements)
    r'|A\d{2}(?::\d{4})?\b'             # A01:2021 (OWASP Top 10)
    r'|Table\s+\d+'                      # Table 1, Table A-1
    r'|Figure\s+\d+'                     # Figure 1
    r'|Appendix\s+[A-Z\d]'              # Appendix A, Appendix 1
@@ -827,6 +831,34 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
            pass
 def _normalize_pdf_text(text: str) -> str:
    """Fix broken spacing from multi-column PDF extraction.
    pdfplumber/pypdf often break section numbers in multi-column NIST/BSI/ENISA
    PDFs: "1 . 1" instead of "1.1", "AC - 1" instead of "AC-1".
    """
    # Unicode NFKC: decompose ligatures (fi → fi) before other fixes
    text = unicodedata.normalize('NFKC', text)
    # Remove soft hyphens and zero-width spaces
    text = text.replace('\u00ad', '').replace('\u200b', '')
    # "1 . 1" → "1.1" (broken section numbers, apply repeatedly for nested)
    prev = None
    while prev != text:
        prev = text
        text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text)
    # "AC - 1" → "AC-1" (broken NIST control IDs, 2-4 uppercase letters)
    text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text)
    # "GV . OC - 01" → "GV.OC-01" (NIST CSF 2.0 compound IDs)
    text = re.sub(
        r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text
    )
    # "AC - 1 ( 1 )" → "AC-1(1)" (NIST enhancements with spaced parens)
    text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text)
    # Collapse multiple horizontal spaces (keep newlines)
    text = re.sub(r'[^\S\n]{2,}', ' ', text)
    return text
 def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
    """Extract PDF using pdfplumber (best for multi-column EU regulation PDFs)."""
    import io
@@ -839,12 +871,12 @@ def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
    with pdfplumber.open(pdf_file) as pdf:
        page_count = len(pdf.pages)
        for page in pdf.pages:
-            text = page.extract_text(x_tolerance=2, y_tolerance=3)
+            text = page.extract_text(x_tolerance=3, y_tolerance=4)
            if text:
                text_parts.append(text)
    return ExtractPDFResponse(
-        text="\n\n".join(text_parts),
+        text=_normalize_pdf_text("\n\n".join(text_parts)),
        backend_used="pdfplumber",
        pages=page_count,
        table_count=0,
@@ -866,7 +898,7 @@ def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse:
            text_parts.append(text)
    return ExtractPDFResponse(
-        text="\n\n".join(text_parts),
+        text=_normalize_pdf_text("\n\n".join(text_parts)),
        backend_used="pypdf",
        pages=len(reader.pages),
        table_count=0
@@ -0,0 +1,173 @@
 """
 Tests for NIST/BSI/ENISA PDF text normalization and section detection.
 Covers:
 - _normalize_pdf_text() fixing broken multi-column PDF artifacts
 - Section detection after normalization
 - NIST CSF 2.0 compound IDs (GV.OC-01)
 - NIST SP 800-53 control IDs (AC-1, AC-1(1))
 - OWASP Top 10 IDs (A01:2021)
 - Unicode normalization (ligatures, soft hyphens)
 """
 from main import (
    _normalize_pdf_text,
    _extract_section_header,
    chunk_text_legal,
 )
 # =========================================================================
 # _normalize_pdf_text — broken spacing fixes
 # =========================================================================
 class TestNormalizePdfText:
    def test_broken_section_number(self):
        assert _normalize_pdf_text("1 . 1 Risk Framing") == "1.1 Risk Framing"
    def test_nested_section_number(self):
        assert _normalize_pdf_text("2 . 3 . 1 Subtitle") == "2.3.1 Subtitle"
    def test_broken_nist_control_id(self):
        assert _normalize_pdf_text("AC - 1 Account Management") == "AC-1 Account Management"
    def test_broken_nist_control_au(self):
        assert _normalize_pdf_text("AU - 2 Audit Events") == "AU-2 Audit Events"
    def test_broken_csf_compound_id(self):
        assert _normalize_pdf_text("GV . OC - 01 Context") == "GV.OC-01 Context"
    def test_broken_enhancement_parens(self):
        assert _normalize_pdf_text("AC-1( 1 ) Enhancement") == "AC-1(1) Enhancement"
    def test_soft_hyphen_removed(self):
        assert _normalize_pdf_text("infor\u00admation") == "information"
    def test_zero_width_space_removed(self):
        assert _normalize_pdf_text("data\u200bprotection") == "dataprotection"
    def test_ligature_fi_normalized(self):
        # U+FB01 = fi ligature
        assert _normalize_pdf_text("con\ufb01dential") == "confidential"
    def test_ligature_fl_normalized(self):
        # U+FB02 = fl ligature
        assert _normalize_pdf_text("over\ufb02ow") == "overflow"
    def test_multiple_spaces_collapsed(self):
        assert _normalize_pdf_text("too   many    spaces") == "too many spaces"
    def test_newlines_preserved(self):
        result = _normalize_pdf_text("line one\nline two\n\nline three")
        assert "\n" in result
        assert "line one" in result
        assert "line three" in result
    def test_normal_text_unchanged(self):
        text = "AC-1 Account Management requires proper controls."
        assert _normalize_pdf_text(text) == text
    def test_combined_artifacts(self):
        """Multiple broken artifacts in one text block."""
        broken = "1 . 1 Overview\nAC - 1 Account Management\nGV . OC - 01 Context"
        fixed = _normalize_pdf_text(broken)
        assert "1.1 Overview" in fixed
        assert "AC-1 Account Management" in fixed
        assert "GV.OC-01 Context" in fixed
 # =========================================================================
 # Section detection after normalization
 # =========================================================================
 class TestNistSectionDetection:
    def test_nist_control_ac1(self):
        assert _extract_section_header("AC-1 Account Management") is not None
    def test_nist_control_au2(self):
        assert _extract_section_header("AU-2 Audit Events") is not None
    def test_nist_csf_compound(self):
        assert _extract_section_header("GV.OC-01 Organizational Context") is not None
    def test_nist_enhancement(self):
        assert _extract_section_header("AC-1(1) Policy and Procedures") is not None
    def test_owasp_top10(self):
        assert _extract_section_header("A01:2021 Broken Access Control") is not None
    def test_owasp_without_year(self):
        assert _extract_section_header("A03 Injection") is not None
    def test_numbered_section(self):
        assert _extract_section_header("2.1 Risk Framing") is not None
    def test_deep_numbered_section(self):
        assert _extract_section_header("3.2.1 Assessment Methodology") is not None
    def test_broken_then_normalized_detects(self):
        """After normalization, broken NIST IDs should be detected as sections."""
        broken = "AC - 1 Account Management"
        normalized = _normalize_pdf_text(broken)
        assert _extract_section_header(normalized) is not None
    def test_broken_csf_then_normalized_detects(self):
        broken = "GV . OC - 01 Organizational Context"
        normalized = _normalize_pdf_text(broken)
        assert _extract_section_header(normalized) is not None
    def test_broken_section_num_then_normalized(self):
        broken = "2 . 1 Risk Framing"
        normalized = _normalize_pdf_text(broken)
        assert _extract_section_header(normalized) is not None
 # =========================================================================
 # Chunking with NIST-style text
 # =========================================================================
 class TestNistChunking:
    NIST_SAMPLE = (
        "AC-1 Account Management\n"
        "The organization develops, documents, and disseminates an access "
        "control policy that addresses purpose, scope, roles, responsibilities, "
        "management commitment, coordination among organizational entities, "
        "and compliance.\n\n"
        "AC-2 Access Enforcement\n"
        "The information system enforces approved authorizations for logical "
        "access to information and system resources in accordance with "
        "applicable access control policies.\n\n"
        "AC-3 Information Flow Enforcement\n"
        "The system enforces approved authorizations for controlling the flow "
        "of information within the system and between interconnected systems.\n"
    )
    def test_chunks_have_section_prefix(self):
        chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=300, overlap=50)
        assert any("[AC-1" in c for c in chunks)
        assert any("[AC-2" in c for c in chunks)
    def test_sections_detected(self):
        chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=500, overlap=50)
        assert len(chunks) >= 2
    def test_normalized_broken_text_chunks_correctly(self):
        """Broken PDF text should chunk correctly after normalization."""
        broken = (
            "AC - 1 Account Management\n"
            "The organization develops, documents, and disseminates an access "
            "control policy that addresses purpose, scope, roles, responsibilities, "
            "management commitment, coordination among organizational entities, "
            "and compliance with applicable regulations and standards.\n\n"
            "AC - 2 Access Enforcement\n"
            "The information system enforces approved authorizations for logical "
            "access to information and system resources in accordance with "
            "applicable access control policies and procedures.\n"
        )
        normalized = _normalize_pdf_text(broken)
        chunks = chunk_text_legal(normalized, chunk_size=300, overlap=50)
        assert any("[AC-1" in c for c in chunks)
        assert any("[AC-2" in c for c in chunks)