diff --git a/control-pipeline/scripts/reingest_d5.py b/control-pipeline/scripts/reingest_d5.py index 31d880a..db11eb1 100644 --- a/control-pipeline/scripts/reingest_d5.py +++ b/control-pipeline/scripts/reingest_d5.py @@ -180,6 +180,29 @@ def delete_old_chunks(qdrant_url: str, collection: str, object_name: str) -> int return 0 # Qdrant delete doesn't return count +def _delete_old_chunks_safe( + qdrant_url: str, collection: str, object_name: str, keep_doc_id: str, +) -> None: + """Delete old chunks for a document, keeping chunks with keep_doc_id.""" + with httpx.Client(timeout=30.0) as c: + resp = c.post( + f"{qdrant_url}/collections/{collection}/points/delete", + json={ + "filter": { + "must": [{ + "key": "object_name", + "match": {"value": object_name}, + }], + "must_not": [{ + "key": "document_id", + "match": {"value": keep_doc_id}, + }], + } + }, + ) + resp.raise_for_status() + + def reupload_document( rag_url: str, file_bytes: bytes, @@ -220,7 +243,11 @@ def process_document( progress: dict, max_retries: int = 2, ) -> bool: - """Process a single document: download → delete → re-upload. Returns success.""" + """Process a single document: download → upload → verify → delete old. + + Safe order: new chunks are created FIRST, old chunks deleted only after + successful verification (upload-before-delete pattern). + """ key = doc_key(doc["object_name"], doc["collection"]) # Skip if already done @@ -237,20 +264,32 @@ def process_document( "status": "skipped", "reason": "empty_file"} return False - # 2. Delete old chunks - delete_old_chunks(qdrant_url, doc["collection"], doc["object_name"]) - - # 3. Re-upload + # 2. Upload FIRST (creates new chunks alongside old ones) result = reupload_document( rag_url, file_bytes, doc["filename"], doc["collection"], doc["form"], doc["extra_metadata"], ) + new_chunks = result.get("chunks_count", 0) + new_doc_id = result.get("document_id", "") + if new_chunks == 0: + logger.error(" Upload produced 0 chunks — keeping old data: %s", + doc["object_name"]) + progress.setdefault("documents", {})[key] = { + "status": "error", "error": "0 new chunks"} + return False + + # 3. Delete OLD chunks only (exclude the new document_id) + _delete_old_chunks_safe( + qdrant_url, doc["collection"], + doc["object_name"], new_doc_id, + ) + # 4. Record success progress.setdefault("documents", {})[key] = { "status": "done", "old_chunks": doc["old_chunk_count"], - "new_chunks": result.get("chunks_count", 0), + "new_chunks": new_chunks, "new_document_id": result.get("document_id", ""), "completed_at": datetime.now(timezone.utc).isoformat(), } diff --git a/control-pipeline/scripts/reingest_nist.py b/control-pipeline/scripts/reingest_nist.py new file mode 100644 index 0000000..88a2ffa --- /dev/null +++ b/control-pipeline/scripts/reingest_nist.py @@ -0,0 +1,485 @@ +#!/usr/bin/env python3 +"""Safe re-ingestion of NIST/BSI/ENISA PDFs from MinIO. + +Uses upload-before-delete pattern: new chunks are created FIRST, +old chunks are only deleted after successful verification. + +Usage: + python3 control-pipeline/scripts/reingest_nist.py [--dry-run] + python3 control-pipeline/scripts/reingest_nist.py --only-missing +""" + +import argparse +import json +import logging +import sys +import time + +import httpx + +sys.path.insert(0, "control-pipeline/scripts") +from reingest_d5_config import ( # noqa: E402 + CHUNK_OVERLAP, + CHUNK_SIZE, + CHUNK_STRATEGY, + DEFAULT_QDRANT_URL, + DEFAULT_RAG_URL, + content_type_from_filename, +) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", +) +logger = logging.getLogger("reingest-nist") + +UPLOAD_TIMEOUT = 1800.0 # 30 min for large PDFs + +# ------------------------------------------------------------------- +# Documents to re-ingest +# ------------------------------------------------------------------- + +# 4 documents with 0 chunks (deleted by D5, upload failed) +MISSING_DOCS = [ + { + "object_name": "compliance/bund/compliance/2026/NIST_SP_800_53r5.pdf", + "collection": "bp_compliance_datenschutz", + "filename": "NIST_SP_800_53r5.pdf", + "extra_metadata": { + "regulation_id": "nist_sp800_53r5", + "source_id": "nist", + "doc_type": "controls_catalog", + "guideline_name": "NIST SP 800-53 Rev. 5 Security and Privacy Controls", + "license": "public_domain_us_gov", + "attribution": "NIST", + "source": "nist.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/nist_sp_800_82r3.pdf", + "collection": "bp_compliance_ce", + "filename": "nist_sp_800_82r3.pdf", + "extra_metadata": { + "regulation_id": "nist_sp_800_82r3", + "regulation_name_de": "NIST SP 800-82 Rev. 3 — Guide to OT Security", + "regulation_name_en": "NIST SP 800-82 Rev. 3 — Guide to OT Security", + "regulation_short": "NIST SP 800-82", + "category": "ot_security", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/nist_sp_800_160v1r1.pdf", + "collection": "bp_compliance_ce", + "filename": "nist_sp_800_160v1r1.pdf", + "extra_metadata": { + "regulation_id": "nist_sp_800_160v1r1", + "regulation_name_de": "NIST SP 800-160 Vol. 1 Rev. 1", + "regulation_name_en": "NIST SP 800-160 Vol. 1 Rev. 1", + "regulation_short": "NIST SP 800-160", + "category": "security_engineering", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/NIST_SP_800_207.pdf", + "collection": "bp_compliance_datenschutz", + "filename": "NIST_SP_800_207.pdf", + "extra_metadata": { + "regulation_id": "nist_sp800_207", + "source_id": "nist", + "doc_type": "architecture", + "guideline_name": "NIST SP 800-207 Zero Trust Architecture", + "license": "public_domain_us_gov", + "attribution": "NIST", + "source": "nist.gov", + }, + }, +] + +# Additional NIST/BSI/ENISA docs with <10% section rate (re-ingest for quality) +LOW_QUALITY_DOCS = [ + { + "object_name": "compliance/bund/compliance/2026/nist_csf_2_0.pdf", + "collection": "bp_compliance_datenschutz", + "filename": "nist_csf_2_0.pdf", + "extra_metadata": { + "regulation_id": "nist_csf_2_0", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/nistir_8259a.pdf", + "collection": "bp_compliance_datenschutz", + "filename": "nistir_8259a.pdf", + "extra_metadata": { + "regulation_id": "nistir_8259a", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/nist_ai_rmf.pdf", + "collection": "bp_compliance_datenschutz", + "filename": "nist_ai_rmf.pdf", + "extra_metadata": { + "regulation_id": "nist_ai_rmf", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/nist_sp_800_30r1.pdf", + "collection": "bp_compliance_ce", + "filename": "nist_sp_800_30r1.pdf", + "extra_metadata": { + "regulation_id": "nist_sp_800_30r1", + "license": "public_domain_us", + "source": "nist.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_good_practices.pdf", + "collection": "bp_compliance_ce", + "filename": "enisa_supply_chain_good_practices.pdf", + "extra_metadata": { + "regulation_id": "enisa_supply_chain_good_practices", + "license": "reuse_with_attribution", + "source": "enisa.europa.eu", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/enisa_ics_scada.pdf", + "collection": "bp_compliance_ce", + "filename": "enisa_ics_scada.pdf", + "extra_metadata": { + "regulation_id": "enisa_ics_scada_dependencies", + "license": "reuse_with_attribution", + "source": "enisa.europa.eu", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/enisa_supply_chain_security.pdf", + "collection": "bp_compliance_ce", + "filename": "enisa_supply_chain_security.pdf", + "extra_metadata": { + "regulation_id": "enisa_threat_landscape_supply_chain", + "license": "reuse_with_attribution", + "source": "enisa.europa.eu", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/cisa_secure_by_design.pdf", + "collection": "bp_compliance_ce", + "filename": "cisa_secure_by_design.pdf", + "extra_metadata": { + "regulation_id": "cisa_secure_by_design", + "license": "public_domain_us", + "source": "cisa.gov", + }, + }, + { + "object_name": "compliance/bund/compliance/2026/cvss_v4_0.pdf", + "collection": "bp_compliance_ce", + "filename": "cvss_v4_0.pdf", + "extra_metadata": { + "regulation_id": "cvss_v4_0", + "license": "public_domain_us", + "source": "first.org", + }, + }, +] + + +# ------------------------------------------------------------------- +# Qdrant helpers +# ------------------------------------------------------------------- +def count_chunks(qdrant_url: str, collection: str, object_name: str) -> int: + """Count existing chunks for a document in Qdrant.""" + with httpx.Client(timeout=30.0) as c: + resp = c.post( + f"{qdrant_url}/collections/{collection}/points/count", + json={ + "filter": { + "must": [{ + "key": "object_name", + "match": {"value": object_name}, + }] + }, + "exact": True, + }, + ) + resp.raise_for_status() + return resp.json()["result"]["count"] + + +def get_old_document_ids( + qdrant_url: str, collection: str, object_name: str, +) -> set: + """Get all document_ids for existing chunks of this document.""" + doc_ids = set() + offset = None + with httpx.Client(timeout=60.0) as c: + while True: + body = { + "filter": { + "must": [{ + "key": "object_name", + "match": {"value": object_name}, + }] + }, + "limit": 100, + "with_payload": ["document_id"], + } + if offset is not None: + body["offset"] = offset + resp = c.post( + f"{qdrant_url}/collections/{collection}/points/scroll", + json=body, + ) + resp.raise_for_status() + data = resp.json()["result"] + for pt in data["points"]: + did = pt.get("payload", {}).get("document_id") + if did: + doc_ids.add(did) + offset = data.get("next_page_offset") + if offset is None: + break + return doc_ids + + +def delete_by_document_ids( + qdrant_url: str, collection: str, doc_ids: set, +) -> None: + """Delete chunks matching specific document_ids.""" + for did in doc_ids: + with httpx.Client(timeout=30.0) as c: + c.post( + f"{qdrant_url}/collections/{collection}/points/delete", + json={ + "filter": { + "must": [{ + "key": "document_id", + "match": {"value": did}, + }] + } + }, + ).raise_for_status() + + +def check_section_rate( + qdrant_url: str, collection: str, object_name: str, +) -> tuple: + """Check section rate for a document's chunks. Returns (total, with_section).""" + total = 0 + with_section = 0 + offset = None + with httpx.Client(timeout=60.0) as c: + while True: + body = { + "filter": { + "must": [{ + "key": "object_name", + "match": {"value": object_name}, + }] + }, + "limit": 100, + "with_payload": ["section"], + } + if offset is not None: + body["offset"] = offset + resp = c.post( + f"{qdrant_url}/collections/{collection}/points/scroll", + json=body, + ) + resp.raise_for_status() + data = resp.json()["result"] + for pt in data["points"]: + total += 1 + sec = pt.get("payload", {}).get("section", "") + if sec and sec.strip(): + with_section += 1 + offset = data.get("next_page_offset") + if offset is None: + break + return total, with_section + + +# ------------------------------------------------------------------- +# Upload +# ------------------------------------------------------------------- +def download_from_minio(rag_url: str, object_name: str) -> bytes: + """Download file from MinIO via RAG service presigned URL.""" + with httpx.Client(timeout=60.0, verify=False) as c: + resp = c.get(f"{rag_url}/api/v1/documents/download/{object_name}") + resp.raise_for_status() + presigned_url = resp.json()["url"] + + with httpx.Client(timeout=300.0, verify=False) as c: + resp = c.get(presigned_url) + resp.raise_for_status() + return resp.content + + +def upload_document( + rag_url: str, + file_bytes: bytes, + filename: str, + collection: str, + extra_metadata: dict, +) -> dict: + """Upload document to RAG service.""" + ct = content_type_from_filename(filename) + form_data = { + "collection": collection, + "data_type": "compliance", + "bundesland": "bund", + "use_case": "compliance", + "year": "2026", + "chunk_strategy": CHUNK_STRATEGY, + "chunk_size": str(CHUNK_SIZE), + "chunk_overlap": str(CHUNK_OVERLAP), + "metadata_json": json.dumps(extra_metadata, ensure_ascii=False), + } + with httpx.Client(timeout=UPLOAD_TIMEOUT, verify=False) as c: + resp = c.post( + f"{rag_url}/api/v1/documents/upload", + files={"file": (filename, file_bytes, ct)}, + data=form_data, + ) + resp.raise_for_status() + return resp.json() + + +# ------------------------------------------------------------------- +# Main processing +# ------------------------------------------------------------------- +def process_document( + doc: dict, + rag_url: str, + qdrant_url: str, + dry_run: bool = False, +) -> dict: + """Safe re-ingest: upload first, then delete old. Returns result dict.""" + obj = doc["object_name"] + coll = doc["collection"] + fname = doc["filename"] + + # 1. Check existing state + old_count = count_chunks(qdrant_url, coll, obj) + old_doc_ids = get_old_document_ids(qdrant_url, coll, obj) if old_count > 0 else set() + logger.info(" [%s] existing: %d chunks, %d document_ids", + fname, old_count, len(old_doc_ids)) + + if dry_run: + logger.info(" [%s] DRY RUN — would download + upload + delete old", fname) + return {"status": "dry_run", "old_chunks": old_count} + + # 2. Download from MinIO + logger.info(" [%s] downloading from MinIO...", fname) + file_bytes = download_from_minio(rag_url, obj) + size_mb = len(file_bytes) / (1024 * 1024) + logger.info(" [%s] downloaded %.1f MB", fname, size_mb) + + # 3. Upload FIRST (creates new chunks) + logger.info(" [%s] uploading to RAG service...", fname) + result = upload_document(rag_url, file_bytes, fname, coll, doc["extra_metadata"]) + new_chunks = result.get("chunks_count", 0) + new_doc_id = result.get("document_id", "") + logger.info(" [%s] uploaded: %d new chunks (doc_id=%s)", fname, new_chunks, new_doc_id) + + # 4. Verify new chunks exist + if new_chunks == 0: + logger.error(" [%s] UPLOAD PRODUCED 0 CHUNKS — keeping old data!", fname) + return {"status": "error", "error": "0 new chunks", "old_chunks": old_count} + + # 5. Delete old chunks (only if there were any) + if old_doc_ids: + logger.info(" [%s] deleting %d old document_ids...", fname, len(old_doc_ids)) + delete_by_document_ids(qdrant_url, coll, old_doc_ids) + logger.info(" [%s] old chunks deleted", fname) + + # 6. Check section rate + total, with_sec = check_section_rate(qdrant_url, coll, obj) + pct = (with_sec / total * 100) if total > 0 else 0 + logger.info(" [%s] section rate: %d/%d (%.0f%%)", fname, with_sec, total, pct) + + return { + "status": "ok", + "old_chunks": old_count, + "new_chunks": new_chunks, + "new_document_id": new_doc_id, + "section_rate": round(pct, 1), + } + + +def main(): + parser = argparse.ArgumentParser(description="Safe NIST/BSI/ENISA re-ingestion") + parser.add_argument("--dry-run", action="store_true", help="Show what would happen") + parser.add_argument("--only-missing", action="store_true", + help="Only re-ingest the 4 missing docs (skip low-quality)") + parser.add_argument("--rag-url", default=DEFAULT_RAG_URL) + parser.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL) + args = parser.parse_args() + + docs = list(MISSING_DOCS) + if not args.only_missing: + docs.extend(LOW_QUALITY_DOCS) + + logger.info("=" * 60) + logger.info("NIST/BSI/ENISA Safe Re-Ingestion") + logger.info(" Documents: %d (%d missing + %d low-quality)", + len(docs), len(MISSING_DOCS), + 0 if args.only_missing else len(LOW_QUALITY_DOCS)) + logger.info(" RAG: %s", args.rag_url) + logger.info(" Qdrant: %s", args.qdrant_url) + logger.info(" Dry run: %s", args.dry_run) + logger.info("=" * 60) + + results = {} + ok = 0 + errors = 0 + + for i, doc in enumerate(docs, 1): + logger.info("[%d/%d] %s → %s", i, len(docs), doc["filename"], doc["collection"]) + try: + r = process_document(doc, args.rag_url, args.qdrant_url, args.dry_run) + results[doc["filename"]] = r + if r["status"] == "ok": + ok += 1 + elif r["status"] == "error": + errors += 1 + except Exception as e: + logger.error(" FAILED: %s", e) + results[doc["filename"]] = {"status": "error", "error": str(e)} + errors += 1 + + if i < len(docs): + time.sleep(2) + + # Summary + logger.info("") + logger.info("=" * 60) + logger.info("RESULTS") + logger.info("=" * 60) + for fname, r in results.items(): + status = r["status"].upper() + old = r.get("old_chunks", "?") + new = r.get("new_chunks", "?") + sec = r.get("section_rate", "?") + logger.info(" %-40s %s old=%s new=%s sect=%.0f%%", + fname, status, old, new, sec if isinstance(sec, float) else 0) + + logger.info("") + logger.info("OK: %d, Errors: %d, Total: %d", ok, errors, len(docs)) + + if errors > 0: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/embedding-service/main.py b/embedding-service/main.py index 03f3ca6..cdcceca 100644 --- a/embedding-service/main.py +++ b/embedding-service/main.py @@ -12,6 +12,7 @@ This service handles all ML-heavy operations, keeping the main klausur-service l import logging import re +import unicodedata from typing import List, Optional from contextlib import asynccontextmanager @@ -299,6 +300,9 @@ _LEGAL_SECTION_RE = re.compile( # NIST/ENISA/standard numbering r'|\d+\.\d+(?:\.\d+)*\s+[A-ZÄÖÜ]' # 1.1 Title, 2.3.1 Subtitle r'|[A-Z]{2,4}[-\.]\d+(?:\.\d+)*\b' # AC-1, AU-2, PO.1, PW.1.1 + r'|[A-Z]{2}\.[A-Z]{2}-\d{2}\b' # GV.OC-01 (NIST CSF 2.0) + r'|[A-Z]{2,4}-\d+\(\d+\)' # AC-1(1) (NIST enhancements) + r'|A\d{2}(?::\d{4})?\b' # A01:2021 (OWASP Top 10) r'|Table\s+\d+' # Table 1, Table A-1 r'|Figure\s+\d+' # Figure 1 r'|Appendix\s+[A-Z\d]' # Appendix A, Appendix 1 @@ -827,6 +831,34 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse: pass +def _normalize_pdf_text(text: str) -> str: + """Fix broken spacing from multi-column PDF extraction. + + pdfplumber/pypdf often break section numbers in multi-column NIST/BSI/ENISA + PDFs: "1 . 1" instead of "1.1", "AC - 1" instead of "AC-1". + """ + # Unicode NFKC: decompose ligatures (fi → fi) before other fixes + text = unicodedata.normalize('NFKC', text) + # Remove soft hyphens and zero-width spaces + text = text.replace('\u00ad', '').replace('\u200b', '') + # "1 . 1" → "1.1" (broken section numbers, apply repeatedly for nested) + prev = None + while prev != text: + prev = text + text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text) + # "AC - 1" → "AC-1" (broken NIST control IDs, 2-4 uppercase letters) + text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text) + # "GV . OC - 01" → "GV.OC-01" (NIST CSF 2.0 compound IDs) + text = re.sub( + r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text + ) + # "AC - 1 ( 1 )" → "AC-1(1)" (NIST enhancements with spaced parens) + text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text) + # Collapse multiple horizontal spaces (keep newlines) + text = re.sub(r'[^\S\n]{2,}', ' ', text) + return text + + def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse: """Extract PDF using pdfplumber (best for multi-column EU regulation PDFs).""" import io @@ -839,12 +871,12 @@ def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse: with pdfplumber.open(pdf_file) as pdf: page_count = len(pdf.pages) for page in pdf.pages: - text = page.extract_text(x_tolerance=2, y_tolerance=3) + text = page.extract_text(x_tolerance=3, y_tolerance=4) if text: text_parts.append(text) return ExtractPDFResponse( - text="\n\n".join(text_parts), + text=_normalize_pdf_text("\n\n".join(text_parts)), backend_used="pdfplumber", pages=page_count, table_count=0, @@ -866,7 +898,7 @@ def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse: text_parts.append(text) return ExtractPDFResponse( - text="\n\n".join(text_parts), + text=_normalize_pdf_text("\n\n".join(text_parts)), backend_used="pypdf", pages=len(reader.pages), table_count=0 diff --git a/embedding-service/test_nist_normalization.py b/embedding-service/test_nist_normalization.py new file mode 100644 index 0000000..6439529 --- /dev/null +++ b/embedding-service/test_nist_normalization.py @@ -0,0 +1,173 @@ +""" +Tests for NIST/BSI/ENISA PDF text normalization and section detection. + +Covers: +- _normalize_pdf_text() fixing broken multi-column PDF artifacts +- Section detection after normalization +- NIST CSF 2.0 compound IDs (GV.OC-01) +- NIST SP 800-53 control IDs (AC-1, AC-1(1)) +- OWASP Top 10 IDs (A01:2021) +- Unicode normalization (ligatures, soft hyphens) +""" + +from main import ( + _normalize_pdf_text, + _extract_section_header, + chunk_text_legal, +) + + +# ========================================================================= +# _normalize_pdf_text — broken spacing fixes +# ========================================================================= + +class TestNormalizePdfText: + + def test_broken_section_number(self): + assert _normalize_pdf_text("1 . 1 Risk Framing") == "1.1 Risk Framing" + + def test_nested_section_number(self): + assert _normalize_pdf_text("2 . 3 . 1 Subtitle") == "2.3.1 Subtitle" + + def test_broken_nist_control_id(self): + assert _normalize_pdf_text("AC - 1 Account Management") == "AC-1 Account Management" + + def test_broken_nist_control_au(self): + assert _normalize_pdf_text("AU - 2 Audit Events") == "AU-2 Audit Events" + + def test_broken_csf_compound_id(self): + assert _normalize_pdf_text("GV . OC - 01 Context") == "GV.OC-01 Context" + + def test_broken_enhancement_parens(self): + assert _normalize_pdf_text("AC-1( 1 ) Enhancement") == "AC-1(1) Enhancement" + + def test_soft_hyphen_removed(self): + assert _normalize_pdf_text("infor\u00admation") == "information" + + def test_zero_width_space_removed(self): + assert _normalize_pdf_text("data\u200bprotection") == "dataprotection" + + def test_ligature_fi_normalized(self): + # U+FB01 = fi ligature + assert _normalize_pdf_text("con\ufb01dential") == "confidential" + + def test_ligature_fl_normalized(self): + # U+FB02 = fl ligature + assert _normalize_pdf_text("over\ufb02ow") == "overflow" + + def test_multiple_spaces_collapsed(self): + assert _normalize_pdf_text("too many spaces") == "too many spaces" + + def test_newlines_preserved(self): + result = _normalize_pdf_text("line one\nline two\n\nline three") + assert "\n" in result + assert "line one" in result + assert "line three" in result + + def test_normal_text_unchanged(self): + text = "AC-1 Account Management requires proper controls." + assert _normalize_pdf_text(text) == text + + def test_combined_artifacts(self): + """Multiple broken artifacts in one text block.""" + broken = "1 . 1 Overview\nAC - 1 Account Management\nGV . OC - 01 Context" + fixed = _normalize_pdf_text(broken) + assert "1.1 Overview" in fixed + assert "AC-1 Account Management" in fixed + assert "GV.OC-01 Context" in fixed + + +# ========================================================================= +# Section detection after normalization +# ========================================================================= + +class TestNistSectionDetection: + + def test_nist_control_ac1(self): + assert _extract_section_header("AC-1 Account Management") is not None + + def test_nist_control_au2(self): + assert _extract_section_header("AU-2 Audit Events") is not None + + def test_nist_csf_compound(self): + assert _extract_section_header("GV.OC-01 Organizational Context") is not None + + def test_nist_enhancement(self): + assert _extract_section_header("AC-1(1) Policy and Procedures") is not None + + def test_owasp_top10(self): + assert _extract_section_header("A01:2021 Broken Access Control") is not None + + def test_owasp_without_year(self): + assert _extract_section_header("A03 Injection") is not None + + def test_numbered_section(self): + assert _extract_section_header("2.1 Risk Framing") is not None + + def test_deep_numbered_section(self): + assert _extract_section_header("3.2.1 Assessment Methodology") is not None + + def test_broken_then_normalized_detects(self): + """After normalization, broken NIST IDs should be detected as sections.""" + broken = "AC - 1 Account Management" + normalized = _normalize_pdf_text(broken) + assert _extract_section_header(normalized) is not None + + def test_broken_csf_then_normalized_detects(self): + broken = "GV . OC - 01 Organizational Context" + normalized = _normalize_pdf_text(broken) + assert _extract_section_header(normalized) is not None + + def test_broken_section_num_then_normalized(self): + broken = "2 . 1 Risk Framing" + normalized = _normalize_pdf_text(broken) + assert _extract_section_header(normalized) is not None + + +# ========================================================================= +# Chunking with NIST-style text +# ========================================================================= + +class TestNistChunking: + + NIST_SAMPLE = ( + "AC-1 Account Management\n" + "The organization develops, documents, and disseminates an access " + "control policy that addresses purpose, scope, roles, responsibilities, " + "management commitment, coordination among organizational entities, " + "and compliance.\n\n" + "AC-2 Access Enforcement\n" + "The information system enforces approved authorizations for logical " + "access to information and system resources in accordance with " + "applicable access control policies.\n\n" + "AC-3 Information Flow Enforcement\n" + "The system enforces approved authorizations for controlling the flow " + "of information within the system and between interconnected systems.\n" + ) + + def test_chunks_have_section_prefix(self): + chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=300, overlap=50) + assert any("[AC-1" in c for c in chunks) + assert any("[AC-2" in c for c in chunks) + + def test_sections_detected(self): + chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=500, overlap=50) + assert len(chunks) >= 2 + + def test_normalized_broken_text_chunks_correctly(self): + """Broken PDF text should chunk correctly after normalization.""" + broken = ( + "AC - 1 Account Management\n" + "The organization develops, documents, and disseminates an access " + "control policy that addresses purpose, scope, roles, responsibilities, " + "management commitment, coordination among organizational entities, " + "and compliance with applicable regulations and standards.\n\n" + "AC - 2 Access Enforcement\n" + "The information system enforces approved authorizations for logical " + "access to information and system resources in accordance with " + "applicable access control policies and procedures.\n" + ) + normalized = _normalize_pdf_text(broken) + chunks = chunk_text_legal(normalized, chunk_size=300, overlap=50) + assert any("[AC-1" in c for c in chunks) + assert any("[AC-2" in c for c in chunks)