chore(qa): PDF QA v3 — 6,259/7,943 controls matched (79%)

- Added NIST 800-53, OWASP Top 10/ASVS/SAMM/API/MASVS, ENISA ICS PDFs - Improved normalize() for ligatures, smart quotes, dashes - Added OWASP-specific index builder (A01:2021, V1.1, MASVS-*) - 6,259 article assignments in DB (1,817 article, 1,355 preamble, 1,173 control, 790 annex, 666 section) - Remaining 1,651 unmatched: Blue Guide (EN text vs DE PDF), OWASP multilingual translations (PT/AR/ID/ES) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 07:57:52 +01:00
parent 24f02b52ed
commit 0e16640c28
4 changed files with 15335 additions and 37 deletions
@@ -39,22 +39,22 @@ SOURCE_FILE_MAP = {
    "IFRS-Übernahmeverordnung": "ifrs_regulation_2023_1803_de.pdf",

    # NIST (PDFs)
-    "NIST SP 800-53 Rev. 5": None,  # TODO: Need to find/download
-    "NIST SP 800-207 (Zero Trust)": None,
-    "NIST SP 800-63-3": None,
-    "NIST AI Risk Management Framework": None,
+    "NIST SP 800-53 Rev. 5": "nist_sp_800_53_r5.pdf",
+    "NIST SP 800-207 (Zero Trust)": "nist_sp_800_207.pdf",
+    "NIST SP 800-63-3": "nist_sp_800_63_3.pdf",
+    "NIST AI Risk Management Framework": "nist_ai_rmf.pdf",
    "NIST SP 800-218 (SSDF)": "nist_sp_800_218_ssdf.pdf",
    "NIST Cybersecurity Framework 2.0": "nist_csf_2_0.pdf",

-    # OWASP (no PDFs — these are web-based)
-    "OWASP Top 10 (2021)": None,
-    "OWASP ASVS 4.0": None,
-    "OWASP SAMM 2.0": None,
-    "OWASP API Security Top 10 (2023)": None,
-    "OWASP MASVS 2.0": None,
+    # OWASP (PDFs)
+    "OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
+    "OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
+    "OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
+    "OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
+    "OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",

    # ENISA (PDFs)
-    "ENISA ICS/SCADA Dependencies": None,
+    "ENISA ICS/SCADA Dependencies": "enisa_ics_scada.pdf",
    "ENISA Supply Chain Good Practices": "enisa_supply_chain_security.pdf",
    "ENISA Threat Landscape Supply Chain": "enisa_supply_chain_security.pdf",
    "ENISA Cybersecurity State 2024": None,
@@ -71,14 +71,14 @@ SOURCE_FILE_MAP = {

    # EDPB Guidelines (PDFs)
    "EDPB Leitlinien 01/2022 (BCR)": "edpb_bcr_01_2022.pdf",
-    "EDPB Leitlinien 05/2020 - Einwilligung": None,  # txt
+    "EDPB Leitlinien 05/2020 - Einwilligung": "edpb_consent_05_2020.pdf",
    "EDPB Leitlinien 08/2020 (Social Media)": "edpb_social_media_08_2020.pdf",
    "EDPB Leitlinien 01/2019 (Zertifizierung)": "edpb_certification_01_2019.pdf",
    "EDPB Leitlinien 07/2020 (Datentransfers)": "edpb_transfers_07_2020.pdf",
    "EDPB Leitlinien 09/2022 (Data Breach)": "edpb_breach_09_2022.pdf",
    "EDPB Leitlinien - Berechtigtes Interesse (Art. 6(1)(f))": "edpb_legitimate_interest.pdf",
    "EDPB Leitlinien 01/2024 (Berechtigtes Interesse)": "edpb_legitimate_interest.pdf",
-    "EDPB Leitlinien 04/2019 (Data Protection by Design)": None,  # txt
+    "EDPB Leitlinien 04/2019 (Data Protection by Design)": "edpb_dpbd_04_2019.pdf",
    "EDPB Leitlinien 01/2020 (Vernetzte Fahrzeuge)": "edpb_connected_vehicles_01_2020.pdf",
    "EDPB Leitlinien 01/2020 (Datentransfers)": "edpb_transfers_07_2020.pdf",

@@ -135,10 +135,18 @@ def classify_doc(source_name):


 def normalize(s):
-    """Remove soft hyphens, normalize whitespace."""
-    s = s.replace('\u00ad', '').replace('\xad', '')
-    s = s.replace('\u200b', '').replace('\u00a0', ' ')
+    """Remove soft hyphens, normalize whitespace, handle PDF encoding issues."""
+    s = s.replace('\u00ad', '').replace('\xad', '')  # soft hyphen
+    s = s.replace('\u200b', '').replace('\u00a0', ' ')  # zero-width, nbsp
    s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')  # ligatures
+    s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+    s = s.replace('\u2019', "'").replace('\u2018', "'")  # smart quotes
+    s = s.replace('\u201c', '"').replace('\u201d', '"')
+    s = s.replace('\u2013', '-').replace('\u2014', '-')  # en/em dash
+    s = s.replace('\u2022', '-')  # bullet
+    s = s.replace('\u00b7', '-')  # middle dot
+    # Remove common PDF artifacts
+    s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)  # control chars
    s = unicodedata.normalize('NFC', s)
    s = re.sub(r'\s+', ' ', s)
    return s.strip()
@@ -248,6 +256,47 @@ def build_nist_index(text):
    return unique


+def build_owasp_index(text, source_name):
+    """Build index for OWASP documents."""
+    items = []
+
+    if "Top 10" in source_name and "API" not in source_name:
+        # OWASP Top 10: A01:2021, A02:2021, etc.
+        for m in re.finditer(r'(A\d{2}:\d{4})', text):
+            items.append((m.start(), m.group(1), "category"))
+    elif "API" in source_name:
+        # OWASP API Top 10: API1:2023, API2:2023, etc.
+        for m in re.finditer(r'(API\d+:\d{4})', text):
+            items.append((m.start(), m.group(1), "category"))
+    elif "ASVS" in source_name:
+        # OWASP ASVS: V1.1, V2.1.1, etc.
+        for m in re.finditer(r'(?:^|\n)\s*(V\d+\.\d+(?:\.\d+)?)\b', text, re.MULTILINE):
+            items.append((m.start(), m.group(1), "requirement"))
+    elif "SAMM" in source_name:
+        # OWASP SAMM: practice names like "Strategy & Metrics", "Education & Guidance"
+        # Use section numbers
+        for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE):
+            items.append((m.start(), f"Section {m.group(1)}", "section"))
+    elif "MASVS" in source_name:
+        # OWASP MASVS: MASVS-STORAGE-1, MASVS-CRYPTO-1, etc.
+        for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text):
+            items.append((m.start(), m.group(1), "requirement"))
+
+    # Fallback: also find generic section numbers
+    if not items:
+        for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE):
+            items.append((m.start(), f"Section {m.group(1)}", "section"))
+
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    unique = []
+    for pos, label, typ in items:
+        if label not in seen:
+            seen.add(label)
+            unique.append((pos, label, typ))
+    return unique
+
+
 def build_generic_index(text):
    """Build a generic section index using numbered headings."""
    items = []
@@ -288,11 +337,11 @@ def find_text_in_doc(orig_text, full_norm, index, index_norm_positions):
        return None

    # Try progressively shorter substrings from different positions
-    for start_frac in [0.25, 0.1, 0.5, 0.0]:
-        for length in [80, 60, 40, 30]:
+    for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
+        for length in [80, 60, 40, 30, 20]:
            start = max(0, int(len(orig_norm) * start_frac))
            snippet = orig_norm[start:start+length]
-            if not snippet or len(snippet) < 25:
+            if not snippet or len(snippet) < 15:
                continue
            pos = full_norm.find(snippet)
            if pos >= 0:
@@ -380,6 +429,8 @@ def main():
            index = build_de_law_index(text)
        elif doc_type == "nist":
            index = build_nist_index(text)
+        elif doc_type == "owasp":
+            index = build_owasp_index(text, source_name)
        else:
            index = build_generic_index(text)