breakpilot-compliance/scripts/qa/gap_analysis.py

"""
Phase 7.3: Gap Analysis — Identify articles/sections WITHOUT controls.

For each regulation PDF:
1. Extract all articles/sections from the PDF
2. Compare with controls in the DB that reference this article
3. Report gaps (articles with no controls)

Usage:
    python3 gap_analysis.py                  # show all gaps
    python3 gap_analysis.py --source "DSGVO"  # filter by source
"""
import os
import sys
import json
import re
import psycopg2
import urllib.parse
from pathlib import Path
from collections import defaultdict

# Import from pdf_qa_all
sys.path.insert(0, os.path.dirname(__file__))
from pdf_qa_all import (
    SOURCE_FILE_MAP, read_file, classify_doc, normalize,
    build_eu_article_index, build_de_law_index, build_nist_index,
    build_owasp_index, build_generic_index, MAX_ARTICLES
)

# Only analyze sources with significant control counts (skip sources with <5 controls)
MIN_CONTROLS = 5


def main():
    source_filter = None
    if "--source" in sys.argv:
        idx = sys.argv.index("--source")
        if idx + 1 < len(sys.argv):
            source_filter = sys.argv[idx + 1]

    # DB connection
    db_url = os.environ['DATABASE_URL']
    parsed = urllib.parse.urlparse(db_url)
    conn = psycopg2.connect(
        host=parsed.hostname, port=parsed.port or 5432,
        user=parsed.username, password=parsed.password,
        dbname=parsed.path.lstrip('/'),
        options="-c search_path=compliance,public"
    )
    cur = conn.cursor()

    # Get all controls grouped by source with their article
    cur.execute("""
        SELECT source_citation->>'source' as source,
               source_citation->>'article' as article,
               source_citation->>'article_type' as article_type,
               count(*) as cnt
        FROM compliance.canonical_controls
        WHERE source_citation->>'source' IS NOT NULL
        AND release_state NOT IN ('duplicate', 'too_close')
        GROUP BY 1, 2, 3
        ORDER BY 1, 2
    """)

    # Build: source -> {article -> (type, count)}
    controls_by_source = defaultdict(dict)
    for source, article, art_type, cnt in cur.fetchall():
        if article:
            controls_by_source[source][article] = (art_type or "unknown", cnt)

    total_gaps = 0
    total_articles_checked = 0
    total_covered = 0
    gap_report = []

    sources_to_check = sorted(SOURCE_FILE_MAP.keys())
    if source_filter:
        sources_to_check = [s for s in sources_to_check if source_filter.lower() in s.lower()]

    for source_name in sources_to_check:
        filename = SOURCE_FILE_MAP.get(source_name)
        if filename is None:
            continue

        controls = controls_by_source.get(source_name, {})
        if len(controls) < MIN_CONTROLS and not source_filter:
            continue

        # Read PDF and build article index
        text = read_file(filename)
        if text is None:
            continue

        doc_type = classify_doc(source_name)
        max_art = MAX_ARTICLES.get(source_name)

        if doc_type == "eu_regulation":
            index = build_eu_article_index(text, max_article=max_art)
        elif doc_type == "de_law":
            index = build_de_law_index(text)
        elif doc_type == "nist":
            index = build_nist_index(text)
        elif doc_type == "owasp":
            index = build_owasp_index(text, source_name)
        else:
            index = build_generic_index(text)

        if not index:
            continue

        # Only look at substantive articles (not preamble, not annex for gap analysis)
        substantive_types = {"article", "section", "control", "requirement", "category"}
        substantive_articles = [(pos, label, typ) for pos, label, typ in index if typ in substantive_types]

        preamble_articles = [(pos, label, typ) for pos, label, typ in index if typ == "preamble"]
        annex_articles = [(pos, label, typ) for pos, label, typ in index if typ == "annex"]

        # Check which articles have controls
        covered = []
        gaps = []
        for pos, label, typ in substantive_articles:
            if label in controls:
                covered.append(label)
            else:
                gaps.append((label, typ))

        total_articles_checked += len(substantive_articles)
        total_covered += len(covered)
        total_gaps += len(gaps)

        # Count preamble/annex controls
        preamble_controls = sum(1 for a in controls if controls[a][0] == "preamble")
        annex_controls = sum(1 for a in controls if controls[a][0] == "annex")

        coverage_pct = len(covered) / len(substantive_articles) * 100 if substantive_articles else 0

        print(f"\n{'='*70}")
        print(f"{source_name}")
        print(f"  PDF articles: {len(substantive_articles)} substantive, "
              f"{len(preamble_articles)} preamble, {len(annex_articles)} annex")
        print(f"  DB controls:  {sum(v[1] for v in controls.values())} total "
              f"({preamble_controls} preamble, {annex_controls} annex)")
        print(f"  Coverage:     {len(covered)}/{len(substantive_articles)} "
              f"({coverage_pct:.0f}%)")

        if gaps:
            print(f"  GAPS ({len(gaps)}):")
            for label, typ in gaps[:30]:  # limit output
                print(f"    - {label} [{typ}]")
            if len(gaps) > 30:
                print(f"    ... and {len(gaps)-30} more")

            gap_report.append({
                "source": source_name,
                "total_articles": len(substantive_articles),
                "covered": len(covered),
                "gaps": len(gaps),
                "coverage_pct": round(coverage_pct, 1),
                "gap_articles": [{"label": l, "type": t} for l, t in gaps],
            })

    # Summary
    print(f"\n{'='*70}")
    print("GAP ANALYSIS SUMMARY")
    print(f"{'='*70}")
    print(f"  Sources analyzed:        {len([r for r in gap_report]) + len([s for s in sources_to_check if SOURCE_FILE_MAP.get(s)])}")
    print(f"  Total articles in PDFs:  {total_articles_checked}")
    print(f"  Articles with controls:  {total_covered}")
    print(f"  Articles WITHOUT controls: {total_gaps}")
    if total_articles_checked:
        print(f"  Overall coverage:        {total_covered/total_articles_checked*100:.1f}%")

    print(f"\n  Sources with gaps:")
    for r in sorted(gap_report, key=lambda x: -x["gaps"]):
        print(f"    {r['source']:45s}  {r['gaps']:4d} gaps  "
              f"({r['covered']}/{r['total_articles']} = {r['coverage_pct']}%)")

    # Save report
    out_path = "/tmp/gap_analysis_results.json"
    with open(out_path, 'w') as f:
        json.dump(gap_report, f, indent=2, ensure_ascii=False)
    print(f"\n  Full report saved to {out_path}")

    conn.close()


if __name__ == "__main__":
    main()