breakpilot-core/legal-sources/osha/crawl_osha_otm.py

#!/usr/bin/env python3
"""Crawl OSHA Technical Manual — all chapters as HTML."""

import json
import logging
import time
from pathlib import Path

from playwright.sync_api import sync_playwright

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger("osha-crawl")

OUTPUT_DIR = Path(__file__).parent / "otm_chapters"
BASE = "https://www.osha.gov"


def main():
    OUTPUT_DIR.mkdir(exist_ok=True)
    registry = []

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)
        page = browser.new_page()

        # Step 1: Get all chapter URLs
        page.goto(f"{BASE}/otm", timeout=30000)
        time.sleep(5)

        links = page.query_selector_all('a[href*="/otm/"]')
        chapters = []
        seen = set()
        for l in links:
            href = l.get_attribute("href") or ""
            text = (l.inner_text() or "").strip()
            if href and "chapter" in href and href not in seen and text:
                seen.add(href)
                chapters.append({"url": href, "title": text})

        logger.info("Found %d chapters", len(chapters))

        # Step 2: Download each chapter
        for i, ch in enumerate(chapters):
            url = ch["url"] if ch["url"].startswith("http") else BASE + ch["url"]
            slug = ch["url"].replace("/otm/", "").replace("/", "_")
            outfile = OUTPUT_DIR / f"{slug}.html"

            logger.info("[%d/%d] %s", i + 1, len(chapters), ch["title"][:60])

            if outfile.exists():
                logger.info("  Already exists, skipping")
                ch["local_path"] = str(outfile)
                registry.append(ch)
                continue

            try:
                page.goto(url, timeout=30000)
                time.sleep(3)
                content = page.content()
                outfile.write_text(content)
                ch["local_path"] = str(outfile)
                logger.info("  Saved: %s (%.1f KB)", outfile.name, len(content) / 1024)
            except Exception as e:
                logger.error("  Failed: %s", e)
                ch["local_path"] = None

            registry.append(ch)
            time.sleep(1)

        browser.close()

    reg_file = Path(__file__).parent / "otm_registry.json"
    reg_file.write_text(json.dumps(registry, indent=2, ensure_ascii=False))
    ok = sum(1 for r in registry if r.get("local_path"))
    logger.info("Done: %d/%d chapters saved", ok, len(registry))


if __name__ == "__main__":
    main()