#!/usr/bin/env python3 """Crawl OSHA Technical Manual — all chapters as HTML.""" import json import logging import time from pathlib import Path from playwright.sync_api import sync_playwright logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger("osha-crawl") OUTPUT_DIR = Path(__file__).parent / "otm_chapters" BASE = "https://www.osha.gov" def main(): OUTPUT_DIR.mkdir(exist_ok=True) registry = [] with sync_playwright() as p: browser = p.chromium.launch(headless=False) page = browser.new_page() # Step 1: Get all chapter URLs page.goto(f"{BASE}/otm", timeout=30000) time.sleep(5) links = page.query_selector_all('a[href*="/otm/"]') chapters = [] seen = set() for l in links: href = l.get_attribute("href") or "" text = (l.inner_text() or "").strip() if href and "chapter" in href and href not in seen and text: seen.add(href) chapters.append({"url": href, "title": text}) logger.info("Found %d chapters", len(chapters)) # Step 2: Download each chapter for i, ch in enumerate(chapters): url = ch["url"] if ch["url"].startswith("http") else BASE + ch["url"] slug = ch["url"].replace("/otm/", "").replace("/", "_") outfile = OUTPUT_DIR / f"{slug}.html" logger.info("[%d/%d] %s", i + 1, len(chapters), ch["title"][:60]) if outfile.exists(): logger.info(" Already exists, skipping") ch["local_path"] = str(outfile) registry.append(ch) continue try: page.goto(url, timeout=30000) time.sleep(3) content = page.content() outfile.write_text(content) ch["local_path"] = str(outfile) logger.info(" Saved: %s (%.1f KB)", outfile.name, len(content) / 1024) except Exception as e: logger.error(" Failed: %s", e) ch["local_path"] = None registry.append(ch) time.sleep(1) browser.close() reg_file = Path(__file__).parent / "otm_registry.json" reg_file.write_text(json.dumps(registry, indent=2, ensure_ascii=False)) ok = sum(1 for r in registry if r.get("local_path")) logger.info("Done: %d/%d chapters saved", ok, len(registry)) if __name__ == "__main__": main()