#!/usr/bin/env bash # # fetch-and-analyze.sh — Fetch a URL and extract clean text for compliance analysis. # # Usage: bash fetch-and-analyze.sh [max_chars] # # Outputs clean text to stdout, truncated to max_chars (default: 4000). set -euo pipefail URL="${1:?Usage: fetch-and-analyze.sh [max_chars]}" MAX_CHARS="${2:-4000}" # Fetch page with reasonable timeout and user agent HTML=$(curl -sL --max-time 30 \ -H "User-Agent: Mozilla/5.0 (compatible; BreakPilot-Compliance-Agent/1.0)" \ "$URL" 2>/dev/null || echo "") if [ -z "$HTML" ]; then echo "ERROR: Could not fetch $URL" >&2 exit 1 fi # Strip HTML: remove style/script blocks, then all tags, normalize whitespace CLEAN=$(echo "$HTML" \ | sed 's/]*>[^<]*<\/style>//gi' \ | sed 's/]*>[^<]*<\/script>//gi' \ | sed 's/<[^>]*>//g' \ | sed 's/ / /g; s/&/\&/g; s/<//g; s/"/"/g' \ | tr -s '[:space:]' ' ' \ | sed 's/^ //; s/ $//') # Truncate to max chars echo "$CLEAN" | head -c "$MAX_CHARS"