breakpilot-pwa/klausur-service/backend/full_compliance_pipeline.py

#!/usr/bin/env python3
"""
Full Compliance Pipeline for Legal Corpus.

This script runs the complete pipeline:
1. Re-ingest all legal documents with improved chunking
2. Extract requirements/checkpoints from chunks
3. Generate controls using AI
4. Define remediation measures
5. Update statistics

Run on Mac Mini:
    nohup python full_compliance_pipeline.py > /tmp/compliance_pipeline.log 2>&1 &

Checkpoints are saved to /tmp/pipeline_checkpoints.json and can be viewed in admin-v2.
"""

import asyncio
import json
import logging
import os
import sys
import time
from datetime import datetime
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, asdict
import re
import hashlib

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler('/tmp/compliance_pipeline.log')
    ]
)
logger = logging.getLogger(__name__)

# Import checkpoint manager
try:
    from pipeline_checkpoints import CheckpointManager, EXPECTED_VALUES, ValidationStatus
except ImportError:
    logger.warning("Checkpoint manager not available, running without checkpoints")
    CheckpointManager = None
    EXPECTED_VALUES = {}
    ValidationStatus = None

# Set environment variables for Docker network
# Support both QDRANT_URL and QDRANT_HOST
if not os.getenv("QDRANT_URL") and not os.getenv("QDRANT_HOST"):
    os.environ["QDRANT_HOST"] = "qdrant"
os.environ.setdefault("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")

# Try to import from klausur-service
try:
    from legal_corpus_ingestion import LegalCorpusIngestion, REGULATIONS, LEGAL_CORPUS_COLLECTION
    from qdrant_client import QdrantClient
    from qdrant_client.models import Filter, FieldCondition, MatchValue
except ImportError:
    logger.error("Could not import required modules. Make sure you're in the klausur-service container.")
    sys.exit(1)


@dataclass
class Checkpoint:
    """A requirement/checkpoint extracted from legal text."""
    id: str
    regulation_code: str
    regulation_name: str
    article: Optional[str]
    title: str
    description: str
    original_text: str
    chunk_id: str
    source_url: str


@dataclass
class Control:
    """A control derived from checkpoints."""
    id: str
    domain: str
    title: str
    description: str
    checkpoints: List[str]  # List of checkpoint IDs
    pass_criteria: str
    implementation_guidance: str
    is_automated: bool
    automation_tool: Optional[str]
    priority: str


@dataclass
class Measure:
    """A remediation measure for a control."""
    id: str
    control_id: str
    title: str
    description: str
    responsible: str
    deadline_days: int
    status: str


class CompliancePipeline:
    """Handles the full compliance pipeline."""

    def __init__(self):
        # Support both QDRANT_URL and QDRANT_HOST/PORT
        qdrant_url = os.getenv("QDRANT_URL", "")
        if qdrant_url:
            from urllib.parse import urlparse
            parsed = urlparse(qdrant_url)
            qdrant_host = parsed.hostname or "qdrant"
            qdrant_port = parsed.port or 6333
        else:
            qdrant_host = os.getenv("QDRANT_HOST", "qdrant")
            qdrant_port = 6333
        self.qdrant = QdrantClient(host=qdrant_host, port=qdrant_port)
        self.checkpoints: List[Checkpoint] = []
        self.controls: List[Control] = []
        self.measures: List[Measure] = []
        self.stats = {
            "chunks_processed": 0,
            "checkpoints_extracted": 0,
            "controls_created": 0,
            "measures_defined": 0,
            "by_regulation": {},
            "by_domain": {},
        }
        # Initialize checkpoint manager
        self.checkpoint_mgr = CheckpointManager() if CheckpointManager else None

    def extract_checkpoints_from_chunk(self, chunk_text: str, payload: Dict) -> List[Checkpoint]:
        """
        Extract checkpoints/requirements from a chunk of text.

        Uses pattern matching to find requirement-like statements.
        """
        checkpoints = []
        regulation_code = payload.get("regulation_code", "UNKNOWN")
        regulation_name = payload.get("regulation_name", "Unknown")
        source_url = payload.get("source_url", "")
        chunk_id = hashlib.md5(chunk_text[:100].encode()).hexdigest()[:8]

        # Patterns for different requirement types
        patterns = [
            # BSI-TR patterns
            (r'([OT]\.[A-Za-z_]+\d*)[:\s]+(.+?)(?=\n[OT]\.|$)', 'bsi_requirement'),
            # Article patterns (GDPR, AI Act, etc.)
            (r'(?:Artikel|Art\.?)\s+(\d+)(?:\s+Abs(?:atz)?\.?\s*(\d+))?\s*[-–:]\s*(.+?)(?=\n|$)', 'article'),
            # Numbered requirements
            (r'\((\d+)\)\s+(.+?)(?=\n\(\d+\)|$)', 'numbered'),
            # "Der Verantwortliche muss" patterns
            (r'(?:Der Verantwortliche|Die Aufsichtsbehörde|Der Auftragsverarbeiter)\s+(muss|hat|soll)\s+(.+?)(?=\.\s|$)', 'obligation'),
            # "Es ist erforderlich" patterns
            (r'(?:Es ist erforderlich|Es muss gewährleistet|Es sind geeignete)\s+(.+?)(?=\.\s|$)', 'requirement'),
        ]

        for pattern, pattern_type in patterns:
            matches = re.finditer(pattern, chunk_text, re.MULTILINE | re.DOTALL)
            for match in matches:
                if pattern_type == 'bsi_requirement':
                    req_id = match.group(1)
                    description = match.group(2).strip()
                    title = req_id
                elif pattern_type == 'article':
                    article_num = match.group(1)
                    paragraph = match.group(2) or ""
                    title_text = match.group(3).strip()
                    req_id = f"{regulation_code}-Art{article_num}"
                    if paragraph:
                        req_id += f"-{paragraph}"
                    title = f"Art. {article_num}" + (f" Abs. {paragraph}" if paragraph else "")
                    description = title_text
                elif pattern_type == 'numbered':
                    num = match.group(1)
                    description = match.group(2).strip()
                    req_id = f"{regulation_code}-{num}"
                    title = f"Anforderung {num}"
                else:
                    # Generic requirement
                    description = match.group(0).strip()
                    req_id = f"{regulation_code}-{chunk_id}-{len(checkpoints)}"
                    title = description[:50] + "..." if len(description) > 50 else description

                # Skip very short matches
                if len(description) < 20:
                    continue

                checkpoint = Checkpoint(
                    id=req_id,
                    regulation_code=regulation_code,
                    regulation_name=regulation_name,
                    article=title if 'Art' in title else None,
                    title=title,
                    description=description[:500],
                    original_text=description,
                    chunk_id=chunk_id,
                    source_url=source_url
                )
                checkpoints.append(checkpoint)

        return checkpoints

    def generate_control_for_checkpoints(self, checkpoints: List[Checkpoint]) -> Optional[Control]:
        """
        Generate a control that covers the given checkpoints.

        This is a simplified version - in production this would use the AI assistant.
        """
        if not checkpoints:
            return None

        # Group by regulation
        regulation = checkpoints[0].regulation_code

        # Determine domain based on content
        all_text = " ".join([cp.description for cp in checkpoints]).lower()

        domain = "gov"  # Default
        if any(kw in all_text for kw in ["verschlüssel", "krypto", "encrypt", "hash"]):
            domain = "crypto"
        elif any(kw in all_text for kw in ["zugang", "access", "authentif", "login", "benutzer"]):
            domain = "iam"
        elif any(kw in all_text for kw in ["datenschutz", "personenbezogen", "privacy", "einwilligung"]):
            domain = "priv"
        elif any(kw in all_text for kw in ["entwicklung", "test", "code", "software"]):
            domain = "sdlc"
        elif any(kw in all_text for kw in ["überwach", "monitor", "log", "audit"]):
            domain = "aud"
        elif any(kw in all_text for kw in ["ki", "künstlich", "ai", "machine learning", "model"]):
            domain = "ai"
        elif any(kw in all_text for kw in ["betrieb", "operation", "verfügbar", "backup"]):
            domain = "ops"
        elif any(kw in all_text for kw in ["cyber", "resilience", "sbom", "vulnerab"]):
            domain = "cra"

        # Generate control ID
        domain_counts = self.stats.get("by_domain", {})
        domain_count = domain_counts.get(domain, 0) + 1
        control_id = f"{domain.upper()}-{domain_count:03d}"

        # Create title from first checkpoint
        title = checkpoints[0].title
        if len(title) > 100:
            title = title[:97] + "..."

        # Create description
        description = f"Control für {regulation}: " + checkpoints[0].description[:200]

        # Pass criteria
        pass_criteria = f"Alle {len(checkpoints)} zugehörigen Anforderungen sind erfüllt und dokumentiert."

        # Implementation guidance
        guidance = f"Implementiere Maßnahmen zur Erfüllung der Anforderungen aus {regulation}. "
        guidance += f"Dokumentiere die Umsetzung und führe regelmäßige Reviews durch."

        # Determine if automated
        is_automated = any(kw in all_text for kw in ["automat", "tool", "scan", "test"])

        control = Control(
            id=control_id,
            domain=domain,
            title=title,
            description=description,
            checkpoints=[cp.id for cp in checkpoints],
            pass_criteria=pass_criteria,
            implementation_guidance=guidance,
            is_automated=is_automated,
            automation_tool="CI/CD Pipeline" if is_automated else None,
            priority="high" if "muss" in all_text or "erforderlich" in all_text else "medium"
        )

        return control

    def generate_measure_for_control(self, control: Control) -> Measure:
        """Generate a remediation measure for a control."""
        measure_id = f"M-{control.id}"

        # Determine deadline based on priority
        deadline_days = {
            "critical": 30,
            "high": 60,
            "medium": 90,
            "low": 180
        }.get(control.priority, 90)

        # Determine responsible team
        responsible = {
            "priv": "Datenschutzbeauftragter",
            "iam": "IT-Security Team",
            "sdlc": "Entwicklungsteam",
            "crypto": "IT-Security Team",
            "ops": "Operations Team",
            "aud": "Compliance Team",
            "ai": "AI/ML Team",
            "cra": "IT-Security Team",
            "gov": "Management"
        }.get(control.domain, "Compliance Team")

        measure = Measure(
            id=measure_id,
            control_id=control.id,
            title=f"Umsetzung: {control.title[:50]}",
            description=f"Implementierung und Dokumentation von {control.id}: {control.description[:100]}",
            responsible=responsible,
            deadline_days=deadline_days,
            status="pending"
        )

        return measure

    async def run_ingestion_phase(self, force_reindex: bool = False) -> int:
        """Phase 1: Ingest documents (incremental - only missing ones)."""
        logger.info("\n" + "=" * 60)
        logger.info("PHASE 1: DOCUMENT INGESTION (INCREMENTAL)")
        logger.info("=" * 60)

        if self.checkpoint_mgr:
            self.checkpoint_mgr.start_checkpoint("ingestion", "Document Ingestion")

        ingestion = LegalCorpusIngestion()

        try:
            # Check existing chunks per regulation
            existing_chunks = {}
            try:
                for regulation in REGULATIONS:
                    count_result = self.qdrant.count(
                        collection_name=LEGAL_CORPUS_COLLECTION,
                        count_filter=Filter(
                            must=[FieldCondition(key="regulation_code", match=MatchValue(value=regulation.code))]
                        )
                    )
                    existing_chunks[regulation.code] = count_result.count
                    logger.info(f"  {regulation.code}: {count_result.count} existing chunks")
            except Exception as e:
                logger.warning(f"Could not check existing chunks: {e}")
                # Collection might not exist, that's OK

            # Determine which regulations need ingestion
            regulations_to_ingest = []
            for regulation in REGULATIONS:
                existing = existing_chunks.get(regulation.code, 0)
                if force_reindex or existing == 0:
                    regulations_to_ingest.append(regulation)
                    logger.info(f"  -> Will ingest: {regulation.code} (existing: {existing}, force: {force_reindex})")
                else:
                    logger.info(f"  -> Skipping: {regulation.code} (already has {existing} chunks)")
                    self.stats["by_regulation"][regulation.code] = existing

            if not regulations_to_ingest:
                logger.info("All regulations already indexed. Skipping ingestion phase.")
                total_chunks = sum(existing_chunks.values())
                self.stats["chunks_processed"] = total_chunks
                if self.checkpoint_mgr:
                    self.checkpoint_mgr.add_metric("total_chunks", total_chunks)
                    self.checkpoint_mgr.add_metric("skipped", True)
                    self.checkpoint_mgr.complete_checkpoint(success=True)
                return total_chunks

            # Ingest only missing regulations
            total_chunks = sum(existing_chunks.values())
            for i, regulation in enumerate(regulations_to_ingest, 1):
                logger.info(f"[{i}/{len(regulations_to_ingest)}] Ingesting {regulation.code}...")
                try:
                    count = await ingestion.ingest_regulation(regulation)
                    total_chunks += count
                    self.stats["by_regulation"][regulation.code] = count
                    logger.info(f"  -> {count} chunks")

                    # Add metric for this regulation
                    if self.checkpoint_mgr:
                        self.checkpoint_mgr.add_metric(f"chunks_{regulation.code}", count)

                except Exception as e:
                    logger.error(f"  -> FAILED: {e}")
                    self.stats["by_regulation"][regulation.code] = 0

            self.stats["chunks_processed"] = total_chunks
            logger.info(f"\nTotal chunks in collection: {total_chunks}")

            # Validate ingestion results
            if self.checkpoint_mgr:
                self.checkpoint_mgr.add_metric("total_chunks", total_chunks)
                self.checkpoint_mgr.add_metric("regulations_count", len(REGULATIONS))

                # Validate total chunks
                expected = EXPECTED_VALUES.get("ingestion", {})
                self.checkpoint_mgr.validate(
                    "total_chunks",
                    expected=expected.get("total_chunks", 8000),
                    actual=total_chunks,
                    min_value=expected.get("min_chunks", 7000)
                )

                # Validate key regulations
                reg_expected = expected.get("regulations", {})
                for reg_code, reg_exp in reg_expected.items():
                    actual = self.stats["by_regulation"].get(reg_code, 0)
                    self.checkpoint_mgr.validate(
                        f"chunks_{reg_code}",
                        expected=reg_exp.get("expected", 0),
                        actual=actual,
                        min_value=reg_exp.get("min", 0)
                    )

                self.checkpoint_mgr.complete_checkpoint(success=True)

            return total_chunks

        except Exception as e:
            if self.checkpoint_mgr:
                self.checkpoint_mgr.fail_checkpoint(str(e))
            raise

        finally:
            await ingestion.close()

    async def run_extraction_phase(self) -> int:
        """Phase 2: Extract checkpoints from chunks."""
        logger.info("\n" + "=" * 60)
        logger.info("PHASE 2: CHECKPOINT EXTRACTION")
        logger.info("=" * 60)

        if self.checkpoint_mgr:
            self.checkpoint_mgr.start_checkpoint("extraction", "Checkpoint Extraction")

        try:
            # Scroll through all chunks
            offset = None
            total_checkpoints = 0

            while True:
                result = self.qdrant.scroll(
                    collection_name=LEGAL_CORPUS_COLLECTION,
                    limit=100,
                    offset=offset,
                    with_payload=True,
                    with_vectors=False
                )

                points, next_offset = result

                if not points:
                    break

                for point in points:
                    payload = point.payload
                    text = payload.get("text", "")

                    checkpoints = self.extract_checkpoints_from_chunk(text, payload)
                    self.checkpoints.extend(checkpoints)
                    total_checkpoints += len(checkpoints)

                logger.info(f"Processed {len(points)} chunks, extracted {total_checkpoints} checkpoints so far...")

                if next_offset is None:
                    break
                offset = next_offset

            self.stats["checkpoints_extracted"] = len(self.checkpoints)
            logger.info(f"\nTotal checkpoints extracted: {len(self.checkpoints)}")

            # Log per regulation
            by_reg = {}
            for cp in self.checkpoints:
                by_reg[cp.regulation_code] = by_reg.get(cp.regulation_code, 0) + 1
            for reg, count in sorted(by_reg.items()):
                logger.info(f"  {reg}: {count} checkpoints")

            # Validate extraction results
            if self.checkpoint_mgr:
                self.checkpoint_mgr.add_metric("total_checkpoints", len(self.checkpoints))
                self.checkpoint_mgr.add_metric("checkpoints_by_regulation", by_reg)

                expected = EXPECTED_VALUES.get("extraction", {})
                self.checkpoint_mgr.validate(
                    "total_checkpoints",
                    expected=expected.get("total_checkpoints", 3500),
                    actual=len(self.checkpoints),
                    min_value=expected.get("min_checkpoints", 3000)
                )

                self.checkpoint_mgr.complete_checkpoint(success=True)

            return len(self.checkpoints)

        except Exception as e:
            if self.checkpoint_mgr:
                self.checkpoint_mgr.fail_checkpoint(str(e))
            raise

    async def run_control_generation_phase(self) -> int:
        """Phase 3: Generate controls from checkpoints."""
        logger.info("\n" + "=" * 60)
        logger.info("PHASE 3: CONTROL GENERATION")
        logger.info("=" * 60)

        if self.checkpoint_mgr:
            self.checkpoint_mgr.start_checkpoint("controls", "Control Generation")

        try:
            # Group checkpoints by regulation
            by_regulation: Dict[str, List[Checkpoint]] = {}
            for cp in self.checkpoints:
                reg = cp.regulation_code
                if reg not in by_regulation:
                    by_regulation[reg] = []
                by_regulation[reg].append(cp)

            # Generate controls per regulation (group every 3-5 checkpoints)
            for regulation, checkpoints in by_regulation.items():
                logger.info(f"Generating controls for {regulation} ({len(checkpoints)} checkpoints)...")

                # Group checkpoints into batches of 3-5
                batch_size = 4
                for i in range(0, len(checkpoints), batch_size):
                    batch = checkpoints[i:i + batch_size]
                    control = self.generate_control_for_checkpoints(batch)

                    if control:
                        self.controls.append(control)
                        self.stats["by_domain"][control.domain] = self.stats["by_domain"].get(control.domain, 0) + 1

            self.stats["controls_created"] = len(self.controls)
            logger.info(f"\nTotal controls created: {len(self.controls)}")

            # Log per domain
            for domain, count in sorted(self.stats["by_domain"].items()):
                logger.info(f"  {domain}: {count} controls")

            # Validate control generation
            if self.checkpoint_mgr:
                self.checkpoint_mgr.add_metric("total_controls", len(self.controls))
                self.checkpoint_mgr.add_metric("controls_by_domain", dict(self.stats["by_domain"]))

                expected = EXPECTED_VALUES.get("controls", {})
                self.checkpoint_mgr.validate(
                    "total_controls",
                    expected=expected.get("total_controls", 900),
                    actual=len(self.controls),
                    min_value=expected.get("min_controls", 800)
                )

                self.checkpoint_mgr.complete_checkpoint(success=True)

            return len(self.controls)

        except Exception as e:
            if self.checkpoint_mgr:
                self.checkpoint_mgr.fail_checkpoint(str(e))
            raise

    async def run_measure_generation_phase(self) -> int:
        """Phase 4: Generate measures for controls."""
        logger.info("\n" + "=" * 60)
        logger.info("PHASE 4: MEASURE GENERATION")
        logger.info("=" * 60)

        if self.checkpoint_mgr:
            self.checkpoint_mgr.start_checkpoint("measures", "Measure Generation")

        try:
            for control in self.controls:
                measure = self.generate_measure_for_control(control)
                self.measures.append(measure)

            self.stats["measures_defined"] = len(self.measures)
            logger.info(f"\nTotal measures defined: {len(self.measures)}")

            # Validate measure generation
            if self.checkpoint_mgr:
                self.checkpoint_mgr.add_metric("total_measures", len(self.measures))

                expected = EXPECTED_VALUES.get("measures", {})
                self.checkpoint_mgr.validate(
                    "total_measures",
                    expected=expected.get("total_measures", 900),
                    actual=len(self.measures),
                    min_value=expected.get("min_measures", 800)
                )

                self.checkpoint_mgr.complete_checkpoint(success=True)

            return len(self.measures)

        except Exception as e:
            if self.checkpoint_mgr:
                self.checkpoint_mgr.fail_checkpoint(str(e))
            raise

    def save_results(self, output_dir: str = "/tmp/compliance_output"):
        """Save results to JSON files."""
        logger.info("\n" + "=" * 60)
        logger.info("SAVING RESULTS")
        logger.info("=" * 60)

        os.makedirs(output_dir, exist_ok=True)

        # Save checkpoints
        checkpoints_file = os.path.join(output_dir, "checkpoints.json")
        with open(checkpoints_file, "w") as f:
            json.dump([asdict(cp) for cp in self.checkpoints], f, indent=2, ensure_ascii=False)
        logger.info(f"Saved {len(self.checkpoints)} checkpoints to {checkpoints_file}")

        # Save controls
        controls_file = os.path.join(output_dir, "controls.json")
        with open(controls_file, "w") as f:
            json.dump([asdict(c) for c in self.controls], f, indent=2, ensure_ascii=False)
        logger.info(f"Saved {len(self.controls)} controls to {controls_file}")

        # Save measures
        measures_file = os.path.join(output_dir, "measures.json")
        with open(measures_file, "w") as f:
            json.dump([asdict(m) for m in self.measures], f, indent=2, ensure_ascii=False)
        logger.info(f"Saved {len(self.measures)} measures to {measures_file}")

        # Save statistics
        stats_file = os.path.join(output_dir, "statistics.json")
        self.stats["generated_at"] = datetime.now().isoformat()
        with open(stats_file, "w") as f:
            json.dump(self.stats, f, indent=2, ensure_ascii=False)
        logger.info(f"Saved statistics to {stats_file}")

    async def run_full_pipeline(self, force_reindex: bool = False, skip_ingestion: bool = False):
        """Run the complete pipeline.

        Args:
            force_reindex: If True, re-ingest all documents even if they exist
            skip_ingestion: If True, skip ingestion phase entirely (use existing chunks)
        """
        start_time = time.time()

        logger.info("=" * 60)
        logger.info("FULL COMPLIANCE PIPELINE (INCREMENTAL)")
        logger.info(f"Started at: {datetime.now().isoformat()}")
        logger.info(f"Force reindex: {force_reindex}")
        logger.info(f"Skip ingestion: {skip_ingestion}")
        if self.checkpoint_mgr:
            logger.info(f"Pipeline ID: {self.checkpoint_mgr.pipeline_id}")
        logger.info("=" * 60)

        try:
            # Phase 1: Ingestion (skip if requested or run incrementally)
            if skip_ingestion:
                logger.info("Skipping ingestion phase as requested...")
                # Still get the chunk count
                try:
                    collection_info = self.qdrant.get_collection(LEGAL_CORPUS_COLLECTION)
                    self.stats["chunks_processed"] = collection_info.points_count
                except Exception:
                    self.stats["chunks_processed"] = 0
            else:
                await self.run_ingestion_phase(force_reindex=force_reindex)

            # Phase 2: Extraction
            await self.run_extraction_phase()

            # Phase 3: Control Generation
            await self.run_control_generation_phase()

            # Phase 4: Measure Generation
            await self.run_measure_generation_phase()

            # Save results
            self.save_results()

            # Final summary
            elapsed = time.time() - start_time
            logger.info("\n" + "=" * 60)
            logger.info("PIPELINE COMPLETE")
            logger.info("=" * 60)
            logger.info(f"Duration: {elapsed:.1f} seconds")
            logger.info(f"Chunks processed: {self.stats['chunks_processed']}")
            logger.info(f"Checkpoints extracted: {self.stats['checkpoints_extracted']}")
            logger.info(f"Controls created: {self.stats['controls_created']}")
            logger.info(f"Measures defined: {self.stats['measures_defined']}")
            logger.info(f"\nResults saved to: /tmp/compliance_output/")
            logger.info("Checkpoint status: /tmp/pipeline_checkpoints.json")
            logger.info("=" * 60)

            # Complete pipeline checkpoint
            if self.checkpoint_mgr:
                self.checkpoint_mgr.complete_pipeline({
                    "duration_seconds": elapsed,
                    "chunks_processed": self.stats['chunks_processed'],
                    "checkpoints_extracted": self.stats['checkpoints_extracted'],
                    "controls_created": self.stats['controls_created'],
                    "measures_defined": self.stats['measures_defined'],
                    "by_regulation": self.stats['by_regulation'],
                    "by_domain": self.stats['by_domain'],
                })

        except Exception as e:
            logger.error(f"Pipeline failed: {e}")
            if self.checkpoint_mgr:
                self.checkpoint_mgr.state.status = "failed"
                self.checkpoint_mgr._save()
            raise


async def main():
    import argparse
    parser = argparse.ArgumentParser(description="Run the compliance pipeline")
    parser.add_argument("--force-reindex", action="store_true",
                        help="Force re-ingestion of all documents")
    parser.add_argument("--skip-ingestion", action="store_true",
                        help="Skip ingestion phase, use existing chunks")
    args = parser.parse_args()

    pipeline = CompliancePipeline()
    await pipeline.run_full_pipeline(
        force_reindex=args.force_reindex,
        skip_ingestion=args.skip_ingestion
    )


if __name__ == "__main__":
    asyncio.run(main())