breakpilot-lehrer/klausur-service/backend/legal_corpus_ingestion.py

"""
Legal Corpus Ingestion for UCCA RAG Integration.

Indexes all regulations from the Compliance Hub into Qdrant for
semantic search during UCCA assessments and explanations.
Includes EU regulations, DACH national laws, and EDPB guidelines.

Collections:
- bp_legal_corpus: All regulation texts (GDPR, AI Act, CRA, BSI, etc.)

Split modules:
- legal_corpus_registry: Regulation dataclass + REGULATIONS list (pure data)
- legal_corpus_chunking: Sentence/paragraph splitting, semantic chunking, HTML-to-text

Usage:
    python legal_corpus_ingestion.py --ingest-all
    python legal_corpus_ingestion.py --ingest GDPR AIACT
    python legal_corpus_ingestion.py --status
"""

import asyncio
import hashlib
import json
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
from urllib.parse import urlparse

import httpx
from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance,
    FieldCondition,
    Filter,
    MatchValue,
    PointStruct,
    VectorParams,
)

# Re-export for backward compatibility
from legal_corpus_registry import Regulation, REGULATIONS  # noqa: F401
from legal_corpus_chunking import (  # noqa: F401
    chunk_text_semantic,
    extract_article_info,
    html_to_text,
    split_into_sentences,
    split_into_paragraphs,
    GERMAN_ABBREVIATIONS,
)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configuration - Support both QDRANT_URL and QDRANT_HOST/PORT
_qdrant_url = os.getenv("QDRANT_URL", "")
if _qdrant_url:
    _parsed = urlparse(_qdrant_url)
    QDRANT_HOST = _parsed.hostname or "localhost"
    QDRANT_PORT = _parsed.port or 6333
else:
    QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
    QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://localhost:8087")
LEGAL_CORPUS_COLLECTION = "bp_legal_corpus"
VECTOR_SIZE = 1024  # BGE-M3 dimension

# Chunking configuration - matched to NIBIS settings for semantic chunking
CHUNK_SIZE = int(os.getenv("LEGAL_CHUNK_SIZE", "1000"))
CHUNK_OVERLAP = int(os.getenv("LEGAL_CHUNK_OVERLAP", "200"))

# Base path for local PDF/HTML files
_default_docs_path = Path(__file__).parent.parent / "docs" / "legal_corpus"
LEGAL_DOCS_PATH = Path(os.getenv("LEGAL_DOCS_PATH", str(_default_docs_path)))
if Path("/app/docs/legal_corpus").exists():
    LEGAL_DOCS_PATH = Path("/app/docs/legal_corpus")


class LegalCorpusIngestion:
    """Handles ingestion of legal documents into Qdrant."""

    def __init__(self):
        self.qdrant = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
        self.http_client = httpx.AsyncClient(timeout=60.0)
        self._ensure_collection()

    def _ensure_collection(self):
        """Create the legal corpus collection if it doesn't exist."""
        collections = self.qdrant.get_collections().collections
        collection_names = [c.name for c in collections]

        if LEGAL_CORPUS_COLLECTION not in collection_names:
            logger.info(f"Creating collection: {LEGAL_CORPUS_COLLECTION}")
            self.qdrant.create_collection(
                collection_name=LEGAL_CORPUS_COLLECTION,
                vectors_config=VectorParams(
                    size=VECTOR_SIZE,
                    distance=Distance.COSINE,
                ),
            )
            logger.info(f"Collection {LEGAL_CORPUS_COLLECTION} created")
        else:
            logger.info(f"Collection {LEGAL_CORPUS_COLLECTION} already exists")

    async def _generate_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings via the embedding service."""
        try:
            response = await self.http_client.post(
                f"{EMBEDDING_SERVICE_URL}/embed",
                json={"texts": texts},
                timeout=120.0,
            )
            response.raise_for_status()
            data = response.json()
            return data["embeddings"]
        except Exception as e:
            logger.error(f"Embedding generation failed: {e}")
            raise

    # Delegate chunking/text methods to legal_corpus_chunking module
    # Keep as instance methods for backward compatibility
    GERMAN_ABBREVIATIONS = GERMAN_ABBREVIATIONS

    def _split_into_sentences(self, text: str) -> List[str]:
        return split_into_sentences(text)

    def _split_into_paragraphs(self, text: str) -> List[str]:
        return split_into_paragraphs(text)

    def _chunk_text_semantic(self, text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP):
        return chunk_text_semantic(text, chunk_size, overlap)

    def _extract_article_info(self, text: str):
        return extract_article_info(text)

    def _html_to_text(self, html_content: str) -> str:
        return html_to_text(html_content)

    async def _fetch_document_text(self, regulation: Regulation) -> Optional[str]:
        """
        Fetch document text from local file or URL.

        Priority:
        1. Local file in docs/legal_corpus/ (.txt or .pdf)
        2. EUR-Lex via CELEX URL (for EU regulations/directives)
        3. Fallback to original source URL
        """
        # Check for local file first
        local_file = LEGAL_DOCS_PATH / f"{regulation.code}.txt"
        if local_file.exists():
            logger.info(f"Loading {regulation.code} from local file: {local_file}")
            return local_file.read_text(encoding="utf-8")

        local_pdf = LEGAL_DOCS_PATH / f"{regulation.code}.pdf"
        if local_pdf.exists():
            logger.info(f"Extracting text from PDF: {local_pdf}")
            try:
                response = await self.http_client.post(
                    f"{EMBEDDING_SERVICE_URL}/extract-pdf",
                    files={"file": open(local_pdf, "rb")},
                    timeout=120.0,
                )
                response.raise_for_status()
                data = response.json()
                return data.get("text", "")
            except Exception as e:
                logger.error(f"PDF extraction failed for {regulation.code}: {e}")

        # Try EUR-Lex CELEX URL if available
        if regulation.celex:
            celex_url = f"https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{regulation.celex}"
            logger.info(f"Fetching {regulation.code} from EUR-Lex CELEX: {celex_url}")
            try:
                response = await self.http_client.get(
                    celex_url,
                    follow_redirects=True,
                    headers={
                        "Accept": "text/html,application/xhtml+xml",
                        "Accept-Language": "de-DE,de;q=0.9",
                        "User-Agent": "Mozilla/5.0 (compatible; LegalCorpusIndexer/1.0)",
                    },
                    timeout=120.0,
                )
                response.raise_for_status()

                html_content = response.text

                if "verify that you're not a robot" not in html_content and len(html_content) > 10000:
                    text = self._html_to_text(html_content)
                    if text and len(text) > 1000:
                        logger.info(f"Successfully fetched {regulation.code} via CELEX ({len(text)} chars)")
                        return text
                    else:
                        logger.warning(f"CELEX response too short for {regulation.code}, trying fallback")
                else:
                    logger.warning(f"CELEX returned CAPTCHA for {regulation.code}, trying fallback")
            except Exception as e:
                logger.warning(f"CELEX fetch failed for {regulation.code}: {e}, trying fallback")

        # Fallback to original source URL
        logger.info(f"Fetching {regulation.code} from: {regulation.source_url}")
        try:
            parsed_url = urlparse(regulation.source_url)
            is_pdf_url = parsed_url.path.lower().endswith('.pdf')
            if is_pdf_url:
                logger.info(f"Downloading PDF from URL for {regulation.code}")
                response = await self.http_client.get(
                    regulation.source_url,
                    follow_redirects=True,
                    headers={
                        "Accept": "application/pdf",
                        "User-Agent": "Mozilla/5.0 (compatible; LegalCorpusIndexer/1.0)",
                    },
                    timeout=180.0,
                )
                response.raise_for_status()

                pdf_content = response.content
                extract_response = await self.http_client.post(
                    f"{EMBEDDING_SERVICE_URL}/extract-pdf",
                    files={"file": ("document.pdf", pdf_content, "application/pdf")},
                    timeout=180.0,
                )
                extract_response.raise_for_status()
                data = extract_response.json()
                text = data.get("text", "")
                if text:
                    logger.info(f"Successfully extracted PDF text for {regulation.code} ({len(text)} chars)")
                    return text
                else:
                    logger.warning(f"PDF extraction returned empty text for {regulation.code}")
                    return None
            else:
                response = await self.http_client.get(
                    regulation.source_url,
                    follow_redirects=True,
                    headers={
                        "Accept": "text/html,application/xhtml+xml",
                        "Accept-Language": "de-DE,de;q=0.9",
                        "User-Agent": "Mozilla/5.0 (compatible; LegalCorpusIndexer/1.0)",
                    },
                    timeout=120.0,
                )
                response.raise_for_status()

                text = self._html_to_text(response.text)
                return text
        except Exception as e:
            logger.error(f"Failed to fetch {regulation.code}: {e}")
            return None

    async def ingest_regulation(self, regulation: Regulation) -> int:
        """Ingest a single regulation into Qdrant. Returns number of chunks indexed."""
        logger.info(f"Ingesting {regulation.code}: {regulation.name}")

        text = await self._fetch_document_text(regulation)
        if not text or len(text) < 100:
            logger.warning(f"No text found for {regulation.code}, skipping")
            return 0

        chunks = self._chunk_text_semantic(text)
        logger.info(f"Created {len(chunks)} chunks for {regulation.code}")

        if not chunks:
            return 0

        batch_size = 4
        all_points = []
        max_retries = 3

        for i in range(0, len(chunks), batch_size):
            batch_chunks = chunks[i:i + batch_size]
            chunk_texts = [c[0] for c in batch_chunks]

            embeddings = None
            for retry in range(max_retries):
                try:
                    embeddings = await self._generate_embeddings(chunk_texts)
                    break
                except Exception as e:
                    logger.warning(f"Embedding attempt {retry+1}/{max_retries} failed for batch {i//batch_size}: {e}")
                    if retry < max_retries - 1:
                        await asyncio.sleep(3 * (retry + 1))
                    else:
                        logger.error(f"Embedding failed permanently for batch {i//batch_size}")

            if embeddings is None:
                continue

            await asyncio.sleep(1.5)

            for j, ((chunk_text, position), embedding) in enumerate(zip(batch_chunks, embeddings)):
                chunk_idx = i + j
                point_id = hashlib.md5(f"{regulation.code}-{chunk_idx}".encode()).hexdigest()

                article_info = self._extract_article_info(chunk_text)

                point = PointStruct(
                    id=point_id,
                    vector=embedding,
                    payload={
                        "text": chunk_text,
                        "regulation_code": regulation.code,
                        "regulation_name": regulation.name,
                        "regulation_full_name": regulation.full_name,
                        "regulation_type": regulation.regulation_type,
                        "source_url": regulation.source_url,
                        "chunk_index": chunk_idx,
                        "chunk_position": position,
                        "article": article_info.get("article") if article_info else None,
                        "paragraph": article_info.get("paragraph") if article_info else None,
                        "language": regulation.language,
                        "indexed_at": datetime.utcnow().isoformat(),
                        "training_allowed": False,
                    },
                )
                all_points.append(point)

        if all_points:
            self.qdrant.upsert(
                collection_name=LEGAL_CORPUS_COLLECTION,
                points=all_points,
            )
            logger.info(f"Indexed {len(all_points)} chunks for {regulation.code}")

        return len(all_points)

    async def ingest_all(self) -> Dict[str, int]:
        """Ingest all regulations."""
        results = {}
        total = 0

        for regulation in REGULATIONS:
            try:
                count = await self.ingest_regulation(regulation)
                results[regulation.code] = count
                total += count
            except Exception as e:
                logger.error(f"Failed to ingest {regulation.code}: {e}")
                results[regulation.code] = 0

        logger.info(f"Ingestion complete: {total} total chunks indexed")
        return results

    async def ingest_selected(self, codes: List[str]) -> Dict[str, int]:
        """Ingest selected regulations by code."""
        results = {}

        for code in codes:
            regulation = next((r for r in REGULATIONS if r.code == code), None)
            if not regulation:
                logger.warning(f"Unknown regulation code: {code}")
                results[code] = 0
                continue

            try:
                count = await self.ingest_regulation(regulation)
                results[code] = count
            except Exception as e:
                logger.error(f"Failed to ingest {code}: {e}")
                results[code] = 0

        return results

    def get_status(self) -> Dict:
        """Get collection status and indexed regulations."""
        try:
            collection_info = self.qdrant.get_collection(LEGAL_CORPUS_COLLECTION)

            regulation_counts = {}
            for reg in REGULATIONS:
                result = self.qdrant.count(
                    collection_name=LEGAL_CORPUS_COLLECTION,
                    count_filter=Filter(
                        must=[
                            FieldCondition(
                                key="regulation_code",
                                match=MatchValue(value=reg.code),
                            )
                        ]
                    ),
                )
                regulation_counts[reg.code] = result.count

            return {
                "collection": LEGAL_CORPUS_COLLECTION,
                "total_points": collection_info.points_count,
                "vector_size": VECTOR_SIZE,
                "regulations": regulation_counts,
                "status": "ready" if collection_info.points_count > 0 else "empty",
            }
        except Exception as e:
            return {
                "collection": LEGAL_CORPUS_COLLECTION,
                "error": str(e),
                "status": "error",
            }

    async def search(
        self,
        query: str,
        regulation_codes: Optional[List[str]] = None,
        top_k: int = 5,
    ) -> List[Dict]:
        """Search the legal corpus for relevant passages."""
        embeddings = await self._generate_embeddings([query])
        query_vector = embeddings[0]

        search_filter = None
        if regulation_codes:
            search_filter = Filter(
                should=[
                    FieldCondition(
                        key="regulation_code",
                        match=MatchValue(value=code),
                    )
                    for code in regulation_codes
                ]
            )

        results = self.qdrant.search(
            collection_name=LEGAL_CORPUS_COLLECTION,
            query_vector=query_vector,
            query_filter=search_filter,
            limit=top_k,
        )

        return [
            {
                "text": hit.payload.get("text"),
                "regulation_code": hit.payload.get("regulation_code"),
                "regulation_name": hit.payload.get("regulation_name"),
                "article": hit.payload.get("article"),
                "paragraph": hit.payload.get("paragraph"),
                "source_url": hit.payload.get("source_url"),
                "score": hit.score,
            }
            for hit in results
        ]

    async def close(self):
        """Close HTTP client."""
        await self.http_client.aclose()


async def main():
    """CLI entry point."""
    import argparse

    parser = argparse.ArgumentParser(description="Legal Corpus Ingestion for UCCA")
    parser.add_argument("--ingest-all", action="store_true", help="Ingest all regulations")
    parser.add_argument("--ingest", nargs="+", metavar="CODE", help="Ingest specific regulations by code")
    parser.add_argument("--status", action="store_true", help="Show collection status")
    parser.add_argument("--search", type=str, help="Test search query")

    args = parser.parse_args()

    ingestion = LegalCorpusIngestion()

    try:
        if args.status:
            status = ingestion.get_status()
            print(json.dumps(status, indent=2))

        elif args.ingest_all:
            print(f"Ingesting all {len(REGULATIONS)} regulations...")
            results = await ingestion.ingest_all()
            print("\nResults:")
            for code, count in results.items():
                print(f"  {code}: {count} chunks")
            print(f"\nTotal: {sum(results.values())} chunks")

        elif args.ingest:
            print(f"Ingesting: {', '.join(args.ingest)}")
            results = await ingestion.ingest_selected(args.ingest)
            print("\nResults:")
            for code, count in results.items():
                print(f"  {code}: {count} chunks")

        elif args.search:
            print(f"Searching: {args.search}")
            results = await ingestion.search(args.search)
            print(f"\nFound {len(results)} results:")
            for i, result in enumerate(results, 1):
                print(f"\n{i}. [{result['regulation_code']}] Score: {result['score']:.3f}")
                if result.get('article'):
                    print(f"   Art. {result['article']}" + (f" Abs. {result['paragraph']}" if result.get('paragraph') else ""))
                print(f"   {result['text'][:200]}...")

        else:
            parser.print_help()

    finally:
        await ingestion.close()


if __name__ == "__main__":
    asyncio.run(main())