breakpilot-compliance/document-crawler/extractors/dispatcher.py

"""Routes files to the appropriate extractor by extension."""

from .pdf_extractor import extract_pdf
from .docx_extractor import extract_docx
from .xlsx_extractor import extract_xlsx
from .pptx_extractor import extract_pptx

EXTRACTORS = {
    ".pdf": extract_pdf,
    ".docx": extract_docx,
    ".xlsx": extract_xlsx,
    ".pptx": extract_pptx,
}


def extract_text(file_path: str, extension: str) -> str:
    """Extract text from a file based on its extension.

    Returns extracted text or raises ValueError for unsupported types.
    """
    ext = extension.lower()
    extractor = EXTRACTORS.get(ext)
    if extractor is None:
        raise ValueError(f"Unsupported file extension: {ext}")
    return extractor(file_path)