"""
DSFA Chunking — Text chunking strategies for document ingestion.

Contains:
- chunk_text_recursive: Recursive chunking with overlap
- chunk_by_sections: Section-marker-based chunking
- chunk_by_list_items: List-item-based chunking
- chunk_document: Strategy router
"""

import re
from typing import List, Dict

from dsfa_sources_registry import DSFA_CHUNK_CONFIG


def chunk_text_recursive(text: str, max_size: int = 1000, overlap: int = 200) -> List[Dict]:
    """Recursively chunk text with overlap."""
    chunks = []
    start = 0

    while start < len(text):
        end = min(start + max_size, len(text))

        # Find a good break point (sentence end, paragraph)
        if end < len(text):
            for sep in ["\n\n", "\n", ". ", ", ", " "]:
                last_sep = text[start:end].rfind(sep)
                if last_sep > max_size // 2:
                    end = start + last_sep + len(sep)
                    break

        chunk_text = text[start:end].strip()
        if chunk_text:
            chunks.append({
                "content": chunk_text,
                "start_char": start,
                "end_char": end
            })

        start = end - overlap if end < len(text) else len(text)

    return chunks


def chunk_by_sections(text: str, markers: List[str], max_size: int = 1500, overlap: int = 200) -> List[Dict]:
    """Chunk text by section markers."""
    chunks = []
    pattern = "|".join(f"({m})" for m in markers)

    matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))

    if not matches:
        return chunk_text_recursive(text, max_size, overlap)

    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)

        section_text = text[start:end].strip()
        section_title = match.group(0).strip()

        if len(section_text) > max_size:
            sub_chunks = chunk_text_recursive(section_text, max_size, overlap)
            for j, sub in enumerate(sub_chunks):
                chunks.append({
                    "content": sub["content"],
                    "section_title": section_title if j == 0 else f"{section_title} (cont.)",
                    "start_char": start + sub["start_char"],
                    "end_char": start + sub["end_char"]
                })
        else:
            chunks.append({
                "content": section_text,
                "section_title": section_title,
                "start_char": start,
                "end_char": end
            })

    return chunks


def chunk_by_list_items(text: str, markers: List[str], max_size: int = 800) -> List[Dict]:
    """Chunk text by list item markers."""
    chunks = []
    pattern = "|".join(f"({m})" for m in markers)

    lines = text.split("\n")
    current_item = ""
    current_start = 0

    for i, line in enumerate(lines):
        if re.match(pattern, line.strip()):
            if current_item.strip():
                chunks.append({
                    "content": current_item.strip(),
                    "start_char": current_start,
                    "end_char": current_start + len(current_item)
                })
            current_item = line
            current_start = sum(len(lines[j]) + 1 for j in range(i))
        else:
            current_item += "\n" + line

    if current_item.strip():
        chunks.append({
            "content": current_item.strip(),
            "start_char": current_start,
            "end_char": current_start + len(current_item)
        })

    return chunks


def chunk_document(text: str, source_code: str) -> List[Dict]:
    """Chunk document using appropriate strategy for source type."""
    config = DSFA_CHUNK_CONFIG.get(source_code, DSFA_CHUNK_CONFIG["DEFAULT"])

    if source_code.endswith("_MUSS_PUBLIC") or source_code.endswith("_MUSS_PRIVATE"):
        config = DSFA_CHUNK_CONFIG["MUSS_LISTEN"]

    if config["strategy"] == "section_based":
        return chunk_by_sections(
            text,
            config["section_markers"],
            config["max_chunk_size"],
            config["overlap"]
        )
    elif config["strategy"] == "list_item":
        return chunk_by_list_items(
            text,
            config["list_markers"],
            config["max_chunk_size"]
        )
    else:
        return chunk_text_recursive(
            text,
            config["max_chunk_size"],
            config["overlap"]
        )