Source code for purrfectmeow.konja.base

from typing import List, Optional, Literal

from purrfectmeow.konja.splitter import Splitter


[docs]
class Kornja:
    """
    A flexible interface for text segmentation based on tokenization or custom separators.

    This class provides a unified API for splitting large text inputs into smaller, manageable chunks using either
    token-based or separator-based strategies. It supports configuration of chunk size, overlap, and model-specific
    parameters for token-aware splitting, as well as simple string-based segmentation for more structured inputs.        
    """

[docs]
    @staticmethod
    def chunking(
        text: str,
        splitter: Optional[Literal["token", "separator"]] = "token",
        **kwargs
    ) -> List[str]:
        """
        Handles text chunking with token or separator-based splitting.

        Parameters
        ----------
        text : str
            The input text to be chunked.
        splitter : str, optional
            The type of splitter to use for chunking. Must be either 'token' or 'separator'.
        **kwargs : dict
            Additional parameters for the splitter:
            
            For 'token' splitter:
                model_name : str, optional
                    Name of the model for token-based splitting.
                chunk_size : int, optional
                    Maximum size of each chunk in tokens.
                chunk_overlap : int, optional
                    Number of overlapping tokens between chunks.
            
            For 'separator' splitter:
                separator : str, optional
                    String used to split the text.

        Returns
        -------
        List[str]
            A list of text chunks generated by the specified splitter.

        Raises
        ------
        ValueError
            If `splitter` is not 'token' or 'separator', or if required parameters
            (`model_name` for 'token', `separator` for 'separator') are invalid or empty.

        Examples
        --------
        >>> text = "This is a sample text.\\n\\nAnother paragraph."
        >>> Kornja.chunking(text, splitter="separator")
        ['This is a sample text.', 'Another paragraph.']
        >>> Kornja.chunking(text, splitter="token", model_name="text-embedding-ada-002", chunk_size=10)
        ['This is a', 'sample text.', 'Another', 'paragraph.']
        """
        match splitter:
            case "token":
                model_name = kwargs.get("model_name", "text-embedding-ada-002")
                chunk_size = kwargs.get("chunk_size", 500)
                chunk_overlap = kwargs.get("chunk_overlap", 0)

                if not model_name or not isinstance(model_name, str) or not model_name.strip():
                    raise ValueError("model_name must be a non-empty string for token splitter")
                sptr = Splitter.create_token_splitter(model_name, chunk_size, chunk_overlap)

            case "separator":
                separator = kwargs.get("separator", "\n\n")
                
                if not separator or not isinstance(separator, str) or not separator.strip():
                    raise ValueError("separator must be a non-empty string for separator splitter")
                sptr = Splitter.create_separator_splitter(separator)
                
            case _:
                raise ValueError(f"Invalid splitter type: {splitter}. Must be 'token' or 'separator'.")

        chunks = sptr.split_text(text)
        return chunks