Source code for purrfectmeow.konja.base

from typing import List, Optional, Literal

from purrfectmeow.konja.splitter import Splitter

[docs] class Kornja: """ A flexible interface for text segmentation based on tokenization or custom separators. This class provides a unified API for splitting large text inputs into smaller, manageable chunks using either token-based or separator-based strategies. It supports configuration of chunk size, overlap, and model-specific parameters for token-aware splitting, as well as simple string-based segmentation for more structured inputs. """
[docs] @staticmethod def chunking( text: str, splitter: Optional[Literal["token", "separator"]] = "token", **kwargs ) -> List[str]: """ Handles text chunking with token or separator-based splitting. Parameters ---------- text : str The input text to be chunked. splitter : str, optional The type of splitter to use for chunking. Must be either 'token' or 'separator'. **kwargs : dict Additional parameters for the splitter: For 'token' splitter: model_name : str, optional Name of the model for token-based splitting. chunk_size : int, optional Maximum size of each chunk in tokens. chunk_overlap : int, optional Number of overlapping tokens between chunks. For 'separator' splitter: separator : str, optional String used to split the text. Returns ------- List[str] A list of text chunks generated by the specified splitter. Raises ------ ValueError If `splitter` is not 'token' or 'separator', or if required parameters (`model_name` for 'token', `separator` for 'separator') are invalid or empty. Examples -------- >>> text = "This is a sample text.\\n\\nAnother paragraph." >>> Kornja.chunking(text, splitter="separator") ['This is a sample text.', 'Another paragraph.'] >>> Kornja.chunking(text, splitter="token", model_name="text-embedding-ada-002", chunk_size=10) ['This is a', 'sample text.', 'Another', 'paragraph.'] """ match splitter: case "token": model_name = kwargs.get("model_name", "text-embedding-ada-002") chunk_size = kwargs.get("chunk_size", 500) chunk_overlap = kwargs.get("chunk_overlap", 0) if not model_name or not isinstance(model_name, str) or not model_name.strip(): raise ValueError("model_name must be a non-empty string for token splitter") sptr = Splitter.create_token_splitter(model_name, chunk_size, chunk_overlap) case "separator": separator = kwargs.get("separator", "\n\n") if not separator or not isinstance(separator, str) or not separator.strip(): raise ValueError("separator must be a non-empty string for separator splitter") sptr = Splitter.create_separator_splitter(separator) case _: raise ValueError(f"Invalid splitter type: {splitter}. Must be 'token' or 'separator'.") chunks = sptr.split_text(text) return chunks