[docs]classKornja:""" A flexible interface for text segmentation based on tokenization or custom separators. This class provides a unified API for splitting large text inputs into smaller, manageable chunks using either token-based or separator-based strategies. It supports configuration of chunk size, overlap, and model-specific parameters for token-aware splitting, as well as simple string-based segmentation for more structured inputs. """
[docs]@staticmethoddefchunking(text:str,splitter:Optional[Literal["token","separator"]]="token",**kwargs)->List[str]:""" Handles text chunking with token or separator-based splitting. Parameters ---------- text : str The input text to be chunked. splitter : str, optional The type of splitter to use for chunking. Must be either 'token' or 'separator'. **kwargs : dict Additional parameters for the splitter: For 'token' splitter: model_name : str, optional Name of the model for token-based splitting. chunk_size : int, optional Maximum size of each chunk in tokens. chunk_overlap : int, optional Number of overlapping tokens between chunks. For 'separator' splitter: separator : str, optional String used to split the text. Returns ------- List[str] A list of text chunks generated by the specified splitter. Raises ------ ValueError If `splitter` is not 'token' or 'separator', or if required parameters (`model_name` for 'token', `separator` for 'separator') are invalid or empty. Examples -------- >>> text = "This is a sample text.\\n\\nAnother paragraph." >>> Kornja.chunking(text, splitter="separator") ['This is a sample text.', 'Another paragraph.'] >>> Kornja.chunking(text, splitter="token", model_name="text-embedding-ada-002", chunk_size=10) ['This is a', 'sample text.', 'Another', 'paragraph.'] """matchsplitter:case"token":model_name=kwargs.get("model_name","text-embedding-ada-002")chunk_size=kwargs.get("chunk_size",500)chunk_overlap=kwargs.get("chunk_overlap",0)ifnotmodel_nameornotisinstance(model_name,str)ornotmodel_name.strip():raiseValueError("model_name must be a non-empty string for token splitter")sptr=Splitter.create_token_splitter(model_name,chunk_size,chunk_overlap)case"separator":separator=kwargs.get("separator","\n\n")ifnotseparatorornotisinstance(separator,str)ornotseparator.strip():raiseValueError("separator must be a non-empty string for separator splitter")sptr=Splitter.create_separator_splitter(separator)case_:raiseValueError(f"Invalid splitter type: {splitter}. Must be 'token' or 'separator'.")chunks=sptr.split_text(text)returnchunks