Source code for purrfectmeow.tc02_mlt.base

from typing import Any, Literal

from langchain_text_splitters import TokenTextSplitter

from .separate import SeparateSplit
from .token import TokenSplit


[docs] class Malet: DEFAULT_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" DEFAULT_CHUNK_SIZE = 500 DEFAULT_CHUNK_OVERLAP = 0 DEFAULT_CHUNK_SEPARATOR = "\n\n" @staticmethod def _get_kwarg(kwargs: dict[str, Any], keys: list[str], default: Any = None) -> Any: for key in keys: if key in kwargs: return kwargs[key] return default
[docs] @classmethod def chunking( cls, text: str, chunk_method: Literal["token", "separate"] | None = "token", **kwargs: Any ) -> TokenTextSplitter | SeparateSplit.CharacterSeparator: match chunk_method: case "token": model_name = cls._get_kwarg(kwargs, ["model_name", "ModelName", "modelName"], cls.DEFAULT_MODEL_NAME) chunk_size = cls._get_kwarg(kwargs, ["chunk_size", "ChunkSize", "chunkSize"], cls.DEFAULT_CHUNK_SIZE) chunk_overlap = cls._get_kwarg( kwargs, ["chunk_overlap", "ChunkOverlap", "chunkOverlap"], cls.DEFAULT_CHUNK_OVERLAP ) method = TokenSplit.splitter(model_name, chunk_size, chunk_overlap) case "separate": chunk_separator = cls._get_kwarg( kwargs, ["chunk_separator", "ChunkSeparator", "chunkSeparator"], cls.DEFAULT_CHUNK_SEPARATOR ) method = SeparateSplit.splitter(chunk_separator) return method.split_text(text)