Source code for purrfectmeow.tc02_mlt.token

import time

from langchain_text_splitters import TokenTextSplitter

from purrfectmeow.meow.kitty import kitty_logger


[docs] class TokenSplit: _logger = kitty_logger(__name__) _OPENAI_EMBED_MODEL = {"text-embedding-ada-002", "text-embedding-3-small", "text-embedding-3-large"} _OPENAI_HF_MODEL = {"Xenova/text-embedding-ada-002"} _HF_MODEL_DIR = ".cache/huggingface/hub/"
[docs] @classmethod def splitter(cls, model_name: str, chunk_size: int, chunk_overlap: int) -> TokenTextSplitter: cls._logger.debug("Initializing token splitter") start = time.time() try: cls._logger.debug(f"Using OpenAI model tokenizer: {model_name}") if model_name in cls._OPENAI_EMBED_MODEL: splitter = TokenTextSplitter.from_tiktoken_encoder( model_name=model_name, chunk_size=chunk_size, chunk_overlap=chunk_overlap ) else: cls._logger.debug(f"Using HuggingFace tokenizer: {model_name}") from transformers import AutoTokenizer, GPT2TokenizerFast if model_name in cls._OPENAI_HF_MODEL: tokenizer = GPT2TokenizerFast.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR) else: tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR) splitter = TokenTextSplitter.from_huggingface_tokenizer( tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap ) cls._logger.debug("Token splitter successfully initialized.") return splitter except Exception as e: cls._logger.exception(f"Failed to initialize token splitter: {e}") raise finally: elapsed = time.time() - start cls._logger.debug(f"Token splitting completed in {elapsed:.2f} seconds.")