Source code for purrfectmeow.plort.base

import numpy
from typing import Optional, Literal, List
from langchain_core.documents import Document

from purrfectmeow.plort.embedder import SimpleHFEmbedder
from purrfectmeow.plort.tokenization import SimpleTokenization

[docs] class KhaoManee: """ A class provides underlying complexity of embedding and tokenization processes. This class consolidates methods from `SimpleHFEmbedder`, and `SimpleTokenization` to perform encoding documents and query strings into dense vector using pre-trained transformer models, as well as tokenizing text through various supported engines. """
[docs] @staticmethod def get_embeddings( documents: Document, model_name: Optional[str] = "intfloat/multilingual-e5-large-instruct" ) -> numpy.ndarray: """ Generates embeddings and tokenizes text using various engines. Parameters ---------- documents : Document The document(s) to generate embeddings for. model_name : str, optional The name of the model to use for embedding generation. Returns ------- numpy.ndarray An array of embeddings for the input documents. Examples -------- >>> from langchain_core.documents import Document >>> doc = Document(page_content="This is a test document.") >>> KhaoManee.get_embeddings(doc) array([[0.1, 0.2, ...], ...]) """ return SimpleHFEmbedder.embed_documents(documents, model_name)
[docs] @staticmethod def get_query_embeddings( query: Optional[str] = "meow~", model_name: Optional[str] = "intfloat/multilingual-e5-large-instruct" ) -> numpy.ndarray: """ Generates embeddings for a query string using a specified model. Parameters ---------- query : str, optional The query string to generate embeddings for. Defaults to 'meow~'. model_name : str, optional The name of the model to use for embedding generation. Defaults to 'intfloat/multilingual-e5-large-instruct'. Returns ------- numpy.ndarray An array of embeddings for the input query. Examples -------- >>> KhaoManee.get_query_embeddings(query="What is this?") array([0.1, 0.2, ...]) """ return SimpleHFEmbedder.embed_query(query, model_name)
[docs] @staticmethod def get_tokens( text: str, engine: Optional[Literal["spacy", "pythainlp", "huggingface"]] = "pythainlp" ) -> List[str]: """ Tokenizes input text using a specified tokenization engine. Parameters ---------- text : str The input text to tokenize. engine : str, optional The tokenization engine to use. Must be one of 'spacy', 'pythainlp', or 'huggingface'. Defaults to 'pythainlp'. Returns ------- List[str] A list of tokens extracted from the input text. Raises ------ ValueError If the specified engine is not one of 'spacy', 'pythainlp', or 'huggingface'. Examples -------- >>> KhaoManee.get_tokens("Hello world", engine="pythainlp") ['Hello', 'world'] """ return SimpleTokenization.tokenize(text, engine)