[docs]classKhaoManee:""" A class provides underlying complexity of embedding and tokenization processes. This class consolidates methods from `SimpleHFEmbedder`, and `SimpleTokenization` to perform encoding documents and query strings into dense vector using pre-trained transformer models, as well as tokenizing text through various supported engines. """
[docs]@staticmethoddefget_embeddings(documents:Document,model_name:Optional[str]="intfloat/multilingual-e5-large-instruct")->numpy.ndarray:""" Generates embeddings and tokenizes text using various engines. Parameters ---------- documents : Document The document(s) to generate embeddings for. model_name : str, optional The name of the model to use for embedding generation. Returns ------- numpy.ndarray An array of embeddings for the input documents. Examples -------- >>> from langchain_core.documents import Document >>> doc = Document(page_content="This is a test document.") >>> KhaoManee.get_embeddings(doc) array([[0.1, 0.2, ...], ...]) """returnSimpleHFEmbedder.embed_documents(documents,model_name)
[docs]@staticmethoddefget_query_embeddings(query:Optional[str]="meow~",model_name:Optional[str]="intfloat/multilingual-e5-large-instruct")->numpy.ndarray:""" Generates embeddings for a query string using a specified model. Parameters ---------- query : str, optional The query string to generate embeddings for. Defaults to 'meow~'. model_name : str, optional The name of the model to use for embedding generation. Defaults to 'intfloat/multilingual-e5-large-instruct'. Returns ------- numpy.ndarray An array of embeddings for the input query. Examples -------- >>> KhaoManee.get_query_embeddings(query="What is this?") array([0.1, 0.2, ...]) """returnSimpleHFEmbedder.embed_query(query,model_name)
[docs]@staticmethoddefget_tokens(text:str,engine:Optional[Literal["spacy","pythainlp","huggingface"]]="pythainlp")->List[str]:""" Tokenizes input text using a specified tokenization engine. Parameters ---------- text : str The input text to tokenize. engine : str, optional The tokenization engine to use. Must be one of 'spacy', 'pythainlp', or 'huggingface'. Defaults to 'pythainlp'. Returns ------- List[str] A list of tokens extracted from the input text. Raises ------ ValueError If the specified engine is not one of 'spacy', 'pythainlp', or 'huggingface'. Examples -------- >>> KhaoManee.get_tokens("Hello world", engine="pythainlp") ['Hello', 'world'] """returnSimpleTokenization.tokenize(text,engine)