Source code for purrfectmeow.plort.base

import numpy
from typing import Optional, Literal, List
from langchain_core.documents import Document

from purrfectmeow.plort.embedder import SimpleHFEmbedder
from purrfectmeow.plort.tokenization import SimpleTokenization


[docs]
class KhaoManee:
    """
    A class provides underlying complexity of embedding and tokenization processes.

    This class consolidates methods from `SimpleHFEmbedder`, and `SimpleTokenization` to perform
    encoding documents and query strings into dense vector using pre-trained transformer models, 
    as well as tokenizing text through various supported engines.
    """

[docs]
    @staticmethod
    def get_embeddings(
        documents: Document, 
        model_name: Optional[str] = "intfloat/multilingual-e5-large-instruct"
    ) -> numpy.ndarray:
        """
        Generates embeddings and tokenizes text using various engines.

        Parameters
        ----------
        documents : Document
            The document(s) to generate embeddings for.
        model_name : str, optional
            The name of the model to use for embedding generation.

        Returns
        -------
        numpy.ndarray
            An array of embeddings for the input documents.

        Examples
        --------
        >>> from langchain_core.documents import Document
        >>> doc = Document(page_content="This is a test document.")
        >>> KhaoManee.get_embeddings(doc)
        array([[0.1, 0.2, ...], ...])
        """
        return SimpleHFEmbedder.embed_documents(documents, model_name)



[docs]
    @staticmethod
    def get_query_embeddings(
        query: Optional[str] = "meow~",
        model_name: Optional[str] = "intfloat/multilingual-e5-large-instruct"
    ) -> numpy.ndarray:
        """
        Generates embeddings for a query string using a specified model.
        
        Parameters
        ----------
        query : str, optional
            The query string to generate embeddings for. Defaults to 'meow~'.
        model_name : str, optional
            The name of the model to use for embedding generation. Defaults to
            'intfloat/multilingual-e5-large-instruct'.

        Returns
        -------
        numpy.ndarray
            An array of embeddings for the input query.

        Examples
        --------
        >>> KhaoManee.get_query_embeddings(query="What is this?")
        array([0.1, 0.2, ...])
        """
        return SimpleHFEmbedder.embed_query(query, model_name)



[docs]
    @staticmethod
    def get_tokens(
        text: str,
        engine: Optional[Literal["spacy", "pythainlp", "huggingface"]] = "pythainlp"
    ) -> List[str]:
        """
        Tokenizes input text using a specified tokenization engine.
        
        Parameters
        ----------
        text : str
            The input text to tokenize.
        engine : str, optional
            The tokenization engine to use. Must be one of 'spacy', 'pythainlp', or
            'huggingface'. Defaults to 'pythainlp'.

        Returns
        -------
        List[str]
            A list of tokens extracted from the input text.

        Raises
        ------
        ValueError
            If the specified engine is not one of 'spacy', 'pythainlp', or 'huggingface'.

        Examples
        --------
        >>> KhaoManee.get_tokens("Hello world", engine="pythainlp")
        ['Hello', 'world']
        """
        return SimpleTokenization.tokenize(text, engine)