Source code for purrfectmeow.taeng.base

from typing import Dict, List, Any
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer
from transformers import PreTrainedTokenizerBase, PreTrainedModel

from purrfectmeow.taeng.template_doc import DocTemplate
from purrfectmeow.taeng.model_loader import LoadingModel
from purrfectmeow.taeng.file_metadata import MetadataFile


[docs]
class Suphalaks:
    """
    A class for handling files, loading models, and creating document templates.

    This class consolidates methods from `LoadingModel`, `DocTemplate` and `MetadataFile`
    to perform common operations such as saving/removing files, retrieving models and tokenizers, 
    extracting file metadata, and creating structured LangChain document templates.
    """


[docs]
    @staticmethod
    def get_tokenizer(model_name: str = None) -> PreTrainedTokenizerBase:
        """
        Retrieve a Hugging Face tokenizer by model name.

        Parameters
        ----------
        model_name : str, optional
            The name of the model. If None, a default is used.

        Returns
        -------
        PreTrainedTokenizerBase
            The tokenizer corresponding to the specified model.

        Examples
        --------
        >>> tokenizer = Suphalaks.get_tokenizer('bert-base-uncased')
        """
        model_name = model_name or "intfloat/multilingual-e5-large-instruct"
        return LoadingModel.get_hf_tokenizer(model_name)



[docs]
    @staticmethod
    def get_model_hf(model_name: str = None) -> PreTrainedModel:
        """
        Retrieve a Hugging Face model by model name.

        Parameters
        ----------
        model_name : str, optional
            The name of the model. If None, a default is used.

        Returns
        -------
        PreTrainedModel
            The loaded Hugging Face model.

        Examples
        --------
        >>> model = Suphalaks.get_model_hf('bert-base-uncased')
        """
        model_name = model_name or "intfloat/multilingual-e5-large-instruct"
        return LoadingModel.get_hf_model(model_name)

    

[docs]
    @staticmethod
    def get_model_st(model_name: str = None) -> SentenceTransformer:
        """
        Retrieve a SentenceTransformer model by model name.

        Parameters
        ----------
        model_name : str, optional
            The name of the model. If None, a default is used.

        Returns
        -------
        SentenceTransformer
            The loaded SentenceTransformer model.

        Examples
        --------
        >>> st_model = Suphalaks.get_model_st('all-MiniLM-L6-v2')
        """
        model_name = model_name or "intfloat/multilingual-e5-large-instruct"
        return LoadingModel.get_st_model(model_name)



[docs]
    @staticmethod
    def get_file_metadata(file_path: str) -> Dict:
        """
        Extract metadata from a file including size, timestamps, and type.

        Parameters
        ----------
        file_path : str
            The path to the file.

        Returns
        -------
        Dict
            A dictionary containing metadata such as size, creation date, modification date, and file type.

        Examples
        --------
        >>> metadata = Suphalaks.get_file_metadata('tmp_dir/example.txt')
        """
        return MetadataFile(file_path).get_metadata()



[docs]
    @staticmethod
    def document_template(chunks: List[str], metadata: Dict[str, Any]) -> Document:
        """
        Create a structured LangChain Document object from chunks and metadata.

        Parameters
        ----------
        chunks : List[str]
            A list of text chunks.
        metadata : Dict[str, Any]
            A dictionary containing metadata associated with the document.

        Returns
        -------
        Document
            A structured LangChain `Document` object.

        Examples
        --------
        >>> chunks = ["This is the first chunk.", "This is the second chunk."]
        >>> metadata = {"source": "example.txt", "author": "John Doe"}
        >>> document = Suphalaks.document_template(chunks, metadata)
        >>> print(document.page_content, document.metadata)
        """
        chunks = chunks or []
        metadata = metadata or {}
        return DocTemplate.create_template(chunks, metadata)