Source code for purrfectmeow.tc01_spl.ocr

import time
from collections.abc import Callable
from typing import Any

from purrfectmeow.meow.kitty import kitty_logger


[docs] class Ocr: _logger = kitty_logger(__name__) _image_type = [ ".apng", ".png", ".avif", ".gif", ".jpg", ".jpeg", ".jfif", ".pjpeg", ".pjp", ".png", ".svg", ".webp", ".bmp", ".ico", ".cur", ".tif", ".tiff", ] @classmethod def _convert(cls, file_path: str, converter: Callable[[str], Any]) -> str: cls._logger.debug(f"Starting conversion for '{file_path}'") start = time.time() try: content = [] match file_path.lower(): case path if path.endswith(".pdf"): from pdf2image import convert_from_path images = convert_from_path(file_path, fmt="png") for idx, image in enumerate(images): try: text = converter(image) cls._logger.debug(f"Text: {text}") content.append(text) cls._logger.debug(f"Page {idx + 1} processed") except Exception as e: cls._logger.exception(f"Page {idx + 1} failed: {e}") raise case path if path.endswith(tuple(cls._image_type)): from PIL import Image image = Image.open(file_path) try: text = converter(image) cls._logger.debug(f"Text: {text}") content.append(text) cls._logger.debug("Page 1 processed") except Exception as e: cls._logger.debug(f"Page 1 failed: {e}") raise cls._logger.debug(f"Successfully converted '{file_path}'") return "\n".join(content) finally: elasped = time.time() - start cls._logger.debug(f"Conversion time spent '{elasped:.2f}' seconds.")
[docs] @classmethod def pytesseract_convert(cls, file_path: str) -> str: cls._logger.debug("Using PyTesseract for Conversion") def converter(image: str) -> Any: import pytesseract return pytesseract.image_to_string(image, lang="tha+eng") return cls._convert(file_path, converter)
[docs] @classmethod def easyocr_convert(cls, file_path: str) -> str: cls._logger.debug("Using EasyOCR for Conversion") def converter(image: str) -> Any: import easyocr import numpy reader = easyocr.Reader(["th", "en"], gpu=False) res = reader.readtext(numpy.array(image)) return "\n".join(text for _, text, _ in res) return cls._convert(file_path, converter)
[docs] @classmethod def suryaocr_convert(cls, file_path: str) -> str: cls._logger.debug("Using SuryaOCR for Conversion") def converter(image: str) -> Any: from surya.detection import DetectionPredictor from surya.recognition import RecognitionPredictor rec_pred = RecognitionPredictor() det_pred = DetectionPredictor() prediction = rec_pred( [image], det_predictor=det_pred, detection_batch_size=1, recognition_batch_size=1, ) return "\n".join(line.text for line in prediction[0].text_lines) return cls._convert(file_path, converter)
[docs] @classmethod def doctr_convert(cls, file_path: str) -> str: cls._logger.debug("Using docTR for Conversion") def converter(image: str) -> str: import os import shutil import tempfile from doctr.io import DocumentFile from doctr.models import ocr_predictor with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: shutil.copy(image, tmp.name) temp_image_path = tmp.name model = ocr_predictor(pretrained=True) doc = DocumentFile.from_images(temp_image_path) result = model(doc) data = result.export() combined_text = "\n".join( word["value"] for page in data["pages"] for block in page.get("blocks", []) for line in block.get("lines", []) for word in line.get("words", []) if "value" in word ) if os.path.exists(temp_image_path): os.remove(temp_image_path) return combined_text return cls._convert(file_path, converter)