Source code for label_processing.text_recognition

# Import third-party libraries
from __future__ import annotations
import os
import copy
import cv2
import shutil
import math
import pytesseract as py
import numpy as np
from typing import Union, Tuple, Optional
from deskew import determine_skew
from enum import Enum
from pathlib import Path
import warnings

# Import the necessary module from the 'label_processing' module package
from label_processing import utils

# Constants
CONFIG = r"--psm 6 --oem 3"  # Configuration for OCR
LANGUAGES = "eng+deu+fra+ita+spa+por"  # Specifying languages used for OCR
MIN_SKEW_ANGLE = -10
MAX_SKEW_ANGLE = 10



[docs]
def find_tesseract() -> None:
    """
    Searches for the tesseract executable and raises an error if it is not found.
    """
    tesseract_path = shutil.which("tesseract")
    if not tesseract_path:
        raise FileNotFoundError(
            (
                "Could not find tesseract on your machine!"
                "Please read the README for instructions!"
            )
        )
    else:
        py.pytesseract.tesseract_cmd = tesseract_path



# ---------------------Image Preprocessing---------------------#



[docs]
class ImageProcessor:
    """
    A class for image preprocessing and other image actions.
    """

    def __init__(
        self, image: np.ndarray, path: str, blocksize: int = None, c_value: int = None
    ):
        """
        Initialize an instance of Image class.

        Args:
            image (np.ndarray): The image data as a NumPy array.
            path (str): The path to the image file.
            blocksize (int, optional): The blocksize for thresholding. Defaults to None.
            c_value (int, optional): The c_value for thresholding. Defaults to None.
        """
        self.image = image
        self.path = Path(path)
        self.filename = self.path.name
        self.blocksize: Optional[int] = blocksize
        self.c_value: Optional[int] = c_value

    @property
    def blocksize(self) -> int:
        return self._blocksize

    @blocksize.setter
    def blocksize(self, value: int | None) -> None:
        if value is not None:
            if value <= 1 or value % 2 == 0:
                raise ValueError(
                    "Value for blocksize has to be at least 3 and needs\
                    to be odd"
                )
        self._blocksize = value

    @property
    def c_value(self) -> int:
        return self._c_value

    @c_value.setter
    def c_value(self, value: int) -> None:
        self._c_value = value

    @property
    def image(self) -> np.ndarray:
        return self._image

    @image.setter
    def image(self, image: np.ndarray) -> None:
        self._image = image

    @property
    def path(self) -> str:
        return self._path

    @path.setter
    def path(self, path: str) -> None:
        self._path = path


[docs]
    def copy_this(self) -> ImageProcessor:
        """
        Creates a copy of the current Image instance.

        Returns:
            ImageProcessor: A copy of the current Image instance.
        """
        return copy.copy(self)



[docs]
    @staticmethod
    def read_image(path: str | Path) -> ImageProcessor:
        """
        Read an image from the specified path and return an ImageProcessor instance.

        Args:
            path (str): The path to a JPG file.

        Returns:
            ImageProcessor: An instance of the ImageProcessor class.
        """
        return ImageProcessor(cv2.imread(str(path)), path)



[docs]
    def get_grayscale(self) -> ImageProcessor:
        """
        Convert the image to grayscale.

        Returns:
            ImageProcessor: An instance representing the grayscale image.
        """
        image = cv2.cvtColor(self.image, cv2.COLOR_RGB2GRAY)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def blur(self, ksize: tuple[int, int] = (5, 5)) -> ImageProcessor:
        """
        Apply Gaussian blur to the image.

        Args:
            ksize (Tuple[int, int], optional): The kernel size for blurring. Defaults to (5, 5).

        Returns:
            ImageProcessor: An instance representing the blurred image.
        """
        image = cv2.GaussianBlur(self.image, ksize, 0)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def remove_noise(self) -> ImageProcessor:
        """
        Remove noise from the image using median blur.

        Returns:
            ImageProcessor: An instance representing the noise-reduced image.
        """
        image = cv2.medianBlur(self.image, 5)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def apply_clahe(
        self, clip_limit: float = 2.0, tile_grid_size: tuple[int, int] = (8, 8)
    ) -> ImageProcessor:
        """
        Apply Contrast Limited Adaptive Histogram Equalization (CLAHE).

        CLAHE improves contrast in images with uneven illumination or low contrast,
        which is common in aged specimen labels or images with inconsistent lighting.

        Args:
            clip_limit (float, optional): Threshold for contrast limiting. Higher values
                give more contrast. Defaults to 2.0.
            tile_grid_size (tuple[int, int], optional): Size of grid for histogram equalization.
                Defaults to (8, 8).

        Returns:
            ImageProcessor: An instance of the Image class with CLAHE applied.
        """
        clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
        image = clahe.apply(self.image)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def normalize_illumination(self) -> ImageProcessor:
        """
        Normalize image illumination using morphological operations.

        This method corrects uneven lighting by estimating and removing the background
        illumination, useful for images with shadows or uneven flash lighting.

        Returns:
            ImageProcessor: An instance of the Image class with normalized illumination.
        """
        # Estimate background using morphological closing with large kernel
        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (19, 19))
        background = cv2.morphologyEx(self.image, cv2.MORPH_CLOSE, kernel)

        # Subtract background and normalize to 0-255 range
        image = cv2.subtract(background, self.image)
        image = cv2.normalize(image, None, 0, 255, cv2.NORM_MINMAX)

        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def thresholding(self, thresh_mode: Enum) -> ImageProcessor:
        """
        Perform thresholding on the image.

        Args:
            thresh_mode (Threshmode): The thresholding mode to use (OTSU, ADAPTIVE_MEAN, or ADAPTIVE_GAUSSIAN).

        Returns:
            ImageProcessor: An instance representing the thresholded image.
        """
        if thresh_mode == Threshmode.OTSU:
            image = cv2.threshold(
                self.image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
            )[1]
        elif thresh_mode == Threshmode.ADAPTIVE_GAUSSIAN:
            # set blocksize and c_value
            gaussian_blocksize = self.blocksize if self.blocksize else 73
            gaussian_c = self.c_value if self.c_value else 16

            image = cv2.adaptiveThreshold(
                self.image,
                255,
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY,
                gaussian_blocksize,
                gaussian_c,
            )
        elif thresh_mode == Threshmode.ADAPTIVE_MEAN:
            # set blocksize and c_value
            mean_blocksize = self.blocksize if self.blocksize else 35
            mean_c = self.c_value if self.c_value else 17

            image = cv2.adaptiveThreshold(
                self.image,
                255,
                cv2.ADAPTIVE_THRESH_MEAN_C,
                cv2.THRESH_BINARY,
                mean_blocksize,
                mean_c,
            )
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def dilate(self) -> ImageProcessor:
        """
        Dilate the image using a 5x5 kernel.

        Returns:
            ImageProcessor: An instance representing the dilated image.
        """
        kernel = np.ones((5, 5), np.uint8)
        image = cv2.dilate(self.image, kernel, iterations=1)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def erode(self) -> ImageProcessor:
        """
        Erode the image using a 5x5 kernel.

        Returns:
            ImageProcessor: An instance representing the eroded image.
        """
        kernel = np.ones((5, 5), np.uint8)
        image = cv2.erode(self.image, kernel, iterations=1)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance


    @staticmethod
    def _rotate(
        image: np.ndarray,
        angle: float | np.float,
        background: Union[int, Tuple[int, int, int]],
    ) -> np.ndarray:
        """
        Performs a rotation of an image given an angle.

        Args:
            image (np.ndarray): Image loaded in with OpenCV.
            angle (float): Angle with which the picture should be rotated.
            background (Union[int, Tuple[int, int, int]]): RGB values for the background color.

        Returns:
            np.ndarray: Rotated image.
        """
        old_width, old_height = image.shape[:2]
        angle_radian = math.radians(angle)
        width = abs(np.sin(angle_radian) * old_height) + abs(
            np.cos(angle_radian) * old_width
        )
        height = abs(np.sin(angle_radian) * old_width) + abs(
            np.cos(angle_radian) * old_height
        )

        image_center = tuple(np.array(image.shape[1::-1]) / 2)
        rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
        rot_mat[1, 2] += (width - old_width) / 2
        rot_mat[0, 2] += (height - old_height) / 2
        return cv2.warpAffine(
            image,
            rot_mat,
            (int(round(height)), int(round(width))),
            borderValue=background,
        )


[docs]
    def get_skew_angle(self) -> Optional[np.float64]:
        """
        Calculate and return the skew angle of the image.

        Returns:
            Optional[np.float64]: The skew angle in degrees or None if it couldn't be determined.
        """
        grayscale = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
        angle = determine_skew(
            grayscale, max_angle=MAX_SKEW_ANGLE, min_angle=MIN_SKEW_ANGLE
        )
        return angle



[docs]
    def deskew(self, angle: Optional[np.float64]) -> ImageProcessor:
        """
        Rotate the image to deskew it.

        Args:
            angle (Optional[np.float64]): The skew angle to use for deskewing.

        Returns:
            ImageProcessor: An instance representing the deskewed image.
        """
        if angle is None:
            # Handle the case where angle is None, e.g., log a message or skip deskewing
            print(
                f"Warning: Skew angle for file {self.filename} could not be determined. Skipping deskewing."
            )
            return self

        # If angle is not None, proceed with deskewing
        image = self._rotate(self.image, angle, (255, 255, 255))
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def preprocessing(
        self,
        thresh_mode: Threshmode,
        use_clahe: bool = False,
        normalize_illum: bool = False,
        clahe_clip_limit: float = 2.0,
        clahe_tile_grid_size: tuple[int, int] = (8, 8),
    ) -> ImageProcessor:
        """
        Perform a series of preprocessing steps on the image.

        Args:
            thresh_mode (Threshmode): The thresholding mode to use (OTSU, ADAPTIVE_MEAN, or ADAPTIVE_GAUSSIAN).
            use_clahe (bool, optional): Apply CLAHE for contrast enhancement. Useful for low-contrast
                or faded labels. Defaults to False.
            normalize_illum (bool, optional): Apply illumination normalization to correct uneven lighting.
                Useful for images with shadows or hotspots. Defaults to False.
            clahe_clip_limit (float, optional): CLAHE contrast limiting threshold. Defaults to 2.0.
            clahe_tile_grid_size (tuple[int, int], optional): CLAHE grid size. Defaults to (8, 8).

        Returns:
            ImageProcessor: An instance of the Image class representing the preprocessed image.
        """
        # Skew angle has to be calculated before processing
        angle = self.get_skew_angle()

        if angle is None:
            print(
                "Warning: Skew angle could not be determined. Skipping preprocessing."
            )
            return self

        # Perform preprocessing
        image = self.get_grayscale()

        # Optional: normalize illumination before other processing
        if normalize_illum:
            image = image.normalize_illumination()

        # Optional: apply CLAHE before blurring for better contrast
        if use_clahe:
            image = image.apply_clahe(
                clip_limit=clahe_clip_limit, tile_grid_size=clahe_tile_grid_size
            )

        image = image.blur()
        image = image.thresholding(thresh_mode=thresh_mode)
        image = image.deskew(angle)
        return image


    # ---------------------Read QR-Code---------------------#


[docs]
    def read_qr_code(self) -> Optional[str]:
        """
        Tries to identify if a picture has a QR-code and then reads and returns it.

        Returns:
            Optional[str]: Decoded QR-code text as a str or None if there is no QR-code found.
        """
        try:
            detect = cv2.QRCodeDetector()
            value = detect.detectAndDecode(self.image)[0]
            return value if value else None
        except Exception as e:
            print(f"An error occurred while detecting and decoding QR code: {e}")
            return None



[docs]
    def save_image(self, dir_path: str | Path, appendix: Optional[str] = None) -> None:
        """
        Save the image to a specified directory with an optional appendix.

        Args:
            dir_path (str | Path): The directory path where the image will be saved.
            appendix (str, optional): An optional string to append to the image filename. Defaults to None.
        """
        try:
            if appendix:
                filename = utils.generate_filename(
                    self.filename, appendix, extension="jpg"
                )
            else:
                filename = self.filename
            filename_processed = os.path.join(dir_path, filename)
            cv2.imwrite(filename_processed, self.image)
        except Exception as e:
            print(f"An error occurred while saving the image: {e}")





[docs]
class Threshmode(Enum):
    """
    Different possibilities for thresholding.

    Args:
        Enum (int):
    """

    OTSU = 1
    ADAPTIVE_MEAN = 2
    ADAPTIVE_GAUSSIAN = 3


[docs]
    @classmethod
    def eval(cls, threshmode: int) -> Enum:
        if threshmode == 1:
            return cls.OTSU
        if threshmode == 2:
            return cls.ADAPTIVE_MEAN
        if threshmode == 3:
            return cls.ADAPTIVE_GAUSSIAN




# ---------------------OCR Tesseract---------------------#



[docs]
class Tesseract:
    def __init__(
        self, languages=LANGUAGES, config=CONFIG, image: Optional[ImageProcessor] = None
    ) -> None:
        """
        Initialize the Tesseract OCR processor.

        Args:
            languages (str, optional): OCR available languages. Defaults to LANGUAGES.
            config (str, optional): Additional custom configuration flags not available via the pytesseract function. Defaults to CONFIG.
            image (ImageProcessor, optional): An instance of the Image class representing the image to process. Defaults to None.
        """
        self.config = config
        self.languages = languages
        self.image = image

    @property
    def image(self) -> ImageProcessor:
        return self._image

    @image.setter
    def image(self, img: ImageProcessor) -> None:
        self._image = img

    @staticmethod
    def _process_string(result_raw: str) -> str:
        """
        Processes the OCR output by replacing '\n' with spaces and encoding it to ASCII and decoding it again to UTF-8.

        Args:
            result_raw (str): Raw string from pytesseract output.

        Returns:
            str: Processed string.
        """
        processed = result_raw.replace("\n", " ")
        return processed


[docs]
    def image_to_string(self) -> dict[str, str | float]:
        """
        Apply OCR and image parameters on JPG images.

        Returns:
            dict[str, str | float]: A dictionary containing the image ID (filename), OCR-processed text, and confidence score.
        """
        # Get OCR text with confidence data
        data = py.image_to_data(
            self.image.image,
            lang=self.languages,
            config=self.config,
            output_type=py.Output.DICT,
        )

        # Extract text and calculate mean confidence
        text_parts = []
        confidences = []

        for i in range(len(data["text"])):
            if int(data["conf"][i]) > 0:  # Only include words with positive confidence
                text_parts.append(data["text"][i])
                confidences.append(int(data["conf"][i]))

        # Combine text and calculate average confidence
        transcript = " ".join(text_parts)
        transcript = self._process_string(transcript)

        # Calculate mean confidence (0-100 scale, convert to 0-1)
        mean_confidence = (
            (sum(confidences) / len(confidences) / 100.0) if confidences else 0.0
        )

        return {
            "ID": self.image.filename,
            "text": transcript,
            "confidence": round(mean_confidence, 3),
        }