Source code for label_processing.ocr_vision

# Import third-party libraries
from __future__ import annotations
import io
import os
from google.cloud import vision

# Import the necessary module from the 'label_processing' module package
import label_processing.utils



[docs]
class VisionApi:
    """
    Class for interacting with the Google Cloud Vision API for OCR tasks on images.
    """

    def __init__(
        self, path: str, image: bytes, credentials: str, encoding: str
    ) -> None:
        """
        Initialize the VisionApi instance.

        Args:
            path (str): Path to the image file.
            image (bytes): Image content in bytes.
            credentials (str): Path to the credentials JSON file.
            encoding (str): Encoding for the result ('ascii' or 'utf8').
        """
        self.image = image
        self.path = path
        self.encoding = encoding
        self.credentials = credentials
        self.client = self._initialize_client(credentials)

    @staticmethod
    def _initialize_client(credentials: str) -> vision.ImageAnnotatorClient:
        """
        SECURITY: Initialize the Google Vision API client with secure credential handling.

        Args:
            credentials (str): Path to the credentials JSON file.

        Returns:
            vision.ImageAnnotatorClient: Initialized Google Vision API client.

        Raises:
            Exception: If credentials file is not secure or accessible.
        """
        # SECURITY: Validate credentials file exists and has proper permissions
        if not os.path.exists(credentials):
            raise Exception(
                f"SECURITY ERROR: Credentials file not found: {credentials}"
            )

        # SECURITY: Check file permissions (should not be world-readable)
        file_stat = os.stat(credentials)
        if file_stat.st_mode & 0o044:  # Check if group or other readable
            print(
                f"SECURITY WARNING: Credentials file {credentials} has overly permissive permissions"
            )

        # SECURITY: Temporarily set environment variable, then clear it
        original_creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
        try:
            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials
            client = vision.ImageAnnotatorClient()

            # SECURITY: Clear credentials from environment immediately after client creation
            if original_creds is None:
                os.environ.pop("GOOGLE_APPLICATION_CREDENTIALS", None)
            else:
                os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = original_creds

            return client
        except Exception as e:
            # SECURITY: Ensure credentials are cleared even if client creation fails
            if original_creds is None:
                os.environ.pop("GOOGLE_APPLICATION_CREDENTIALS", None)
            else:
                os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = original_creds
            raise Exception(
                f"SECURITY ERROR: Failed to initialize Google Vision client: {e}"
            )


[docs]
    @staticmethod
    def read_image(path: str, credentials: str, encoding: str = "utf8") -> VisionApi:
        """
        Read an image file and return an instance of the VisionApi class.

        Args:
            path (str): Path to the image file.
            credentials (str): Path to the credentials JSON file.
            encoding (str, optional): Encoding for the result ('ascii' or 'utf8'). Defaults to 'utf8'.

        Returns:
            VisionApi: Instance of the VisionApi class.
        """
        with io.open(path, "rb") as image_file:
            image = image_file.read()
        return VisionApi(path, image, credentials, encoding)



[docs]
    def process_string(self, result_raw: str) -> str:
        """
        Process the Google Vision OCR output, replacing newlines with spaces and encoding as specified.

        Args:
            result_raw (str): Raw output string directly from Google Vision.

        Returns:
            str: Processed string.
        """
        processed = result_raw.replace("\n", " ")
        if self.encoding == "ascii":
            processed = processed.encode("ascii", "ignore").decode()
        return processed



[docs]
    def vision_ocr(self) -> dict[str, str]:
        """
        Perform the actual API call, handle errors, and return the processed transcription.

        Raises:
            Exception: Raises an exception if the API does not respond.

        Returns:
            dict[str, str]: Dictionary with the filename and the transcript.
        """
        vision_image = vision.Image(content=self.image)
        response = self.client.text_detection(image=vision_image)
        single_transcripts = response.text_annotations

        transcripts = [str(transcript.description) for transcript in single_transcripts]
        bounding_boxes = []

        for transcript in single_transcripts:
            vertices = [
                {"word": f"({vertex.x},{vertex.y})"}
                for vertex in transcript.bounding_poly.vertices
            ]
            bounding_boxes.append({transcript.description: vertices})

        if transcripts:
            transcript = self.process_string(transcripts[0])
        else:
            transcript = " "

        filename = os.path.basename(self.path)
        if response.error.message:
            raise Exception(
                f"{response.error.message}\nFor more info on error messages, "
                "check:  https://cloud.google.com/apis/design/errors"
            )

        entry = {"ID": filename, "text": transcript, "bounding_boxes": bounding_boxes}
        if label_processing.utils.check_nuri_format(entry["text"]):
            entry = label_processing.utils.replace_nuri(entry)
        return entry