# Import third-party libraries
from __future__ import annotations
import io
import os
from google.cloud import vision
# Import the necessary module from the 'label_processing' module package
import label_processing.utils
[docs]
class VisionApi:
"""
Class for interacting with the Google Cloud Vision API for OCR tasks on images.
"""
def __init__(
self, path: str, image: bytes, credentials: str, encoding: str
) -> None:
"""
Initialize the VisionApi instance.
Args:
path (str): Path to the image file.
image (bytes): Image content in bytes.
credentials (str): Path to the credentials JSON file.
encoding (str): Encoding for the result ('ascii' or 'utf8').
"""
self.image = image
self.path = path
self.encoding = encoding
self.credentials = credentials
self.client = self._initialize_client(credentials)
@staticmethod
def _initialize_client(credentials: str) -> vision.ImageAnnotatorClient:
"""
SECURITY: Initialize the Google Vision API client with secure credential handling.
Args:
credentials (str): Path to the credentials JSON file.
Returns:
vision.ImageAnnotatorClient: Initialized Google Vision API client.
Raises:
Exception: If credentials file is not secure or accessible.
"""
# SECURITY: Validate credentials file exists and has proper permissions
if not os.path.exists(credentials):
raise Exception(
f"SECURITY ERROR: Credentials file not found: {credentials}"
)
# SECURITY: Check file permissions (should not be world-readable)
file_stat = os.stat(credentials)
if file_stat.st_mode & 0o044: # Check if group or other readable
print(
f"SECURITY WARNING: Credentials file {credentials} has overly permissive permissions"
)
# SECURITY: Temporarily set environment variable, then clear it
original_creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
try:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials
client = vision.ImageAnnotatorClient()
# SECURITY: Clear credentials from environment immediately after client creation
if original_creds is None:
os.environ.pop("GOOGLE_APPLICATION_CREDENTIALS", None)
else:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = original_creds
return client
except Exception as e:
# SECURITY: Ensure credentials are cleared even if client creation fails
if original_creds is None:
os.environ.pop("GOOGLE_APPLICATION_CREDENTIALS", None)
else:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = original_creds
raise Exception(
f"SECURITY ERROR: Failed to initialize Google Vision client: {e}"
)
[docs]
@staticmethod
def read_image(path: str, credentials: str, encoding: str = "utf8") -> VisionApi:
"""
Read an image file and return an instance of the VisionApi class.
Args:
path (str): Path to the image file.
credentials (str): Path to the credentials JSON file.
encoding (str, optional): Encoding for the result ('ascii' or 'utf8'). Defaults to 'utf8'.
Returns:
VisionApi: Instance of the VisionApi class.
"""
with io.open(path, "rb") as image_file:
image = image_file.read()
return VisionApi(path, image, credentials, encoding)
[docs]
def process_string(self, result_raw: str) -> str:
"""
Process the Google Vision OCR output, replacing newlines with spaces and encoding as specified.
Args:
result_raw (str): Raw output string directly from Google Vision.
Returns:
str: Processed string.
"""
processed = result_raw.replace("\n", " ")
if self.encoding == "ascii":
processed = processed.encode("ascii", "ignore").decode()
return processed
[docs]
def vision_ocr(self) -> dict[str, str]:
"""
Perform the actual API call, handle errors, and return the processed transcription.
Raises:
Exception: Raises an exception if the API does not respond.
Returns:
dict[str, str]: Dictionary with the filename and the transcript.
"""
vision_image = vision.Image(content=self.image)
response = self.client.text_detection(image=vision_image)
single_transcripts = response.text_annotations
transcripts = [str(transcript.description) for transcript in single_transcripts]
bounding_boxes = []
for transcript in single_transcripts:
vertices = [
{"word": f"({vertex.x},{vertex.y})"}
for vertex in transcript.bounding_poly.vertices
]
bounding_boxes.append({transcript.description: vertices})
if transcripts:
transcript = self.process_string(transcripts[0])
else:
transcript = " "
filename = os.path.basename(self.path)
if response.error.message:
raise Exception(
f"{response.error.message}\nFor more info on error messages, "
"check: https://cloud.google.com/apis/design/errors"
)
entry = {"ID": filename, "text": transcript, "bounding_boxes": bounding_boxes}
if label_processing.utils.check_nuri_format(entry["text"]):
entry = label_processing.utils.replace_nuri(entry)
return entry