Source code for label_postprocessing.ocr_postprocessing

# Import third-party libraries
import json
import nltk
from nltk import word_tokenize

nltk.download("punkt", quiet=True)
import string
import pandas as pd
import re
from typing import List, Dict

# Constants
NON_ASCII = re.compile(" [^\x00-\x7f] ")
NON_ALPHA_NUM = re.compile(r"[^a-zA-Z\d\s]{2,}")
PIPE = re.compile("[|]")



[docs]
def count_mean_token_length(tokens: List[str]) -> float:
    """
    Calculates the mean length of tokens in a list.

    Args:
        tokens (list): List of tokens.

    Returns:
        float: Mean token length.
    """
    if not tokens:
        return 0
    total_length = sum(len(token) for token in tokens)
    mean_length = total_length / len(tokens)
    return round(mean_length, 2)




[docs]
def is_plausible_prediction(transcript: str) -> bool:
    """
    Checks if a transcript is a plausible prediction based on the average token length.

    Args:
        transcript (str): Input transcript.

    Returns:
        bool: True if the transcript is plausible, False otherwise.
    """
    try:
        tokens = word_tokenize(transcript)
        tokens_no_punct = [token for token in tokens if token not in string.punctuation]
        return count_mean_token_length(tokens_no_punct) >= 2
    except Exception as e:
        print(f"Error checking plausible prediction: {e}")
        return False




[docs]
def correct_transcript(transcript: str) -> str:
    """
    Performs corrections on a transcript, removing non-ASCII characters, multiple non-alphanumeric characters,
    the pipe character, and other special symbols (like °, ', , etc.). Also removes any trailing periods.

    Args:
        transcript (str): Input transcript.

    Returns:
        str: Corrected transcript.
    """
    try:
        # Remove non-ASCII characters
        transcript = re.sub(NON_ASCII, " ", transcript)

        # Remove non-alphanumeric characters (like special symbols, except for spaces)
        transcript = re.sub(NON_ALPHA_NUM, "", transcript)

        # Remove the pipe character
        transcript = re.sub(PIPE, "", transcript)

        # Remove specific characters (degree symbol, apostrophes, commas)
        transcript = transcript.replace("°", "").replace("'", "").replace(",", "")

        # Remove any trailing periods
        return transcript.rstrip(".")
    except Exception as e:
        print(f"Error correcting transcript: {e}")
        return transcript




[docs]
def is_nuri(transcript: str) -> bool:
    """
    Checks if a transcript starts with "http," indicating a Nuri.

    Args:
        transcript (str): Input transcript.

    Returns:
        bool: True if the transcript is a Nuri, False otherwise.
    """
    return transcript.startswith("http")




[docs]
def is_empty(transcript: str) -> bool:
    """
    Checks if a transcript is empty.

    Args:
        transcript (str): Input transcript.

    Returns:
        bool: True if the transcript is empty, False otherwise.
    """
    return len(transcript.strip()) == 0




[docs]
def save_transcripts(transcripts: Dict, file_name: str) -> None:
    """
    Saves transcripts as a CSV file.

    Args:
        transcripts (dict): Dictionary of transcripts.
        file_name (str): Name of the output CSV file.
    """
    try:
        pd.DataFrame.from_dict(transcripts, orient="index").to_csv(file_name)
    except Exception as e:
        print(f"Error saving transcripts to CSV: {e}")




[docs]
def save_json(transcripts: List[Dict], file_name: str) -> None:
    """
    Saves transcripts as a JSON file.

    Args:
        transcripts (list): List of transcripts.
        file_name (str): Name of the output JSON file.
    """
    try:
        with open(file_name, "w") as outfile:
            json.dump(transcripts, outfile, indent=4)
    except Exception as e:
        print(f"Error saving JSON file: {e}")




[docs]
def process_ocr_output(ocr_output: str) -> None:
    """
    Processes OCR output, categorizing and saving transcripts based on Nuri, empty, plausible, and corrected.

    Args:
        ocr_output (str): OCR output file path.
    """
    try:
        nuri_labels, empty_labels, plausible_labels, clean_labels = {}, {}, [], []

        with open(ocr_output, "r") as f:
            labels = json.load(f)

        for label in labels:
            text = label.get("text", "")
            label_id = label.get("ID", "")

            if is_nuri(text):
                nuri_labels[label_id] = text
            elif is_empty(text):
                empty_labels[label_id] = ""
            elif is_plausible_prediction(text):
                plausible_labels.append({"ID": label_id, "text": text})
                clean_labels.append({"ID": label_id, "text": correct_transcript(text)})

        save_transcripts(nuri_labels, "nuris.csv")
        save_transcripts(empty_labels, "empty_transcripts.csv")
        save_json(plausible_labels, "plausible_transcripts.json")
        save_json(clean_labels, "corrected_transcripts.json")
    except Exception as e:
        print(f"Error processing OCR output: {e}")