Source code for label_evaluation.redundancy

# Import third-party library
import warnings

# Suppress warning messages during execution
warnings.filterwarnings("ignore")



[docs]
def clean_data(data: list[dict]) -> list[dict]:
    """
    Preprocess the dataset by converting text to lowercase, removing punctuation and whitespace,
    and excluding entries containing 'http'.

    Args:
        data (list of dict): List of dictionaries with labels' transcription.

    Returns:
        list of dict: Preprocessed list of dictionaries.
    """
    try:
        cleaned_data = []
        for item in data:
            if "text" not in item:
                continue
            text = item["text"].lower()
            cleaned_text = "".join(
                e for e in text if e.isalnum() or e.isspace()
            ).replace(" ", "")
            if "http" not in cleaned_text:
                item["text"] = cleaned_text
                cleaned_data.append(item)
        return cleaned_data
    except Exception as e:
        print(f"Error cleaning data: {e}")
        return []




[docs]
def redundancy(data: list[dict]) -> list[dict]:
    """
    Identify duplicate entries in a preprocessed dataset.

    Args:
        data (list of dict): Preprocessed list of dictionaries with labels' transcription.

    Returns:
        list of dict: List of dictionaries containing duplicate entries.
    """
    try:
        data = clean_data(data)
        text_set = set()
        duplicates = []
        for item in data:
            text = item["text"]
            if text in text_set:
                duplicates.append(item)
            text_set.add(text)
        return duplicates
    except Exception as e:
        print(f"Error identifying redundant entries: {e}")
        return []




[docs]
def per_redundancy(data: list[dict]) -> int:
    """
    Calculate the percentage of transcription redundancy in a dataset.

    Args:
        data (list of dict): Preprocessed list of dictionaries with labels' transcription.

    Returns:
        int: Percentage of redundant text.
    """
    try:
        data_clean = clean_data(data)
        duplicates = redundancy(data_clean)
        sum_text = len(data_clean)
        sum_dup = len(duplicates)
        return round((sum_dup / sum_text) * 100) if sum_text > 0 else 0
    except Exception as e:
        print(f"Error calculating redundancy percentage: {e}")
        return 0