Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Script to calculate total minutes of audio data assigned to each annotator. | |
This script queries the database to find all audio files assigned to each annotator | |
through AnnotationInterval ranges, loads the actual audio files to calculate their | |
durations, and reports the total minutes per annotator. | |
""" | |
import argparse | |
import sys | |
import os | |
import time | |
from typing import Dict, List, Tuple | |
from sqlalchemy import and_ | |
from sqlalchemy.exc import OperationalError | |
# Add project root to Python path | |
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) | |
if project_root not in sys.path: | |
sys.path.insert(0, project_root) | |
from utils.database import get_db, get_db_readonly | |
from utils.cloud_server_audio_loader import CloudServerAudioLoader | |
from data.models import Annotator, AnnotationInterval, TTSData | |
from utils.logger import Logger | |
from utils.sentry_integration import capture_custom_event | |
import sentry_sdk | |
from config import conf | |
log = Logger() | |
def get_assigned_tts_data_for_annotator(db, annotator_id: int) -> List[TTSData]: | |
""" | |
Get all TTSData items assigned to a specific annotator through AnnotationInterval ranges. | |
Args: | |
db: Database session | |
annotator_id: ID of the annotator | |
Returns: | |
List of TTSData objects assigned to the annotator | |
""" | |
max_retries = 3 | |
retry_delay = 5 # seconds | |
for attempt in range(max_retries): | |
try: | |
# Get all annotation intervals for this annotator | |
intervals = db.query(AnnotationInterval).filter( | |
AnnotationInterval.annotator_id == annotator_id | |
).all() | |
if not intervals: | |
return [] | |
# Collect all TTSData IDs within the assigned ranges | |
assigned_tts_data = [] | |
for interval in intervals: | |
if interval.start_index is not None and interval.end_index is not None: | |
tts_data_in_range = db.query(TTSData).filter( | |
and_( | |
TTSData.id >= interval.start_index, | |
TTSData.id <= interval.end_index | |
) | |
).all() | |
assigned_tts_data.extend(tts_data_in_range) | |
return assigned_tts_data | |
except OperationalError as e: | |
if "Lost connection to MySQL server" in str(e) and attempt < max_retries - 1: | |
log.warning(f"Database connection lost, retrying in {retry_delay} seconds... (attempt {attempt + 1}/{max_retries})") | |
time.sleep(retry_delay) | |
# Refresh the database session | |
db.rollback() | |
continue | |
else: | |
raise | |
def calculate_audio_duration_seconds(filename: str, loader: CloudServerAudioLoader) -> float: | |
""" | |
Calculate the duration of an audio file in seconds. | |
Args: | |
filename: Name of the audio file | |
loader: CloudServerAudioLoader instance | |
Returns: | |
Duration in seconds, or 0.0 if file cannot be loaded | |
""" | |
try: | |
sample_rate, samples = loader.load_audio(filename) | |
# Calculate duration in seconds | |
if samples.ndim == 1: | |
# Mono audio | |
duration_seconds = len(samples) / sample_rate | |
else: | |
# Multi-channel audio - use length of first channel | |
duration_seconds = samples.shape[0] / sample_rate | |
return duration_seconds | |
except Exception as e: | |
log.warning(f"Failed to load audio file '{filename}': {e}") | |
sentry_sdk.capture_exception(e, extra={ | |
'operation': 'calculate_audio_duration', | |
'filename': filename | |
}) | |
return 0.0 | |
def calculate_annotator_audio_minutes(annotator_name: str = None): | |
""" | |
Calculate and report the total minutes of audio assigned to each annotator. | |
Args: | |
annotator_name: Optional name of specific annotator to calculate for | |
""" | |
try: | |
# Initialize audio loader | |
loader = CloudServerAudioLoader(conf.FTP_URL) | |
# First, get the annotators list with a fresh connection | |
annotator_data = [] | |
with get_db_readonly() as db: | |
# Get annotators based on filter | |
if annotator_name: | |
annotators = db.query(Annotator).filter( | |
Annotator.is_active == True, | |
Annotator.name == annotator_name | |
).all() | |
if not annotators: | |
log.error(f"No active annotator found with name: {annotator_name}") | |
return | |
else: | |
annotators = db.query(Annotator).filter(Annotator.is_active == True).all() | |
# Extract the data we need before the session closes | |
annotator_data = [(ann.id, ann.name) for ann in annotators] | |
if not annotator_data: | |
log.info("No active annotators found.") | |
return | |
log.info("--- Annotator Audio Duration Report ---") | |
log.info("Calculating total minutes of assigned audio per annotator...") | |
log.info("") | |
total_annotators = len(annotator_data) | |
annotator_results = [] | |
for idx, (annotator_id, annotator_name) in enumerate(annotator_data, 1): | |
log.info(f"Processing annotator {idx}/{total_annotators}: {annotator_name} (ID: {annotator_id})") | |
# Get assigned TTSData for this annotator with a fresh connection | |
assigned_tts_data = [] | |
with get_db_readonly() as db: | |
assigned_tts_data = get_assigned_tts_data_for_annotator(db, annotator_id) | |
if not assigned_tts_data: | |
log.info(f" No audio files assigned to {annotator_name}") | |
annotator_results.append((annotator_name, 0, 0.0)) | |
continue | |
total_duration_seconds = 0.0 | |
successful_files = 0 | |
failed_files = 0 | |
log.info(f" Calculating duration for {len(assigned_tts_data)} assigned audio files...") | |
# Calculate duration for each assigned audio file | |
for tts_data in assigned_tts_data: | |
duration = calculate_audio_duration_seconds(tts_data.filename, loader) | |
if duration > 0: | |
total_duration_seconds += duration | |
successful_files += 1 | |
else: | |
failed_files += 1 | |
total_minutes = total_duration_seconds / 60.0 | |
log.info(f" Successfully processed: {successful_files} files") | |
if failed_files > 0: | |
log.warning(f" Failed to process: {failed_files} files") | |
log.info(f" Total duration: {total_duration_seconds:.2f} seconds ({total_minutes:.2f} minutes)") | |
annotator_results.append((annotator_name, len(assigned_tts_data), total_minutes)) | |
log.info("") | |
# Print summary report | |
log.info("=" * 60) | |
log.info("SUMMARY REPORT") | |
log.info("=" * 60) | |
log.info(f"{'Annotator':<20} {'Files':<8} {'Minutes':<12} {'Hours':<8}") | |
log.info("-" * 60) | |
total_files = 0 | |
total_minutes = 0.0 | |
for annotator_name, file_count, minutes in annotator_results: | |
hours = minutes / 60.0 | |
log.info(f"{annotator_name:<20} {file_count:<8} {minutes:<12.2f} {hours:<8.2f}") | |
total_files += file_count | |
total_minutes += minutes | |
log.info("-" * 60) | |
total_hours = total_minutes / 60.0 | |
log.info(f"{'TOTAL':<20} {total_files:<8} {total_minutes:<12.2f} {total_hours:<8.2f}") | |
log.info("=" * 60) | |
# Capture analytics event | |
capture_custom_event( | |
'annotator_audio_calculation_completed', | |
{ | |
'total_annotators': total_annotators, | |
'total_files_processed': total_files, | |
'total_minutes': total_minutes, | |
'total_hours': total_hours | |
} | |
) | |
except Exception as e: | |
log.error(f"Failed to calculate annotator audio minutes: {e}") | |
sentry_sdk.capture_exception(e, extra={ | |
'operation': 'calculate_annotator_audio_minutes' | |
}) | |
raise | |
def main(): | |
"""Main entry point for the script.""" | |
parser = argparse.ArgumentParser( | |
description="Calculate total minutes of audio data assigned to each annotator" | |
) | |
parser.add_argument( | |
'--annotator', | |
type=str, | |
help="Calculate for a specific annotator by name (optional, calculates for all if not specified)" | |
) | |
args = parser.parse_args() | |
if args.annotator: | |
log.info(f"Calculating audio minutes for annotator: {args.annotator}") | |
calculate_annotator_audio_minutes(args.annotator) | |
else: | |
log.info("Calculating audio minutes for all annotators") | |
calculate_annotator_audio_minutes() | |
if __name__ == "__main__": | |
main() | |