teachingAssistant / src /infrastructure /stt /legacy_compatibility.py
Michael Hu
add more logs
fdc056d
"""Legacy compatibility functions for STT functionality."""
import logging
from pathlib import Path
from typing import Union
from .provider_factory import STTProviderFactory
from ...domain.models.audio_content import AudioContent
from ...domain.exceptions import SpeechRecognitionException
logger = logging.getLogger(__name__)
def transcribe_audio(audio_path: Union[str, Path], model_name: str = "parakeet") -> str:
"""
Convert audio file to text using specified STT model (legacy interface).
This function maintains backward compatibility with the original utils/stt.py interface.
Args:
audio_path: Path to input audio file
model_name: Name of the STT model/provider to use (whisper or parakeet)
Returns:
str: Transcribed English text
Raises:
SpeechRecognitionException: If transcription fails
"""
logger.info(f"Starting transcription for: {audio_path} using {model_name} model")
try:
# Convert path to Path object
audio_path = Path(audio_path)
if not audio_path.exists():
raise SpeechRecognitionException(f"Audio file not found: {audio_path}")
# Read audio file and create AudioContent
with open(audio_path, 'rb') as f:
audio_data = f.read()
# Determine audio format from file extension
audio_format = audio_path.suffix.lower().lstrip('.')
if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
audio_format = 'wav' # Default fallback
# Create AudioContent (we'll use reasonable placeholder values)
# The provider will handle the actual audio analysis during preprocessing
try:
audio_content = AudioContent(
data=audio_data,
format=audio_format,
sample_rate=16000, # Standard rate for STT
duration=max(1.0, len(audio_data) / (16000 * 2)), # Rough estimate
filename=audio_path.name
)
except ValueError:
# If validation fails, try with minimal valid values
audio_content = AudioContent(
data=audio_data,
format=audio_format,
sample_rate=16000,
duration=1.0, # Minimum valid duration
filename=audio_path.name
)
# Get the appropriate provider
try:
provider = STTProviderFactory.create_provider(model_name)
except SpeechRecognitionException:
# Fallback to any available provider
logger.warning(f"Requested provider {model_name} not available, using fallback")
provider = STTProviderFactory.create_provider_with_fallback(model_name)
# Get the default model for the provider
model = provider.get_default_model()
# Transcribe audio
text_content = provider.transcribe(audio_content, model)
result = text_content.text
logger.info(f"Transcription completed: {result}")
return result
except Exception as e:
logger.error(f"Transcription failed: {str(e)}", exc_info=True)
raise SpeechRecognitionException(f"Transcription failed: {str(e)}") from e
def create_audio_content_from_file(audio_path: Union[str, Path]) -> AudioContent:
"""
Create AudioContent from an audio file with proper metadata detection.
Args:
audio_path: Path to the audio file
Returns:
AudioContent: The audio content object
Raises:
SpeechRecognitionException: If file cannot be processed
"""
try:
from pydub import AudioSegment
audio_path = Path(audio_path)
# Load audio file to get metadata
audio_segment = AudioSegment.from_file(audio_path)
# Read raw audio data
with open(audio_path, 'rb') as f:
audio_data = f.read()
# Determine format
audio_format = audio_path.suffix.lower().lstrip('.')
if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
audio_format = 'wav'
# Create AudioContent with actual metadata
return AudioContent(
data=audio_data,
format=audio_format,
sample_rate=audio_segment.frame_rate,
duration=len(audio_segment) / 1000.0, # Convert ms to seconds
filename=audio_path.name
)
except ImportError:
# Fallback without pydub
logger.warning("pydub not available, using placeholder metadata")
with open(audio_path, 'rb') as f:
audio_data = f.read()
audio_format = Path(audio_path).suffix.lower().lstrip('.')
if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
audio_format = 'wav'
return AudioContent(
data=audio_data,
format=audio_format,
sample_rate=16000, # Default
duration=1.0, # Placeholder
filename=Path(audio_path).name
)
except Exception as e:
raise SpeechRecognitionException(f"Failed to create AudioContent from file: {str(e)}") from e