teachingAssistant / src /infrastructure /stt /legacy_compatibility.py
Michael Hu
Migrate existing STT providers to infrastructure layer
1be582a
raw
history blame
5.31 kB
"""Legacy compatibility functions for STT functionality."""
import logging
from pathlib import Path
from typing import Union
from .provider_factory import STTProviderFactory
from ...domain.models.audio_content import AudioContent
from ...domain.exceptions import SpeechRecognitionException
logger = logging.getLogger(__name__)
def transcribe_audio(audio_path: Union[str, Path], model_name: str = "parakeet") -> str:
"""
Convert audio file to text using specified STT model (legacy interface).
This function maintains backward compatibility with the original utils/stt.py interface.
Args:
audio_path: Path to input audio file
model_name: Name of the STT model/provider to use (whisper or parakeet)
Returns:
str: Transcribed English text
Raises:
SpeechRecognitionException: If transcription fails
"""
logger.info(f"Starting transcription for: {audio_path} using {model_name} model")
try:
# Convert path to Path object
audio_path = Path(audio_path)
if not audio_path.exists():
raise SpeechRecognitionException(f"Audio file not found: {audio_path}")
# Read audio file and create AudioContent
with open(audio_path, 'rb') as f:
audio_data = f.read()
# Determine audio format from file extension
audio_format = audio_path.suffix.lower().lstrip('.')
if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
audio_format = 'wav' # Default fallback
# Create AudioContent (we'll use reasonable placeholder values)
# The provider will handle the actual audio analysis during preprocessing
try:
audio_content = AudioContent(
data=audio_data,
format=audio_format,
sample_rate=16000, # Standard rate for STT
duration=max(1.0, len(audio_data) / (16000 * 2)), # Rough estimate
filename=audio_path.name
)
except ValueError:
# If validation fails, try with minimal valid values
audio_content = AudioContent(
data=audio_data,
format=audio_format,
sample_rate=16000,
duration=1.0, # Minimum valid duration
filename=audio_path.name
)
# Get the appropriate provider
try:
provider = STTProviderFactory.create_provider(model_name)
except SpeechRecognitionException:
# Fallback to any available provider
logger.warning(f"Requested provider {model_name} not available, using fallback")
provider = STTProviderFactory.create_provider_with_fallback(model_name)
# Get the default model for the provider
model = provider.get_default_model()
# Transcribe audio
text_content = provider.transcribe(audio_content, model)
result = text_content.text
logger.info(f"Transcription completed: {result}")
return result
except Exception as e:
logger.error(f"Transcription failed: {str(e)}", exc_info=True)
raise SpeechRecognitionException(f"Transcription failed: {str(e)}") from e
def create_audio_content_from_file(audio_path: Union[str, Path]) -> AudioContent:
"""
Create AudioContent from an audio file with proper metadata detection.
Args:
audio_path: Path to the audio file
Returns:
AudioContent: The audio content object
Raises:
SpeechRecognitionException: If file cannot be processed
"""
try:
from pydub import AudioSegment
audio_path = Path(audio_path)
# Load audio file to get metadata
audio_segment = AudioSegment.from_file(audio_path)
# Read raw audio data
with open(audio_path, 'rb') as f:
audio_data = f.read()
# Determine format
audio_format = audio_path.suffix.lower().lstrip('.')
if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
audio_format = 'wav'
# Create AudioContent with actual metadata
return AudioContent(
data=audio_data,
format=audio_format,
sample_rate=audio_segment.frame_rate,
duration=len(audio_segment) / 1000.0, # Convert ms to seconds
filename=audio_path.name
)
except ImportError:
# Fallback without pydub
logger.warning("pydub not available, using placeholder metadata")
with open(audio_path, 'rb') as f:
audio_data = f.read()
audio_format = Path(audio_path).suffix.lower().lstrip('.')
if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
audio_format = 'wav'
return AudioContent(
data=audio_data,
format=audio_format,
sample_rate=16000, # Default
duration=1.0, # Placeholder
filename=Path(audio_path).name
)
except Exception as e:
raise SpeechRecognitionException(f"Failed to create AudioContent from file: {str(e)}") from e