Spaces:
Build error
Build error
"""Legacy compatibility functions for STT functionality.""" | |
import logging | |
from pathlib import Path | |
from typing import Union | |
from .provider_factory import STTProviderFactory | |
from ...domain.models.audio_content import AudioContent | |
from ...domain.exceptions import SpeechRecognitionException | |
logger = logging.getLogger(__name__) | |
def transcribe_audio(audio_path: Union[str, Path], model_name: str = "parakeet") -> str: | |
""" | |
Convert audio file to text using specified STT model (legacy interface). | |
This function maintains backward compatibility with the original utils/stt.py interface. | |
Args: | |
audio_path: Path to input audio file | |
model_name: Name of the STT model/provider to use (whisper or parakeet) | |
Returns: | |
str: Transcribed English text | |
Raises: | |
SpeechRecognitionException: If transcription fails | |
""" | |
logger.info(f"Starting transcription for: {audio_path} using {model_name} model") | |
try: | |
# Convert path to Path object | |
audio_path = Path(audio_path) | |
if not audio_path.exists(): | |
raise SpeechRecognitionException(f"Audio file not found: {audio_path}") | |
# Read audio file and create AudioContent | |
with open(audio_path, 'rb') as f: | |
audio_data = f.read() | |
# Determine audio format from file extension | |
audio_format = audio_path.suffix.lower().lstrip('.') | |
if audio_format not in ['wav', 'mp3', 'flac', 'ogg']: | |
audio_format = 'wav' # Default fallback | |
# Create AudioContent (we'll use reasonable placeholder values) | |
# The provider will handle the actual audio analysis during preprocessing | |
try: | |
audio_content = AudioContent( | |
data=audio_data, | |
format=audio_format, | |
sample_rate=16000, # Standard rate for STT | |
duration=max(1.0, len(audio_data) / (16000 * 2)), # Rough estimate | |
filename=audio_path.name | |
) | |
except ValueError: | |
# If validation fails, try with minimal valid values | |
audio_content = AudioContent( | |
data=audio_data, | |
format=audio_format, | |
sample_rate=16000, | |
duration=1.0, # Minimum valid duration | |
filename=audio_path.name | |
) | |
# Get the appropriate provider | |
try: | |
provider = STTProviderFactory.create_provider(model_name) | |
except SpeechRecognitionException: | |
# Fallback to any available provider | |
logger.warning(f"Requested provider {model_name} not available, using fallback") | |
provider = STTProviderFactory.create_provider_with_fallback(model_name) | |
# Get the default model for the provider | |
model = provider.get_default_model() | |
# Transcribe audio | |
text_content = provider.transcribe(audio_content, model) | |
result = text_content.text | |
logger.info(f"Transcription completed: {result}") | |
return result | |
except Exception as e: | |
logger.error(f"Transcription failed: {str(e)}", exc_info=True) | |
raise SpeechRecognitionException(f"Transcription failed: {str(e)}") from e | |
def create_audio_content_from_file(audio_path: Union[str, Path]) -> AudioContent: | |
""" | |
Create AudioContent from an audio file with proper metadata detection. | |
Args: | |
audio_path: Path to the audio file | |
Returns: | |
AudioContent: The audio content object | |
Raises: | |
SpeechRecognitionException: If file cannot be processed | |
""" | |
try: | |
from pydub import AudioSegment | |
audio_path = Path(audio_path) | |
# Load audio file to get metadata | |
audio_segment = AudioSegment.from_file(audio_path) | |
# Read raw audio data | |
with open(audio_path, 'rb') as f: | |
audio_data = f.read() | |
# Determine format | |
audio_format = audio_path.suffix.lower().lstrip('.') | |
if audio_format not in ['wav', 'mp3', 'flac', 'ogg']: | |
audio_format = 'wav' | |
# Create AudioContent with actual metadata | |
return AudioContent( | |
data=audio_data, | |
format=audio_format, | |
sample_rate=audio_segment.frame_rate, | |
duration=len(audio_segment) / 1000.0, # Convert ms to seconds | |
filename=audio_path.name | |
) | |
except ImportError: | |
# Fallback without pydub | |
logger.warning("pydub not available, using placeholder metadata") | |
with open(audio_path, 'rb') as f: | |
audio_data = f.read() | |
audio_format = Path(audio_path).suffix.lower().lstrip('.') | |
if audio_format not in ['wav', 'mp3', 'flac', 'ogg']: | |
audio_format = 'wav' | |
return AudioContent( | |
data=audio_data, | |
format=audio_format, | |
sample_rate=16000, # Default | |
duration=1.0, # Placeholder | |
filename=Path(audio_path).name | |
) | |
except Exception as e: | |
raise SpeechRecognitionException(f"Failed to create AudioContent from file: {str(e)}") from e |