Spaces:

DroolingPanda
/

teachingAssistant

Build error

teachingAssistant / src /infrastructure /stt /legacy_compatibility.py

Michael Hu

Migrate existing STT providers to infrastructure layer

1be582a 26 days ago

5.31 kB

	"""Legacy compatibility functions for STT functionality."""

	import logging
	from pathlib import Path
	from typing import Union

	from .provider_factory import STTProviderFactory
	from ...domain.models.audio_content import AudioContent
	from ...domain.exceptions import SpeechRecognitionException

	logger = logging.getLogger(__name__)


	def transcribe_audio(audio_path: Union[str, Path], model_name: str = "parakeet") -> str:
	"""
	Convert audio file to text using specified STT model (legacy interface).

	This function maintains backward compatibility with the original utils/stt.py interface.

	Args:
	audio_path: Path to input audio file
	model_name: Name of the STT model/provider to use (whisper or parakeet)

	Returns:
	str: Transcribed English text

	Raises:
	SpeechRecognitionException: If transcription fails
	"""
	logger.info(f"Starting transcription for: {audio_path} using {model_name} model")

	try:
	# Convert path to Path object
	audio_path = Path(audio_path)

	if not audio_path.exists():
	raise SpeechRecognitionException(f"Audio file not found: {audio_path}")

	# Read audio file and create AudioContent
	with open(audio_path, 'rb') as f:
	audio_data = f.read()

	# Determine audio format from file extension
	audio_format = audio_path.suffix.lower().lstrip('.')
	if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
	audio_format = 'wav' # Default fallback

	# Create AudioContent (we'll use reasonable placeholder values)
	# The provider will handle the actual audio analysis during preprocessing
	try:
	audio_content = AudioContent(
	data=audio_data,
	format=audio_format,
	sample_rate=16000, # Standard rate for STT
	duration=max(1.0, len(audio_data) / (16000 * 2)), # Rough estimate
	filename=audio_path.name
	)
	except ValueError:
	# If validation fails, try with minimal valid values
	audio_content = AudioContent(
	data=audio_data,
	format=audio_format,
	sample_rate=16000,
	duration=1.0, # Minimum valid duration
	filename=audio_path.name
	)

	# Get the appropriate provider
	try:
	provider = STTProviderFactory.create_provider(model_name)
	except SpeechRecognitionException:
	# Fallback to any available provider
	logger.warning(f"Requested provider {model_name} not available, using fallback")
	provider = STTProviderFactory.create_provider_with_fallback(model_name)

	# Get the default model for the provider
	model = provider.get_default_model()

	# Transcribe audio
	text_content = provider.transcribe(audio_content, model)
	result = text_content.text

	logger.info(f"Transcription completed: {result}")
	return result

	except Exception as e:
	logger.error(f"Transcription failed: {str(e)}", exc_info=True)
	raise SpeechRecognitionException(f"Transcription failed: {str(e)}") from e


	def create_audio_content_from_file(audio_path: Union[str, Path]) -> AudioContent:
	"""
	Create AudioContent from an audio file with proper metadata detection.

	Args:
	audio_path: Path to the audio file

	Returns:
	AudioContent: The audio content object

	Raises:
	SpeechRecognitionException: If file cannot be processed
	"""
	try:
	from pydub import AudioSegment

	audio_path = Path(audio_path)

	# Load audio file to get metadata
	audio_segment = AudioSegment.from_file(audio_path)

	# Read raw audio data
	with open(audio_path, 'rb') as f:
	audio_data = f.read()

	# Determine format
	audio_format = audio_path.suffix.lower().lstrip('.')
	if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
	audio_format = 'wav'

	# Create AudioContent with actual metadata
	return AudioContent(
	data=audio_data,
	format=audio_format,
	sample_rate=audio_segment.frame_rate,
	duration=len(audio_segment) / 1000.0, # Convert ms to seconds
	filename=audio_path.name
	)

	except ImportError:
	# Fallback without pydub
	logger.warning("pydub not available, using placeholder metadata")

	with open(audio_path, 'rb') as f:
	audio_data = f.read()

	audio_format = Path(audio_path).suffix.lower().lstrip('.')
	if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
	audio_format = 'wav'

	return AudioContent(
	data=audio_data,
	format=audio_format,
	sample_rate=16000, # Default
	duration=1.0, # Placeholder
	filename=Path(audio_path).name
	)

	except Exception as e:
	raise SpeechRecognitionException(f"Failed to create AudioContent from file: {str(e)}") from e