File size: 5,308 Bytes
1be582a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""Legacy compatibility functions for STT functionality."""

import logging
from pathlib import Path
from typing import Union

from .provider_factory import STTProviderFactory
from ...domain.models.audio_content import AudioContent
from ...domain.exceptions import SpeechRecognitionException

logger = logging.getLogger(__name__)


def transcribe_audio(audio_path: Union[str, Path], model_name: str = "parakeet") -> str:
    """
    Convert audio file to text using specified STT model (legacy interface).
    
    This function maintains backward compatibility with the original utils/stt.py interface.
    
    Args:
        audio_path: Path to input audio file
        model_name: Name of the STT model/provider to use (whisper or parakeet)
        
    Returns:
        str: Transcribed English text
        
    Raises:
        SpeechRecognitionException: If transcription fails
    """
    logger.info(f"Starting transcription for: {audio_path} using {model_name} model")
    
    try:
        # Convert path to Path object
        audio_path = Path(audio_path)
        
        if not audio_path.exists():
            raise SpeechRecognitionException(f"Audio file not found: {audio_path}")
        
        # Read audio file and create AudioContent
        with open(audio_path, 'rb') as f:
            audio_data = f.read()
        
        # Determine audio format from file extension
        audio_format = audio_path.suffix.lower().lstrip('.')
        if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
            audio_format = 'wav'  # Default fallback
        
        # Create AudioContent (we'll use reasonable placeholder values)
        # The provider will handle the actual audio analysis during preprocessing
        try:
            audio_content = AudioContent(
                data=audio_data,
                format=audio_format,
                sample_rate=16000,  # Standard rate for STT
                duration=max(1.0, len(audio_data) / (16000 * 2)),  # Rough estimate
                filename=audio_path.name
            )
        except ValueError:
            # If validation fails, try with minimal valid values
            audio_content = AudioContent(
                data=audio_data,
                format=audio_format,
                sample_rate=16000,
                duration=1.0,  # Minimum valid duration
                filename=audio_path.name
            )
        
        # Get the appropriate provider
        try:
            provider = STTProviderFactory.create_provider(model_name)
        except SpeechRecognitionException:
            # Fallback to any available provider
            logger.warning(f"Requested provider {model_name} not available, using fallback")
            provider = STTProviderFactory.create_provider_with_fallback(model_name)
        
        # Get the default model for the provider
        model = provider.get_default_model()
        
        # Transcribe audio
        text_content = provider.transcribe(audio_content, model)
        result = text_content.text
        
        logger.info(f"Transcription completed: {result}")
        return result

    except Exception as e:
        logger.error(f"Transcription failed: {str(e)}", exc_info=True)
        raise SpeechRecognitionException(f"Transcription failed: {str(e)}") from e


def create_audio_content_from_file(audio_path: Union[str, Path]) -> AudioContent:
    """
    Create AudioContent from an audio file with proper metadata detection.
    
    Args:
        audio_path: Path to the audio file
        
    Returns:
        AudioContent: The audio content object
        
    Raises:
        SpeechRecognitionException: If file cannot be processed
    """
    try:
        from pydub import AudioSegment
        
        audio_path = Path(audio_path)
        
        # Load audio file to get metadata
        audio_segment = AudioSegment.from_file(audio_path)
        
        # Read raw audio data
        with open(audio_path, 'rb') as f:
            audio_data = f.read()
        
        # Determine format
        audio_format = audio_path.suffix.lower().lstrip('.')
        if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
            audio_format = 'wav'
        
        # Create AudioContent with actual metadata
        return AudioContent(
            data=audio_data,
            format=audio_format,
            sample_rate=audio_segment.frame_rate,
            duration=len(audio_segment) / 1000.0,  # Convert ms to seconds
            filename=audio_path.name
        )
        
    except ImportError:
        # Fallback without pydub
        logger.warning("pydub not available, using placeholder metadata")
        
        with open(audio_path, 'rb') as f:
            audio_data = f.read()
        
        audio_format = Path(audio_path).suffix.lower().lstrip('.')
        if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
            audio_format = 'wav'
        
        return AudioContent(
            data=audio_data,
            format=audio_format,
            sample_rate=16000,  # Default
            duration=1.0,  # Placeholder
            filename=Path(audio_path).name
        )
        
    except Exception as e:
        raise SpeechRecognitionException(f"Failed to create AudioContent from file: {str(e)}") from e