File size: 4,982 Bytes
1be582a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdc056d
1be582a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdc056d
1be582a
 
 
 
 
 
 
 
 
 
 
 
 
 
fdc056d
1be582a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdc056d
1be582a
 
fdc056d
1be582a
 
 
 
 
fdc056d
1be582a
fdc056d
1be582a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdc056d
1be582a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""Whisper STT provider implementation."""

import logging
from pathlib import Path
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from ...domain.models.audio_content import AudioContent
    from ...domain.models.text_content import TextContent

from ..base.stt_provider_base import STTProviderBase
from ...domain.exceptions import SpeechRecognitionException

logger = logging.getLogger(__name__)


class WhisperSTTProvider(STTProviderBase):
    """Whisper STT provider using faster-whisper implementation."""

    def __init__(self):
        """Initialize the Whisper STT provider."""
        super().__init__(
            provider_name="Whisper",
            supported_languages=["en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"]
        )
        self.model = None
        self._device = None
        self._compute_type = None
        self._initialize_device_settings()

    def _initialize_device_settings(self):
        """Initialize device and compute type settings."""
        try:
            import torch
            self._device = "cuda" if torch.cuda.is_available() else "cpu"
        except ImportError:
            # Fallback to CPU if torch is not available
            self._device = "cpu"

        self._compute_type = "float16" if self._device == "cuda" else "int8"
        logger.info(f"Whisper provider initialized with device: {self._device}, compute_type: {self._compute_type}")

    def _perform_transcription(self, audio_path: Path, model: str) -> str:
        """
        Perform transcription using Faster Whisper.

        Args:
            audio_path: Path to the preprocessed audio file
            model: The Whisper model to use (e.g., 'large-v3', 'medium', 'small')

        Returns:
            str: The transcribed text
        """
        try:
            # Load model if not already loaded or if model changed
            if self.model is None or getattr(self.model, 'model_size_or_path', None) != model:
                self._load_model(model)

            logger.info(f"Starting Whisper transcription with model {model}")

            # Perform transcription
            segments, info = self.model.transcribe(
                str(audio_path),
                beam_size=5,
                language="en",  # Can be made configurable
                task="transcribe"
            )

            logger.info(f"Detected language '{info.language}' with probability {info.language_probability}")

            # Collect all segments into a single text
            result_text = ""
            for segment in segments:
                result_text += segment.text + " "
                logger.info(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

            result = result_text.strip()
            logger.info("Whisper transcription completed successfully")
            return result

        except Exception as e:
            self._handle_provider_error(e, "transcription")

    def _load_model(self, model_name: str):
        """
        Load the Whisper model.

        Args:
            model_name: Name of the model to load
        """
        try:
            from faster_whisper import WhisperModel as FasterWhisperModel

            logger.info(f"Loading Whisper model: {model_name}")
            logger.info(f"Using device: {self._device}, compute_type: {self._compute_type}")

            self.model = FasterWhisperModel(
                model_name,
                device=self._device,
                compute_type=self._compute_type
            )

            logger.info(f"Whisper model {model_name} loaded successfully")

        except ImportError as e:
            raise SpeechRecognitionException(
                "faster-whisper not available. Please install with: pip install faster-whisper"
            ) from e
        except Exception as e:
            raise SpeechRecognitionException(f"Failed to load Whisper model {model_name}: {str(e)}") from e

    def is_available(self) -> bool:
        """
        Check if the Whisper provider is available.

        Returns:
            bool: True if faster-whisper is available, False otherwise
        """
        try:
            import faster_whisper
            return True
        except ImportError:
            logger.warning("faster-whisper not available")
            return False

    def get_available_models(self) -> list[str]:
        """
        Get list of available Whisper models.

        Returns:
            list[str]: List of available model names
        """
        return [
            "tiny",
            "tiny.en",
            "base",
            "base.en",
            "small",
            "small.en",
            "medium",
            "medium.en",
            "large-v1",
            "large-v2",
            "large-v3"
        ]

    def get_default_model(self) -> str:
        """
        Get the default model for this provider.

        Returns:
            str: Default model name
        """
        return "large-v3"