Michael Hu commited on
Commit
60bd17d
·
1 Parent(s): 27972f7

refactor tts

Browse files
README.md CHANGED
@@ -10,3 +10,80 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+ # Speech Recognition Module Refactoring
15
+
16
+ ## Overview
17
+
18
+ The speech recognition module (`utils/stt.py`) has been refactored to support multiple ASR (Automatic Speech Recognition) models. The implementation now follows a factory pattern that allows easy switching between different speech recognition models while maintaining a consistent interface.
19
+
20
+ ## Supported Models
21
+
22
+ ### 1. Whisper (Default)
23
+ - Based on OpenAI's Whisper Large-v3 model
24
+ - High accuracy for general speech recognition
25
+ - No additional installation required
26
+
27
+ ### 2. Parakeet
28
+ - NVIDIA's Parakeet-TDT-0.6B model
29
+ - Optimized for real-time transcription
30
+ - Requires additional installation (see below)
31
+
32
+ ## Installation
33
+
34
+ ### For Parakeet Support
35
+
36
+ To use the Parakeet model, you need to install the NeMo Toolkit:
37
+
38
+ ```bash
39
+ pip install -U 'nemo_toolkit[asr]'
40
+ ```
41
+
42
+ Alternatively, you can use the provided requirements file:
43
+
44
+ ```bash
45
+ pip install -r requirements-parakeet.txt
46
+ ```
47
+
48
+ ## Usage
49
+
50
+ ### In the Web Application
51
+
52
+ The web application now includes a dropdown menu to select the ASR model. Simply choose your preferred model before uploading an audio file.
53
+
54
+ ### Programmatic Usage
55
+
56
+ ```python
57
+ from utils.stt import transcribe_audio
58
+
59
+ # Using the default Whisper model
60
+ text = transcribe_audio("path/to/audio.wav")
61
+
62
+ # Using the Parakeet model
63
+ text = transcribe_audio("path/to/audio.wav", model_name="parakeet")
64
+ ```
65
+
66
+ ### Direct Model Access
67
+
68
+ For more advanced usage, you can directly access the model classes:
69
+
70
+ ```python
71
+ from utils.stt import ASRFactory
72
+
73
+ # Get a specific model instance
74
+ whisper_model = ASRFactory.get_model("whisper")
75
+ parakeet_model = ASRFactory.get_model("parakeet")
76
+
77
+ # Use the model directly
78
+ text = whisper_model.transcribe("path/to/audio.wav")
79
+ ```
80
+
81
+ ## Architecture
82
+
83
+ The refactored code follows these design patterns:
84
+
85
+ 1. **Abstract Base Class**: `ASRModel` defines the interface for all speech recognition models
86
+ 2. **Factory Pattern**: `ASRFactory` creates the appropriate model instance based on the requested model name
87
+ 3. **Strategy Pattern**: Different model implementations can be swapped at runtime
88
+
89
+ This architecture makes it easy to add support for additional ASR models in the future.
utils/tts.py CHANGED
@@ -3,130 +3,53 @@ import logging
3
  # Configure logging
4
  logger = logging.getLogger(__name__)
5
 
6
- # Import from the new factory pattern implementation
7
- from utils.tts_factory import get_tts_engine, generate_speech, TTSFactory
8
- from utils.tts_engines import get_available_engines
9
-
10
- # For backward compatibility
11
- from utils.tts_engines import KOKORO_AVAILABLE, KOKORO_SPACE_AVAILABLE, DIA_AVAILABLE
12
-
13
- # Backward compatibility class
14
- class TTSEngine:
15
- """Legacy TTSEngine class for backward compatibility
16
-
17
- This class is maintained for backward compatibility with existing code.
18
- New code should use the factory pattern implementation directly.
19
- """
 
 
 
 
 
 
 
 
 
20
 
21
- def __init__(self, lang_code='z'):
22
- """Initialize TTS Engine using the factory pattern
23
 
24
- Args:
25
- lang_code (str): Language code ('a' for US English, 'b' for British English,
26
- 'j' for Japanese, 'z' for Mandarin Chinese)
27
- """
28
- logger.info("Initializing legacy TTSEngine wrapper")
29
- logger.info(f"Available engines - Kokoro: {KOKORO_AVAILABLE}, Dia: {DIA_AVAILABLE}")
30
-
31
- # Create the appropriate engine using the factory
32
- self._engine = TTSFactory.create_engine(lang_code=lang_code)
33
-
34
- # Set engine_type for backward compatibility
35
- engine_class = self._engine.__class__.__name__
36
- if 'Kokoro' in engine_class and 'Space' in engine_class:
37
- self.engine_type = "kokoro_space"
38
- elif 'Kokoro' in engine_class:
39
- self.engine_type = "kokoro"
40
- elif 'Dia' in engine_class:
41
- self.engine_type = "dia"
42
- else:
43
- self.engine_type = "dummy"
44
-
45
- # Set pipeline and client attributes for backward compatibility
46
- self.pipeline = getattr(self._engine, 'pipeline', None)
47
- self.client = getattr(self._engine, 'client', None)
48
-
49
- logger.info(f"Legacy TTSEngine wrapper initialized with engine type: {self.engine_type}")
50
 
51
- def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
52
- """Generate speech from text using available TTS engine
53
-
54
- Args:
55
- text (str): Input text to synthesize
56
- voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
57
- speed (float): Speech speed multiplier (0.5 to 2.0)
58
-
59
- Returns:
60
- str: Path to the generated audio file
61
- """
62
- logger.info(f"Legacy TTSEngine wrapper calling generate_speech for text length: {len(text)}")
63
- return self._engine.generate_speech(text, voice, speed)
64
 
65
- def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0):
66
- """Generate speech from text and yield each segment
67
-
68
- Args:
69
- text (str): Input text to synthesize
70
- voice (str): Voice ID to use
71
- speed (float): Speech speed multiplier
72
-
73
- Yields:
74
- tuple: (sample_rate, audio_data) pairs for each segment
75
- """
76
- logger.info(f"Legacy TTSEngine wrapper calling generate_speech_stream for text length: {len(text)}")
77
- yield from self._engine.generate_speech_stream(text, voice, speed)
78
-
79
- # For backward compatibility
80
- def _generate_dummy_audio(self, output_path):
81
- """Generate a dummy audio file with a simple sine wave (backward compatibility)
82
-
83
- Args:
84
- output_path (str): Path to save the dummy audio file
85
-
86
- Returns:
87
- str: Path to the generated dummy audio file
88
- """
89
- from utils.tts_base import DummyTTSEngine
90
- dummy_engine = DummyTTSEngine()
91
- return dummy_engine.generate_speech("", "", 1.0)
92
 
93
- # For backward compatibility
94
- def _generate_dummy_audio_stream(self):
95
- """Generate dummy audio chunks (backward compatibility)
96
-
97
- Yields:
98
- tuple: (sample_rate, audio_data) pairs for each dummy segment
99
- """
100
- from utils.tts_base import DummyTTSEngine
101
- dummy_engine = DummyTTSEngine()
102
- yield from dummy_engine.generate_speech_stream("", "", 1.0)
103
-
104
- # Import the new implementations from tts_base
105
- # These functions are already defined in tts_base.py and imported at the top of this file
106
- # They are kept here as comments for reference
107
-
108
- # def get_tts_engine(lang_code='a'):
109
- # """Get or create TTS engine instance
110
- #
111
- # Args:
112
- # lang_code (str): Language code for the pipeline
113
- #
114
- # Returns:
115
- # TTSEngineBase: Initialized TTS engine instance
116
- # """
117
- # # Implementation moved to tts_base.py
118
- # pass
119
-
120
- # def generate_speech(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
121
- # """Public interface for TTS generation
122
- #
123
- # Args:
124
- # text (str): Input text to synthesize
125
- # voice (str): Voice ID to use
126
- # speed (float): Speech speed multiplier
127
- #
128
- # Returns:
129
- # str: Path to generated audio file
130
- # "\"""
131
- # # Implementation moved to tts_base.py
132
- # pass
 
3
  # Configure logging
4
  logger = logging.getLogger(__name__)
5
 
6
+ # Import the factory pattern implementation
7
+ from utils.tts_factory import TTSFactory
8
+
9
+ # Import base classes
10
+ from utils.tts_base import TTSEngineBase, DummyTTSEngine
11
+
12
+ # Import engine-specific modules
13
+ from utils.tts_engines import (
14
+ get_available_engines,
15
+ create_engine,
16
+ KokoroTTSEngine,
17
+ KokoroSpaceTTSEngine,
18
+ DiaTTSEngine
19
+ )
20
+
21
+ # Import legacy functions for backward compatibility
22
+ from utils.tts_kokoro import generate_speech as kokoro_generate_speech
23
+ from utils.tts_kokoro_space import generate_speech as kokoro_space_generate_speech
24
+ from utils.tts_dia import generate_speech as dia_generate_speech
25
+
26
+ # Convenience function to get the best available TTS engine
27
+ def get_best_engine(lang_code: str = 'z') -> TTSEngineBase:
28
+ """Get the best available TTS engine
29
 
30
+ Args:
31
+ lang_code (str): Language code for the engine
32
 
33
+ Returns:
34
+ TTSEngineBase: An instance of the best available TTS engine
35
+ """
36
+ return TTSFactory.create_engine(None, lang_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Legacy function for backward compatibility
39
+ def generate_speech(text: str, language: str = "z", voice: str = "af_heart", speed: float = 1.0) -> str:
40
+ """Generate speech using the best available TTS engine
 
 
 
 
 
 
 
 
 
 
41
 
42
+ This is a legacy function maintained for backward compatibility.
43
+ New code should use the factory pattern implementation directly.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ Args:
46
+ text (str): Input text to synthesize
47
+ language (str): Language code
48
+ voice (str): Voice ID to use
49
+ speed (float): Speech speed multiplier
50
+
51
+ Returns:
52
+ str: Path to the generated audio file
53
+ """
54
+ engine = get_best_engine(language)
55
+ return engine.generate_speech(text, voice, speed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/tts_base.py CHANGED
@@ -143,10 +143,4 @@ class DummyTTSEngine(TTSEngineBase):
143
  t = np.linspace(0, duration, int(sample_rate * duration), False)
144
  freq = 440 + (i * 220) # Different frequency for each chunk
145
  tone = np.sin(2 * np.pi * freq * t) * 0.3
146
- yield sample_rate, tone
147
-
148
-
149
- # Factory functionality moved to tts_factory.py to avoid circular imports
150
-
151
-
152
- # Note: Backward compatibility functions moved to tts_factory.py
 
143
  t = np.linspace(0, duration, int(sample_rate * duration), False)
144
  freq = 440 + (i * 220) # Different frequency for each chunk
145
  tone = np.sin(2 * np.pi * freq * t) * 0.3
146
+ yield sample_rate, tone
 
 
 
 
 
 
utils/tts_factory.py CHANGED
@@ -49,70 +49,4 @@ class TTSFactory:
49
 
50
  # Fall back to dummy engine
51
  logger.warning("No TTS engines available, falling back to dummy engine")
52
- return DummyTTSEngine(lang_code)
53
-
54
-
55
- # Backward compatibility function
56
- def get_tts_engine(lang_code: str = 'a') -> TTSEngineBase:
57
- """Get or create TTS engine instance (backward compatibility function)
58
-
59
- Args:
60
- lang_code (str): Language code for the pipeline
61
-
62
- Returns:
63
- TTSEngineBase: Initialized TTS engine instance
64
- """
65
- logger.info(f"Requesting TTS engine with language code: {lang_code}")
66
- try:
67
- import streamlit as st
68
- logger.info("Streamlit detected, using cached TTS engine")
69
- @st.cache_resource
70
- def _get_engine():
71
- logger.info("Creating cached TTS engine instance")
72
- engine = TTSFactory.create_engine(lang_code=lang_code)
73
- logger.info(f"Cached TTS engine created with type: {engine.__class__.__name__}")
74
- return engine
75
-
76
- engine = _get_engine()
77
- logger.info(f"Retrieved TTS engine from cache with type: {engine.__class__.__name__}")
78
- return engine
79
- except ImportError:
80
- logger.info("Streamlit not available, creating direct TTS engine instance")
81
- engine = TTSFactory.create_engine(lang_code=lang_code)
82
- logger.info(f"Direct TTS engine created with type: {engine.__class__.__name__}")
83
- return engine
84
-
85
-
86
- # Backward compatibility function
87
- def generate_speech(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
88
- """Public interface for TTS generation (backward compatibility function)
89
-
90
- Args:
91
- text (str): Input text to synthesize
92
- voice (str): Voice ID to use
93
- speed (float): Speech speed multiplier
94
-
95
- Returns:
96
- str: Path to generated audio file
97
- """
98
- logger.info(f"Public generate_speech called with text length: {len(text)}, voice: {voice}, speed: {speed}")
99
- try:
100
- # Get the TTS engine
101
- logger.info("Getting TTS engine instance")
102
- engine = get_tts_engine()
103
- logger.info(f"Using TTS engine type: {engine.__class__.__name__}")
104
-
105
- # Generate speech
106
- logger.info("Calling engine.generate_speech")
107
- output_path = engine.generate_speech(text, voice, speed)
108
- logger.info(f"Speech generation complete, output path: {output_path}")
109
- return output_path
110
- except Exception as e:
111
- logger.error(f"Error in public generate_speech function: {str(e)}", exc_info=True)
112
- logger.error(f"Error type: {type(e).__name__}")
113
- if hasattr(e, '__traceback__'):
114
- tb = e.__traceback__
115
- while tb.tb_next:
116
- tb = tb.tb_next
117
- logger.error(f"Error occurred in file: {tb.tb_frame.f_code.co_filename}, line {tb.tb_lineno}")
118
- raise
 
49
 
50
  # Fall back to dummy engine
51
  logger.warning("No TTS engines available, falling back to dummy engine")
52
+ return DummyTTSEngine(lang_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/tts_kokoro.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import logging
4
+ import numpy as np
5
+ import soundfile as sf
6
+ from typing import Optional, Tuple, Generator
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Constants
13
+ DEFAULT_SAMPLE_RATE = 24000
14
+
15
+ # Global model instance (lazy loaded)
16
+ _pipeline = None
17
+
18
+
19
+ def _get_pipeline(lang_code: str = 'z'):
20
+ """Lazy-load the Kokoro pipeline to avoid loading it until needed"""
21
+ global _pipeline
22
+ if _pipeline is None:
23
+ logger.info("Loading Kokoro pipeline...")
24
+ try:
25
+ # Import Kokoro
26
+ from kokoro import KPipeline
27
+
28
+ # Initialize the pipeline
29
+ logger.info(f"Initializing Kokoro pipeline with language code: {lang_code}")
30
+ _pipeline = KPipeline(lang_code=lang_code)
31
+
32
+ # Log pipeline details
33
+ logger.info(f"Kokoro pipeline loaded successfully")
34
+ logger.info(f"Pipeline type: {type(_pipeline).__name__}")
35
+ except ImportError as import_err:
36
+ logger.error(f"Import error loading Kokoro pipeline: {import_err}")
37
+ logger.error(f"This may indicate missing dependencies")
38
+ raise
39
+ except Exception as e:
40
+ logger.error(f"Error loading Kokoro pipeline: {e}", exc_info=True)
41
+ logger.error(f"Error type: {type(e).__name__}")
42
+ raise
43
+ return _pipeline
44
+
45
+
46
+ def generate_speech(text: str, language: str = "z", voice: str = "af_heart", speed: float = 1.0) -> str:
47
+ """Public interface for TTS generation using Kokoro model
48
+
49
+ This is a legacy function maintained for backward compatibility.
50
+ New code should use the factory pattern implementation directly.
51
+
52
+ Args:
53
+ text (str): Input text to synthesize
54
+ language (str): Language code ('a' for US English, 'b' for British English,
55
+ 'j' for Japanese, 'z' for Mandarin Chinese)
56
+ voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
57
+ speed (float): Speech speed multiplier (0.5 to 2.0)
58
+
59
+ Returns:
60
+ str: Path to the generated audio file
61
+ """
62
+ logger.info(f"Legacy Kokoro generate_speech called with text length: {len(text)}")
63
+
64
+ # Use the new implementation via factory pattern
65
+ from utils.tts_engines import KokoroTTSEngine
66
+
67
+ try:
68
+ # Create a Kokoro engine and generate speech
69
+ kokoro_engine = KokoroTTSEngine(language)
70
+ return kokoro_engine.generate_speech(text, voice, speed)
71
+ except Exception as e:
72
+ logger.error(f"Error in legacy Kokoro generate_speech: {str(e)}", exc_info=True)
73
+ # Fall back to dummy TTS
74
+ from utils.tts_base import DummyTTSEngine
75
+ dummy_engine = DummyTTSEngine()
76
+ return dummy_engine.generate_speech(text)
77
+
78
+
79
+ def generate_speech_stream(text: str, language: str = "z", voice: str = "af_heart", speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
80
+ """Generate speech stream using Kokoro TTS engine
81
+
82
+ Args:
83
+ text (str): Input text to synthesize
84
+ language (str): Language code
85
+ voice (str): Voice ID to use
86
+ speed (float): Speech speed multiplier
87
+
88
+ Yields:
89
+ tuple: (sample_rate, audio_data) pairs for each segment
90
+ """
91
+ logger.info(f"Generating speech stream with Kokoro for text length: {len(text)}")
92
+
93
+ try:
94
+ # Get the Kokoro pipeline
95
+ pipeline = _get_pipeline(language)
96
+
97
+ # Generate speech stream
98
+ generator = pipeline(text, voice=voice, speed=speed)
99
+ for _, _, audio in generator:
100
+ yield DEFAULT_SAMPLE_RATE, audio
101
+ except Exception as e:
102
+ logger.error(f"Error in Kokoro generate_speech_stream: {str(e)}", exc_info=True)
103
+ # Fall back to dummy TTS
104
+ from utils.tts_base import DummyTTSEngine
105
+ dummy_engine = DummyTTSEngine()
106
+ yield from dummy_engine.generate_speech_stream(text)
utils/tts_kokoro_space.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import logging
4
+ import numpy as np
5
+ import soundfile as sf
6
+ from typing import Optional, Tuple, Generator
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Constants
13
+ DEFAULT_SAMPLE_RATE = 24000
14
+
15
+ # Global client instance (lazy loaded)
16
+ _client = None
17
+
18
+
19
+ def _get_client():
20
+ """Lazy-load the Kokoro Space client to avoid loading it until needed"""
21
+ global _client
22
+ if _client is None:
23
+ logger.info("Loading Kokoro Space client...")
24
+ try:
25
+ # Import gradio client
26
+ from gradio_client import Client
27
+
28
+ # Initialize the client
29
+ logger.info("Initializing Kokoro Space client")
30
+ _client = Client("Remsky/Kokoro-TTS-Zero")
31
+
32
+ # Log client details
33
+ logger.info("Kokoro Space client loaded successfully")
34
+ logger.info(f"Client type: {type(_client).__name__}")
35
+ except ImportError as import_err:
36
+ logger.error(f"Import error loading Kokoro Space client: {import_err}")
37
+ logger.error("This may indicate missing dependencies")
38
+ raise
39
+ except Exception as e:
40
+ logger.error(f"Error loading Kokoro Space client: {e}", exc_info=True)
41
+ logger.error(f"Error type: {type(e).__name__}")
42
+ raise
43
+ return _client
44
+
45
+
46
+ def generate_speech(text: str, language: str = "z", voice: str = "af_nova", speed: float = 1.0) -> str:
47
+ """Public interface for TTS generation using Kokoro Space
48
+
49
+ This is a legacy function maintained for backward compatibility.
50
+ New code should use the factory pattern implementation directly.
51
+
52
+ Args:
53
+ text (str): Input text to synthesize
54
+ language (str): Language code (not used in Kokoro Space, kept for API compatibility)
55
+ voice (str): Voice ID to use (e.g., 'af_nova', 'af_bella', etc.)
56
+ speed (float): Speech speed multiplier (0.5 to 2.0)
57
+
58
+ Returns:
59
+ str: Path to the generated audio file
60
+ """
61
+ logger.info(f"Legacy Kokoro Space generate_speech called with text length: {len(text)}")
62
+
63
+ # Use the new implementation via factory pattern
64
+ from utils.tts_engines import KokoroSpaceTTSEngine
65
+
66
+ try:
67
+ # Create a Kokoro Space engine and generate speech
68
+ kokoro_space_engine = KokoroSpaceTTSEngine(language)
69
+ return kokoro_space_engine.generate_speech(text, voice, speed)
70
+ except Exception as e:
71
+ logger.error(f"Error in legacy Kokoro Space generate_speech: {str(e)}", exc_info=True)
72
+ # Fall back to dummy TTS
73
+ from utils.tts_base import DummyTTSEngine
74
+ dummy_engine = DummyTTSEngine()
75
+ return dummy_engine.generate_speech(text)
76
+
77
+
78
+ def _create_output_dir() -> str:
79
+ """Create output directory for audio files
80
+
81
+ Returns:
82
+ str: Path to the output directory
83
+ """
84
+ output_dir = "temp/outputs"
85
+ os.makedirs(output_dir, exist_ok=True)
86
+ return output_dir
87
+
88
+
89
+ def _generate_output_path(prefix: str = "output") -> str:
90
+ """Generate a unique output path for audio files
91
+
92
+ Args:
93
+ prefix (str): Prefix for the output filename
94
+
95
+ Returns:
96
+ str: Path to the output file
97
+ """
98
+ output_dir = _create_output_dir()
99
+ timestamp = int(time.time())
100
+ return f"{output_dir}/{prefix}_{timestamp}.wav"