Spaces:
Build error
Build error
File size: 4,326 Bytes
7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 3ed3b5a 7495571 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import logging
import os
import time
import numpy as np
import soundfile as sf
from typing import Optional, Generator, Tuple, List
from abc import ABC, abstractmethod
# Configure logging
logger = logging.getLogger(__name__)
class TTSBase(ABC):
"""Base class for all TTS engines
This abstract class defines the interface that all TTS engines must implement.
"""
def __init__(self, lang_code: str = 'z'):
"""Initialize the TTS engine
Args:
lang_code (str): Language code for the engine
"""
self.lang_code = lang_code
@abstractmethod
def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
"""Generate speech from text
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use
speed (float): Speech speed multiplier
Returns:
Optional[str]: Path to the generated audio file or None if generation fails
"""
pass
@abstractmethod
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
"""Generate speech stream from text
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use
speed (float): Speech speed multiplier
Yields:
tuple: (sample_rate, audio_data) pairs for each segment
"""
pass
def _generate_output_path(self, prefix: str = "tts", extension: str = "wav") -> str:
"""Generate a unique output path for the audio file
Args:
prefix (str): Prefix for the filename
extension (str): File extension
Returns:
str: Path to the output file
"""
timestamp = int(time.time() * 1000)
filename = f"{prefix}_{timestamp}.{extension}"
output_dir = os.path.join(os.getcwd(), "output")
os.makedirs(output_dir, exist_ok=True)
return os.path.join(output_dir, filename)
class DummyTTS(TTSBase):
"""Dummy TTS engine that generates sine wave audio
This class is used as a fallback when no other TTS engine is available.
"""
def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> str:
"""Generate a dummy sine wave audio file
Args:
text (str): Input text (not used)
voice (str): Voice ID (not used)
speed (float): Speech speed multiplier (not used)
Returns:
str: Path to the generated audio file
"""
logger.info(f"Generating dummy speech for text length: {len(text)}")
# Generate a simple sine wave
sample_rate = 24000
duration = min(len(text) / 20, 10) # Rough approximation of speech duration
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
audio = 0.5 * np.sin(2 * np.pi * 440 * t) # 440 Hz sine wave
# Save to file
output_path = self._generate_output_path(prefix="dummy")
sf.write(output_path, audio, sample_rate)
logger.info(f"Generated dummy audio: {output_path}")
return output_path
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
"""Generate a dummy sine wave audio stream
Args:
text (str): Input text (not used)
voice (str): Voice ID (not used)
speed (float): Speech speed multiplier (not used)
Yields:
tuple: (sample_rate, audio_data) pairs
"""
logger.info(f"Generating dummy speech stream for text length: {len(text)}")
# Generate a simple sine wave
sample_rate = 24000
duration = min(len(text) / 20, 10) # Rough approximation of speech duration
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
audio = 0.5 * np.sin(2 * np.pi * 440 * t) # 440 Hz sine wave
# Yield the audio data
yield sample_rate, audio |