Spaces:
Running
Running
File size: 4,142 Bytes
9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d 9f79da5 9874d4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
"""
ElevenLabs TTS Implementation
"""
import httpx
from typing import Optional, Dict
from tts_interface import TTSInterface
from utils.logger import log_info, log_error, log_debug, log_warning
class ElevenLabsTTS(TTSInterface):
"""ElevenLabs TTS implementation"""
def __init__(self, api_key: str):
super().__init__()
self.api_key = api_key.strip()
self.base_url = "https://api.elevenlabs.io/v1"
self.default_voice_id = "2thYbn2sOGtiTwd9QwWH" # Avencia
# ElevenLabs preprocessing needs
self.preprocessing_flags = {
"PREPROCESS_NUMBERS", # Large numbers
"PREPROCESS_CURRENCY", # Currency amounts
"PREPROCESS_TIME", # Time format
"PREPROCESS_CODES", # PNR/codes
"PREPROCESS_PHONE" # Phone numbers
}
# Debug log
masked_key = f"{api_key[:4]}...{api_key[-4:]}" if len(api_key) > 8 else "***"
log_debug(f"π ElevenLabsTTS initialized with key: {masked_key}")
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
"""Convert text to speech using ElevenLabs API"""
try:
voice = voice_id or self.default_voice_id
url = f"{self.base_url}/text-to-speech/{voice}"
headers = {
"xi-api-key": self.api_key,
"Content-Type": "application/json"
}
# Default parameters
data = {
"text": text,
"model_id": kwargs.get("model_id", "eleven_multilingual_v2"),
"voice_settings": kwargs.get("voice_settings", {
"stability": 1,
"similarity_boost": 0.85,
"style": 0.7,
"speed": 1.14,
"use_speaker_boost": True
})
}
# Add optional parameters
if "output_format" in kwargs:
params = {"output_format": kwargs["output_format"]}
else:
params = {"output_format": "mp3_44100_128"}
log_debug(f"π€ Calling ElevenLabs TTS for {len(text)} characters")
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
url,
headers=headers,
json=data,
params=params
)
response.raise_for_status()
audio_data = response.content # This should be bytes
# Ensure we're returning bytes
if isinstance(audio_data, str):
log_warning("ElevenLabs returned string instead of bytes")
# Try to decode if it's base64
try:
audio_data = base64.b64decode(audio_data)
except:
pass
log_debug(f"β
ElevenLabs TTS returned {len(audio_data)} bytes")
log_debug(f"Audio data type: {type(audio_data)}")
return audio_data
except httpx.HTTPStatusError as e:
log_error(f"β ElevenLabs API error: {e.response.status_code} - {e.response.text}")
raise
except Exception as e:
log_error("β TTS synthesis error", e)
raise
def get_supported_voices(self) -> Dict[str, str]:
"""Get default voices - full list can be fetched from API"""
return {
"2thYbn2sOGtiTwd9QwWH": "Avencia (Female - Turkish)",
"21m00Tcm4TlvDq8ikWAM": "Rachel (Female)",
"EXAVITQu4vr4xnSDxMaL": "Bella (Female)",
"ErXwobaYiN019PkySvjV": "Antoni (Male)",
"VR6AewLTigWG4xSOukaG": "Arnold (Male)",
"pNInz6obpgDQGcFmaJgB": "Adam (Male)",
"yoZ06aMxZJJ28mfd3POQ": "Sam (Male)",
}
def get_provider_name(self) -> str:
"""Get provider name"""
return "elevenlabs" |