File size: 4,142 Bytes
9874d4d
 
 
 
 
 
9f79da5
9874d4d
 
 
9f79da5
9874d4d
 
 
 
 
9f79da5
9874d4d
 
 
 
 
 
 
 
9f79da5
9874d4d
 
 
9f79da5
9874d4d
 
 
 
 
9f79da5
9874d4d
 
 
 
9f79da5
9874d4d
 
 
 
 
 
9f79da5
 
9874d4d
 
 
 
9f79da5
9874d4d
 
 
 
 
9f79da5
9874d4d
9f79da5
9874d4d
 
 
 
 
 
 
9f79da5
9874d4d
 
9f79da5
9874d4d
 
 
 
 
 
 
 
9f79da5
9874d4d
 
9f79da5
9874d4d
9f79da5
9874d4d
 
 
 
 
 
9f79da5
9874d4d
 
 
 
 
 
 
 
 
 
 
9f79da5
9874d4d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""

ElevenLabs TTS Implementation

"""
import httpx
from typing import Optional, Dict
from tts_interface import TTSInterface
from utils.logger import log_info, log_error, log_debug, log_warning

class ElevenLabsTTS(TTSInterface):
    """ElevenLabs TTS implementation"""

    def __init__(self, api_key: str):
        super().__init__()
        self.api_key = api_key.strip()
        self.base_url = "https://api.elevenlabs.io/v1"
        self.default_voice_id = "2thYbn2sOGtiTwd9QwWH"  # Avencia

        # ElevenLabs preprocessing needs
        self.preprocessing_flags = {
            "PREPROCESS_NUMBERS",    # Large numbers
            "PREPROCESS_CURRENCY",   # Currency amounts
            "PREPROCESS_TIME",       # Time format
            "PREPROCESS_CODES",      # PNR/codes
            "PREPROCESS_PHONE"       # Phone numbers
        }

        # Debug log
        masked_key = f"{api_key[:4]}...{api_key[-4:]}" if len(api_key) > 8 else "***"
        log_debug(f"πŸ”‘ ElevenLabsTTS initialized with key: {masked_key}")

    async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
        """Convert text to speech using ElevenLabs API"""
        try:
            voice = voice_id or self.default_voice_id
            url = f"{self.base_url}/text-to-speech/{voice}"

            headers = {
                "xi-api-key": self.api_key,
                "Content-Type": "application/json"
            }

            # Default parameters
            data = {
                "text": text,
                "model_id": kwargs.get("model_id", "eleven_multilingual_v2"),
                "voice_settings": kwargs.get("voice_settings", {
                    "stability": 1,
                    "similarity_boost": 0.85,
                    "style": 0.7,
                    "speed": 1.14,
                    "use_speaker_boost": True
                })
            }

            # Add optional parameters
            if "output_format" in kwargs:
                params = {"output_format": kwargs["output_format"]}
            else:
                params = {"output_format": "mp3_44100_128"}

            log_debug(f"🎀 Calling ElevenLabs TTS for {len(text)} characters")

            async with httpx.AsyncClient(timeout=30.0) as client:
                response = await client.post(
                    url,
                    headers=headers,
                    json=data,
                    params=params
                )

                response.raise_for_status()
                audio_data = response.content  # This should be bytes

                # Ensure we're returning bytes
                if isinstance(audio_data, str):
                    log_warning("ElevenLabs returned string instead of bytes")
                    # Try to decode if it's base64
                    try:
                        audio_data = base64.b64decode(audio_data)
                    except:
                        pass

                log_debug(f"βœ… ElevenLabs TTS returned {len(audio_data)} bytes")
                log_debug(f"Audio data type: {type(audio_data)}")

                return audio_data

        except httpx.HTTPStatusError as e:
            log_error(f"❌ ElevenLabs API error: {e.response.status_code} - {e.response.text}")
            raise
        except Exception as e:
            log_error("❌ TTS synthesis error", e)
            raise

    def get_supported_voices(self) -> Dict[str, str]:
        """Get default voices - full list can be fetched from API"""
        return {
            "2thYbn2sOGtiTwd9QwWH": "Avencia (Female - Turkish)",
            "21m00Tcm4TlvDq8ikWAM": "Rachel (Female)",
            "EXAVITQu4vr4xnSDxMaL": "Bella (Female)",
            "ErXwobaYiN019PkySvjV": "Antoni (Male)",
            "VR6AewLTigWG4xSOukaG": "Arnold (Male)",
            "pNInz6obpgDQGcFmaJgB": "Adam (Male)",
            "yoZ06aMxZJJ28mfd3POQ": "Sam (Male)",
        }

    def get_provider_name(self) -> str:
        """Get provider name"""
        return "elevenlabs"