Spaces:

mgbam
/

docappointemet

Runtime error

App Files Files Community

mgbam commited on 2 days ago

Commit

1effb75

verified ·

1 Parent(s): 228cd9c

Update gemini_tts.py

Browse files

Files changed (1) hide show

gemini_tts.py +96 -147

gemini_tts.py CHANGED Viewed

@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import google.generativeai as genai
 import os
 import struct
 import re
 import logging
-from cache import cache
-# Add these imports for MP3 conversion
 from pydub import AudioSegment
-import io
 # --- Constants ---
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
@@ -29,184 +29,133 @@ GENERATE_SPEECH = os.environ.get("GENERATE_SPEECH", "false").lower() == "true"
 TTS_MODEL = "gemini-2.5-flash-preview-tts"
 DEFAULT_RAW_AUDIO_MIME = "audio/L16;rate=24000"
-# --- Configuration ---
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 genai.configure(api_key=GEMINI_API_KEY)
 class TTSGenerationError(Exception):
-    """Custom exception for TTS generation failures."""
     pass
-# --- Helper functions for audio processing ---
-def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
     """
-    Parses bits per sample and rate from an audio MIME type string.
-    e.g., "audio/L16;rate=24000" -> {"bits_per_sample": 16, "rate": 24000}
     """
-    bits_per_sample = 16  # Default
-    rate = 24000          # Default
-    parts = mime_type.split(";")
-    for param in parts:
         param = param.strip().lower()
         if param.startswith("rate="):
             try:
-                rate_str = param.split("=", 1)[1]
-                rate = int(rate_str)
-            except (ValueError, IndexError):
-                pass # Keep default if parsing fails
-        elif re.match(r"audio/l\d+", param): # Matches audio/L<digits>
-             try:
-                bits_str = param.split("l",1)[1]
-                bits_per_sample = int(bits_str)
-             except (ValueError, IndexError):
-                pass # Keep default
     return {"bits_per_sample": bits_per_sample, "rate": rate}
 def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
-    """
-    Generates a WAV file header for the given raw audio data and parameters.
-    Assumes mono audio.
-    """
-    parameters = parse_audio_mime_type(mime_type)
-    bits_per_sample = parameters["bits_per_sample"]
-    sample_rate = parameters["rate"]
-    num_channels = 1  # Mono
-    data_size = len(audio_data)
-    bytes_per_sample = bits_per_sample // 8
     block_align = num_channels * bytes_per_sample
-    byte_rate = sample_rate * block_align
     chunk_size = 36 + data_size
     header = struct.pack(
         "<4sI4s4sIHHIIHH4sI",
-        b"RIFF", chunk_size, b"WAVE", b"fmt ",
-        16, 1, num_channels, sample_rate, byte_rate, block_align,
-        bits_per_sample, b"data", data_size
     )
     return header + audio_data
-# --- End of helper functions ---
-def _synthesize_gemini_tts_impl(text: str, gemini_voice_name: str) -> tuple[bytes, str]:
-    """
-    Synthesizes English text using the Gemini API via the google-genai library.
-    Returns a tuple: (processed_audio_data_bytes, final_mime_type).
-    Raises TTSGenerationError on failure.
-    """
     if not GENERATE_SPEECH:
-        # This should ideally not be hit if the logic outside this function is correct,
-        # but as a safeguard, we raise an error.
-        raise TTSGenerationError(
-            "GENERATE_SPEECH is not set. Please set it in your environment variables to generate speech."
-        )
     try:
         model = genai.GenerativeModel(TTS_MODEL)
-        generation_config = {
-            "response_modalities": ["AUDIO"],
-            "speech_config": {
-                "voice_config": {
-                    "prebuilt_voice_config": {
-                        "voice_name": gemini_voice_name
-                    }
-                }
-            }
-        }
         response = model.generate_content(
             contents=[text],
-            generation_config=generation_config,
         )
         audio_part = response.candidates[0].content.parts[0]
-        audio_data_bytes = audio_part.inline_data.data
-        final_mime_type = audio_part.inline_data.mime_type
     except Exception as e:
-        error_message = f"An unexpected error occurred with google-genai: {e}"
-        logging.error(error_message)
-        raise TTSGenerationError(error_message) from e
-    if not audio_data_bytes:
-        error_message = "No audio data was successfully retrieved or decoded."
-        logging.error(error_message)
-        raise TTSGenerationError(error_message)
-    # --- Audio processing ---
-    if final_mime_type:
-        final_mime_type_lower = final_mime_type.lower()
-        needs_wav_conversion = any(p in final_mime_type_lower for p in ("audio/l16", "audio/l24", "audio/l8")) or \
-                               not final_mime_type_lower.startswith(("audio/wav", "audio/mpeg", "audio/ogg", "audio/opus"))
-        if needs_wav_conversion:
-            processed_audio_data = convert_to_wav(audio_data_bytes, final_mime_type)
-            processed_audio_mime = "audio/wav"
-        else:
-            processed_audio_data = audio_data_bytes
-            processed_audio_mime = final_mime_type
-    else:
-        logging.warning("MIME type not determined. Assuming raw audio and attempting WAV conversion (defaulting to %s).", DEFAULT_RAW_AUDIO_MIME)
-        processed_audio_data = convert_to_wav(audio_data_bytes, DEFAULT_RAW_AUDIO_MIME)
-        processed_audio_mime = "audio/wav"
-    # --- MP3 compression ---
-    if processed_audio_data:
-        try:
-            # Load audio into AudioSegment
-            audio_segment = AudioSegment.from_file(io.BytesIO(processed_audio_data), format="wav")
-            mp3_buffer = io.BytesIO()
-            audio_segment.export(mp3_buffer, format="mp3")
-            mp3_bytes = mp3_buffer.getvalue()
-            return mp3_bytes, "audio/mpeg"
-        except Exception as e:
-            logging.warning("MP3 compression failed: %s. Falling back to WAV.", e)
-            # Fallback to WAV if MP3 conversion fails
-            return processed_audio_data, processed_audio_mime
-    else:
-        error_message = "Audio processing failed."
-        logging.error(error_message)
-        raise TTSGenerationError(error_message)
-# Always create the memoized function first, so we can access its .key() method
-_memoized_tts_func = cache.memoize()(_synthesize_gemini_tts_impl)
 if GENERATE_SPEECH:
-    def synthesize_gemini_tts_with_error_handling(*args, **kwargs) -> tuple[bytes | None, str | None]:
-        """
-        A wrapper for the memoized TTS function that catches errors and returns (None, None).
-        This makes the audio generation more resilient to individual failures.
-        """
         try:
-            # Attempt to get the audio from the cache or by generating it.
-            return _memoized_tts_func(*args, **kwargs)
         except TTSGenerationError as e:
-            # If generation fails, log the error and return None, None.
-            logging.error("Handled TTS Generation Error: %s. Continuing without audio for this segment.", e)
             return None, None
-    synthesize_gemini_tts = synthesize_gemini_tts_with_error_handling
 else:
-    # When not generating speech, create a read-only function that only
-    # checks the cache and does not generate new audio.
-    def read_only_synthesize_gemini_tts(*args, **kwargs):
-        """
-        Checks cache for a result, but never calls the underlying TTS function.
-        This is a 'read-only' memoization check.
-        """
-        # Generate the cache key using the memoized function's key method.
-        key = _memoized_tts_func.__cache_key__(*args, **kwargs)
-        # Check the cache directly using the generated key.
-        _sentinel = object()
-        result = cache.get(key, default=_sentinel)
-        if result is not _sentinel:
-            return result  # Cache hit
-        # Cache miss
-        logging.info("GENERATE_SPEECH is false and no cached result found for key: %s", key)
         return None, None
-    synthesize_gemini_tts = read_only_synthesize_gemini_tts

 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import struct
 import re
 import logging
+import io
+from typing import Optional, Dict, Tuple, Union
+import google.generativeai as genai
 from pydub import AudioSegment
+from cache import cache
 # --- Constants ---
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
 TTS_MODEL = "gemini-2.5-flash-preview-tts"
 DEFAULT_RAW_AUDIO_MIME = "audio/L16;rate=24000"
+# --- Setup ---
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 genai.configure(api_key=GEMINI_API_KEY)
 class TTSGenerationError(Exception):
+    """Raised when Gemini TTS generation fails."""
     pass
+def parse_audio_mime_type(mime_type: str) -> Dict[str, int]:
     """
+    Extracts bits_per_sample and sampling rate from a MIME string.
+    e.g. "audio/L16;rate=24000" → {"bits_per_sample": 16, "rate": 24000}
     """
+    bits_per_sample = 16
+    rate = 24000
+    for param in mime_type.split(";"):
         param = param.strip().lower()
         if param.startswith("rate="):
             try:
+                rate = int(param.split("=", 1)[1])
+            except ValueError:
+                pass
+        elif re.match(r"audio/l\d+", param):
+            try:
+                bits_per_sample = int(param.split("l", 1)[1])
+            except ValueError:
+                pass
     return {"bits_per_sample": bits_per_sample, "rate": rate}
 def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
+    """Wrap raw PCM bytes in a WAV header for mono audio."""
+    params = parse_audio_mime_type(mime_type)
+    bits = params["bits_per_sample"]
+    rate = params["rate"]
+    num_channels = 1
+    bytes_per_sample = bits // 8
     block_align = num_channels * bytes_per_sample
+    byte_rate = rate * block_align
+    data_size = len(audio_data)
     chunk_size = 36 + data_size
     header = struct.pack(
         "<4sI4s4sIHHIIHH4sI",
+        b"RIFF",
+        chunk_size,
+        b"WAVE",
+        b"fmt ",
+        16,
+        1,
+        num_channels,
+        rate,
+        byte_rate,
+        block_align,
+        bits,
+        b"data",
+        data_size,
     )
     return header + audio_data
+@cache.memoize()
+def _synthesize_gemini_tts_impl(text: str, gemini_voice_name: str) -> Tuple[bytes, str]:
+    """Core function to request audio from Gemini TTS (cached)."""
     if not GENERATE_SPEECH:
+        raise TTSGenerationError("GENERATE_SPEECH not enabled in environment.")
     try:
         model = genai.GenerativeModel(TTS_MODEL)
         response = model.generate_content(
             contents=[text],
+            generation_config={
+                "response_modalities": ["AUDIO"],
+                "speech_config": {
+                    "voice_config": {
+                        "prebuilt_voice_config": {"voice_name": gemini_voice_name}
+                    }
+                }
+            },
         )
         audio_part = response.candidates[0].content.parts[0]
+        raw_data = audio_part.inline_data.data
+        mime = audio_part.inline_data.mime_type
     except Exception as e:
+        logging.error("Gemini TTS API error: %s", e)
+        raise TTSGenerationError(f"TTS request failed: {e}")
+    if not raw_data:
+        raise TTSGenerationError("Empty audio data from Gemini.")
+    # Convert raw audio to WAV if needed
+    mime_lower = mime.lower() if mime else ""
+    if mime_lower and (
+        mime_lower.startswith("audio/l")
+        or not mime_lower.startswith(("audio/wav", "audio/mpeg", "audio/ogg", "audio/opus"))
+    ):
+        raw_data = convert_to_wav(raw_data, mime_lower)
+        mime = "audio/wav"
+    elif not mime:
+        logging.warning("MIME missing; defaulting to WAV")
+        raw_data = convert_to_wav(raw_data, DEFAULT_RAW_AUDIO_MIME)
+        mime = "audio/wav"
+    # Attempt MP3 compression
+    try:
+        segment = AudioSegment.from_file(io.BytesIO(raw_data), format="wav")
+        buf = io.BytesIO()
+        segment.export(buf, format="mp3")
+        return buf.getvalue(), "audio/mpeg"
+    except Exception as e:
+        logging.warning("MP3 conversion failed (%s); returning WAV", e)
+        return raw_data, mime
+# Choose wrapper based on GENERATE_SPEECH flag
 if GENERATE_SPEECH:
+    def synthesize_gemini_tts(text: str, voice: str) -> Tuple[Optional[bytes], Optional[str]]:
         try:
+            return _synthesize_gemini_tts_impl(text, voice)
         except TTSGenerationError as e:
+            logging.error("TTS failed: %s; skipping audio", e)
             return None, None
 else:
+    def synthesize_gemini_tts(text: str, voice: str) -> Tuple[Optional[bytes], Optional[str]]:
+        key = _synthesize_gemini_tts_impl.__cache_key__(text, voice)
+        result = cache.get(key)
+        if result is not None:
+            return result
+        logging.info("No cached audio; speech disabled.")
         return None, None