Spaces:

Edmon02
/

SpeechT5_hy

Runtime error

File size: 3,736 Bytes

3f1840e

"""
SpeechT5 Armenian TTS - Ultra-Compatible Version
===============================================

Maximum compatibility version for HuggingFace Spaces.
"""

import gradio as gr
import numpy as np
import logging
import os
import sys
import warnings

# Suppress warnings that might interfere
warnings.filterwarnings("ignore")

# Setup minimal logging
logging.basicConfig(level=logging.WARNING)  # Reduce log noise
logger = logging.getLogger(__name__)

def safe_tts(text):
    """
    Ultra-safe TTS function that handles all errors gracefully.
    """
    # Input validation
    if not isinstance(text, str) or not text.strip():
        return generate_silence()
    
    try:
        # Try to import and use the real pipeline
        current_dir = os.path.dirname(os.path.abspath(__file__))
        src_path = os.path.join(current_dir, 'src')
        if src_path not in sys.path:
            sys.path.insert(0, src_path)
        
        from src.pipeline import TTSPipeline
        
        # Initialize pipeline if not done
        if not hasattr(safe_tts, 'pipeline'):
            safe_tts.pipeline = TTSPipeline(
                model_checkpoint="Edmon02/TTS_NB_2",
                max_chunk_length=200,
                use_mixed_precision=True
            )
            safe_tts.pipeline.optimize_for_production()
        
        # Generate speech
        sr, audio = safe_tts.pipeline.synthesize(
            text=text,
            speaker="BDL",
            enable_chunking=True,
            apply_audio_processing=True
        )
        
        return sr, audio
        
    except Exception as e:
        logger.warning(f"TTS failed, using fallback: {e}")
        return generate_fallback_audio(text)

def generate_silence():
    """Generate short silence."""
    return 16000, np.zeros(8000, dtype=np.int16)

def generate_fallback_audio(text):
    """Generate simple audio as fallback."""
    # Create a simple beep based on text length
    duration = min(len(text) * 0.08, 3.0)
    sr = 16000
    samples = int(duration * sr)
    
    if samples == 0:
        return generate_silence()
    
    # Generate simple tone
    t = np.linspace(0, duration, samples)
    frequency = 440  # A4
    audio = np.sin(2 * np.pi * frequency * t) * 0.2
    
    # Add some variation for different text
    if len(text) > 10:
        audio += np.sin(2 * np.pi * 880 * t) * 0.1
    
    return sr, (audio * 32767).astype(np.int16)

# Create the interface using the most basic approach
def create_interface():
    """Create interface with maximum compatibility."""
    
    # Use the simplest possible interface
    interface = gr.Interface(
        fn=safe_tts,
        inputs="text",  # Simplest input type
        outputs="audio",  # Simplest output type
        title="Armenian Text-to-Speech",
        description="Enter Armenian text to generate speech.",
        examples=[
            "Բարև ձեզ",
            "Ինչպե՞ս եք",
            "Շնորհակալություն"
        ]
    )
    
    return interface

# Main execution
if __name__ == "__main__":
    try:
        # Create and launch interface
        app = create_interface()
        app.launch(
            server_name="0.0.0.0",
            server_port=7860,
            share=False,
            quiet=True  # Reduce noise
        )
    except Exception as e:
        print(f"Failed to launch: {e}")
        # Emergency fallback - create the simplest possible app
        emergency_app = gr.Interface(
            fn=lambda x: generate_fallback_audio(x or "test"),
            inputs="text",
            outputs="audio",
            title="Armenian TTS (Emergency Mode)"
        )
        emergency_app.launch(server_name="0.0.0.0", server_port=7860)