Spaces:

Edmon02
/

SpeechT5_hy

Runtime error

File size: 4,038 Bytes

3f1840e

"""
Armenian TTS - Minimal HF Spaces Version
=======================================

Absolutely minimal version to avoid all possible compatibility issues.
"""

import gradio as gr
import numpy as np
import logging
import os
import sys

# Simple logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def setup_pipeline():
    """Setup TTS pipeline with maximum error handling."""
    try:
        # Add source path
        current_dir = os.path.dirname(os.path.abspath(__file__))
        src_path = os.path.join(current_dir, 'src')
        if src_path not in sys.path:
            sys.path.insert(0, src_path)
        
        # Try to import and initialize
        from src.pipeline import TTSPipeline
        
        pipeline = TTSPipeline(
            model_checkpoint="Edmon02/TTS_NB_2",
            max_chunk_length=200,
            use_mixed_precision=True
        )
        pipeline.optimize_for_production()
        logger.info("TTS pipeline initialized successfully")
        return pipeline, True
        
    except Exception as e:
        logger.error(f"Pipeline initialization failed: {e}")
        return None, False

def tts_process(text):
    """Process text to speech with complete error handling."""
    global tts_pipeline, pipeline_available
    
    # Basic input validation
    if not text or not isinstance(text, str) or len(text.strip()) == 0:
        # Return 1 second of silence
        return 16000, np.zeros(16000, dtype=np.int16)
    
    text = text.strip()
    
    # If no pipeline available, create a simple audio response
    if not pipeline_available or tts_pipeline is None:
        logger.info(f"Using fallback for text: {text[:30]}...")
        
        # Create simple fallback audio
        duration = min(len(text) * 0.08, 4.0)  # Max 4 seconds
        sample_rate = 16000
        samples = int(duration * sample_rate)
        
        if samples <= 0:
            return sample_rate, np.zeros(8000, dtype=np.int16)
        
        # Generate a simple pleasant tone
        t = np.linspace(0, duration, samples)
        frequency = 440  # A4 note
        audio = np.sin(2 * np.pi * frequency * t) * 0.2
        
        # Add some harmonics for richer sound
        audio += np.sin(2 * np.pi * frequency * 2 * t) * 0.1
        audio += np.sin(2 * np.pi * frequency * 3 * t) * 0.05
        
        # Apply simple envelope
        envelope = np.exp(-t * 2)  # Exponential decay
        audio *= envelope
        
        # Convert to int16
        audio_int16 = (audio * 32767).astype(np.int16)
        return sample_rate, audio_int16
    
    # Try real TTS
    try:
        logger.info(f"Synthesizing: {text[:50]}...")
        
        sample_rate, audio = tts_pipeline.synthesize(
            text=text,
            speaker="BDL",
            enable_chunking=True,
            apply_audio_processing=True
        )
        
        logger.info(f"Successfully generated {len(audio)} samples")
        return sample_rate, audio
        
    except Exception as e:
        logger.error(f"TTS synthesis failed: {e}")
        # Fallback to silence
        return 16000, np.zeros(8000, dtype=np.int16)

# Initialize the pipeline once
logger.info("Initializing Armenian TTS application...")
tts_pipeline, pipeline_available = setup_pipeline()

if pipeline_available:
    title = "🇦🇲 Armenian Text-to-Speech (Ready)"
    description = "Convert Armenian text to speech using SpeechT5."
else:
    title = "🇦🇲 Armenian TTS (Test Mode)"
    description = "TTS system in test mode - will generate simple audio tones."

# Create the simplest possible Gradio interface
app = gr.Interface(
    fn=tts_process,
    inputs="text",
    outputs="audio",
    title=title,
    description=description,
    examples=[
        "Բարև ձեզ",
        "Շնորհակալություն",
        "Ինչպե՞ս եք"
    ]
)

# Launch the app
if __name__ == "__main__":
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )