Spaces:

KarhuSuomi
/

google-tts

Running

File size: 2,910 Bytes

import gradio as gr
from google.cloud import texttospeech
import pysrt
import os
from pydub import AudioSegment
from dotenv import load_dotenv
from google.oauth2 import service_account
import json

# 加载环境变量
load_dotenv()

def get_google_client():
    # Get credentials from environment variable
    creds_json = os.getenv("GOOGLE_CREDENTIALS_JSON")
    if not creds_json:
        raise ValueError("Google Cloud credentials not configured")
        
    credentials = service_account.Credentials.from_service_account_info(
        json.loads(creds_json)
    )
    
    return texttospeech.TextToSpeechClient(
        credentials=credentials,
        client_options={"quota_project_id": os.getenv("GOOGLE_QUOTA_PROJECT_ID")}
    )

client = get_google_client()

def srt_to_speech(srt_file, language_code, voice_name, gender):
    subs = pysrt.open(srt_file.name)
    final_audio = AudioSegment.empty()
    
    for sub in subs:
        synthesis_input = texttospeech.SynthesisInput(text=sub.text)
        
        voice = texttospeech.VoiceSelectionParams(
            language_code=language_code,
            name=voice_name,
            ssml_gender=texttospeech.SsmlVoiceGender[gender]
        )
        
        audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3
        )
        
        response = client.synthesize_speech(
            input=synthesis_input, voice=voice, audio_config=audio_config
        )
        
        # 保存临时音频片段
        temp_file = f"temp_{sub.index}.mp3"
        with open(temp_file, "wb") as out:
            out.write(response.audio_content)
        
        # 计算静音时长
        silence_duration = (sub.start.ordinal - (0 if sub.index == 1 else subs[sub.index-2].end.ordinal)) / 1000
        if silence_duration > 0:
            silence = AudioSegment.silent(duration=silence_duration)
            final_audio += silence
        
        # 添加音频片段
        segment = AudioSegment.from_mp3(temp_file)
        final_audio += segment
        os.remove(temp_file)
    
    # 保存最终音频
    output_file = "output_with_timeline.mp3"
    final_audio.export(output_file, format="mp3")
    return output_file

# Gradio界面
iface = gr.Interface(
    fn=srt_to_speech,
    inputs=[
        gr.File(label="Lataa SRT/tiedosto"),
        gr.Dropdown(["fi-FI", "en-US"], label="语言代码", value="fi-FI"),
        gr.Dropdown(["fi-FI-Standard-A", "fi-FI-Wavenet-A", "en-US-Standard-A"], 
                   label="Äänen nimi", value="fi-FI-Standard-A"),
        gr.Dropdown(["FEMALE", "MALE", "NEUTRAL"], label="Sukupoli", value="FEMALE")
    ],
    outputs=gr.Audio(label="Luotu äänitetty tekstitys"),
    title="SRT-äänityökalu",
    description="Lataa SRT-tekstitystiedosto ja luo äänitetty versio ajoituksella"
)

if __name__ == "__main__":
    iface.launch()
    ssr=False