Spaces:

SohomToom
/

DocToAudioConverted

Sleeping

App Files Files Community

DocToAudioConverted / app.py

SohomToom

Update app.py

68f40ec verified 4 months ago

raw

history blame

2.53 kB

	import os
	import tempfile
	import zipfile
	from docx import Document
	from TTS.api import TTS
	from pydub import AudioSegment
	import gradio as gr

	# Available TTS models with voice descriptions
	VOICE_MODELS = {
	"Jenny (Expressive Female)": "tts_models/en/jenny/jenny",
	"LJSpeech (Standard Female)": "tts_models/en/ljspeech/vits",
	"VCTK (Multiple Speakers)": "tts_models/en/vctk/vits"
	}

	# Function to update speaker choices based on the selected model
	def update_speaker_choices(selected_voice):
	if selected_voice == "VCTK (Multiple Speakers)":
	return ["Speaker 1", "Speaker 2", "Speaker 3"] # Modify with actual speaker names or indices
	return ["Default Speaker"]

	def docx_to_wav_zip(doc_file, selected_voice, speaker_name):
	# Load the selected TTS model
	tts = TTS(model_name=VOICE_MODELS[selected_voice], progress_bar=False, gpu=False)

	# Extract text from .docx
	document = Document(doc_file.name)
	full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()])

	# Generate temporary paths
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
	wav_path = tmp_wav.name
	zip_path = wav_path.replace(".wav", ".zip")

	# Get speaker index (this part assumes speaker names are like 'Speaker 1', 'Speaker 2', etc.)
	speaker_idx = int(speaker_name.split()[-1]) - 1 if speaker_name.startswith("Speaker") else 0

	# Generate speech with the selected speaker index
	tts.tts_to_file(text=full_text, speaker_idx=speaker_idx, file_path=wav_path)

	# Convert wav to mp3 and zip the result
	sound = AudioSegment.from_wav(wav_path)
	sound.export(wav_path, format="wav") # keeping the wav format

	# Zip the files
	with zipfile.ZipFile(zip_path, 'w') as zipf:
	zipf.write(wav_path, os.path.basename(wav_path))

	return zip_path

	# Gradio interface
	interface = gr.Interface(
	fn=docx_to_wav_zip,
	inputs=[
	gr.File(label="Upload .docx File"),
	gr.Dropdown(choices=list(VOICE_MODELS.keys()), label="Choose Voice", value="Jenny (Expressive Female)"),
	gr.Dropdown(choices=update_speaker_choices("VCTK (Multiple Speakers)"), label="Choose Speaker", value="Speaker 1") # Example
	],
	outputs=gr.File(label="Download Zip File"),
	title="Realistic Voiceover from DOCX (Multiple Voices)",
	description="Upload a .docx file, choose a realistic voice, and pick a speaker to generate a voiceover in WAV format."
	)

	if __name__ == "__main__":
	interface.launch()