Spaces:
Sleeping
Sleeping
File size: 9,019 Bytes
9e3182d 235e7c7 bebc496 9e3182d 68f40ec bebc496 b9d2659 235e7c7 c849c89 9e3182d 6e7a5e3 9e3182d 235e7c7 9e3182d 6ee4112 235e7c7 6ee4112 235e7c7 b9d2659 235e7c7 b9d2659 235e7c7 b9d2659 a5f420b b9d2659 235e7c7 b9d2659 235e7c7 b9d2659 a5f420b 235e7c7 b9bf9b2 9e3182d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import os
os.environ["NUMBA_DISABLE_CACHE"] = "1"
import os
import gradio as gr
from docx import Document
from TTS.api import TTS
import tempfile
import zipfile
from io import BytesIO
import re
# Voice model
VOICE_MODEL = "tts_models/en/vctk/vits"
# Embedded metadata (from your file)
SPEAKER_METADATA = {
300: { "age": 23, "gender": "F", "accent": "American"},
271: { "age": 19, "gender": "M", "accent": "Scottish"},
287: { "age": 23, "gender": "M", "accent": "English"},
262: { "age": 23, "gender": "F", "accent": "Scottish"},
284: { "age": 20, "gender": "M", "accent": "Scottish"},
297: { "age": 20, "gender": "F", "accent": "American"},
227: { "age": 38, "gender": "M", "accent": "English"},
246: { "age": 22, "gender": "M", "accent": "Scottish"},
225: { "age": 23, "gender": "F", "accent": "English"},
259: { "age": 23, "gender": "M", "accent": "English"},
252: { "age": 22, "gender": "M", "accent": "Scottish"},
231: { "age": 23, "gender": "F", "accent": "English"},
266: { "age": 22, "gender": "F", "accent": "Irish"},
241: { "age": 21, "gender": "M", "accent": "Scottish"},
312: { "age": 19, "gender": "F", "accent": "Canadian"},
329: { "age": 23, "gender": "F", "accent": "American"},
232: { "age": 23, "gender": "M", "accent": "English"},
305: { "age": 19, "gender": "F", "accent": "American"},
311: { "age": 21, "gender": "M", "accent": "American"},
301: { "age": 23, "gender": "F", "accent": "American"},
304: { "age": 22, "gender": "M", "accent": "NorthernIrish"},
310: { "age": 21, "gender": "F", "accent": "American"},
260: { "age": 21, "gender": "M", "accent": "Scottish"},
315: { "age": 18, "gender": "M", "accent": "American"},
374: { "age": 28, "gender": "M", "accent": "Australian"},
364: { "age": 23, "gender": "M", "accent": "Irish"},
269: { "age": 20, "gender": "F", "accent": "English"},
345: { "age": 22, "gender": "M", "accent": "American"},
326: { "age": 26, "gender": "M", "accent": "Australian"},
343: { "age": 27, "gender": "F", "accent": "Canadian"},
230: { "age": 22, "gender": "F", "accent": "English"},
376: { "age": 22, "gender": "M", "accent": "Indian"},
240: { "age": 21, "gender": "F", "accent": "English"},
298: { "age": 19, "gender": "M", "accent": "Irish"},
272: { "age": 23, "gender": "M", "accent": "Scottish"},
248: { "age": 23, "gender": "F", "accent": "Indian"},
264: { "age": 23, "gender": "F", "accent": "Scottish"},
250: { "age": 22, "gender": "F", "accent": "English"},
292: { "age": 23, "gender": "M", "accent": "NorthernIrish"},
237: { "age": 22, "gender": "M", "accent": "Scottish"},
363: { "age": 22, "gender": "M", "accent": "Canadian"},
313: { "age": 24, "gender": "F", "accent": "Irish"},
285: { "age": 21, "gender": "M", "accent": "Scottish"},
268: { "age": 23, "gender": "F", "accent": "English"},
302: { "age": 20, "gender": "M", "accent": "Canadian"},
261: { "age": 26, "gender": "F", "accent": "NorthernIrish"},
336: { "age": 18, "gender": "F", "accent": "SouthAfrican"},
288: { "age": 22, "gender": "F", "accent": "Irish"},
226: { "age": 22, "gender": "M", "accent": "English"},
277: { "age": 23, "gender": "F", "accent": "English"},
360: { "age": 19, "gender": "M", "accent": "American"},
257: { "age": 24, "gender": "F", "accent": "English"},
254: { "age": 21, "gender": "M", "accent": "English"},
339: { "age": 21, "gender": "F", "accent": "American"},
323: { "age": 19, "gender": "F", "accent": "SouthAfrican"},
255: { "age": 19, "gender": "M", "accent": "Scottish"},
249: { "age": 22, "gender": "F", "accent": "Scottish"},
293: { "age": 22, "gender": "F", "accent": "NorthernIrish"},
244: { "age": 22, "gender": "F", "accent": "English"},
245: { "age": 25, "gender": "M", "accent": "Irish"},
361: { "age": 19, "gender": "F", "accent": "American"},
314: { "age": 26, "gender": "F", "accent": "SouthAfrican"},
308: { "age": 18, "gender": "F", "accent": "American"},
229: { "age": 23, "gender": "F", "accent": "English"},
341: { "age": 26, "gender": "F", "accent": "American"},
275: { "age": 23, "gender": "M", "accent": "Scottish"},
263: { "age": 22, "gender": "M", "accent": "Scottish"},
253: { "age": 22, "gender": "F", "accent": "Welsh"},
299: { "age": 25, "gender": "F", "accent": "American"},
316: { "age": 20, "gender": "M", "accent": "Canadian"},
282: { "age": 23, "gender": "F", "accent": "English"},
362: { "age": 29, "gender": "F", "accent": "American"},
294: { "age": 33, "gender": "F", "accent": "American"},
274: { "age": 22, "gender": "M", "accent": "English"},
279: { "age": 23, "gender": "M", "accent": "English"},
281: { "age": 29, "gender": "M", "accent": "Scottish"},
286: { "age": 23, "gender": "M", "accent": "English"},
258: { "age": 22, "gender": "M", "accent": "English"},
247: { "age": 22, "gender": "M", "accent": "Scottish"},
351: { "age": 21, "gender": "F", "accent": "NorthernIrish"},
283: { "age": 24, "gender": "F", "accent": "Irish"},
334: { "age": 18, "gender": "M", "accent": "American"},
333: { "age": 19, "gender": "F", "accent": "American"},
295: { "age": 23, "gender": "F", "accent": "Irish"},
330: { "age": 26, "gender": "F", "accent": "American"},
335: { "age": 25, "gender": "F", "accent": "NewZealand"},
228: { "age": 22, "gender": "F", "accent": "English"},
267: { "age": 23, "gender": "F", "accent": "English"},
273: { "age": 18, "gender": "F", "accent": "English"}
}
# Static list of speakers for dropdown
SPEAKER_CHOICES = [
f"{sid} - {data['gender']} - {data['accent']} (Age {data['age']})"
for sid, data in SPEAKER_METADATA.items()
]
# VCTK model (multi-speaker)
MODEL_NAME = "tts_models/en/vctk/vits"
tts = TTS(model_name=MODEL_NAME, progress_bar=False, gpu=False)
# Extract plain text from docx, ignoring hyperlinks
def extract_text_ignoring_hyperlinks(docx_file):
doc = Document(docx_file.name)
text_blocks = []
for para in doc.paragraphs:
# Remove hyperlinks using regex or by inspecting runs
if para.text.strip():
clean_text = re.sub(r'https?://\S+', '', para.text)
text_blocks.append(clean_text.strip())
return text_blocks
# Generate sample audio for preview
def generate_sample_audio(sample_text, selected_speaker):
if not sample_text.strip():
raise gr.Error("Sample text cannot be empty.")
sid = selected_speaker.split(" ")[0] # Extract speaker ID
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
tts.tts_to_file(text=sample_text, speaker="p"+sid, file_path=tmp_wav.name)
return tmp_wav.name
# Main conversion function
def docx_to_zipped_wavs(doc_file, selected_speaker):
sid = selected_speaker.split(" ")[0]
paragraphs = extract_text_ignoring_hyperlinks(doc_file)
audio_files = []
try:
for i, para in enumerate(paragraphs):
if not para:
continue
with tempfile.NamedTemporaryFile(suffix=f"_{i}.wav", delete=False) as tmp_wav:
tts.tts_to_file(text=para, speaker="p"+sid, file_path=tmp_wav.name)
audio_files.append(tmp_wav.name)
except Exception as e:
print("Connection interrupted, returning partial result.", str(e))
# Zip the results
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, "w") as zipf:
for wav_path in audio_files:
zipf.write(wav_path, arcname=os.path.basename(wav_path))
zip_buffer.seek(0)
# Save the zip temporarily for download
final_zip = tempfile.NamedTemporaryFile(delete=False, suffix=".zip")
final_zip.write(zip_buffer.read())
final_zip.close()
return final_zip.name
# Gradio UI
with gr.Blocks() as interface:
gr.Markdown("""# Multi-Paragraph Voiceover Generator
Upload a `.docx` file and convert each paragraph to audio. You can also try a short sample first.
""")
with gr.Row():
sample_text = gr.Textbox(label="Sample Text (Max 500 chars)", max_lines=4, lines=3, max_length=500)
speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=SPEAKER_CHOICES, value=SPEAKER_CHOICES[0])
# sample_button = gr.Button("Generate Sample Audio")
# sample_audio = gr.Audio(label="Sample Audio", type="filepath")
with gr.Row():
docx_input = gr.File(label="Upload .docx File", type="filepath")
convert_button = gr.Button("Generate WAV Zip")
final_output = gr.File(label="Download ZIP of WAVs")
# sample_button.click(fn=generate_sample_audio, inputs=[sample_text, speaker_dropdown], outputs=sample_audio)
convert_button.click(fn=docx_to_zipped_wavs, inputs=[docx_input, speaker_dropdown], outputs=final_output)
if __name__ == "__main__":
interface.launch()
|