Spaces:
Runtime error
Runtime error
File size: 4,415 Bytes
787037e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import gradio as gr
import numpy as np
import torch
import os
import time
from scipy.io import wavfile
# Explicitly import Bark components
from bark import generate_audio, SAMPLE_RATE
from bark.generation import preload_models, load_model, generate_text_semantic
class VoiceCloningApp:
def __init__(self):
# Create working directory
self.base_dir = os.path.dirname(os.path.abspath(__file__))
self.working_dir = os.path.join(self.base_dir, "working_files")
os.makedirs(self.working_dir, exist_ok=True)
# Explicit model loading with error handling
try:
print("Attempting to load Bark models...")
preload_models()
print("Bark models loaded successfully.")
except Exception as e:
print(f"Error loading Bark models: {e}")
raise RuntimeError(f"Could not load Bark models: {e}")
def process_reference_audio(self, audio_data):
"""Simple audio processing"""
if audio_data is None:
return "Please provide an audio input"
try:
# Unpack audio data
sample_rate, audio_array = audio_data
# Normalize audio
audio_array = audio_array / np.max(np.abs(audio_array))
# Save reference audio
filename = f"reference_{int(time.time())}.wav"
filepath = os.path.join(self.working_dir, filename)
wavfile.write(filepath, sample_rate, audio_array)
return "β
Audio captured successfully!"
except Exception as e:
return f"Error processing audio: {str(e)}"
def generate_speech(self, text):
"""Generate speech using Bark"""
if not text or not text.strip():
return None, "Please enter some text to speak"
try:
# Generate audio with explicit error handling
print(f"Generating speech for text: {text}")
# Use alternative generation method
semantic_tokens = generate_text_semantic(
text,
history_prompt=None,
temp=0.7,
min_eos_p=0.05,
)
# Generate audio from semantic tokens
audio_array = generate_audio(
semantic_tokens,
history_prompt=None,
temp=0.7
)
# Save generated audio
filename = f"generated_speech_{int(time.time())}.wav"
filepath = os.path.join(self.working_dir, filename)
wavfile.write(filepath, SAMPLE_RATE, audio_array)
return filepath, None
except Exception as e:
print(f"Speech generation error: {e}")
return None, f"Error generating speech: {str(e)}"
def create_interface():
app = VoiceCloningApp()
# Use the most basic Gradio theme to avoid font issues
with gr.Blocks() as interface:
gr.Markdown("# ποΈ Voice Cloning App")
with gr.Row():
with gr.Column():
gr.Markdown("## 1. Capture Reference Voice")
reference_audio = gr.Audio(sources=["microphone", "upload"], type="numpy")
process_btn = gr.Button("Process Reference Voice")
process_output = gr.Textbox(label="Processing Result")
with gr.Column():
gr.Markdown("## 2. Generate Speech")
text_input = gr.Textbox(label="Enter Text to Speak")
generate_btn = gr.Button("Generate Speech")
audio_output = gr.Audio(label="Generated Speech")
error_output = gr.Textbox(label="Errors", visible=True)
# Bind functions
process_btn.click(
fn=app.process_reference_audio,
inputs=reference_audio,
outputs=process_output
)
generate_btn.click(
fn=app.generate_speech,
inputs=text_input,
outputs=[audio_output, error_output]
)
return interface
if __name__ == "__main__":
interface = create_interface()
interface.launch(
share=False,
debug=True,
show_error=True,
server_name='0.0.0.0',
server_port=7860
)
|