AIVoice6 / app.py
dschandra's picture
Update app.py
46ee408 verified
raw
history blame
2.09 kB
# Import required libraries
from faster_whisper import WhisperModel
from transformers import pipeline
from bark import generate_audio
from flask import Flask, request, jsonify
# Initialize Flask app
app = Flask(__name__)
# Load models
speech_model = WhisperModel("tiny", device="cuda", compute_type="float16")
nlp_model = pipeline("text-generation", model="gpt-3.5-turbo")
@app.route('/process_audio', methods=['POST'])
def process_audio():
try:
# Step 1: Receive the audio file from the user
audio_file = request.files['audio']
audio_path = f"./temp/{audio_file.filename}"
audio_file.save(audio_path)
# Step 2: Transcribe the audio to text
transcription = transcribe_audio(audio_path)
# Step 3: Generate a response based on the transcription
response_text = generate_response(transcription)
# Step 4: Synthesize speech from the response text
response_audio = synthesize_speech(response_text)
# Save the response audio to a file
response_audio_path = f"./temp/response_audio.wav"
response_audio.export(response_audio_path, format="wav")
return jsonify({
"transcription": transcription,
"response_text": response_text,
"response_audio_path": response_audio_path
})
except Exception as e:
return jsonify({"error": str(e)}), 500
def transcribe_audio(audio_path):
"""
Transcribe audio using Whisper.
"""
segments, info = speech_model.transcribe(audio_path)
transcription = " ".join([segment.text for segment in segments])
return transcription
def generate_response(user_input):
"""
Generate text response using GPT-3.5-turbo.
"""
response = nlp_model(user_input, max_length=100, do_sample=True)
return response[0]['generated_text']
def synthesize_speech(text):
"""
Generate speech audio using Bark.
"""
audio_array = generate_audio(text)
return audio_array
# Run the app
if __name__ == "__main__":
app.run(debug=True, host="0.0.0.0", port=5000)