|
|
|
from faster_whisper import WhisperModel |
|
from transformers import pipeline |
|
from bark import generate_audio |
|
from flask import Flask, request, jsonify |
|
|
|
|
|
app = Flask(__name__) |
|
|
|
|
|
speech_model = WhisperModel("tiny", device="cuda", compute_type="float16") |
|
nlp_model = pipeline("text-generation", model="gpt-3.5-turbo") |
|
|
|
@app.route('/process_audio', methods=['POST']) |
|
def process_audio(): |
|
try: |
|
|
|
audio_file = request.files['audio'] |
|
audio_path = f"./temp/{audio_file.filename}" |
|
audio_file.save(audio_path) |
|
|
|
|
|
transcription = transcribe_audio(audio_path) |
|
|
|
|
|
response_text = generate_response(transcription) |
|
|
|
|
|
response_audio = synthesize_speech(response_text) |
|
|
|
|
|
response_audio_path = f"./temp/response_audio.wav" |
|
response_audio.export(response_audio_path, format="wav") |
|
|
|
return jsonify({ |
|
"transcription": transcription, |
|
"response_text": response_text, |
|
"response_audio_path": response_audio_path |
|
}) |
|
except Exception as e: |
|
return jsonify({"error": str(e)}), 500 |
|
|
|
def transcribe_audio(audio_path): |
|
""" |
|
Transcribe audio using Whisper. |
|
""" |
|
segments, info = speech_model.transcribe(audio_path) |
|
transcription = " ".join([segment.text for segment in segments]) |
|
return transcription |
|
|
|
def generate_response(user_input): |
|
""" |
|
Generate text response using GPT-3.5-turbo. |
|
""" |
|
response = nlp_model(user_input, max_length=100, do_sample=True) |
|
return response[0]['generated_text'] |
|
|
|
def synthesize_speech(text): |
|
""" |
|
Generate speech audio using Bark. |
|
""" |
|
audio_array = generate_audio(text) |
|
return audio_array |
|
|
|
|
|
if __name__ == "__main__": |
|
app.run(debug=True, host="0.0.0.0", port=5000) |
|
|