File size: 2,093 Bytes
46ee408
 
 
 
 
408a4d8
46ee408
408a4d8
cce0f39
46ee408
 
 
f73965a
46ee408
408a4d8
 
46ee408
 
 
 
cce0f39
46ee408
 
de42bf3
46ee408
 
30e1c24
46ee408
 
30e1c24
46ee408
 
 
cce0f39
46ee408
 
 
 
 
408a4d8
46ee408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408a4d8
46ee408
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Import required libraries
from faster_whisper import WhisperModel
from transformers import pipeline
from bark import generate_audio
from flask import Flask, request, jsonify

# Initialize Flask app
app = Flask(__name__)

# Load models
speech_model = WhisperModel("tiny", device="cuda", compute_type="float16")
nlp_model = pipeline("text-generation", model="gpt-3.5-turbo")

@app.route('/process_audio', methods=['POST'])
def process_audio():
    try:
        # Step 1: Receive the audio file from the user
        audio_file = request.files['audio']
        audio_path = f"./temp/{audio_file.filename}"
        audio_file.save(audio_path)

        # Step 2: Transcribe the audio to text
        transcription = transcribe_audio(audio_path)

        # Step 3: Generate a response based on the transcription
        response_text = generate_response(transcription)

        # Step 4: Synthesize speech from the response text
        response_audio = synthesize_speech(response_text)

        # Save the response audio to a file
        response_audio_path = f"./temp/response_audio.wav"
        response_audio.export(response_audio_path, format="wav")

        return jsonify({
            "transcription": transcription,
            "response_text": response_text,
            "response_audio_path": response_audio_path
        })
    except Exception as e:
        return jsonify({"error": str(e)}), 500

def transcribe_audio(audio_path):
    """
    Transcribe audio using Whisper.
    """
    segments, info = speech_model.transcribe(audio_path)
    transcription = " ".join([segment.text for segment in segments])
    return transcription

def generate_response(user_input):
    """
    Generate text response using GPT-3.5-turbo.
    """
    response = nlp_model(user_input, max_length=100, do_sample=True)
    return response[0]['generated_text']

def synthesize_speech(text):
    """
    Generate speech audio using Bark.
    """
    audio_array = generate_audio(text)
    return audio_array

# Run the app
if __name__ == "__main__":
    app.run(debug=True, host="0.0.0.0", port=5000)