File size: 1,546 Bytes
88dc3ba
d16271b
88dc3ba
335cb84
eb4d23a
e416e8e
 
 
 
527e644
ce7e2d6
e416e8e
 
 
 
 
88dc3ba
e416e8e
 
630d7c9
e416e8e
 
88dc3ba
e416e8e
 
 
 
 
 
 
527e644
e416e8e
 
 
 
 
 
 
 
527e644
e416e8e
 
 
 
 
 
527e644
e416e8e
 
527e644
e416e8e
527e644
e416e8e
 
527e644
 
e416e8e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import subprocess
import gradio as gr  # Add this import statement

subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
subprocess.run(["pip", "install", "gradio", "--upgrade"])
subprocess.run(["pip", "install", "soundfile"])
subprocess.run(["pip", "install", "numpy"])
subprocess.run(["pip", "install", "pydub"])
subprocess.run(["pip", "install", "openai"])

import gradio as gr
import openai
import soundfile as sf
import numpy as np
from pydub import AudioSegment
from io import BytesIO

# Set your OpenAI API key
openai.api_key = "YOUR_OPENAI_API_KEY"

# Whisper ASR model
whisper_model = "whisper-small"

# Define the Gradio interface
iface = gr.Interface(
    fn=None,  # To be defined later
    inputs=gr.Audio(),
    outputs=gr.Textbox(),
    live=True,
)

# Define the function for ASR
def transcribe_audio(audio_data):
    # Convert the audio data to a suitable format
    audio = AudioSegment.from_file(BytesIO(audio_data), format="wav")
    audio.export("temp.wav", format="wav")
    
    # Load the audio file using soundfile
    audio_array, _ = sf.read("temp.wav")

    # Perform ASR using OpenAI's Whisper
    response = openai.Completion.create(
        engine=whisper_model,
        audio_input=audio_array.tolist(),
        content_type="audio/wav",
    )

    # Extract the transcribed text from the response
    transcription = response["choices"][0]["text"].strip()

    return transcription

# Set the function for the Gradio interface
iface.fn = transcribe_audio

# Launch the Gradio app
iface.launch()