whisper_fileStream

Runtime error

App Files Files Community

mskov

Firefly777a commited on Mar 9, 2023

Commit

1575629

1 Parent(s): ed8df2e

Major changes to the app to allow prompt engineering (#2)

Browse files

- Major changes to the app to allow prompt engineering (ad35daaa7d851793c89104eff8bf4912e5c2dc76)

Co-authored-by: Maddie <[email protected]>

Files changed (1) hide show

app.py +47 -82

app.py CHANGED Viewed

@@ -1,14 +1,10 @@
 '''
-This script calls the ada model from openai api to predict the next few words.
 '''
 import os
-os.system("pip install --upgrade pip")
 from pprint import pprint
-os.system("pip install git+https://github.com/openai/whisper.git")
 import sys
-print("Sys: ", sys.executable)
-os.system("pip install openai")
 import openai
 import gradio as gr
 import whisper
@@ -17,68 +13,47 @@ import torch
 from transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 import time
-# import streaming.py
-# from next_word_prediction import GPT2
-#gpt2 = AutoModelForCausalLM.from_pretrained("gpt2", return_dict_in_generate=True)
-#tokenizer = AutoTokenizer.from_pretrained("gpt2")
-### /code snippet
-# get gpt2 model
-#generator = pipeline('text-generation', model='gpt2')
-# whisper model specification
-model = whisper.load_model("tiny")
-def inference(audio, state=""):
-    #time.sleep(2)
-    #text = p(audio)["text"]
-    #state += text + " "
-    # load audio data
-    audio = whisper.load_audio(audio)
-    # ensure sample is in correct format for inference
-    audio = whisper.pad_or_trim(audio)
-    # generate a log-mel spetrogram of the audio data
-    mel = whisper.log_mel_spectrogram(audio).to(model.device)
-    _, probs = model.detect_language(mel)
-    # decode audio data
-    options = whisper.DecodingOptions(fp16 = False)
-    # transcribe speech to text
-    result = whisper.decode(model, mel, options)
-    print("result pre gp model from whisper: ", result, ".text ", result.text, "and the data type: ", type(result.text))
-    PROMPT = """The following is an incomplete transcript of a brief conversation. Predict a list of the next most probable words to complete the sentence.
-Some examples:
-Transcript1: Tomorrow night we're going out to
-Predictions1: the movies, a restaurant, a baseball game, the theater, a party for a friend
-Transcript2: I would like to order a cheeseburger with a side of
-Predictions2: french fries, milkshake, apple slices, salad, extra catsup
-Transcript3: My friend Savanah is
-Predictions3: an electrical engineer, a marine biologist, a classical musician
-Transcript4: I need to buy a birthday
-Predictions4: present, gift, cake, card
-Transcript5: """
-    text = PROMPT + result.text + "Prediction5: "
-    openai.api_key = os.environ["Openai_APIkey"]
     response = openai.Completion.create(
-                        model="text-ada-001",
-                        #model="text-curie-001",
                         prompt=text,
-                        temperature=1,
                         max_tokens=8,
                         n=5)
@@ -96,27 +71,17 @@ Transcript5: """
         infers = list(map(lambda x: x.replace("\n", ""), temp))
         #infered = list(map(lambda x: x.split(','), infers))
-    # result.text
-    #return getText, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
-    return result.text, state, infers
 # get audio from microphone
 gr.Interface(
-        fn=inference,
-    inputs=[
-        gr.inputs.Audio(source="microphone", type="filepath"),
-        "state"
-    ],
-    outputs=[
-        "textbox",
-        "state",
-        "textbox"
-    ],
-    live=True).launch()

 '''
+This script calls the model from openai api to predict the next few words.
 '''
 import os
 from pprint import pprint
 import sys
 import openai
 import gradio as gr
 import whisper
 from transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 import time
+EXAMPLE_PROMPT = """This is a tool for helping someone with memory issues remember the next word.
+The predictions follow a few rules:
+1) The predictions are suggestions of ways to continue the transcript as if someone forgot what the next word was.
+2) The predictions do not repeat themselves.
+3) The predictions focus on suggesting nouns, adjectives, and verbs.
+4) The predictions are related to the context in the transcript.
+EXAMPLES:
+Transcript: Tomorrow night we're going out to
+Prediction: The Movies, A Restaurant, A Baseball Game, The Theater, A Party for a friend
+Transcript: I would like to order a cheeseburger with a side of
+Prediction: Frnech fries, Milkshake, Apple slices, Side salad, Extra katsup
+Transcript: My friend Savanah is
+Prediction: An elecrical engineer, A marine biologist, A classical musician
+Transcript: I need to buy a birthday
+Prediction: Present, Gift, Cake, Card
+Transcript: """
+# whisper model specification
+asr_model = whisper.load_model("tiny")
+openai.api_key = os.environ["Openai_APIkey"]
+# Transcribe function
+def transcribe(audio_file):
+    print("Transcribing")
+    transcription = asr_model.transcribe(audio_file)["text"]
+    return transcription
+def debug_inference(audio, prompt, model, temperature, state=""):
+    # Transcribe with Whisper
+    print("The audio is:", audio)
+    transcript = transcribe(audio)
+    text = prompt + transcript + "\nPrediction: "
     response = openai.Completion.create(
+                        model=model,
                         prompt=text,
+                        temperature=temperature,
                         max_tokens=8,
                         n=5)
         infers = list(map(lambda x: x.replace("\n", ""), temp))
         #infered = list(map(lambda x: x.split(','), infers))
+    return transcript, state, infers, text
 # get audio from microphone
 gr.Interface(
+    fn=debug_inference,
+    inputs=[gr.inputs.Audio(source="microphone", type="filepath"),
+            gr.inputs.Textbox(lines=15, placeholder="Enter a prompt here"),
+            gr.inputs.Dropdown(["text-ada-001", "text-davinci-002", "text-davinci-003", "gpt-3.5-turbo"], label="Model"),
+            gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.8, step=0.1, label="Temperature"),
+            "state"
+            ],
+    outputs=["textbox","state","textbox", "textbox"],
+    # examples=[["example_in-the-mood-to-eat.m4a", EXAMPLE_PROMPT, "text-ada-001", 0.8, ""],["","","",0.9,""]],
+    live=False).launch()