Spaces:

etchen
/

phonolearn

Sleeping

etchen commited on Jun 14

Commit

e91b2cd

verified ·

1 Parent(s): 7038cf7

add whisper

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,55 +4,29 @@ import numpy as np
 # import spaces #[uncomment to use ZeroGPU]
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
 if torch.cuda.is_available():
     torch_dtype = torch.float16
 else:
     torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
 # @spaces.GPU #[uncomment to use ZeroGPU]
 def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
 ):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
 with gr.Blocks(css=css) as demo:
     gr.Markdown(" # PhonoLearn")
     input_audio = gr.Audio(
         sources=["microphone", "upload"]
     )
 if __name__ == "__main__":
     demo.launch()

 # import spaces #[uncomment to use ZeroGPU]
 import torch
+from transformers import pipeline
 device = "cuda" if torch.cuda.is_available() else "cpu"
+model_repo_id = "openai/whisper-tiny"
 if torch.cuda.is_available():
     torch_dtype = torch.float16
 else:
     torch_dtype = torch.float32
+pipe = pipeline(task="automatic-speech-recognition", model=model_repo_id, device=device)
 # @spaces.GPU #[uncomment to use ZeroGPU]
 def infer(
+    audio
 ):
+    return pipe(audio, generate_kwargs={'language': 'chinese'})['text']
 with gr.Blocks(css=css) as demo:
     gr.Markdown(" # PhonoLearn")
     input_audio = gr.Audio(
         sources=["microphone", "upload"]
     )
 if __name__ == "__main__":
     demo.launch()