etchen commited on
Commit
e91b2cd
·
verified ·
1 Parent(s): 7038cf7

add whisper

Browse files
Files changed (1) hide show
  1. app.py +7 -33
app.py CHANGED
@@ -4,55 +4,29 @@ import numpy as np
4
  # import spaces #[uncomment to use ZeroGPU]
5
  import torch
6
 
 
 
7
  device = "cuda" if torch.cuda.is_available() else "cpu"
8
- model_repo_id = "stabilityai/sdxl-turbo" # Replace to the model you would like to use
9
 
10
  if torch.cuda.is_available():
11
  torch_dtype = torch.float16
12
  else:
13
  torch_dtype = torch.float32
14
 
15
- pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
16
- pipe = pipe.to(device)
17
-
18
- MAX_SEED = np.iinfo(np.int32).max
19
- MAX_IMAGE_SIZE = 1024
20
-
21
 
22
  # @spaces.GPU #[uncomment to use ZeroGPU]
23
  def infer(
24
- prompt,
25
- negative_prompt,
26
- seed,
27
- randomize_seed,
28
- width,
29
- height,
30
- guidance_scale,
31
- num_inference_steps,
32
- progress=gr.Progress(track_tqdm=True),
33
  ):
34
- if randomize_seed:
35
- seed = random.randint(0, MAX_SEED)
36
-
37
- generator = torch.Generator().manual_seed(seed)
38
-
39
- image = pipe(
40
- prompt=prompt,
41
- negative_prompt=negative_prompt,
42
- guidance_scale=guidance_scale,
43
- num_inference_steps=num_inference_steps,
44
- width=width,
45
- height=height,
46
- generator=generator,
47
- ).images[0]
48
-
49
- return image, seed
50
 
51
  with gr.Blocks(css=css) as demo:
52
  gr.Markdown(" # PhonoLearn")
53
  input_audio = gr.Audio(
54
  sources=["microphone", "upload"]
55
  )
56
-
57
  if __name__ == "__main__":
58
  demo.launch()
 
4
  # import spaces #[uncomment to use ZeroGPU]
5
  import torch
6
 
7
+ from transformers import pipeline
8
+
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
+ model_repo_id = "openai/whisper-tiny"
11
 
12
  if torch.cuda.is_available():
13
  torch_dtype = torch.float16
14
  else:
15
  torch_dtype = torch.float32
16
 
17
+ pipe = pipeline(task="automatic-speech-recognition", model=model_repo_id, device=device)
 
 
 
 
 
18
 
19
  # @spaces.GPU #[uncomment to use ZeroGPU]
20
  def infer(
21
+ audio
 
 
 
 
 
 
 
 
22
  ):
23
+ return pipe(audio, generate_kwargs={'language': 'chinese'})['text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  with gr.Blocks(css=css) as demo:
26
  gr.Markdown(" # PhonoLearn")
27
  input_audio = gr.Audio(
28
  sources=["microphone", "upload"]
29
  )
30
+
31
  if __name__ == "__main__":
32
  demo.launch()