jsbeaudry commited on
Commit
bd1095d
·
verified ·
1 Parent(s): 13e0865

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -35
app.py CHANGED
@@ -1,46 +1,33 @@
1
- from transformers import pipeline
2
- import gradio as gr
3
- from unsloth import FastModel
4
- from transformers import WhisperForConditionalGeneration
5
  import torch
 
 
6
 
7
 
8
- model, tokenizer = FastModel.from_pretrained(
9
- model_name = "jsbeaudry/creole-speech-to-text",
10
- dtype = None, # Leave as None for auto detection
11
- load_in_4bit = False, # Set to True to do 4bit quantization which reduces memory
12
- auto_model = WhisperForConditionalGeneration,
13
- whisper_language = "Haitian",
14
- whisper_task = "transcribe",
15
- # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
16
  )
17
- # Reuse the previously created pipeline object
18
- # pipe = pipeline(model) # This line caused the error
 
19
 
20
- # Initialize the pipeline correctly
21
  pipe = pipeline(
22
  "automatic-speech-recognition",
23
  model=model,
24
- tokenizer=tokenizer.tokenizer,
25
- feature_extractor=tokenizer.feature_extractor,
26
- processor=tokenizer,
27
- return_language=True,
28
- torch_dtype=torch.float16
 
29
  )
30
 
 
 
31
 
32
- def transcribe(audio):
33
- # Use the 'pipe' pipeline
34
- text = pipe(audio)["text"]
35
- return text
36
-
37
-
38
- iface = gr.Interface(
39
- fn=transcribe,
40
- inputs=gr.Audio(type="filepath"),
41
- outputs="text",
42
- title="Whisper medium Creole",
43
- description="Realtime demo for Haitian Creole speech recognition using a fine-tuned medium small model.",
44
- )
45
-
46
- iface.launch()
 
 
 
 
 
1
  import torch
2
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
+ from datasets import load_dataset
4
 
5
 
6
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
7
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
8
+
9
+ model_id = "openai/whisper-large-v3"
10
+
11
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
12
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
 
13
  )
14
+ model.to(device)
15
+
16
+ processor = AutoProcessor.from_pretrained(model_id)
17
 
 
18
  pipe = pipeline(
19
  "automatic-speech-recognition",
20
  model=model,
21
+ tokenizer=processor.tokenizer,
22
+ feature_extractor=processor.feature_extractor,
23
+ chunk_length_s=30,
24
+ batch_size=16, # batch size for inference - set based on your device
25
+ torch_dtype=torch_dtype,
26
+ device=device,
27
  )
28
 
29
+ dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
30
+ sample = dataset[0]["audio"]
31
 
32
+ result = pipe(sample)
33
+ print(result["text"])