Nick021402 commited on
Commit
d914104
Β·
verified Β·
1 Parent(s): 67dd37a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -21
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
3
  import torch
4
- import numpy as np
5
 
6
  # Load pretrained model and processor
7
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
@@ -11,28 +11,15 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
11
  model.to(device)
12
 
13
  # Transcription function
14
- def transcribe(audio):
15
- if audio is None:
16
  return "Please upload or record an audio file."
17
 
18
- # audio is a tuple: (numpy_array, sample_rate)
19
- if isinstance(audio, tuple):
20
- audio_np, sample_rate = audio
21
- else:
22
- return "Invalid audio input."
23
 
24
- # Convert to mono if stereo
25
- if len(audio_np.shape) > 1:
26
- audio_np = np.mean(audio_np, axis=1)
27
-
28
- # Resample to 16000 Hz if necessary
29
- if sample_rate != 16000:
30
- import librosa
31
- audio_np = librosa.resample(audio_np, orig_sr=sample_rate, target_sr=16000)
32
- sample_rate = 16000
33
-
34
- # Process and run model
35
- input_values = processor(audio_np, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)
36
  with torch.no_grad():
37
  logits = model(input_values).logits
38
  predicted_ids = torch.argmax(logits, dim=-1)
@@ -45,7 +32,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
45
  gr.Markdown("Upload or record your voice, and this app will transcribe what you say.")
46
 
47
  with gr.Row():
48
- audio_input = gr.Audio(label="🎀 Record or Upload Your Voice", type="numpy", streaming=False)
49
  output_text = gr.Textbox(label="πŸ“ Transcribed Text")
50
 
51
  transcribe_button = gr.Button("Transcribe")
 
1
  import gradio as gr
2
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
3
  import torch
4
+ import librosa
5
 
6
  # Load pretrained model and processor
7
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
 
11
  model.to(device)
12
 
13
  # Transcription function
14
+ def transcribe(audio_path):
15
+ if audio_path is None:
16
  return "Please upload or record an audio file."
17
 
18
+ # Load audio file and resample to 16kHz mono
19
+ audio_np, sample_rate = librosa.load(audio_path, sr=16000)
 
 
 
20
 
21
+ # Process and transcribe
22
+ input_values = processor(audio_np, sampling_rate=16000, return_tensors="pt").input_values.to(device)
 
 
 
 
 
 
 
 
 
 
23
  with torch.no_grad():
24
  logits = model(input_values).logits
25
  predicted_ids = torch.argmax(logits, dim=-1)
 
32
  gr.Markdown("Upload or record your voice, and this app will transcribe what you say.")
33
 
34
  with gr.Row():
35
+ audio_input = gr.Audio(label="🎀 Record or Upload Your Voice", type="filepath")
36
  output_text = gr.Textbox(label="πŸ“ Transcribed Text")
37
 
38
  transcribe_button = gr.Button("Transcribe")