vaibhavs10 commited on
Commit
12b6ee7
·
1 Parent(s): 68ffe75

Adding YT transcription code

Browse files
Files changed (1) hide show
  1. app.py +58 -41
app.py CHANGED
@@ -1,66 +1,83 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import WhisperForConditionalGeneration, WhisperProcessor
4
- import librosa
5
- import soundfile
6
 
7
  MODEL_NAME = "openai/whisper-small"
8
- lang = "ja"
9
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
12
- model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
13
- processor = WhisperProcessor.from_pretrained(MODEL_NAME)
14
-
15
-
16
- def load_and_fix_data(input_file):
17
- speech, sample_rate = librosa.load(input_file)
18
- if len(speech.shape) > 1:
19
- speech = speech[:,0] + speech[:,1]
20
- if sample_rate !=16000:
21
- speech = librosa.resample(speech, sample_rate,16000)
22
- speech = librosa.to_mono(speech)
23
- return speech
24
 
25
 
26
- def transcribe(Microphone, File_Upload):
27
  warn_output = ""
28
- if (Microphone is not None) and (File_Upload is not None):
29
- warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
30
- "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
31
- file = Microphone
 
 
32
 
33
- elif (Microphone is None) and (File_Upload is None):
34
  return "ERROR: You have to either use the microphone or upload an audio file"
35
 
36
- elif Microphone is not None:
37
- file = Microphone
38
- else:
39
- file = File_Upload
40
-
41
- speech_data = load_and_fix_data(file)
42
 
43
- inputs = processor(speech_data, return_tensors="pt", sampling_rate=16_000).input_features.to(device)
44
- forced_decoder_ids = processor.get_decoder_prompt_ids(language=lang, task="transcribe")
45
-
46
- predicted_ids = model.generate(inputs, max_length=480_000, forced_decoder_ids=forced_decoder_ids)
47
- text = processor.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0]
48
 
49
  return warn_output + text
50
 
51
 
52
- iface = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  fn=transcribe,
54
  inputs=[
55
- gr.inputs.Audio(source="microphone", type='filepath', optional=True),
56
- gr.inputs.Audio(source="upload", type='filepath', optional=True),
57
  ],
58
  outputs="text",
59
  layout="horizontal",
60
  theme="huggingface",
61
- title="[WFTE] Whisper model showcase",
62
- description="Demo for showcasing fine-tuned OpenAI Whisper models from WFTE.",
63
- allow_flagging='never',
64
  )
65
 
66
- iface.launch(enable_queue=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
+ import pytube as pt
 
 
4
 
5
  MODEL_NAME = "openai/whisper-small"
 
6
 
7
  device = "cuda" if torch.cuda.is_available() else "cpu"
8
 
9
+ pipe = pipeline(
10
+ task="automatic-speech-recognition",
11
+ model=MODEL_NAME,
12
+ chunk_length_s=30,
13
+ device=device,
14
+ )
 
 
 
 
 
 
15
 
16
 
17
+ def transcribe(microphone, file_upload):
18
  warn_output = ""
19
+ if (microphone is not None) and (file_upload is not None):
20
+ warn_output = (
21
+ "WARNING: You've uploaded an audio file and used the microphone. "
22
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
23
+ )
24
+ file = microphone
25
 
26
+ elif (microphone is None) and (file_upload is None):
27
  return "ERROR: You have to either use the microphone or upload an audio file"
28
 
29
+ file = microphone if microphone is not None else file_upload
 
 
 
 
 
30
 
31
+ text = pipe(file)["text"]
 
 
 
 
32
 
33
  return warn_output + text
34
 
35
 
36
+ def yt_transcribe(yt_url):
37
+
38
+ yt = pt.YouTube(yt_url)
39
+ stream = yt.streams.filter(only_audio=True)[0]
40
+ stream.download(filename="audio.mp3")
41
+
42
+ text = pipe("audio.mp3")["text"]
43
+
44
+ return text
45
+
46
+
47
+ demo = gr.Blocks()
48
+
49
+ mf_transcribe = gr.Interface(
50
  fn=transcribe,
51
  inputs=[
52
+ gr.inputs.Audio(source="microphone", type="filepath", optional=True),
53
+ gr.inputs.Audio(source="upload", type="filepath", optional=True),
54
  ],
55
  outputs="text",
56
  layout="horizontal",
57
  theme="huggingface",
58
+ title="Whisper Audio Transcribe",
59
+ description="Transcribe long audio/ microphone input (powered by 🤗transformers) with a click of a button!",
60
+ allow_flagging="never",
61
  )
62
 
63
+ yt_transcribe = gr.Interface(
64
+ fn=yt_transcribe,
65
+ inputs=[
66
+ gr.inputs.Textbox(
67
+ lines=1, placeholder="Paste a URL to YT video here", label="yt_url"
68
+ )
69
+ ],
70
+ outputs="text",
71
+ layout="horizontal",
72
+ theme="huggingface",
73
+ title="Whisper YT Transcribe",
74
+ description="Transcribe long YouTube videos (powered by 🤗transformers) with a click of a button!",
75
+ allow_flagging="never",
76
+ )
77
+
78
+ with demo:
79
+ gr.TabbedInterface(
80
+ [mf_transcribe, yt_transcribe], ["Audio Transcribe", "YouTube Transcribe"]
81
+ )
82
+
83
+ demo.launch(enable_queue=True)