Bagda commited on
Commit
4b4cac8
·
verified ·
1 Parent(s): 5435791

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -28
app.py CHANGED
@@ -17,47 +17,28 @@ demo = gr.Interface(
17
 
18
  demo.launch()
19
 
20
-
21
  from pytube import YouTube
 
 
 
22
 
 
23
  video_url = "https://www.youtube.com/watch?v=YOUR_VIDEO_ID"
24
  yt = YouTube(video_url)
25
  stream = yt.streams.filter(only_audio=True).first()
26
  stream.download(filename="video_audio.mp4")
27
 
28
- from moviepy.editor import VideoFileClip
29
-
30
  video = VideoFileClip("video_audio.mp4")
31
  audio = video.audio
32
  audio.write_audiofile("output_audio.wav")
33
 
34
-
35
-
36
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
37
- import torch
38
- from datasets import load_dataset
39
-
40
- # Load model and processor
41
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
42
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
43
-
44
- # Optional: Use GPU if available
45
- device = "cuda" if torch.cuda.is_available() else "cpu"
46
- model = model.to(device)
47
-
48
- # Load sample audio (here using a dummy dataset, aap apni audio file bhi use kar sakte hain)
49
- ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
50
- sample = ds[0]["audio"]
51
-
52
- # Prepare audio input
53
- input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
54
- input_features = input_features.to(device)
55
-
56
- # Generate transcription
57
  predicted_ids = model.generate(input_features)
58
-
59
- # Decode transcription
60
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
61
  print(transcription)
62
 
63
-
 
17
 
18
  demo.launch()
19
 
 
20
  from pytube import YouTube
21
+ from moviepy.editor import VideoFileClip
22
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
23
+ import librosa
24
 
25
+ # Step 1: Download YouTube video as audio
26
  video_url = "https://www.youtube.com/watch?v=YOUR_VIDEO_ID"
27
  yt = YouTube(video_url)
28
  stream = yt.streams.filter(only_audio=True).first()
29
  stream.download(filename="video_audio.mp4")
30
 
31
+ # Step 2: Extract audio as WAV
 
32
  video = VideoFileClip("video_audio.mp4")
33
  audio = video.audio
34
  audio.write_audiofile("output_audio.wav")
35
 
36
+ # Step 3: Speech-to-text with Whisper-Small
 
 
 
 
 
 
37
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
38
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
39
+ audio, sr = librosa.load("output_audio.wav", sr=16000)
40
+ input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features
 
 
 
 
 
 
 
 
 
 
 
 
41
  predicted_ids = model.generate(input_features)
42
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
 
43
  print(transcription)
44