muhtasham commited on
Commit
a6eeb9b
·
1 Parent(s): 88d0fe2
Files changed (2) hide show
  1. app.py +24 -59
  2. requirements.txt +0 -2
app.py CHANGED
@@ -1,12 +1,16 @@
1
- import torch
2
  import gradio as gr
3
  import subprocess
4
  import datetime
5
  import tempfile
6
- from transformers import pipeline
7
  from loguru import logger
 
8
 
9
- MODEL_NAME = "muhtasham/whisper-tg"
 
 
 
 
10
 
11
  def format_time(seconds):
12
  """Convert seconds to SRT time format (HH:MM:SS,mmm).
@@ -66,7 +70,7 @@ def generate_srt(chunks):
66
  for i, chunk in enumerate(chunks, 1):
67
  start_time = format_time(chunk["timestamp"][0])
68
  end_time = format_time(chunk["timestamp"][1])
69
- text = chunk["text"].strip()
70
  srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
71
  return "".join(srt_content)
72
 
@@ -106,35 +110,13 @@ def check_ffmpeg():
106
  # Initialize ffmpeg check
107
  check_ffmpeg()
108
 
109
- # Use T4 GPU if available, otherwise fallback to CPU
110
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
111
- logger.info(f"Using device: {device}")
112
-
113
- def create_pipeline():
114
- """Create a new pipeline with optimized settings for T4 GPU.
115
-
116
- Returns:
117
- transformers.Pipeline: Configured speech recognition pipeline.
118
- """
119
- return pipeline(
120
- task="automatic-speech-recognition",
121
- model=MODEL_NAME,
122
- device=device,
123
- )
124
-
125
- # Initialize pipeline once
126
- pipe = create_pipeline()
127
- logger.info(f"Pipeline initialized: {pipe}")
128
-
129
- def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_length_s):
130
- """Transcribe audio input using Whisper model.
131
 
132
  Args:
133
  inputs (str): Path to audio file to transcribe.
134
  return_timestamps (bool): Whether to include timestamps in output.
135
  generate_subs (bool): Whether to generate SRT subtitles.
136
- batch_size (int): Number of chunks to process in parallel.
137
- chunk_length_s (int): Length of audio chunks in seconds.
138
 
139
  Returns:
140
  tuple: (formatted_result, srt_file, correction_text)
@@ -152,27 +134,20 @@ def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_lengt
152
  try:
153
  logger.info(f"Processing audio file: {inputs}")
154
 
155
- # Calculate optimal chunk and stride lengths based on input
156
- stride_length_s = chunk_length_s / 6
 
157
 
158
- # Clear CUDA cache before processing
159
- if torch.cuda.is_available():
160
- torch.cuda.empty_cache()
161
- logger.debug("Cleared CUDA cache before processing")
162
 
163
- # Process audio with dynamic chunking
164
- result = pipe(
165
- inputs,
166
- batch_size=batch_size,
167
- chunk_length_s=chunk_length_s,
168
- stride_length_s=stride_length_s,
169
- return_timestamps="word" if return_timestamps else False
170
- )
171
- logger.debug(f"Pipeline result: {result}")
172
 
173
  # Format response as JSON
174
  formatted_result = {
175
- "text": result["text"]
176
  }
177
 
178
  chunks = []
@@ -208,17 +183,11 @@ def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_lengt
208
  srt_file = save_srt_to_file(srt_content)
209
  logger.info("SRT subtitles generated successfully")
210
 
211
- # Clear CUDA cache after processing
212
- if torch.cuda.is_available():
213
- torch.cuda.empty_cache()
214
- logger.debug("Cleared CUDA cache after processing")
215
-
216
  return formatted_result, srt_file, "" # Return empty string for correction textbox
 
 
 
217
  except Exception as e:
218
- # Ensure CUDA cache is cleared even if there's an error
219
- if torch.cuda.is_available():
220
- torch.cuda.empty_cache()
221
- logger.debug("Cleared CUDA cache after error")
222
  logger.exception(f"Error during transcription: {str(e)}")
223
  raise gr.Error(f"Failed to transcribe audio: {str(e)}")
224
 
@@ -232,8 +201,6 @@ mf_transcribe = gr.Interface(
232
  gr.Audio(sources="microphone", type="filepath"),
233
  gr.Checkbox(label="Include timestamps", value=True),
234
  gr.Checkbox(label="Generate subtitles", value=True),
235
- gr.Slider(minimum=1, maximum=64, value=8, step=1, label="Batch Size"),
236
- gr.Slider(minimum=5, maximum=30, value=30, step=5, label="Chunk Length (seconds)"),
237
  ],
238
  outputs=[
239
  gr.JSON(label="Transcription", open=True),
@@ -242,7 +209,7 @@ mf_transcribe = gr.Interface(
242
  title="Whisper Large V3 Turbo: Transcribe Audio",
243
  description=(
244
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
245
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
246
  " of arbitrary length."
247
  )
248
  )
@@ -253,8 +220,6 @@ file_transcribe = gr.Interface(
253
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
254
  gr.Checkbox(label="Include timestamps", value=True),
255
  gr.Checkbox(label="Generate subtitles", value=True),
256
- gr.Slider(minimum=1, maximum=32, value=8, step=1, label="Batch Size"),
257
- gr.Slider(minimum=5, maximum=30, value=15, step=5, label="Chunk Length (seconds)"),
258
  ],
259
  outputs=[
260
  gr.JSON(label="Transcription", open=True),
@@ -263,7 +228,7 @@ file_transcribe = gr.Interface(
263
  title="Whisper Large V3: Transcribe Audio",
264
  description=(
265
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
266
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
267
  " of arbitrary length."
268
  )
269
  )
 
 
1
  import gradio as gr
2
  import subprocess
3
  import datetime
4
  import tempfile
5
+ import requests
6
  from loguru import logger
7
+ from os import getenv
8
 
9
+ API_URL = getenv("API_URL")
10
+ headers = {
11
+ "Accept": "application/json",
12
+ "Content-Type": "audio/flac"
13
+ }
14
 
15
  def format_time(seconds):
16
  """Convert seconds to SRT time format (HH:MM:SS,mmm).
 
70
  for i, chunk in enumerate(chunks, 1):
71
  start_time = format_time(chunk["timestamp"][0])
72
  end_time = format_time(chunk["timestamp"][1])
73
+ text = chunk.get("text", "").strip()
74
  srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
75
  return "".join(srt_content)
76
 
 
110
  # Initialize ffmpeg check
111
  check_ffmpeg()
112
 
113
+ def transcribe(inputs, return_timestamps, generate_subs):
114
+ """Transcribe audio input using Whisper model via Hugging Face Inference API.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  Args:
117
  inputs (str): Path to audio file to transcribe.
118
  return_timestamps (bool): Whether to include timestamps in output.
119
  generate_subs (bool): Whether to generate SRT subtitles.
 
 
120
 
121
  Returns:
122
  tuple: (formatted_result, srt_file, correction_text)
 
134
  try:
135
  logger.info(f"Processing audio file: {inputs}")
136
 
137
+ # Read the audio file
138
+ with open(inputs, "rb") as f:
139
+ data = f.read()
140
 
141
+ # Send request to API
142
+ response = requests.post(API_URL, headers=headers, data=data)
143
+ response.raise_for_status() # Raise an exception for bad status codes
144
+ result = response.json()
145
 
146
+ logger.debug(f"API response: {result}")
 
 
 
 
 
 
 
 
147
 
148
  # Format response as JSON
149
  formatted_result = {
150
+ "text": result.get("text", "")
151
  }
152
 
153
  chunks = []
 
183
  srt_file = save_srt_to_file(srt_content)
184
  logger.info("SRT subtitles generated successfully")
185
 
 
 
 
 
 
186
  return formatted_result, srt_file, "" # Return empty string for correction textbox
187
+ except requests.exceptions.RequestException as e:
188
+ logger.exception(f"API request failed: {str(e)}")
189
+ raise gr.Error(f"Failed to transcribe audio: API request failed - {str(e)}")
190
  except Exception as e:
 
 
 
 
191
  logger.exception(f"Error during transcription: {str(e)}")
192
  raise gr.Error(f"Failed to transcribe audio: {str(e)}")
193
 
 
201
  gr.Audio(sources="microphone", type="filepath"),
202
  gr.Checkbox(label="Include timestamps", value=True),
203
  gr.Checkbox(label="Generate subtitles", value=True),
 
 
204
  ],
205
  outputs=[
206
  gr.JSON(label="Transcription", open=True),
 
209
  title="Whisper Large V3 Turbo: Transcribe Audio",
210
  description=(
211
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
212
+ f" checkpoint [{API_URL}](https://huggingface.co/{API_URL}) and 🤗 Transformers to transcribe audio files"
213
  " of arbitrary length."
214
  )
215
  )
 
220
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
221
  gr.Checkbox(label="Include timestamps", value=True),
222
  gr.Checkbox(label="Generate subtitles", value=True),
 
 
223
  ],
224
  outputs=[
225
  gr.JSON(label="Transcription", open=True),
 
228
  title="Whisper Large V3: Transcribe Audio",
229
  description=(
230
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
231
+ f" checkpoint [{API_URL}](https://huggingface.co/{API_URL}) and 🤗 Transformers to transcribe audio files"
232
  " of arbitrary length."
233
  )
234
  )
requirements.txt CHANGED
@@ -1,4 +1,2 @@
1
- transformers
2
  loguru
3
- torch
4
  gradio
 
 
1
  loguru
 
2
  gradio