muhtasham commited on
Commit
22095b0
·
1 Parent(s): c5741b3

feat: enhance transcription with configurable parameters and feedback system

Browse files

- Add configurable batch size (1-32) and chunk length (5-60s) parameters

- Implement comprehensive feedback system with quick rating and detailed corrections

- Switch to local pipeline processing with GPU support

- Add logging for better debugging

- Improve error handling and user feedback

Files changed (1) hide show
  1. app.py +126 -54
app.py CHANGED
@@ -1,5 +1,7 @@
 
 
1
  import gradio as gr
2
- import requests
3
  import subprocess
4
  from loguru import logger
5
  import datetime
@@ -7,7 +9,7 @@ import datetime
7
  # Configure loguru
8
  logger.add("app.log", rotation="500 MB", level="DEBUG")
9
 
10
- API_URL = "https://skdpcqcdd929o4k3.us-east-1.aws.endpoints.huggingface.cloud"
11
 
12
  def format_time(seconds):
13
  """Convert seconds to SRT time format (HH:MM:SS,mmm)"""
@@ -40,44 +42,35 @@ def check_ffmpeg():
40
  # Initialize ffmpeg check
41
  check_ffmpeg()
42
 
43
- def transcribe(inputs, return_timestamps, generate_subs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  if inputs is None:
45
  logger.warning("No audio file submitted")
46
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
47
 
48
- headers = {
49
- "Accept": "application/json",
50
- "Content-Type": "audio/flac"
51
- }
52
- logger.debug(f"Using headers: {headers}")
53
-
54
  try:
55
- logger.info(f"Reading audio file: {inputs}")
56
- with open(inputs, "rb") as f:
57
- data = f.read()
58
- logger.debug(f"Audio file size: {len(data)} bytes")
59
-
60
- # Add parameters to request
61
- params = {
62
- "return_timestamps": return_timestamps
63
- }
64
- logger.debug(f"Request parameters: {params}")
65
-
66
- logger.info("Sending request to API")
67
- response = requests.post(API_URL, headers=headers, data=data, params=params)
68
- logger.debug(f"API Response status: {response.status_code}")
69
-
70
- result = response.json()
71
- logger.debug(f"API Response: {result}")
72
 
73
- if "error" in result:
74
- logger.error(f"API returned error: {result['error']}")
75
- raise gr.Error(f"API Error: {result['error']}")
76
-
77
- if "text" not in result:
78
- logger.error("No transcription text in response")
79
- raise gr.Error("No transcription text in response")
80
-
81
  # Format response as JSON
82
  formatted_result = {
83
  "text": result["text"]
@@ -98,13 +91,14 @@ def transcribe(inputs, return_timestamps, generate_subs):
98
  "text": text,
99
  "timestamp": [start_time, end_time]
100
  }
101
- formatted_result["chunks"] = chunks
102
  chunks.append(chunk_data)
103
  else:
104
  logger.warning(f"Invalid timestamp in chunk {i}: {chunk}")
105
  except Exception as chunk_error:
106
  logger.error(f"Error processing chunk {i}: {str(chunk_error)}")
107
  continue
 
 
108
  logger.info(f"Successfully processed transcription with {len(chunks)} chunks")
109
 
110
  # Generate subtitles if requested
@@ -121,12 +115,18 @@ def transcribe(inputs, return_timestamps, generate_subs):
121
 
122
  demo = gr.Blocks(theme=gr.themes.Ocean())
123
 
 
 
 
 
124
  mf_transcribe = gr.Interface(
125
  fn=transcribe,
126
  inputs=[
127
  gr.Audio(sources="microphone", type="filepath"),
128
  gr.Checkbox(label="Include timestamps", value=True),
129
  gr.Checkbox(label="Generate subtitles", value=True),
 
 
130
  ],
131
  outputs=[
132
  gr.JSON(label="Transcription", open=True),
@@ -134,16 +134,11 @@ mf_transcribe = gr.Interface(
134
  ],
135
  title="Whisper Large V3 Turbo: Transcribe Audio",
136
  description=(
137
- "Transcribe long-form microphone or audio inputs with the click of a button! "
138
- "Generate subtitles for your videos in SRT format."
 
139
  ),
140
- flagging_mode="manual",
141
- flagging_options=[
142
- "Incorrect text",
143
- "Incorrect timestamp",
144
- "Other issue"
145
- ],
146
- flagging_dir="flagged_data"
147
  )
148
 
149
  file_transcribe = gr.Interface(
@@ -152,6 +147,8 @@ file_transcribe = gr.Interface(
152
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
153
  gr.Checkbox(label="Include timestamps", value=True),
154
  gr.Checkbox(label="Generate subtitles", value=True),
 
 
155
  ],
156
  outputs=[
157
  gr.JSON(label="Transcription", open=True),
@@ -159,20 +156,95 @@ file_transcribe = gr.Interface(
159
  ],
160
  title="Whisper Large V3: Transcribe Audio",
161
  description=(
162
- "Transcribe long-form microphone or audio inputs with the click of a button! "
163
- "Generate subtitles for your videos in SRT format."
 
164
  ),
165
- flagging_mode="manual",
166
- flagging_options=[
167
- "Incorrect text",
168
- "Incorrect timestamp",
169
- "Other issue"
170
- ],
171
- flagging_dir="flagged_data"
172
  )
173
 
 
174
  with demo:
175
- gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  logger.info("Starting Gradio interface")
178
  demo.queue().launch(ssr_mode=False)
 
1
+ import spaces
2
+ import torch
3
  import gradio as gr
4
+ from transformers import pipeline
5
  import subprocess
6
  from loguru import logger
7
  import datetime
 
9
  # Configure loguru
10
  logger.add("app.log", rotation="500 MB", level="DEBUG")
11
 
12
+ MODEL_NAME = "muhtasham/whisper-tg"
13
 
14
  def format_time(seconds):
15
  """Convert seconds to SRT time format (HH:MM:SS,mmm)"""
 
42
  # Initialize ffmpeg check
43
  check_ffmpeg()
44
 
45
+ device = 0 if torch.cuda.is_available() else "cpu"
46
+ logger.info(f"Using device: {device}")
47
+
48
+ def create_pipeline(chunk_length_s):
49
+ """Create a new pipeline with specified chunk length"""
50
+ return pipeline(
51
+ task="automatic-speech-recognition",
52
+ model=MODEL_NAME,
53
+ chunk_length_s=chunk_length_s,
54
+ device=device,
55
+ )
56
+
57
+ # Initialize default pipeline
58
+ pipe = create_pipeline(30)
59
+ logger.info(f"Pipeline initialized: {pipe}")
60
+
61
+ @spaces.GPU
62
+ def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_length_s):
63
  if inputs is None:
64
  logger.warning("No audio file submitted")
65
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
66
 
 
 
 
 
 
 
67
  try:
68
+ logger.info(f"Processing audio file: {inputs}")
69
+ # Create new pipeline with specified chunk length
70
+ current_pipe = create_pipeline(chunk_length_s)
71
+ result = current_pipe(inputs, batch_size=batch_size, return_timestamps=return_timestamps)
72
+ logger.debug(f"Pipeline result: {result}")
 
 
 
 
 
 
 
 
 
 
 
 
73
 
 
 
 
 
 
 
 
 
74
  # Format response as JSON
75
  formatted_result = {
76
  "text": result["text"]
 
91
  "text": text,
92
  "timestamp": [start_time, end_time]
93
  }
 
94
  chunks.append(chunk_data)
95
  else:
96
  logger.warning(f"Invalid timestamp in chunk {i}: {chunk}")
97
  except Exception as chunk_error:
98
  logger.error(f"Error processing chunk {i}: {str(chunk_error)}")
99
  continue
100
+
101
+ formatted_result["chunks"] = chunks
102
  logger.info(f"Successfully processed transcription with {len(chunks)} chunks")
103
 
104
  # Generate subtitles if requested
 
115
 
116
  demo = gr.Blocks(theme=gr.themes.Ocean())
117
 
118
+ # Create flagging callback with custom options
119
+ flagging_callback = gr.CSVLogger()
120
+
121
+ # Define interfaces first
122
  mf_transcribe = gr.Interface(
123
  fn=transcribe,
124
  inputs=[
125
  gr.Audio(sources="microphone", type="filepath"),
126
  gr.Checkbox(label="Include timestamps", value=True),
127
  gr.Checkbox(label="Generate subtitles", value=True),
128
+ gr.Slider(minimum=1, maximum=32, value=8, step=1, label="Batch Size"),
129
+ gr.Slider(minimum=5, maximum=30, value=15, step=5, label="Chunk Length (seconds)"),
130
  ],
131
  outputs=[
132
  gr.JSON(label="Transcription", open=True),
 
134
  ],
135
  title="Whisper Large V3 Turbo: Transcribe Audio",
136
  description=(
137
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
138
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
139
+ " of arbitrary length."
140
  ),
141
+ flagging_mode="manual"
 
 
 
 
 
 
142
  )
143
 
144
  file_transcribe = gr.Interface(
 
147
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
148
  gr.Checkbox(label="Include timestamps", value=True),
149
  gr.Checkbox(label="Generate subtitles", value=True),
150
+ gr.Slider(minimum=1, maximum=32, value=8, step=1, label="Batch Size"),
151
+ gr.Slider(minimum=10, maximum=60, value=30, step=5, label="Chunk Length (seconds)"),
152
  ],
153
  outputs=[
154
  gr.JSON(label="Transcription", open=True),
 
156
  ],
157
  title="Whisper Large V3: Transcribe Audio",
158
  description=(
159
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
160
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
161
+ " of arbitrary length."
162
  ),
163
+ flagging_mode="manual"
 
 
 
 
 
 
164
  )
165
 
166
+ # Then set up the demo with the interfaces
167
  with demo:
168
+ with gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"]) as tabs:
169
+ with gr.Row():
170
+ with gr.Column():
171
+ # Quick feedback
172
+ feedback_rating = gr.Radio(
173
+ choices=["👍 Good", "👎 Bad"],
174
+ label="Was this transcription accurate?",
175
+ value="👍 Good"
176
+ )
177
+
178
+ # Detailed feedback
179
+ with gr.Accordion("Detailed Feedback", open=False):
180
+ flag_type = gr.Radio(
181
+ choices=[
182
+ "Text Issue",
183
+ "Timestamp Issue",
184
+ "Missing Content",
185
+ "Other Issue"
186
+ ],
187
+ label="What type of issue did you find?",
188
+ value="Text Issue"
189
+ )
190
+
191
+ # Correction submission
192
+ with gr.Row():
193
+ with gr.Column():
194
+ gr.Markdown("### Original")
195
+ original_text = gr.Textbox(
196
+ label="Original text",
197
+ interactive=False,
198
+ lines=2
199
+ )
200
+ with gr.Column():
201
+ gr.Markdown("### Correction")
202
+ corrected_text = gr.Textbox(
203
+ label="Corrected text",
204
+ placeholder="Enter the correct text here",
205
+ lines=2
206
+ )
207
+
208
+ # Timestamp correction
209
+ with gr.Row():
210
+ with gr.Column():
211
+ gr.Markdown("### Original Timestamp")
212
+ original_timestamp = gr.Textbox(
213
+ label="Original timestamp",
214
+ interactive=False,
215
+ lines=1
216
+ )
217
+ with gr.Column():
218
+ gr.Markdown("### Corrected Timestamp")
219
+ corrected_timestamp = gr.Textbox(
220
+ label="Corrected timestamp (HH:MM:SS,mmm)",
221
+ placeholder="00:00:00,000",
222
+ lines=1
223
+ )
224
+
225
+ flag_details = gr.Textbox(
226
+ label="Additional notes",
227
+ placeholder="Any other details about the issue...",
228
+ lines=3
229
+ )
230
+
231
+ flag_button = gr.Button("Submit Feedback")
232
+
233
+ # Setup flagging callback with all feedback components
234
+ flagging_callback.setup(
235
+ [tabs, feedback_rating, flag_type, original_text, corrected_text,
236
+ original_timestamp, corrected_timestamp, flag_details],
237
+ "flagged_data"
238
+ )
239
+
240
+ # Handle flag submission
241
+ flag_button.click(
242
+ lambda *args: flagging_callback.flag(list(args)),
243
+ [tabs, feedback_rating, flag_type, original_text, corrected_text,
244
+ original_timestamp, corrected_timestamp, flag_details],
245
+ None,
246
+ preprocess=False
247
+ )
248
 
249
  logger.info("Starting Gradio interface")
250
  demo.queue().launch(ssr_mode=False)