Spaces:
Paused
Paused
WIP
Browse files- app.py +24 -59
- requirements.txt +0 -2
app.py
CHANGED
@@ -1,12 +1,16 @@
|
|
1 |
-
import torch
|
2 |
import gradio as gr
|
3 |
import subprocess
|
4 |
import datetime
|
5 |
import tempfile
|
6 |
-
|
7 |
from loguru import logger
|
|
|
8 |
|
9 |
-
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def format_time(seconds):
|
12 |
"""Convert seconds to SRT time format (HH:MM:SS,mmm).
|
@@ -66,7 +70,7 @@ def generate_srt(chunks):
|
|
66 |
for i, chunk in enumerate(chunks, 1):
|
67 |
start_time = format_time(chunk["timestamp"][0])
|
68 |
end_time = format_time(chunk["timestamp"][1])
|
69 |
-
text = chunk
|
70 |
srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
|
71 |
return "".join(srt_content)
|
72 |
|
@@ -106,35 +110,13 @@ def check_ffmpeg():
|
|
106 |
# Initialize ffmpeg check
|
107 |
check_ffmpeg()
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
logger.info(f"Using device: {device}")
|
112 |
-
|
113 |
-
def create_pipeline():
|
114 |
-
"""Create a new pipeline with optimized settings for T4 GPU.
|
115 |
-
|
116 |
-
Returns:
|
117 |
-
transformers.Pipeline: Configured speech recognition pipeline.
|
118 |
-
"""
|
119 |
-
return pipeline(
|
120 |
-
task="automatic-speech-recognition",
|
121 |
-
model=MODEL_NAME,
|
122 |
-
device=device,
|
123 |
-
)
|
124 |
-
|
125 |
-
# Initialize pipeline once
|
126 |
-
pipe = create_pipeline()
|
127 |
-
logger.info(f"Pipeline initialized: {pipe}")
|
128 |
-
|
129 |
-
def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_length_s):
|
130 |
-
"""Transcribe audio input using Whisper model.
|
131 |
|
132 |
Args:
|
133 |
inputs (str): Path to audio file to transcribe.
|
134 |
return_timestamps (bool): Whether to include timestamps in output.
|
135 |
generate_subs (bool): Whether to generate SRT subtitles.
|
136 |
-
batch_size (int): Number of chunks to process in parallel.
|
137 |
-
chunk_length_s (int): Length of audio chunks in seconds.
|
138 |
|
139 |
Returns:
|
140 |
tuple: (formatted_result, srt_file, correction_text)
|
@@ -152,27 +134,20 @@ def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_lengt
|
|
152 |
try:
|
153 |
logger.info(f"Processing audio file: {inputs}")
|
154 |
|
155 |
-
#
|
156 |
-
|
|
|
157 |
|
158 |
-
#
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
|
163 |
-
|
164 |
-
result = pipe(
|
165 |
-
inputs,
|
166 |
-
batch_size=batch_size,
|
167 |
-
chunk_length_s=chunk_length_s,
|
168 |
-
stride_length_s=stride_length_s,
|
169 |
-
return_timestamps="word" if return_timestamps else False
|
170 |
-
)
|
171 |
-
logger.debug(f"Pipeline result: {result}")
|
172 |
|
173 |
# Format response as JSON
|
174 |
formatted_result = {
|
175 |
-
"text": result
|
176 |
}
|
177 |
|
178 |
chunks = []
|
@@ -208,17 +183,11 @@ def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_lengt
|
|
208 |
srt_file = save_srt_to_file(srt_content)
|
209 |
logger.info("SRT subtitles generated successfully")
|
210 |
|
211 |
-
# Clear CUDA cache after processing
|
212 |
-
if torch.cuda.is_available():
|
213 |
-
torch.cuda.empty_cache()
|
214 |
-
logger.debug("Cleared CUDA cache after processing")
|
215 |
-
|
216 |
return formatted_result, srt_file, "" # Return empty string for correction textbox
|
|
|
|
|
|
|
217 |
except Exception as e:
|
218 |
-
# Ensure CUDA cache is cleared even if there's an error
|
219 |
-
if torch.cuda.is_available():
|
220 |
-
torch.cuda.empty_cache()
|
221 |
-
logger.debug("Cleared CUDA cache after error")
|
222 |
logger.exception(f"Error during transcription: {str(e)}")
|
223 |
raise gr.Error(f"Failed to transcribe audio: {str(e)}")
|
224 |
|
@@ -232,8 +201,6 @@ mf_transcribe = gr.Interface(
|
|
232 |
gr.Audio(sources="microphone", type="filepath"),
|
233 |
gr.Checkbox(label="Include timestamps", value=True),
|
234 |
gr.Checkbox(label="Generate subtitles", value=True),
|
235 |
-
gr.Slider(minimum=1, maximum=64, value=8, step=1, label="Batch Size"),
|
236 |
-
gr.Slider(minimum=5, maximum=30, value=30, step=5, label="Chunk Length (seconds)"),
|
237 |
],
|
238 |
outputs=[
|
239 |
gr.JSON(label="Transcription", open=True),
|
@@ -242,7 +209,7 @@ mf_transcribe = gr.Interface(
|
|
242 |
title="Whisper Large V3 Turbo: Transcribe Audio",
|
243 |
description=(
|
244 |
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
|
245 |
-
f" checkpoint [{
|
246 |
" of arbitrary length."
|
247 |
)
|
248 |
)
|
@@ -253,8 +220,6 @@ file_transcribe = gr.Interface(
|
|
253 |
gr.Audio(sources="upload", type="filepath", label="Audio file"),
|
254 |
gr.Checkbox(label="Include timestamps", value=True),
|
255 |
gr.Checkbox(label="Generate subtitles", value=True),
|
256 |
-
gr.Slider(minimum=1, maximum=32, value=8, step=1, label="Batch Size"),
|
257 |
-
gr.Slider(minimum=5, maximum=30, value=15, step=5, label="Chunk Length (seconds)"),
|
258 |
],
|
259 |
outputs=[
|
260 |
gr.JSON(label="Transcription", open=True),
|
@@ -263,7 +228,7 @@ file_transcribe = gr.Interface(
|
|
263 |
title="Whisper Large V3: Transcribe Audio",
|
264 |
description=(
|
265 |
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
|
266 |
-
f" checkpoint [{
|
267 |
" of arbitrary length."
|
268 |
)
|
269 |
)
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import subprocess
|
3 |
import datetime
|
4 |
import tempfile
|
5 |
+
import requests
|
6 |
from loguru import logger
|
7 |
+
from os import getenv
|
8 |
|
9 |
+
API_URL = getenv("API_URL")
|
10 |
+
headers = {
|
11 |
+
"Accept": "application/json",
|
12 |
+
"Content-Type": "audio/flac"
|
13 |
+
}
|
14 |
|
15 |
def format_time(seconds):
|
16 |
"""Convert seconds to SRT time format (HH:MM:SS,mmm).
|
|
|
70 |
for i, chunk in enumerate(chunks, 1):
|
71 |
start_time = format_time(chunk["timestamp"][0])
|
72 |
end_time = format_time(chunk["timestamp"][1])
|
73 |
+
text = chunk.get("text", "").strip()
|
74 |
srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
|
75 |
return "".join(srt_content)
|
76 |
|
|
|
110 |
# Initialize ffmpeg check
|
111 |
check_ffmpeg()
|
112 |
|
113 |
+
def transcribe(inputs, return_timestamps, generate_subs):
|
114 |
+
"""Transcribe audio input using Whisper model via Hugging Face Inference API.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
Args:
|
117 |
inputs (str): Path to audio file to transcribe.
|
118 |
return_timestamps (bool): Whether to include timestamps in output.
|
119 |
generate_subs (bool): Whether to generate SRT subtitles.
|
|
|
|
|
120 |
|
121 |
Returns:
|
122 |
tuple: (formatted_result, srt_file, correction_text)
|
|
|
134 |
try:
|
135 |
logger.info(f"Processing audio file: {inputs}")
|
136 |
|
137 |
+
# Read the audio file
|
138 |
+
with open(inputs, "rb") as f:
|
139 |
+
data = f.read()
|
140 |
|
141 |
+
# Send request to API
|
142 |
+
response = requests.post(API_URL, headers=headers, data=data)
|
143 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
144 |
+
result = response.json()
|
145 |
|
146 |
+
logger.debug(f"API response: {result}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
# Format response as JSON
|
149 |
formatted_result = {
|
150 |
+
"text": result.get("text", "")
|
151 |
}
|
152 |
|
153 |
chunks = []
|
|
|
183 |
srt_file = save_srt_to_file(srt_content)
|
184 |
logger.info("SRT subtitles generated successfully")
|
185 |
|
|
|
|
|
|
|
|
|
|
|
186 |
return formatted_result, srt_file, "" # Return empty string for correction textbox
|
187 |
+
except requests.exceptions.RequestException as e:
|
188 |
+
logger.exception(f"API request failed: {str(e)}")
|
189 |
+
raise gr.Error(f"Failed to transcribe audio: API request failed - {str(e)}")
|
190 |
except Exception as e:
|
|
|
|
|
|
|
|
|
191 |
logger.exception(f"Error during transcription: {str(e)}")
|
192 |
raise gr.Error(f"Failed to transcribe audio: {str(e)}")
|
193 |
|
|
|
201 |
gr.Audio(sources="microphone", type="filepath"),
|
202 |
gr.Checkbox(label="Include timestamps", value=True),
|
203 |
gr.Checkbox(label="Generate subtitles", value=True),
|
|
|
|
|
204 |
],
|
205 |
outputs=[
|
206 |
gr.JSON(label="Transcription", open=True),
|
|
|
209 |
title="Whisper Large V3 Turbo: Transcribe Audio",
|
210 |
description=(
|
211 |
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
|
212 |
+
f" checkpoint [{API_URL}](https://huggingface.co/{API_URL}) and 🤗 Transformers to transcribe audio files"
|
213 |
" of arbitrary length."
|
214 |
)
|
215 |
)
|
|
|
220 |
gr.Audio(sources="upload", type="filepath", label="Audio file"),
|
221 |
gr.Checkbox(label="Include timestamps", value=True),
|
222 |
gr.Checkbox(label="Generate subtitles", value=True),
|
|
|
|
|
223 |
],
|
224 |
outputs=[
|
225 |
gr.JSON(label="Transcription", open=True),
|
|
|
228 |
title="Whisper Large V3: Transcribe Audio",
|
229 |
description=(
|
230 |
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
|
231 |
+
f" checkpoint [{API_URL}](https://huggingface.co/{API_URL}) and 🤗 Transformers to transcribe audio files"
|
232 |
" of arbitrary length."
|
233 |
)
|
234 |
)
|
requirements.txt
CHANGED
@@ -1,4 +1,2 @@
|
|
1 |
-
transformers
|
2 |
loguru
|
3 |
-
torch
|
4 |
gradio
|
|
|
|
|
1 |
loguru
|
|
|
2 |
gradio
|