whisper-tg / app.py
muhtasham's picture
WIP
0147bd5
raw
history blame
5.04 kB
import gradio as gr
import requests
import subprocess
from loguru import logger
# Configure loguru
logger.add("app.log", rotation="500 MB", level="DEBUG")
API_URL = "https://skdpcqcdd929o4k3.us-east-1.aws.endpoints.huggingface.cloud"
# Check if ffmpeg is installed
def check_ffmpeg():
try:
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
logger.info("ffmpeg check passed successfully")
except (subprocess.CalledProcessError, FileNotFoundError) as e:
logger.error(f"ffmpeg check failed: {str(e)}")
raise gr.Error("ffmpeg is not installed. Please install ffmpeg to use this application.")
# Initialize ffmpeg check
check_ffmpeg()
def transcribe(inputs):
if inputs is None:
logger.warning("No audio file submitted")
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
headers = {
"Accept": "application/json",
"Content-Type": "audio/flac"
}
logger.debug(f"Using headers: {headers}")
try:
logger.info(f"Reading audio file: {inputs}")
with open(inputs, "rb") as f:
data = f.read()
logger.debug(f"Audio file size: {len(data)} bytes")
# Add parameters to request
params = {
"return_timestamps": True
}
logger.debug(f"Request parameters: {params}")
logger.info("Sending request to API")
response = requests.post(API_URL, headers=headers, data=data, params=params)
logger.debug(f"API Response status: {response.status_code}")
result = response.json()
logger.debug(f"API Response: {result}")
if "error" in result:
logger.error(f"API returned error: {result['error']}")
raise gr.Error(f"API Error: {result['error']}")
if "text" not in result:
logger.error("No transcription text in response")
raise gr.Error("No transcription text in response")
# Format response as JSON
formatted_result = {
"text": result["text"],
"chunks": []
}
if "chunks" in result:
logger.info(f"Processing {len(result['chunks'])} chunks")
for i, chunk in enumerate(result["chunks"]):
logger.debug(f"Processing chunk {i}: {chunk}")
try:
start_time = chunk.get("timestamp", [None, None])[0]
end_time = chunk.get("timestamp", [None, None])[1]
text = chunk.get("text", "").strip()
if start_time is not None and end_time is not None:
formatted_result["chunks"].append({
"text": text,
"timestamp": [start_time, end_time]
})
else:
logger.warning(f"Invalid timestamp in chunk {i}: {chunk}")
except Exception as chunk_error:
logger.error(f"Error processing chunk {i}: {str(chunk_error)}")
continue
else:
logger.info("No chunks found, using single chunk")
formatted_result["chunks"].append({
"text": result["text"],
"timestamp": [0.0, None]
})
logger.info(f"Successfully processed transcription with {len(formatted_result['chunks'])} chunks")
return formatted_result
except Exception as e:
logger.exception(f"Error during transcription: {str(e)}")
raise gr.Error(f"Failed to transcribe audio: {str(e)}")
demo = gr.Blocks(theme=gr.themes.Ocean())
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="microphone", type="filepath"),
],
outputs=[
gr.JSON(label="Transcription", open=True),
],
title="Whisper Large V3 Turbo: Transcribe Audio",
description=(
"Transcribe long-form microphone or audio inputs with the click of a button! "
),
flagging_mode="manual",
flagging_options=[
"Incorrect text",
"Incorrect timestamp",
"Other issue"
],
flagging_dir="flagged_data"
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="upload", type="filepath", label="Audio file"),
],
outputs=[
gr.JSON(label="Transcription", open=True),
],
title="Whisper Large V3: Transcribe Audio",
description=(
"Transcribe long-form microphone or audio inputs with the click of a button! "
),
flagging_mode="manual",
flagging_options=[
"Incorrect text",
"Incorrect timestamp",
"Other issue"
],
flagging_dir="flagged_data"
)
with demo:
gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])
logger.info("Starting Gradio interface")
demo.queue().launch(ssr_mode=False)