whisper-tg

Paused

App Files Files Community

whisper-tg / app.py

muhtasham

WIP

0147bd5 3 months ago

raw

history blame

5.04 kB

	import gradio as gr
	import requests
	import subprocess
	from loguru import logger

	# Configure loguru
	logger.add("app.log", rotation="500 MB", level="DEBUG")

	API_URL = "https://skdpcqcdd929o4k3.us-east-1.aws.endpoints.huggingface.cloud"

	# Check if ffmpeg is installed
	def check_ffmpeg():
	try:
	subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
	logger.info("ffmpeg check passed successfully")
	except (subprocess.CalledProcessError, FileNotFoundError) as e:
	logger.error(f"ffmpeg check failed: {str(e)}")
	raise gr.Error("ffmpeg is not installed. Please install ffmpeg to use this application.")

	# Initialize ffmpeg check
	check_ffmpeg()

	def transcribe(inputs):
	if inputs is None:
	logger.warning("No audio file submitted")
	raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

	headers = {
	"Accept": "application/json",
	"Content-Type": "audio/flac"
	}
	logger.debug(f"Using headers: {headers}")

	try:
	logger.info(f"Reading audio file: {inputs}")
	with open(inputs, "rb") as f:
	data = f.read()
	logger.debug(f"Audio file size: {len(data)} bytes")

	# Add parameters to request
	params = {
	"return_timestamps": True
	}
	logger.debug(f"Request parameters: {params}")

	logger.info("Sending request to API")
	response = requests.post(API_URL, headers=headers, data=data, params=params)
	logger.debug(f"API Response status: {response.status_code}")

	result = response.json()
	logger.debug(f"API Response: {result}")

	if "error" in result:
	logger.error(f"API returned error: {result['error']}")
	raise gr.Error(f"API Error: {result['error']}")

	if "text" not in result:
	logger.error("No transcription text in response")
	raise gr.Error("No transcription text in response")

	# Format response as JSON
	formatted_result = {
	"text": result["text"],
	"chunks": []
	}

	if "chunks" in result:
	logger.info(f"Processing {len(result['chunks'])} chunks")
	for i, chunk in enumerate(result["chunks"]):
	logger.debug(f"Processing chunk {i}: {chunk}")
	try:
	start_time = chunk.get("timestamp", [None, None])[0]
	end_time = chunk.get("timestamp", [None, None])[1]
	text = chunk.get("text", "").strip()

	if start_time is not None and end_time is not None:
	formatted_result["chunks"].append({
	"text": text,
	"timestamp": [start_time, end_time]
	})
	else:
	logger.warning(f"Invalid timestamp in chunk {i}: {chunk}")
	except Exception as chunk_error:
	logger.error(f"Error processing chunk {i}: {str(chunk_error)}")
	continue
	else:
	logger.info("No chunks found, using single chunk")
	formatted_result["chunks"].append({
	"text": result["text"],
	"timestamp": [0.0, None]
	})

	logger.info(f"Successfully processed transcription with {len(formatted_result['chunks'])} chunks")
	return formatted_result
	except Exception as e:
	logger.exception(f"Error during transcription: {str(e)}")
	raise gr.Error(f"Failed to transcribe audio: {str(e)}")

	demo = gr.Blocks(theme=gr.themes.Ocean())

	mf_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(sources="microphone", type="filepath"),
	],
	outputs=[
	gr.JSON(label="Transcription", open=True),
	],
	title="Whisper Large V3 Turbo: Transcribe Audio",
	description=(
	"Transcribe long-form microphone or audio inputs with the click of a button! "
	),
	flagging_mode="manual",
	flagging_options=[
	"Incorrect text",
	"Incorrect timestamp",
	"Other issue"
	],
	flagging_dir="flagged_data"
	)

	file_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(sources="upload", type="filepath", label="Audio file"),
	],
	outputs=[
	gr.JSON(label="Transcription", open=True),
	],
	title="Whisper Large V3: Transcribe Audio",
	description=(
	"Transcribe long-form microphone or audio inputs with the click of a button! "
	),
	flagging_mode="manual",
	flagging_options=[
	"Incorrect text",
	"Incorrect timestamp",
	"Other issue"
	],
	flagging_dir="flagged_data"
	)

	with demo:
	gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])

	logger.info("Starting Gradio interface")
	demo.queue().launch(ssr_mode=False)