Spaces:

reab5555
/

WhisperCap

Sleeping

App Files Files Community

WhisperCap / app.py

reab5555

Update app.py

b6211a3 verified 10 months ago

raw

history blame

4.2 kB

	import os
	import math
	import gradio as gr
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from moviepy.editor import VideoFileClip, concatenate_audioclips

	def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	model_id = "openai/whisper-large-v3"
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	)
	model.to(device)
	processor = AutoProcessor.from_pretrained(model_id)
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=10, # Shorter chunk length to prevent overly long captions
	batch_size=2,
	return_timestamps=True,
	torch_dtype=torch_dtype,
	device=device,
	)

	if video_file is None:
	yield "Error: No video file provided.", None
	return

	video_path = video_file.name if hasattr(video_file, 'name') else video_file
	try:
	video = VideoFileClip(video_path)
	except Exception as e:
	yield f"Error processing video file: {str(e)}", None
	return

	audio = video.audio
	duration = video.duration
	n_chunks = math.ceil(duration / 10) # Split into 10-second chunks
	transcription_txt = ""
	transcription_srt = []

	for i in range(n_chunks):
	start = i * 10
	end = min((i + 1) * 10, duration)
	audio_chunk = audio.subclip(start, end)

	temp_file_path = f"temp_audio_{i}.wav"
	audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')

	with open(temp_file_path, "rb") as temp_file:
	result = pipe(temp_file_path, generate_kwargs={"language": language})
	transcription_txt += result["text"]
	if transcribe_to_srt:
	for chunk in result["chunks"]:
	start_time, end_time = chunk["timestamp"]
	if start_time is not None and end_time is not None:
	transcription_srt.append({
	"start": start_time + i * 10,
	"end": end_time + i * 10,
	"text": chunk["text"]
	})
	else:
	print(f"Warning: Invalid timestamp for chunk: {chunk}")

	os.remove(temp_file_path)
	yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%", None

	output = ""
	srt_file_path = None
	if transcribe_to_text:
	output += "Text Transcription:\n" + transcription_txt + "\n\n"
	if transcribe_to_srt:
	output += "SRT Transcription:\n"
	srt_content = ""
	for i, sub in enumerate(transcription_srt, 1):
	srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
	output += srt_entry
	srt_content += srt_entry

	srt_file_path = "transcription.srt"
	with open(srt_file_path, "w", encoding="utf-8") as srt_file:
	srt_file.write(srt_content)

	output += f"\nSRT file saved as: {srt_file_path}"

	yield output, srt_file_path

	def format_time(seconds):
	m, s = divmod(seconds, 60)
	h, m = divmod(m, 60)
	return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')

	iface = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Video(),
	gr.Checkbox(label="Transcribe to Text"),
	gr.Checkbox(label="Transcribe to SRT"),
	gr.Dropdown(choices=['en', 'he', 'it', 'es', 'fr', 'de', 'zh', 'ar'], label="Language")
	],
	outputs=[
	gr.Textbox(label="Transcription Output"),
	gr.File(label="Download SRT")
	],
	title="WhisperCap Video Transcription",
	description="Upload a video file to transcribe its audio using Whisper. You can download the SRT file if generated.",
	)

	iface.launch(share=True)