Spaces:

cpt-subtext
/

speech-to-text

Sleeping

speech-to-text / app.py

cptsubtext

fix mel error

7d6c6c5 15 days ago

6.06 kB

	import streamlit as st
	from transformers import pipeline
	from pydub import AudioSegment
	import pysrt
	import os
	import io

	st.title("Speech-to-Text with Transformers")

	with st.expander("README"):
	st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).")

	# Upload audio file
	uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"])

	# Model selection
	# Note: For Hugging Face Spaces, larger models might require more resources (GPU).
	# "tiny", "base", "small", "medium" are common Whisper sizes.
	model_size = st.selectbox(
	"Model Size (select a smaller model for faster inference or limited resources)",
	("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium")
	)

	# Should we translate to English?
	translate = st.checkbox("Would you like a translation to English?")

	# Information about resource usage on Hugging Face Spaces
	st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. Larger models will consume more resources and time.")

	@st.cache_resource
	def load_whisper_pipeline(model_name):
	"""
	Loads the Hugging Face Whisper ASR pipeline.
	Uses st.cache_resource to avoid reloading the model on every rerun.
	We explicitly tell the pipeline to return timestamps for long-form audio.
	"""
	st.info(f"Loading {model_name} model... This may take a moment.")
	# Set return_timestamps=True to handle audio longer than 30 seconds
	return pipeline("automatic-speech-recognition", model=model_name, return_timestamps=True)

	def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
	"""
	Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT.
	"""
	try:
	asr_pipeline = load_whisper_pipeline(model_name)

	st.info("Transcribing audio... Please wait.")

	# Configure generation arguments for translation if requested
	generate_kwargs = {}
	if translate_to_english:
	generate_kwargs["task"] = "translate"

	# Pass the audio file path and any generation arguments to the pipeline
	prediction = asr_pipeline(audio_file_path, generate_kwargs=generate_kwargs)

	transcribed_text = prediction["text"]
	st.subheader("Full Transcription Output:")
	st.write(transcribed_text)

	srt_content = pysrt.SubRipFile()

	# The 'chunks' key will be present if return_timestamps=True was set
	if "chunks" in prediction:
	for i, chunk in enumerate(prediction["chunks"]):
	start_time_seconds = chunk["timestamp"][0] if chunk["timestamp"][0] is not None else 0.0
	end_time_seconds = chunk["timestamp"][1] if chunk["timestamp"][1] is not None else start_time_seconds + 1.0 # Default if end is None

	# Helper function to convert seconds to pysrt.SubRipTime
	def seconds_to_srt_time(total_seconds):
	hours = int(total_seconds / 3600)
	minutes = int((total_seconds % 3600) / 60)
	seconds = int(total_seconds % 60)
	milliseconds = int((total_seconds - int(total_seconds)) * 1000)
	return pysrt.SubRipTime(hours, minutes, seconds, milliseconds)

	item = pysrt.SubRipItem(
	index=i + 1,
	start=seconds_to_srt_time(start_time_seconds),
	end=seconds_to_srt_time(end_time_seconds),
	text=chunk["text"]
	)
	srt_content.append(item)
	else:
	st.warning("Could not retrieve segmented timestamps. Generating a single subtitle entry.")
	# Fallback: Create a single subtitle entry if chunks are not available
	# This is less ideal but ensures some output even if timestamps are missing
	audio_duration_seconds = 0
	try:
	audio = AudioSegment.from_file(audio_file_path)
	audio_duration_seconds = audio.duration_seconds
	except Exception:
	# Estimate duration if pydub fails
	audio_duration_seconds = len(transcribed_text) * 0.1 # Very rough estimate

	item = pysrt.SubRipItem(
	index=1,
	start=pysrt.SubRipTime(0, 0, 0, 0),
	end=pysrt.SubRipTime(0, 0, int(audio_duration_seconds), 0),
	text=transcribed_text
	)
	srt_content.append(item)


	srt_file_path = "audio.srt"
	srt_content.save(srt_file_path, encoding='utf-8')

	st.success("Transcription successful! Download subtitle file?")
	with open(srt_file_path, "rb") as f:
	st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt")
	os.remove(srt_file_path)

	except Exception as e:
	st.error(f"Error during transcription: {str(e)}")
	st.info("Common issues: File format not supported, model loading failed (check Hugging Face Space logs), or audio too large for available memory.")


	if uploaded_file is not None:
	# Save uploaded file to a temporary location for transformers pipeline
	# The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
	# It's crucial to give the file a proper extension for pydub to identify format
	temp_file_name = "temp_audio_file." + uploaded_file.type.split('/')[-1]
	with open(temp_file_name, "wb") as f:
	f.write(uploaded_file.getbuffer())

	audio_file_path = temp_file_name

	transcribe_with_transformers(audio_file_path, model_size, translate)

	# Clean up the temporary file
	if os.path.exists(audio_file_path):
	os.remove(audio_file_path)