speech-to-text / app.py
cptsubtext
fix mel error
7d6c6c5
import streamlit as st
from transformers import pipeline
from pydub import AudioSegment
import pysrt
import os
import io
st.title("Speech-to-Text with Transformers")
with st.expander("README"):
st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).")
# Upload audio file
uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"])
# Model selection
# Note: For Hugging Face Spaces, larger models might require more resources (GPU).
# "tiny", "base", "small", "medium" are common Whisper sizes.
model_size = st.selectbox(
"Model Size (select a smaller model for faster inference or limited resources)",
("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium")
)
# Should we translate to English?
translate = st.checkbox("Would you like a translation to English?")
# Information about resource usage on Hugging Face Spaces
st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. Larger models will consume more resources and time.")
@st.cache_resource
def load_whisper_pipeline(model_name):
"""
Loads the Hugging Face Whisper ASR pipeline.
Uses st.cache_resource to avoid reloading the model on every rerun.
We explicitly tell the pipeline to return timestamps for long-form audio.
"""
st.info(f"Loading {model_name} model... This may take a moment.")
# Set return_timestamps=True to handle audio longer than 30 seconds
return pipeline("automatic-speech-recognition", model=model_name, return_timestamps=True)
def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
"""
Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT.
"""
try:
asr_pipeline = load_whisper_pipeline(model_name)
st.info("Transcribing audio... Please wait.")
# Configure generation arguments for translation if requested
generate_kwargs = {}
if translate_to_english:
generate_kwargs["task"] = "translate"
# Pass the audio file path and any generation arguments to the pipeline
prediction = asr_pipeline(audio_file_path, generate_kwargs=generate_kwargs)
transcribed_text = prediction["text"]
st.subheader("Full Transcription Output:")
st.write(transcribed_text)
srt_content = pysrt.SubRipFile()
# The 'chunks' key will be present if return_timestamps=True was set
if "chunks" in prediction:
for i, chunk in enumerate(prediction["chunks"]):
start_time_seconds = chunk["timestamp"][0] if chunk["timestamp"][0] is not None else 0.0
end_time_seconds = chunk["timestamp"][1] if chunk["timestamp"][1] is not None else start_time_seconds + 1.0 # Default if end is None
# Helper function to convert seconds to pysrt.SubRipTime
def seconds_to_srt_time(total_seconds):
hours = int(total_seconds / 3600)
minutes = int((total_seconds % 3600) / 60)
seconds = int(total_seconds % 60)
milliseconds = int((total_seconds - int(total_seconds)) * 1000)
return pysrt.SubRipTime(hours, minutes, seconds, milliseconds)
item = pysrt.SubRipItem(
index=i + 1,
start=seconds_to_srt_time(start_time_seconds),
end=seconds_to_srt_time(end_time_seconds),
text=chunk["text"]
)
srt_content.append(item)
else:
st.warning("Could not retrieve segmented timestamps. Generating a single subtitle entry.")
# Fallback: Create a single subtitle entry if chunks are not available
# This is less ideal but ensures some output even if timestamps are missing
audio_duration_seconds = 0
try:
audio = AudioSegment.from_file(audio_file_path)
audio_duration_seconds = audio.duration_seconds
except Exception:
# Estimate duration if pydub fails
audio_duration_seconds = len(transcribed_text) * 0.1 # Very rough estimate
item = pysrt.SubRipItem(
index=1,
start=pysrt.SubRipTime(0, 0, 0, 0),
end=pysrt.SubRipTime(0, 0, int(audio_duration_seconds), 0),
text=transcribed_text
)
srt_content.append(item)
srt_file_path = "audio.srt"
srt_content.save(srt_file_path, encoding='utf-8')
st.success("Transcription successful! Download subtitle file?")
with open(srt_file_path, "rb") as f:
st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt")
os.remove(srt_file_path)
except Exception as e:
st.error(f"Error during transcription: {str(e)}")
st.info("Common issues: File format not supported, model loading failed (check Hugging Face Space logs), or audio too large for available memory.")
if uploaded_file is not None:
# Save uploaded file to a temporary location for transformers pipeline
# The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
# It's crucial to give the file a proper extension for pydub to identify format
temp_file_name = "temp_audio_file." + uploaded_file.type.split('/')[-1]
with open(temp_file_name, "wb") as f:
f.write(uploaded_file.getbuffer())
audio_file_path = temp_file_name
transcribe_with_transformers(audio_file_path, model_size, translate)
# Clean up the temporary file
if os.path.exists(audio_file_path):
os.remove(audio_file_path)