Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline | |
from pydub import AudioSegment | |
import pysrt | |
import os | |
import io | |
st.title("Speech-to-Text with Transformers") | |
with st.expander("README"): | |
st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).") | |
# Upload audio file | |
uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"]) | |
# Model selection | |
# Note: For Hugging Face Spaces, larger models might require more resources (GPU). | |
# "tiny", "base", "small", "medium" are common Whisper sizes. | |
model_size = st.selectbox( | |
"Model Size (select a smaller model for faster inference or limited resources)", | |
("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium") | |
) | |
# Should we translate to English? | |
translate = st.checkbox("Would you like a translation to English?") | |
# Information about resource usage on Hugging Face Spaces | |
st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. Larger models will consume more resources and time.") | |
def load_whisper_pipeline(model_name): | |
""" | |
Loads the Hugging Face Whisper ASR pipeline. | |
Uses st.cache_resource to avoid reloading the model on every rerun. | |
We explicitly tell the pipeline to return timestamps for long-form audio. | |
""" | |
st.info(f"Loading {model_name} model... This may take a moment.") | |
# Set return_timestamps=True to handle audio longer than 30 seconds | |
return pipeline("automatic-speech-recognition", model=model_name, return_timestamps=True) | |
def transcribe_with_transformers(audio_file_path, model_name, translate_to_english): | |
""" | |
Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT. | |
""" | |
try: | |
asr_pipeline = load_whisper_pipeline(model_name) | |
st.info("Transcribing audio... Please wait.") | |
# Configure generation arguments for translation if requested | |
generate_kwargs = {} | |
if translate_to_english: | |
generate_kwargs["task"] = "translate" | |
# Pass the audio file path and any generation arguments to the pipeline | |
prediction = asr_pipeline(audio_file_path, generate_kwargs=generate_kwargs) | |
transcribed_text = prediction["text"] | |
st.subheader("Full Transcription Output:") | |
st.write(transcribed_text) | |
srt_content = pysrt.SubRipFile() | |
# The 'chunks' key will be present if return_timestamps=True was set | |
if "chunks" in prediction: | |
for i, chunk in enumerate(prediction["chunks"]): | |
start_time_seconds = chunk["timestamp"][0] if chunk["timestamp"][0] is not None else 0.0 | |
end_time_seconds = chunk["timestamp"][1] if chunk["timestamp"][1] is not None else start_time_seconds + 1.0 # Default if end is None | |
# Helper function to convert seconds to pysrt.SubRipTime | |
def seconds_to_srt_time(total_seconds): | |
hours = int(total_seconds / 3600) | |
minutes = int((total_seconds % 3600) / 60) | |
seconds = int(total_seconds % 60) | |
milliseconds = int((total_seconds - int(total_seconds)) * 1000) | |
return pysrt.SubRipTime(hours, minutes, seconds, milliseconds) | |
item = pysrt.SubRipItem( | |
index=i + 1, | |
start=seconds_to_srt_time(start_time_seconds), | |
end=seconds_to_srt_time(end_time_seconds), | |
text=chunk["text"] | |
) | |
srt_content.append(item) | |
else: | |
st.warning("Could not retrieve segmented timestamps. Generating a single subtitle entry.") | |
# Fallback: Create a single subtitle entry if chunks are not available | |
# This is less ideal but ensures some output even if timestamps are missing | |
audio_duration_seconds = 0 | |
try: | |
audio = AudioSegment.from_file(audio_file_path) | |
audio_duration_seconds = audio.duration_seconds | |
except Exception: | |
# Estimate duration if pydub fails | |
audio_duration_seconds = len(transcribed_text) * 0.1 # Very rough estimate | |
item = pysrt.SubRipItem( | |
index=1, | |
start=pysrt.SubRipTime(0, 0, 0, 0), | |
end=pysrt.SubRipTime(0, 0, int(audio_duration_seconds), 0), | |
text=transcribed_text | |
) | |
srt_content.append(item) | |
srt_file_path = "audio.srt" | |
srt_content.save(srt_file_path, encoding='utf-8') | |
st.success("Transcription successful! Download subtitle file?") | |
with open(srt_file_path, "rb") as f: | |
st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt") | |
os.remove(srt_file_path) | |
except Exception as e: | |
st.error(f"Error during transcription: {str(e)}") | |
st.info("Common issues: File format not supported, model loading failed (check Hugging Face Space logs), or audio too large for available memory.") | |
if uploaded_file is not None: | |
# Save uploaded file to a temporary location for transformers pipeline | |
# The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust. | |
# It's crucial to give the file a proper extension for pydub to identify format | |
temp_file_name = "temp_audio_file." + uploaded_file.type.split('/')[-1] | |
with open(temp_file_name, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
audio_file_path = temp_file_name | |
transcribe_with_transformers(audio_file_path, model_size, translate) | |
# Clean up the temporary file | |
if os.path.exists(audio_file_path): | |
os.remove(audio_file_path) |