Spaces:
Sleeping
Sleeping
File size: 6,063 Bytes
7ff8b05 394dcf7 9f5c443 496c5b5 394dcf7 496c5b5 394dcf7 54450a0 394dcf7 54450a0 8bf0bb0 394dcf7 8bf0bb0 394dcf7 7ff8b05 394dcf7 7d6c6c5 394dcf7 7d6c6c5 394dcf7 7d6c6c5 394dcf7 7d6c6c5 394dcf7 7d6c6c5 394dcf7 7d6c6c5 394dcf7 7d6c6c5 394dcf7 7d6c6c5 c75a5bc 8bf0bb0 394dcf7 7d6c6c5 394dcf7 7d6c6c5 394dcf7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import streamlit as st
from transformers import pipeline
from pydub import AudioSegment
import pysrt
import os
import io
st.title("Speech-to-Text with Transformers")
with st.expander("README"):
st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).")
# Upload audio file
uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"])
# Model selection
# Note: For Hugging Face Spaces, larger models might require more resources (GPU).
# "tiny", "base", "small", "medium" are common Whisper sizes.
model_size = st.selectbox(
"Model Size (select a smaller model for faster inference or limited resources)",
("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium")
)
# Should we translate to English?
translate = st.checkbox("Would you like a translation to English?")
# Information about resource usage on Hugging Face Spaces
st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. Larger models will consume more resources and time.")
@st.cache_resource
def load_whisper_pipeline(model_name):
"""
Loads the Hugging Face Whisper ASR pipeline.
Uses st.cache_resource to avoid reloading the model on every rerun.
We explicitly tell the pipeline to return timestamps for long-form audio.
"""
st.info(f"Loading {model_name} model... This may take a moment.")
# Set return_timestamps=True to handle audio longer than 30 seconds
return pipeline("automatic-speech-recognition", model=model_name, return_timestamps=True)
def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
"""
Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT.
"""
try:
asr_pipeline = load_whisper_pipeline(model_name)
st.info("Transcribing audio... Please wait.")
# Configure generation arguments for translation if requested
generate_kwargs = {}
if translate_to_english:
generate_kwargs["task"] = "translate"
# Pass the audio file path and any generation arguments to the pipeline
prediction = asr_pipeline(audio_file_path, generate_kwargs=generate_kwargs)
transcribed_text = prediction["text"]
st.subheader("Full Transcription Output:")
st.write(transcribed_text)
srt_content = pysrt.SubRipFile()
# The 'chunks' key will be present if return_timestamps=True was set
if "chunks" in prediction:
for i, chunk in enumerate(prediction["chunks"]):
start_time_seconds = chunk["timestamp"][0] if chunk["timestamp"][0] is not None else 0.0
end_time_seconds = chunk["timestamp"][1] if chunk["timestamp"][1] is not None else start_time_seconds + 1.0 # Default if end is None
# Helper function to convert seconds to pysrt.SubRipTime
def seconds_to_srt_time(total_seconds):
hours = int(total_seconds / 3600)
minutes = int((total_seconds % 3600) / 60)
seconds = int(total_seconds % 60)
milliseconds = int((total_seconds - int(total_seconds)) * 1000)
return pysrt.SubRipTime(hours, minutes, seconds, milliseconds)
item = pysrt.SubRipItem(
index=i + 1,
start=seconds_to_srt_time(start_time_seconds),
end=seconds_to_srt_time(end_time_seconds),
text=chunk["text"]
)
srt_content.append(item)
else:
st.warning("Could not retrieve segmented timestamps. Generating a single subtitle entry.")
# Fallback: Create a single subtitle entry if chunks are not available
# This is less ideal but ensures some output even if timestamps are missing
audio_duration_seconds = 0
try:
audio = AudioSegment.from_file(audio_file_path)
audio_duration_seconds = audio.duration_seconds
except Exception:
# Estimate duration if pydub fails
audio_duration_seconds = len(transcribed_text) * 0.1 # Very rough estimate
item = pysrt.SubRipItem(
index=1,
start=pysrt.SubRipTime(0, 0, 0, 0),
end=pysrt.SubRipTime(0, 0, int(audio_duration_seconds), 0),
text=transcribed_text
)
srt_content.append(item)
srt_file_path = "audio.srt"
srt_content.save(srt_file_path, encoding='utf-8')
st.success("Transcription successful! Download subtitle file?")
with open(srt_file_path, "rb") as f:
st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt")
os.remove(srt_file_path)
except Exception as e:
st.error(f"Error during transcription: {str(e)}")
st.info("Common issues: File format not supported, model loading failed (check Hugging Face Space logs), or audio too large for available memory.")
if uploaded_file is not None:
# Save uploaded file to a temporary location for transformers pipeline
# The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
# It's crucial to give the file a proper extension for pydub to identify format
temp_file_name = "temp_audio_file." + uploaded_file.type.split('/')[-1]
with open(temp_file_name, "wb") as f:
f.write(uploaded_file.getbuffer())
audio_file_path = temp_file_name
transcribe_with_transformers(audio_file_path, model_size, translate)
# Clean up the temporary file
if os.path.exists(audio_file_path):
os.remove(audio_file_path) |