import streamlit as st from transformers import pipeline from pydub import AudioSegment import pysrt import os import io st.title("Speech-to-Text with Transformers") with st.expander("README"): st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).") # Upload audio file uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"]) # Model selection # Note: For Hugging Face Spaces, larger models might require more resources (GPU). # "tiny", "base", "small", "medium" are common Whisper sizes. model_size = st.selectbox( "Model Size (select a smaller model for faster inference or limited resources)", ("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium") ) # Should we translate to English? translate = st.checkbox("Would you like a translation to English?") # Information about resource usage on Hugging Face Spaces st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. Larger models will consume more resources and time.") @st.cache_resource def load_whisper_pipeline(model_name): """ Loads the Hugging Face Whisper ASR pipeline. Uses st.cache_resource to avoid reloading the model on every rerun. We explicitly tell the pipeline to return timestamps for long-form audio. """ st.info(f"Loading {model_name} model... This may take a moment.") # Set return_timestamps=True to handle audio longer than 30 seconds return pipeline("automatic-speech-recognition", model=model_name, return_timestamps=True) def transcribe_with_transformers(audio_file_path, model_name, translate_to_english): """ Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT. """ try: asr_pipeline = load_whisper_pipeline(model_name) st.info("Transcribing audio... Please wait.") # Configure generation arguments for translation if requested generate_kwargs = {} if translate_to_english: generate_kwargs["task"] = "translate" # Pass the audio file path and any generation arguments to the pipeline prediction = asr_pipeline(audio_file_path, generate_kwargs=generate_kwargs) transcribed_text = prediction["text"] st.subheader("Full Transcription Output:") st.write(transcribed_text) srt_content = pysrt.SubRipFile() # The 'chunks' key will be present if return_timestamps=True was set if "chunks" in prediction: for i, chunk in enumerate(prediction["chunks"]): start_time_seconds = chunk["timestamp"][0] if chunk["timestamp"][0] is not None else 0.0 end_time_seconds = chunk["timestamp"][1] if chunk["timestamp"][1] is not None else start_time_seconds + 1.0 # Default if end is None # Helper function to convert seconds to pysrt.SubRipTime def seconds_to_srt_time(total_seconds): hours = int(total_seconds / 3600) minutes = int((total_seconds % 3600) / 60) seconds = int(total_seconds % 60) milliseconds = int((total_seconds - int(total_seconds)) * 1000) return pysrt.SubRipTime(hours, minutes, seconds, milliseconds) item = pysrt.SubRipItem( index=i + 1, start=seconds_to_srt_time(start_time_seconds), end=seconds_to_srt_time(end_time_seconds), text=chunk["text"] ) srt_content.append(item) else: st.warning("Could not retrieve segmented timestamps. Generating a single subtitle entry.") # Fallback: Create a single subtitle entry if chunks are not available # This is less ideal but ensures some output even if timestamps are missing audio_duration_seconds = 0 try: audio = AudioSegment.from_file(audio_file_path) audio_duration_seconds = audio.duration_seconds except Exception: # Estimate duration if pydub fails audio_duration_seconds = len(transcribed_text) * 0.1 # Very rough estimate item = pysrt.SubRipItem( index=1, start=pysrt.SubRipTime(0, 0, 0, 0), end=pysrt.SubRipTime(0, 0, int(audio_duration_seconds), 0), text=transcribed_text ) srt_content.append(item) srt_file_path = "audio.srt" srt_content.save(srt_file_path, encoding='utf-8') st.success("Transcription successful! Download subtitle file?") with open(srt_file_path, "rb") as f: st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt") os.remove(srt_file_path) except Exception as e: st.error(f"Error during transcription: {str(e)}") st.info("Common issues: File format not supported, model loading failed (check Hugging Face Space logs), or audio too large for available memory.") if uploaded_file is not None: # Save uploaded file to a temporary location for transformers pipeline # The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust. # It's crucial to give the file a proper extension for pydub to identify format temp_file_name = "temp_audio_file." + uploaded_file.type.split('/')[-1] with open(temp_file_name, "wb") as f: f.write(uploaded_file.getbuffer()) audio_file_path = temp_file_name transcribe_with_transformers(audio_file_path, model_size, translate) # Clean up the temporary file if os.path.exists(audio_file_path): os.remove(audio_file_path)