Spaces:

cpt-subtext
/

speech-to-text

Sleeping

File size: 6,063 Bytes

7ff8b05
394dcf7
9f5c443
 
496c5b5
394dcf7
496c5b5
394dcf7
54450a0
 
394dcf7
54450a0
8bf0bb0
394dcf7
8bf0bb0
394dcf7
 
 
 
 
 
 
7ff8b05
394dcf7
 
 
7d6c6c5
 
394dcf7
 
 
 
 
 
7d6c6c5
394dcf7
 
7d6c6c5
 
394dcf7
 
 
 
 
 
 
 
 
7d6c6c5
 
 
394dcf7
7d6c6c5
 
 
 
394dcf7
 
7d6c6c5
394dcf7
 
 
7d6c6c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394dcf7
 
 
 
 
 
 
 
 
 
 
 
7d6c6c5
c75a5bc
8bf0bb0
 
394dcf7
 
7d6c6c5
 
 
394dcf7
 
7d6c6c5
394dcf7

import streamlit as st
from transformers import pipeline
from pydub import AudioSegment
import pysrt
import os
import io

st.title("Speech-to-Text with Transformers")

with st.expander("README"):
  st.write("This tool transcribes audio files using Hugging Face Transformers. Upload an audio file, choose your model size, and optionally translate to English. A WebVTT/SRT file will be generated and can be downloaded. This is suitable for use as a subtitle file (e.g., in DaVinci Resolve Import Subtitles).")

# Upload audio file
uploaded_file = st.file_uploader("Upload Audio File", type=["mp3", "wav"])

# Model selection
# Note: For Hugging Face Spaces, larger models might require more resources (GPU).
# "tiny", "base", "small", "medium" are common Whisper sizes.
model_size = st.selectbox(
    "Model Size (select a smaller model for faster inference or limited resources)",
    ("openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium")
)

# Should we translate to English?
translate = st.checkbox("Would you like a translation to English?")

# Information about resource usage on Hugging Face Spaces
st.info("When running on Hugging Face Spaces, model inference is limited by the space's compute resources. Larger models will consume more resources and time.")

@st.cache_resource
def load_whisper_pipeline(model_name):
    """
    Loads the Hugging Face Whisper ASR pipeline.
    Uses st.cache_resource to avoid reloading the model on every rerun.
    We explicitly tell the pipeline to return timestamps for long-form audio.
    """
    st.info(f"Loading {model_name} model... This may take a moment.")
    # Set return_timestamps=True to handle audio longer than 30 seconds
    return pipeline("automatic-speech-recognition", model=model_name, return_timestamps=True)

def transcribe_with_transformers(audio_file_path, model_name, translate_to_english):
    """
    Transcribes audio using the Hugging Face Transformers pipeline and generates an SRT.
    """
    try:
        asr_pipeline = load_whisper_pipeline(model_name)

        st.info("Transcribing audio... Please wait.")
        
        # Configure generation arguments for translation if requested
        generate_kwargs = {}
        if translate_to_english:
            generate_kwargs["task"] = "translate"

        # Pass the audio file path and any generation arguments to the pipeline
        prediction = asr_pipeline(audio_file_path, generate_kwargs=generate_kwargs)

        transcribed_text = prediction["text"]
        st.subheader("Full Transcription Output:")
        st.write(transcribed_text)

        srt_content = pysrt.SubRipFile()
        
        # The 'chunks' key will be present if return_timestamps=True was set
        if "chunks" in prediction:
            for i, chunk in enumerate(prediction["chunks"]):
                start_time_seconds = chunk["timestamp"][0] if chunk["timestamp"][0] is not None else 0.0
                end_time_seconds = chunk["timestamp"][1] if chunk["timestamp"][1] is not None else start_time_seconds + 1.0 # Default if end is None
                
                # Helper function to convert seconds to pysrt.SubRipTime
                def seconds_to_srt_time(total_seconds):
                    hours = int(total_seconds / 3600)
                    minutes = int((total_seconds % 3600) / 60)
                    seconds = int(total_seconds % 60)
                    milliseconds = int((total_seconds - int(total_seconds)) * 1000)
                    return pysrt.SubRipTime(hours, minutes, seconds, milliseconds)

                item = pysrt.SubRipItem(
                    index=i + 1,
                    start=seconds_to_srt_time(start_time_seconds),
                    end=seconds_to_srt_time(end_time_seconds),
                    text=chunk["text"]
                )
                srt_content.append(item)
        else:
            st.warning("Could not retrieve segmented timestamps. Generating a single subtitle entry.")
            # Fallback: Create a single subtitle entry if chunks are not available
            # This is less ideal but ensures some output even if timestamps are missing
            audio_duration_seconds = 0
            try:
                audio = AudioSegment.from_file(audio_file_path)
                audio_duration_seconds = audio.duration_seconds
            except Exception:
                # Estimate duration if pydub fails
                audio_duration_seconds = len(transcribed_text) * 0.1 # Very rough estimate

            item = pysrt.SubRipItem(
                index=1,
                start=pysrt.SubRipTime(0, 0, 0, 0),
                end=pysrt.SubRipTime(0, 0, int(audio_duration_seconds), 0),
                text=transcribed_text
            )
            srt_content.append(item)


        srt_file_path = "audio.srt"
        srt_content.save(srt_file_path, encoding='utf-8')

        st.success("Transcription successful! Download subtitle file?")
        with open(srt_file_path, "rb") as f:
            st.download_button("Download Subtitle in SRT Format", f, file_name="audio.srt")
        os.remove(srt_file_path)

    except Exception as e:
        st.error(f"Error during transcription: {str(e)}")
        st.info("Common issues: File format not supported, model loading failed (check Hugging Face Space logs), or audio too large for available memory.")


if uploaded_file is not None:
    # Save uploaded file to a temporary location for transformers pipeline
    # The pipeline can also accept file-like objects or bytes, but saving to a temp file is robust.
    # It's crucial to give the file a proper extension for pydub to identify format
    temp_file_name = "temp_audio_file." + uploaded_file.type.split('/')[-1]
    with open(temp_file_name, "wb") as f:
        f.write(uploaded_file.getbuffer())
    
    audio_file_path = temp_file_name

    transcribe_with_transformers(audio_file_path, model_size, translate)

    # Clean up the temporary file
    if os.path.exists(audio_file_path):
        os.remove(audio_file_path)