Ai-audio / app.py
Athspi's picture
Update app.py
3b2b2f2 verified
raw
history blame
2.17 kB
import gradio as gr
from transformers import pipeline
from pydub import AudioSegment
import os
# Load a smaller Whisper model for faster transcription
model = pipeline("automatic-speech-recognition", model="openai/whisper-base")
def split_audio(filepath, chunk_length_ms=30000):
"""Split audio into chunks of `chunk_length_ms` milliseconds."""
audio = AudioSegment.from_file(filepath)
chunks = []
for i in range(0, len(audio), chunk_length_ms):
chunk = audio[i:i + chunk_length_ms]
chunk_path = f"chunk_{i}.wav"
chunk.export(chunk_path, format="wav")
chunks.append(chunk_path)
return chunks
def transcribe_audio(audio_file):
# Split the audio into chunks
chunks = split_audio(audio_file)
# Transcribe each chunk and collect results
transcriptions = []
detected_language = None
for chunk in chunks:
# Enable language detection and transcription
result = model(chunk, generate_kwargs={"task": "transcribe", "language": None}) # Let Whisper detect language
transcriptions.append(result["text"])
# Extract detected language from the result (if available)
if "language" in result:
detected_language = result["language"]
os.remove(chunk) # Clean up chunk files
# Combine all transcriptions into one
full_transcription = " ".join(transcriptions)
# If no language was detected, set a default message
if detected_language is None:
detected_language = "unknown (language not detected)"
# Return transcription and detected language
return f"Detected Language: {detected_language}\n\nTranscription:\n{full_transcription}"
# Define the Gradio interface
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath", label="Upload Audio File"),
outputs=gr.Textbox(label="Transcription and Detected Language"),
title="Audio Transcription with Automatic Language Detection",
description="Upload an audio file, and the system will automatically detect the language and transcribe it."
)
# Launch the Gradio interface
iface.launch()