Spaces:

Merlintxu
/

Wav2Txt

Build error

App Files Files Community

Wav2Txt / app.py

Merlintxu

Update app.py

5d729bc verified over 1 year ago

raw

history blame

7.31 kB

	import gradio as gr
	from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
	import torch
	import librosa
	import subprocess
	from langdetect import detect_langs
	import os
	import warnings
	from transformers import logging
	import math
	import json
	from pyannote.audio import Pipeline

	# Suppress warnings
	warnings.filterwarnings("ignore")
	logging.set_verbosity_error()

	# Updated models by language
	MODELS = {
	"es": [
	"openai/whisper-large-v3",
	"facebook/wav2vec2-large-xlsr-53-spanish",
	"jonatasgrosman/wav2vec2-xls-r-1b-spanish"
	],
	"en": [
	"openai/whisper-large-v3",
	"facebook/wav2vec2-large-960h",
	"microsoft/wav2vec2-base-960h"
	],
	"pt": [
	"facebook/wav2vec2-large-xlsr-53-portuguese",
	"openai/whisper-medium",
	"jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
	]
	}

	def convert_audio_to_wav(audio_path):
	wav_path = "converted_audio.wav"
	command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
	subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	return wav_path

	def detect_language(audio_path):
	speech, _ = librosa.load(audio_path, sr=16000, duration=30)

	processor = WhisperProcessor.from_pretrained("openai/whisper-base")
	model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

	input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
	predicted_ids = model.generate(input_features)
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

	langs = detect_langs(transcription)

	es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
	pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)

	if abs(es_confidence - pt_confidence) < 0.2:
	return 'es'

	return max(langs, key=lambda x: x.prob).lang

	def diarize_audio(wav_audio):
	pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
	diarization = pipeline(wav_audio)
	return diarization

	def transcribe_audio_stream(audio, model_name, diarization):
	wav_audio = convert_audio_to_wav(audio)
	speech, rate = librosa.load(wav_audio, sr=16000)
	duration = len(speech) / rate

	if "whisper" in model_name:
	processor = WhisperProcessor.from_pretrained(model_name)
	model = WhisperForConditionalGeneration.from_pretrained(model_name)

	chunk_duration = 30 # seconds

	transcriptions = []
	for i in range(0, int(duration), chunk_duration):
	end = min(i + chunk_duration, duration)
	chunk = speech[int(i * rate):int(end * rate)]

	input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
	predicted_ids = model.generate(input_features)
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

	progress = min(100, (end / duration) * 100)
	timestamp = i
	transcriptions.append((timestamp, transcription))
	yield transcriptions, progress
	else:
	transcriber = pipeline("automatic-speech-recognition", model=model_name)

	chunk_duration = 10 # seconds

	transcriptions = []
	for i in range(0, int(duration), chunk_duration):
	end = min(i + chunk_duration, duration)
	chunk = speech[int(i * rate):int(end * rate)]
	result = transcriber(chunk)

	progress = min(100, (end / duration) * 100)
	timestamp = i
	transcriptions.append((timestamp, result["text"]))
	yield transcriptions, progress

	# Merge diarization results with transcription
	speaker_transcriptions = []
	for segment in diarization.itertracks(yield_label=True):
	start, end, speaker = segment
	start_time = start / rate
	end_time = end / rate
	text_segment = ""
	for ts, text in transcriptions:
	if start_time <= ts <= end_time:
	text_segment += text + " "
	speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))

	return speaker_transcriptions

	def detect_and_select_model(audio):
	wav_audio = convert_audio_to_wav(audio)
	language = detect_language(wav_audio)
	model_options = MODELS.get(language, MODELS["en"])
	return language, model_options

	def save_transcription(transcriptions, file_format):
	if file_format == "txt":
	with open("transcription.txt", "w") as f:
	for start, end, speaker, text in transcriptions:
	f.write(f"[{start}-{end}] {speaker}: {text}\n")
	return "transcription.txt"
	elif file_format == "json":
	with open("transcription.json", "w") as f:
	json.dump(transcriptions, f)
	return "transcription.json"

	def combined_interface(audio):
	try:
	language, model_options = detect_and_select_model(audio)
	selected_model = model_options[0]

	yield language, model_options, selected_model, [], 0, "Initializing..."

	wav_audio = convert_audio_to_wav(audio)
	diarization = diarize_audio(wav_audio)
	transcriptions = []
	for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model, diarization):
	transcriptions = partial_transcriptions
	transcriptions_text = "\n".join([f"[{start}-{end}] {speaker}: {text}" for start, end, speaker, text in transcriptions])
	progress_int = math.floor(progress)
	status = f"Transcribing... {progress_int}% complete"
	yield language, model_options, selected_model, transcriptions_text, progress_int, status

	# Clean up temporary files
	os.remove("converted_audio.wav")

	yield language, model_options, selected_model, transcriptions_text, 100, "Transcription complete!"

	except Exception as e:
	yield str(e), [], "", "An error occurred during processing.", 0, "Error"

	iface = gr.Interface(
	fn=combined_interface,
	inputs=gr.Audio(type="filepath"),
	outputs=[
	gr.Textbox(label="Detected Language"),
	gr.Dropdown(label="Available Models", choices=[]),
	gr.Textbox(label="Selected Model"),
	gr.Textbox(label="Transcription", lines=10),
	gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
	gr.Textbox(label="Status"),
	gr.File(label="Download Transcription (TXT)", type="filepath", interactive=True, value="transcription.txt"),
	gr.File(label="Download Transcription (JSON)", type="filepath", interactive=True, value="transcription.json")
	],
	title="Multilingual Audio Transcriber with Real-time Display, Timestamps, and Speaker Diarization",
	description="Upload an audio file to detect the language, select the transcription model, and get the transcription with timestamps and speaker labels in real-time. Download the transcription as TXT or JSON. Optimized for Spanish, English, and Portuguese.",
	live=True
	)

	if __name__ == "__main__":
	iface.queue().launch()