Spaces:

101Frost
/

wav2vec2

Sleeping

App Files Files Community

wav2vec2 / app.py

101Frost

Create app.py

dd8edb5 verified 4 months ago

raw

history blame

4.35 kB

	import gradio as gr
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	import librosa
	import torch
	import epitran
	import re
	import difflib
	import editdistance
	from jiwer import wer
	import json

	# Load model once at startup
	model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
	processor = Wav2Vec2Processor.from_pretrained(model_name)
	model = Wav2Vec2ForCTC.from_pretrained(model_name)
	epi = epitran.Epitran('ara-Arab')

	def clean_phonemes(ipa):
	"""Remove diacritics and length markers from phonemes"""
	return re.sub(r'[\u064B-\u0652\u02D0]', '', ipa)

	def analyze_phonemes(language, reference_text, audio_file):
	# Convert reference text to phonemes
	ref_phonemes = []
	for word in reference_text.split():
	ipa = epi.transliterate(word)
	ipa_clean = clean_phonemes(ipa)
	ref_phonemes.append(list(ipa_clean))

	# Process audio file
	audio, sr = librosa.load(audio_file.name, sr=16000)
	input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values

	# Get transcription
	with torch.no_grad():
	logits = model(input_values).logits
	pred_ids = torch.argmax(logits, dim=-1)
	transcription = processor.batch_decode(pred_ids)[0].strip()

	# Convert transcription to phonemes
	obs_phonemes = []
	for word in transcription.split():
	ipa = epi.transliterate(word)
	ipa_clean = clean_phonemes(ipa)
	obs_phonemes.append(list(ipa_clean))

	# Prepare results in JSON format
	results = {
	"reference_text": reference_text,
	"transcription": transcription,
	"word_alignment": [],
	"metrics": {}
	}

	# Calculate metrics
	total_phoneme_errors = 0
	total_phoneme_length = 0
	correct_words = 0
	total_word_length = len(ref_phonemes)

	# Word-by-word alignment
	for i, (ref, obs) in enumerate(zip(ref_phonemes, obs_phonemes)):
	ref_str = ''.join(ref)
	obs_str = ''.join(obs)
	edits = editdistance.eval(ref, obs)
	acc = round((1 - edits / max(1, len(ref))) * 100, 2)

	# Get error details
	matcher = difflib.SequenceMatcher(None, ref, obs)
	ops = matcher.get_opcodes()
	error_details = []
	for tag, i1, i2, j1, j2 in ops:
	ref_seg = ''.join(ref[i1:i2]) or '-'
	obs_seg = ''.join(obs[j1:j2]) or '-'
	if tag != 'equal':
	error_details.append({
	"type": tag.upper(),
	"reference": ref_seg,
	"observed": obs_seg
	})

	results["word_alignment"].append({
	"word_index": i,
	"reference_phonemes": ref_str,
	"observed_phonemes": obs_str,
	"edit_distance": edits,
	"accuracy": acc,
	"is_correct": edits == 0,
	"errors": error_details
	})

	total_phoneme_errors += edits
	total_phoneme_length += len(ref)
	correct_words += 1 if edits == 0 else 0

	# Calculate metrics
	phoneme_acc = round((1 - total_phoneme_errors / max(1, total_phoneme_length)) * 100, 2)
	phoneme_er = round((total_phoneme_errors / max(1, total_phoneme_length)) * 100, 2)
	word_acc = round((correct_words / max(1, total_word_length)) * 100, 2)
	word_er = round(((total_word_length - correct_words) / max(1, total_word_length)) * 100, 2)
	text_wer = round(wer(reference_text, transcription) * 100, 2)

	results["metrics"] = {
	"word_accuracy": word_acc,
	"word_error_rate": word_er,
	"phoneme_accuracy": phoneme_acc,
	"phoneme_error_rate": phoneme_er,
	"asr_word_error_rate": text_wer
	}

	return json.dumps(results, indent=2, ensure_ascii=False)

	# Create Gradio interface
	demo = gr.Interface(
	fn=analyze_phonemes,
	inputs=[
	gr.Dropdown(["Arabic"], label="Language", value="Arabic"),
	gr.Textbox(label="Reference Text", value="فَبِأَيِّ آلَاءِ رَبِّكُمَا تُكَذِّبَانِ"),
	gr.File(label="Upload Audio File", type="file")
	],
	outputs=gr.JSON(label="Phoneme Alignment Results"),
	title="Arabic Phoneme Alignment Analysis",
	description="Compare audio pronunciation with reference text at phoneme level"
	)

	demo.launch()