import gradio as gr from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import librosa import torch import epitran import re import difflib import editdistance from jiwer import wer import json # Load both models at startup MODELS = { "Arabic": { "processor": Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic"), "model": Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic"), "epitran": epitran.Epitran("ara-Arab") }, "English": { "processor": Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"), "model": Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"), "epitran": epitran.Epitran("eng-Latn") } } def clean_phonemes(ipa): """Remove diacritics and length markers from phonemes""" return re.sub(r'[\u064B-\u0652\u02D0]', '', ipa) def analyze_phonemes(language, reference_text, audio_file): # Get the appropriate model components lang_models = MODELS[language] processor = lang_models["processor"] model = lang_models["model"] epi = lang_models["epitran"] # Convert reference text to phonemes ref_phonemes = [] for word in reference_text.split(): ipa = epi.transliterate(word) ipa_clean = clean_phonemes(ipa) ref_phonemes.append(list(ipa_clean)) # Process audio file audio, sr = librosa.load(audio_file.name, sr=16000) input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values # Get transcription with torch.no_grad(): logits = model(input_values).logits pred_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(pred_ids)[0].strip() # Convert transcription to phonemes obs_phonemes = [] for word in transcription.split(): ipa = epi.transliterate(word) ipa_clean = clean_phonemes(ipa) obs_phonemes.append(list(ipa_clean)) # Prepare results in JSON format results = { "language": language, "reference_text": reference_text, "transcription": transcription, "word_alignment": [], "metrics": {} } # Calculate metrics total_phoneme_errors = 0 total_phoneme_length = 0 correct_words = 0 total_word_length = len(ref_phonemes) # Word-by-word alignment for i, (ref, obs) in enumerate(zip(ref_phonemes, obs_phonemes)): ref_str = ''.join(ref) obs_str = ''.join(obs) edits = editdistance.eval(ref, obs) acc = round((1 - edits / max(1, len(ref))) * 100, 2) # Get error details matcher = difflib.SequenceMatcher(None, ref, obs) ops = matcher.get_opcodes() error_details = [] for tag, i1, i2, j1, j2 in ops: ref_seg = ''.join(ref[i1:i2]) or '-' obs_seg = ''.join(obs[j1:j2]) or '-' if tag != 'equal': error_details.append({ "type": tag.upper(), "reference": ref_seg, "observed": obs_seg }) results["word_alignment"].append({ "word_index": i, "reference_phonemes": ref_str, "observed_phonemes": obs_str, "edit_distance": edits, "accuracy": acc, "is_correct": edits == 0, "errors": error_details }) total_phoneme_errors += edits total_phoneme_length += len(ref) correct_words += 1 if edits == 0 else 0 # Calculate metrics phoneme_acc = round((1 - total_phoneme_errors / max(1, total_phoneme_length)) * 100, 2) phoneme_er = round((total_phoneme_errors / max(1, total_phoneme_length)) * 100, 2) word_acc = round((correct_words / max(1, total_word_length)) * 100, 2) word_er = round(((total_word_length - correct_words) / max(1, total_word_length)) * 100, 2) text_wer = round(wer(reference_text, transcription) * 100, 2) results["metrics"] = { "word_accuracy": word_acc, "word_error_rate": word_er, "phoneme_accuracy": phoneme_acc, "phoneme_error_rate": phoneme_er, "asr_word_error_rate": text_wer } return json.dumps(results, indent=2, ensure_ascii=False) # Create Gradio interface with language-specific default text def get_default_text(language): return { "Arabic": "فَبِأَيِّ آلَاءِ رَبِّكُمَا تُكَذِّبَانِ", "English": "The quick brown fox jumps over the lazy dog" }.get(language, "") with gr.Blocks() as demo: gr.Markdown("# Multilingual Phoneme Alignment Analysis") gr.Markdown("Compare audio pronunciation with reference text at phoneme level") with gr.Row(): language = gr.Dropdown( ["Arabic", "English"], label="Language", value="Arabic" ) reference_text = gr.Textbox( label="Reference Text", value=get_default_text("Arabic") ) audio_input = gr.File(label="Upload Audio File", type="file") submit_btn = gr.Button("Analyze") output = gr.JSON(label="Phoneme Alignment Results") # Update default text when language changes language.change( fn=get_default_text, inputs=language, outputs=reference_text ) submit_btn.click( fn=analyze_phonemes, inputs=[language, reference_text, audio_input], outputs=output ) demo.launch()