import gradio as gr import torch import librosa from transformers import Wav2Vec2Processor, AutoModelForCTC import zipfile import os import firebase_admin from firebase_admin import credentials, firestore from datetime import datetime import json import tempfile # Initialize Firebase firebase_config = json.loads(os.environ.get('firebase_creds')) cred = credentials.Certificate(firebase_config) firebase_admin.initialize_app(cred) db = firestore.client() # Load the ASR model and processor MODEL_NAME = "eleferrand/xlsr53_Amis" processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME) model = AutoModelForCTC.from_pretrained(MODEL_NAME) # Language configuration LANGUAGE = { "en": { "title": "ASR Demo with Editable Transcription", "step1": "Step 1: Audio Upload & Transcription", "audio_input": "Audio Input", "transcribe_btn": "Transcribe Audio", "step2": "Step 2: Review & Edit Transcription", "original_text": "Original Transcription", "corrected_text": "Corrected Transcription", "transcription_placeholder": "Transcription will appear here...", "step3": "Step 3: User Information", "age_label": "Age", "native_speaker": "Native Amis Speaker", "step4": "Step 4: Save & Download", "save_btn": "Save Correction to Database", "save_status": "Save Status", "download_btn": "Download Results (ZIP)", "status_placeholder": "Status messages will appear here...", "toggle_lang": "中文/English" }, "zh": { "title": "可編輯轉寫的語音辨識演示", "step1": "步驟一: 音頻上傳與轉寫", "audio_input": "音頻輸入", "transcribe_btn": "開始轉寫", "step2": "步驟二: 校對與編輯轉寫結果", "original_text": "原始轉寫結果", "corrected_text": "校正後文本", "transcription_placeholder": "轉寫結果將顯示在此處...", "step3": "步驟三: 用戶資訊", "age_label": "年齡", "native_speaker": "阿美族母語者", "step4": "步驟四: 保存與下載", "save_btn": "保存校正結果至數據庫", "save_status": "保存狀態", "download_btn": "下載結果(ZIP壓縮檔)", "status_placeholder": "狀態訊息將顯示在此處...", "toggle_lang": "English/中文" } } current_lang = gr.State(value="en") def transcribe(audio_file): try: audio, rate = librosa.load(audio_file, sr=16000) input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription.replace("[UNK]", "") except Exception as e: return f"Error processing file: {e}" def transcribe_both(audio_file): start_time = datetime.now() transcription = transcribe(audio_file) processing_time = (datetime.now() - start_time).total_seconds() return transcription, transcription, processing_time def store_correction(original_transcription, corrected_transcription, audio_file, processing_time, age, native_speaker): try: audio_metadata = {} if audio_file and os.path.exists(audio_file): audio, sr = librosa.load(audio_file, sr=16000) duration = librosa.get_duration(y=audio, sr=sr) file_size = os.path.getsize(audio_file) audio_metadata = {'duration': duration, 'file_size': file_size} combined_data = { 'original_text': original_transcription, 'corrected_text': corrected_transcription, 'timestamp': datetime.now().isoformat(), 'processing_time': processing_time, 'audio_metadata': audio_metadata, 'audio_url': None, 'model_name': MODEL_NAME, 'user_info': { 'native_amis_speaker': native_speaker, 'age': age } } db.collection('transcriptions').add(combined_data) return "Correction saved successfully!" except Exception as e: return f"Error saving correction: {e}" def prepare_download(audio_file, original_transcription, corrected_transcription): if audio_file is None: return None tmp_zip = tempfile.NamedTemporaryFile(delete=False, suffix=".zip") tmp_zip.close() with zipfile.ZipFile(tmp_zip.name, "w") as zf: if os.path.exists(audio_file): zf.write(audio_file, arcname="audio.wav") orig_txt = "original_transcription.txt" with open(orig_txt, "w", encoding="utf-8") as f: f.write(original_transcription) zf.write(orig_txt, arcname="original_transcription.txt") os.remove(orig_txt) corr_txt = "corrected_transcription.txt" with open(corr_txt, "w", encoding="utf-8") as f: f.write(corrected_transcription) zf.write(corr_txt, arcname="corrected_transcription.txt") os.remove(corr_txt) return tmp_zip.name def toggle_language(lang): new_lang = "zh" if lang == "en" else "en" lang_dict = LANGUAGE[new_lang] return [ gr.Markdown.update(value=f"

{lang_dict['title']}

"), gr.Markdown.update(value=f"### {lang_dict['step1']}"), gr.Audio.update(label=lang_dict['audio_input']), gr.Button.update(value=lang_dict['transcribe_btn']), gr.Markdown.update(value=f"### {lang_dict['step2']}"), gr.Textbox.update(label=lang_dict['original_text'], placeholder=lang_dict['transcription_placeholder']), gr.Textbox.update(label=lang_dict['corrected_text'], placeholder=lang_dict['transcription_placeholder']), gr.Markdown.update(value=f"### {lang_dict['step3']}"), gr.Slider.update(label=lang_dict['age_label']), gr.Checkbox.update(label=lang_dict['native_speaker']), gr.Markdown.update(value=f"### {lang_dict['step4']}"), gr.Button.update(value=lang_dict['save_btn']), gr.Textbox.update(label=lang_dict['save_status'], placeholder=lang_dict['status_placeholder']), gr.Button.update(value=lang_dict['download_btn']), gr.File.update(label=lang_dict['download_btn']), gr.Button.update(value=lang_dict['toggle_lang']), new_lang ] with gr.Blocks(css=""" .container { max-width: 800px; margin: auto; padding: 20px; font-family: Arial, sans-serif; } .header { text-align: center; margin-bottom: 30px; } .section { margin-bottom: 30px; padding: 15px; border: 1px solid #ddd; border-radius: 8px; background-color: #f9f9f9; } .section h3 { margin-top: 0; margin-bottom: 15px; text-align: center; } .button-row { display: flex; justify-content: center; gap: 10px; flex-wrap: wrap; } .lang-toggle { position: absolute; top: 20px; right: 20px; } @media (max-width: 600px) { .gradio-row { flex-direction: column; } } """) as demo: current_lang.render() with gr.Column(elem_classes="container"): with gr.Row(): title_md = gr.Markdown(elem_classes="header") lang_btn = gr.Button(LANGUAGE['en']['toggle_lang'], elem_classes="lang-toggle") # Step 1 with gr.Column(elem_classes="section"): step1_md = gr.Markdown() with gr.Row(): audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath") transcribe_button = gr.Button(variant="primary") proc_time_state = gr.State() # Step 2 with gr.Column(elem_classes="section"): step2_md = gr.Markdown() with gr.Row(): original_text = gr.Textbox(interactive=False, lines=5) corrected_text = gr.Textbox(interactive=True, lines=5) # Step 3 with gr.Column(elem_classes="section"): step3_md = gr.Markdown() with gr.Row(): age_input = gr.Slider(minimum=0, maximum=100, step=1, value=25) native_speaker_input = gr.Checkbox(value=True) # Step 4 with gr.Column(elem_classes="section"): step4_md = gr.Markdown() with gr.Row(elem_classes="button-row"): save_button = gr.Button(variant="primary") save_status = gr.Textbox(interactive=False) with gr.Row(elem_classes="button-row"): download_button = gr.Button() download_output = gr.File() lang_btn.click( toggle_language, inputs=current_lang, outputs=[ title_md, step1_md, audio_input, transcribe_button, step2_md, original_text, corrected_text, step3_md, age_input, native_speaker_input, step4_md, save_button, save_status, download_button, download_output, lang_btn, current_lang ] ) transcribe_button.click( transcribe_both, inputs=audio_input, outputs=[original_text, corrected_text, proc_time_state] ) save_button.click( store_correction, inputs=[original_text, corrected_text, audio_input, proc_time_state, age_input, native_speaker_input], outputs=save_status ) download_button.click( prepare_download, inputs=[audio_input, original_text, corrected_text], outputs=download_output ) demo.load( toggle_language, inputs=current_lang, outputs=[ title_md, step1_md, audio_input, transcribe_button, step2_md, original_text, corrected_text, step3_md, age_input, native_speaker_input, step4_md, save_button, save_status, download_button, download_output, lang_btn, current_lang ] ) demo.launch(share=True)