Spaces:

DeepLearning101
/

ASR-ANNOTATION

Sleeping

File size: 5,038 Bytes

a4e56dc
 
 
 
 
7094a64
 
 
a4e56dc
10c215a
a4e56dc
 
 
824f441
a4e56dc
 
 
824f441
a4e56dc
 
824f441
a4e56dc
9f26f14
967a3f6
e67d60b
e80b2c1
26b8c3d
 
016a16c
 
26b8c3d
 
 
 
 
016a16c
 
 
 
e67d60b
90d7852
7f23ec0
 
 
 
90d7852
a4e56dc
af25a31
 
1e25bc2
 
 
 
 
7094a64

import torch
import gradio as gr
import whisper
import os

# 確保 Whisper 模塊被正確加載
print("Whisper module contents:", dir(whisper))

# 加載 Whisper 模型
model = whisper.load_model("large-v2", device="cuda" if torch.cuda.is_available() else "cpu")

def transcribe(audio_file):
    audio_path = audio_file
    result = model.transcribe(audio_path)
    text = result["text"]
    base_name = os.path.splitext(os.path.basename(audio_path))[0]
    transcript_file_path = f"txt/{base_name}_transcript.txt"
    os.makedirs("txt", exist_ok=True)
    with open(transcript_file_path, "w") as file:
        file.write(text)
    return text, transcript_file_path

TITLE = """<h1>ASR 語音語料辨識修正工具</h1>"""
SUBTITLE = """<h2><a href='https://deep-learning-101.github.io' target='_blank'>deep-learning-101.github.io</a> | <a href='https://www.twman.org/AI' target='_blank'> AI </a> | <a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | <a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a><br></h2>"""
LINKS = """
<a href='https://github.com/Deep-Learning-101' target='_blank'>Deep Learning 101 Github</a> | <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | <a href='https://www.facebook.com/groups/525579498272187/' target='_blank'>台灣人工智慧社團 FB</a> | <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a><br>
<a href='https://blog.twman.org/2025/04/AI-Robot.html' target='_blank'>AI 陪伴機器人：2025 趨勢分析技術突破、市場潛力與未來展望</a> | <a href='https://blog.twman.org/2025/04/FinanceGenAI.html' target='_blank'>金融科技新浪潮：生成式 AI (GenAI) 應用場景、效益與導入挑戰</a><br>
<a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>避開 AI Agent 開發陷阱：常見問題、挑戰與解決方案 (實戰經驗)</a>：探討多種 AI 代理人工具的應用經驗與挑戰，分享實用經驗與工具推薦。<br>
<a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>：淺顯介紹生成式人工智慧核心概念，強調硬體資源和數據的重要性。<br>
<a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工？</a>：回顧 LLM 領域探索歷程，討論硬體升級對 AI 開發的重要性。<br>
<a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>檢索增強生成不是萬靈丹：挑戰與優化技巧</a>：探討 RAG 技術應用與挑戰，提供實用經驗分享和工具建議。<br>
<a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>大型語言模型 (LLM) 入門完整指南：原理、應用與未來 (2025 版)</a>：探討多種 LLM 工具的應用與挑戰，強調硬體資源的重要性。<br>
<a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>解析探索大型語言模型：模型發展歷史、訓練及微調技術的 VRAM 估算</a>：探討 LLM 的發展與應用，強調硬體資源在開發中的關鍵作用。。<br>
<a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>Diffusion Model 完全解析：從原理、應用到實作 (AI 圖像生成)</a>：深入探討影像生成與分割技術的應用，強調硬體資源的重要性。<br>
<a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>ASR/TTS 開發避坑指南：語音辨識與合成的常見挑戰與對策</a>：探討 ASR 和 TTS 技術應用中的問題，強調數據質量的重要性。<br>
<a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a>：分享 NLP 領域的實踐經驗，強調數據質量對模型效果的影響。<br>
<a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>：分享語音處理領域的實務經驗，強調資料品質對模型效果的影響。<br>
<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a><br>
<a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
"""

with gr.Blocks(css=".container { max-width: 800px; margin: auto; } .gradio-app { background-color: #f0f0f0; } button { background-color: #4CAF50; color: white; }") as demo:
    gr.HTML(TITLE)
    gr.HTML(SUBTITLE)
    gr.HTML(LINKS)
    
    with gr.Row():
        # 修改了 Audio 組件的宣告方式
        audio_input = gr.Audio(label="上載你的音頻", type="filepath")
        submit_button = gr.Button("語音識別")
    output_text = gr.TextArea(label="識別結果")
    download_link = gr.File(label="下載轉錄文件")
    submit_button.click(fn=transcribe, inputs=audio_input, outputs=[output_text, download_link])

demo.launch()