File size: 1,721 Bytes
e3a6e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr
import uuid
import os
import requests
import base64
from server import (
    on_click_metrics as server_metrics,
    process_audio as server_process_audio
)

TTS_OUTPUT_DIR = "./tmp"
os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)


def process_audio(audio_path):
    # We have audio_path
    result = server_process_audio(audio_path)

    audio_data = base64.b64decode(result["audio"])
    with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
        f.write(audio_data)
        
    with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
        f.write(result['asr_text'])
    with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
        f.write(result['llm_text'])

    return f"""
asr_text: {result['asr_text']}
llm_text: {result['llm_text']}
""", f"{TTS_OUTPUT_DIR}/response.wav"


def on_click_metrics():
    res = server_metrics()
    return res.content.decode('utf-8')


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column(scale=1):
            gr.Image(value="character.png", show_label=False)  # キャラ絵を表示
        with gr.Column(scale=2):
            mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
            text_output = gr.Textbox(label="transcription")
            audio_output = gr.Audio(label="audio", autoplay=True)  

            mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
    with gr.Row():
        metrics_button = gr.Button("compute metrics")
        metrics_output = gr.Textbox(label="Metrics", lines=3)
        metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])

    with gr.Row():
        log = gr.Textbox(label="logs", lines=5)

demo.launch(share=True)
# demo.launch()