File size: 7,558 Bytes
92dd616
 
 
6bdc489
b3067c5
92dd616
6bdc489
92dd616
 
 
 
 
 
 
 
 
 
 
 
 
 
a609646
92dd616
 
 
 
4792427
 
92dd616
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4792427
92dd616
 
 
 
 
4792427
92dd616
 
 
 
 
 
 
 
 
 
4792427
92dd616
 
 
 
 
 
 
 
4792427
92dd616
 
4792427
92dd616
 
4792427
92dd616
 
 
4792427
 
 
 
 
 
 
92dd616
4792427
 
92dd616
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import os, asyncio, json, tempfile, websockets, pdfplumber
import gradio as gr
import openai
from dotenv import load_dotenv

# โ”€โ”€โ”€ 0. ์ดˆ๊ธฐํ™” โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    raise RuntimeError("OPENAI_API_KEY ๊ฐ€ .env ์— ์—†์Šต๋‹ˆ๋‹ค!")

LANG = ["Korean","English","Japanese","Chinese",
        "Thai","Russian","Vietnamese","Spanish","French"]
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
         for l in LANG}
FOUR = ["English","Chinese","Thai","Russian"]
WS_URL = "wss://api.openai.com/v1/audio/transcriptions/stream"

# โ”€โ”€โ”€ 1. ๊ณตํ†ต GPT ๋ฒˆ์—ญ / TTS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
async def gpt_translate(text, src, tgt):
    rsp = await openai.AsyncClient().chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role":"system",
                   "content":f"Translate {src} โ†’ {tgt}. Return only the text."},
                  {"role":"user","content":text}],
        temperature=0.3,max_tokens=2048)
    return rsp.choices[0].message.content.strip()

async def gpt_tts(text, lang):
    rsp = await openai.AsyncClient().audio.speech.create(
        model="tts-1", voice=VOICE[lang], input=text[:4096])
    tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
    tmp.write(rsp.content); tmp.close(); return tmp.name

# โ”€โ”€โ”€ 2. PDF ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def translate_pdf(file, src, tgt):
    if not file: return "โš ๏ธ PDF ์—…๋กœ๋“œ ํ•„์š”", ""
    with pdfplumber.open(file.name) as pdf:
        text = "\n".join(p.extract_text() or "" for p in pdf.pages[:5]).strip()
    if not text:
        return "โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ", ""
    return text, asyncio.run(gpt_translate(text, src, tgt))

# โ”€โ”€โ”€ 3. WebSocket STT ํ—ฌํผ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
async def ws_stt_generator(audio_queue: asyncio.Queue):
    """
    ๋ฐฑ๊ทธ๋ผ์šด๋“œ ํƒœ์Šคํฌ:
      - audio_queue ๋กœ๋ถ€ํ„ฐ chunk(bytes) ์ˆ˜์‹ 
      - WS ๋กœ ์ „์†ก, ์„œ๋ฒ„ event ์ˆ˜์‹  โ†’ yield (partial text, final?)
    """
    async with websockets.connect(
        WS_URL,
        extra_headers={"Authorization": f"Bearer {openai.api_key}"},
        max_size=None
    ) as ws:
        async def sender():
            while True:
                chunk = await audio_queue.get()
                if chunk is None:          # ์ข…๋ฃŒ ํ”Œ๋ž˜๊ทธ
                    await ws.send(json.dumps({"terminate": True}))
                    break
                await ws.send(chunk)
        asyncio.create_task(sender())
        async for msg in ws:
            data = json.loads(msg)
            yield data["text"], data.get("final", False)

# โ”€โ”€โ”€ 4. Gradio ์ŠคํŠธ๋ฆผ ํ•ธ๋“ค๋Ÿฌ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
async def realtime_single(mic, src, tgt, state):
    """
    mic: bytes chunk (Gradio ์ž๋™)
    state: {"queue": Queue, "task": Task, "orig": str, "trans": str}
    """
    if state is None:
        state = {"queue": asyncio.Queue(), "task": None, "orig":"", "trans":""}

    if mic is None:   # ์ŠคํŠธ๋ฆผ ์ข…๋ฃŒ
        await state["queue"].put(None)
        return state["orig"], state["trans"], state

    # ์ฒซ ํ˜ธ์ถœ์ด๋ฉด WS ํƒœ์Šคํฌ ์‹œ์ž‘
    if state["task"] is None:
        async def run_ws():
            async for text, final in ws_stt_generator(state["queue"]):
                state["orig"] += (" " if state["orig"] else "") + text
                add = await gpt_translate(text, src, tgt)
                state["trans"] += (" " if state["trans"] else "") + add
        state["task"] = asyncio.create_task(run_ws())

    # ๋งˆ์ดํฌ chunk enqueue
    await state["queue"].put(mic)
    return state["orig"], state["trans"], state

async def realtime_four(mic, src, state):
    if state is None:
        state = {"queue": asyncio.Queue(), "task": None,
                 "orig":"", "English":"", "Chinese":"", "Thai":"", "Russian":""}

    if mic is None:
        await state["queue"].put(None)
        return tuple(state[k] for k in
                     ["orig","English","Chinese","Thai","Russian"]) + (state,)

    if state["task"] is None:
        async def run_ws():
            async for text, _ in ws_stt_generator(state["queue"]):
                state["orig"] += (" "+text)
                for lang in FOUR:
                    state[lang] += (" "+ await gpt_translate(text, src, lang))
        state["task"] = asyncio.create_task(run_ws())

    await state["queue"].put(mic)
    return tuple(state[k] for k in
                 ["orig","English","Chinese","Thai","Russian"]) + (state,)

# โ”€โ”€โ”€ 5. UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="SMARTok Demo") as demo:
    with gr.Tabs():
        # ํƒญ 1 โ€“ ์˜ค๋””์˜ค ๋ฒˆ์—ญ
        with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค"):
            src1=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
            tgt1=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
            aud1=gr.Audio(sources=["microphone","upload"],type="filepath")
            btn1=gr.Button("๋ฒˆ์—ญ")
            o1=gr.Textbox(label="์›๋ฌธ"); t1=gr.Textbox(label="๋ฒˆ์—ญ")
            a1=gr.Audio(label="TTS",type="filepath",autoplay=True)
            btn1.click(lambda a,s,t: translate_pdf.__wrapped__ if False else translate_pdf,
                       [aud1,src1,tgt1],[o1,t1,a1])  # dummy, ์œ ์ง€์šฉ

        # ํƒญ 2 โ€“ PDF ๋ฒˆ์—ญ
        with gr.TabItem("๐Ÿ“„ PDF"):
            src2=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
            tgt2=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
            pdf=gr.File(file_types=[".pdf"])
            btn2=gr.Button("๋ฒˆ์—ญ")
            o2=gr.Textbox(label="์ถ”์ถœ ์›๋ฌธ",lines=15)
            t2=gr.Textbox(label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ",lines=15)
            btn2.click(translate_pdf:=translate_pdf,[pdf,src2,tgt2],[o2,t2])

        # ํƒญ 3 โ€“ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด
        with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1"):
            src3=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
            tgt3=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
            mic3=gr.Audio(sources=["microphone"],streaming=True)
            o3=gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)",lines=8)
            t3=gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)",lines=8)
            st3=gr.State()
            mic3.stream(realtime_single,inputs=[src3,tgt3,st3],
                        outputs=[o3,t3,st3])

        # ํƒญ 4 โ€“ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด
        with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4"):
            src4=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
            mic4=gr.Audio(sources=["microphone"],streaming=True)
            o4=gr.Textbox(label="์›๋ฌธ",lines=8)
            e4=gr.Textbox(label="English",lines=8)
            c4=gr.Textbox(label="Chinese(็ฎ€ไฝ“)",lines=8)
            th4=gr.Textbox(label="Thai",lines=8)
            r4=gr.Textbox(label="Russian",lines=8)
            st4=gr.State()
            mic4.stream(realtime_four,inputs=[src4,st4],
                        outputs=[o4,e4,c4,th4,r4,st4])

demo.launch(server_name="0.0.0.0",server_port=7860,debug=True)