Spaces:
Sleeping
Sleeping
File size: 7,558 Bytes
92dd616 6bdc489 b3067c5 92dd616 6bdc489 92dd616 a609646 92dd616 4792427 92dd616 4792427 92dd616 4792427 92dd616 4792427 92dd616 4792427 92dd616 4792427 92dd616 4792427 92dd616 4792427 92dd616 4792427 92dd616 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import os, asyncio, json, tempfile, websockets, pdfplumber
import gradio as gr
import openai
from dotenv import load_dotenv
# โโโ 0. ์ด๊ธฐํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
raise RuntimeError("OPENAI_API_KEY ๊ฐ .env ์ ์์ต๋๋ค!")
LANG = ["Korean","English","Japanese","Chinese",
"Thai","Russian","Vietnamese","Spanish","French"]
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
for l in LANG}
FOUR = ["English","Chinese","Thai","Russian"]
WS_URL = "wss://api.openai.com/v1/audio/transcriptions/stream"
# โโโ 1. ๊ณตํต GPT ๋ฒ์ญ / TTS โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
async def gpt_translate(text, src, tgt):
rsp = await openai.AsyncClient().chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role":"system",
"content":f"Translate {src} โ {tgt}. Return only the text."},
{"role":"user","content":text}],
temperature=0.3,max_tokens=2048)
return rsp.choices[0].message.content.strip()
async def gpt_tts(text, lang):
rsp = await openai.AsyncClient().audio.speech.create(
model="tts-1", voice=VOICE[lang], input=text[:4096])
tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
tmp.write(rsp.content); tmp.close(); return tmp.name
# โโโ 2. PDF ๋ฒ์ญ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def translate_pdf(file, src, tgt):
if not file: return "โ ๏ธ PDF ์
๋ก๋ ํ์", ""
with pdfplumber.open(file.name) as pdf:
text = "\n".join(p.extract_text() or "" for p in pdf.pages[:5]).strip()
if not text:
return "โ ๏ธ ํ
์คํธ ์ถ์ถ ์คํจ", ""
return text, asyncio.run(gpt_translate(text, src, tgt))
# โโโ 3. WebSocket STT ํฌํผ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
async def ws_stt_generator(audio_queue: asyncio.Queue):
"""
๋ฐฑ๊ทธ๋ผ์ด๋ ํ์คํฌ:
- audio_queue ๋ก๋ถํฐ chunk(bytes) ์์
- WS ๋ก ์ ์ก, ์๋ฒ event ์์ โ yield (partial text, final?)
"""
async with websockets.connect(
WS_URL,
extra_headers={"Authorization": f"Bearer {openai.api_key}"},
max_size=None
) as ws:
async def sender():
while True:
chunk = await audio_queue.get()
if chunk is None: # ์ข
๋ฃ ํ๋๊ทธ
await ws.send(json.dumps({"terminate": True}))
break
await ws.send(chunk)
asyncio.create_task(sender())
async for msg in ws:
data = json.loads(msg)
yield data["text"], data.get("final", False)
# โโโ 4. Gradio ์คํธ๋ฆผ ํธ๋ค๋ฌ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
async def realtime_single(mic, src, tgt, state):
"""
mic: bytes chunk (Gradio ์๋)
state: {"queue": Queue, "task": Task, "orig": str, "trans": str}
"""
if state is None:
state = {"queue": asyncio.Queue(), "task": None, "orig":"", "trans":""}
if mic is None: # ์คํธ๋ฆผ ์ข
๋ฃ
await state["queue"].put(None)
return state["orig"], state["trans"], state
# ์ฒซ ํธ์ถ์ด๋ฉด WS ํ์คํฌ ์์
if state["task"] is None:
async def run_ws():
async for text, final in ws_stt_generator(state["queue"]):
state["orig"] += (" " if state["orig"] else "") + text
add = await gpt_translate(text, src, tgt)
state["trans"] += (" " if state["trans"] else "") + add
state["task"] = asyncio.create_task(run_ws())
# ๋ง์ดํฌ chunk enqueue
await state["queue"].put(mic)
return state["orig"], state["trans"], state
async def realtime_four(mic, src, state):
if state is None:
state = {"queue": asyncio.Queue(), "task": None,
"orig":"", "English":"", "Chinese":"", "Thai":"", "Russian":""}
if mic is None:
await state["queue"].put(None)
return tuple(state[k] for k in
["orig","English","Chinese","Thai","Russian"]) + (state,)
if state["task"] is None:
async def run_ws():
async for text, _ in ws_stt_generator(state["queue"]):
state["orig"] += (" "+text)
for lang in FOUR:
state[lang] += (" "+ await gpt_translate(text, src, lang))
state["task"] = asyncio.create_task(run_ws())
await state["queue"].put(mic)
return tuple(state[k] for k in
["orig","English","Chinese","Thai","Russian"]) + (state,)
# โโโ 5. UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
with gr.Blocks(title="SMARTok Demo") as demo:
with gr.Tabs():
# ํญ 1 โ ์ค๋์ค ๋ฒ์ญ
with gr.TabItem("๐๏ธ ์ค๋์ค"):
src1=gr.Dropdown(LANG,value="Korean",label="์
๋ ฅ")
tgt1=gr.Dropdown(LANG,value="English",label="์ถ๋ ฅ")
aud1=gr.Audio(sources=["microphone","upload"],type="filepath")
btn1=gr.Button("๋ฒ์ญ")
o1=gr.Textbox(label="์๋ฌธ"); t1=gr.Textbox(label="๋ฒ์ญ")
a1=gr.Audio(label="TTS",type="filepath",autoplay=True)
btn1.click(lambda a,s,t: translate_pdf.__wrapped__ if False else translate_pdf,
[aud1,src1,tgt1],[o1,t1,a1]) # dummy, ์ ์ง์ฉ
# ํญ 2 โ PDF ๋ฒ์ญ
with gr.TabItem("๐ PDF"):
src2=gr.Dropdown(LANG,value="Korean",label="์
๋ ฅ")
tgt2=gr.Dropdown(LANG,value="English",label="์ถ๋ ฅ")
pdf=gr.File(file_types=[".pdf"])
btn2=gr.Button("๋ฒ์ญ")
o2=gr.Textbox(label="์ถ์ถ ์๋ฌธ",lines=15)
t2=gr.Textbox(label="๋ฒ์ญ ๊ฒฐ๊ณผ",lines=15)
btn2.click(translate_pdf:=translate_pdf,[pdf,src2,tgt2],[o2,t2])
# ํญ 3 โ ์ค์๊ฐ 1์ธ์ด
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ 1"):
src3=gr.Dropdown(LANG,value="Korean",label="์
๋ ฅ")
tgt3=gr.Dropdown(LANG,value="English",label="์ถ๋ ฅ")
mic3=gr.Audio(sources=["microphone"],streaming=True)
o3=gr.Textbox(label="์๋ฌธ(์ค์๊ฐ)",lines=8)
t3=gr.Textbox(label="๋ฒ์ญ(์ค์๊ฐ)",lines=8)
st3=gr.State()
mic3.stream(realtime_single,inputs=[src3,tgt3,st3],
outputs=[o3,t3,st3])
# ํญ 4 โ ์ค์๊ฐ 4์ธ์ด
with gr.TabItem("๐ ์ค์๊ฐ 4"):
src4=gr.Dropdown(LANG,value="Korean",label="์
๋ ฅ")
mic4=gr.Audio(sources=["microphone"],streaming=True)
o4=gr.Textbox(label="์๋ฌธ",lines=8)
e4=gr.Textbox(label="English",lines=8)
c4=gr.Textbox(label="Chinese(็ฎไฝ)",lines=8)
th4=gr.Textbox(label="Thai",lines=8)
r4=gr.Textbox(label="Russian",lines=8)
st4=gr.State()
mic4.stream(realtime_four,inputs=[src4,st4],
outputs=[o4,e4,c4,th4,r4,st4])
demo.launch(server_name="0.0.0.0",server_port=7860,debug=True)
|