voice-trans / app.py
openfree's picture
Update app.py
92dd616 verified
raw
history blame
7.56 kB
import os, asyncio, json, tempfile, websockets, pdfplumber
import gradio as gr
import openai
from dotenv import load_dotenv
# โ”€โ”€โ”€ 0. ์ดˆ๊ธฐํ™” โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
raise RuntimeError("OPENAI_API_KEY ๊ฐ€ .env ์— ์—†์Šต๋‹ˆ๋‹ค!")
LANG = ["Korean","English","Japanese","Chinese",
"Thai","Russian","Vietnamese","Spanish","French"]
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
for l in LANG}
FOUR = ["English","Chinese","Thai","Russian"]
WS_URL = "wss://api.openai.com/v1/audio/transcriptions/stream"
# โ”€โ”€โ”€ 1. ๊ณตํ†ต GPT ๋ฒˆ์—ญ / TTS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
async def gpt_translate(text, src, tgt):
rsp = await openai.AsyncClient().chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role":"system",
"content":f"Translate {src} โ†’ {tgt}. Return only the text."},
{"role":"user","content":text}],
temperature=0.3,max_tokens=2048)
return rsp.choices[0].message.content.strip()
async def gpt_tts(text, lang):
rsp = await openai.AsyncClient().audio.speech.create(
model="tts-1", voice=VOICE[lang], input=text[:4096])
tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
tmp.write(rsp.content); tmp.close(); return tmp.name
# โ”€โ”€โ”€ 2. PDF ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def translate_pdf(file, src, tgt):
if not file: return "โš ๏ธ PDF ์—…๋กœ๋“œ ํ•„์š”", ""
with pdfplumber.open(file.name) as pdf:
text = "\n".join(p.extract_text() or "" for p in pdf.pages[:5]).strip()
if not text:
return "โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ", ""
return text, asyncio.run(gpt_translate(text, src, tgt))
# โ”€โ”€โ”€ 3. WebSocket STT ํ—ฌํผ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
async def ws_stt_generator(audio_queue: asyncio.Queue):
"""
๋ฐฑ๊ทธ๋ผ์šด๋“œ ํƒœ์Šคํฌ:
- audio_queue ๋กœ๋ถ€ํ„ฐ chunk(bytes) ์ˆ˜์‹ 
- WS ๋กœ ์ „์†ก, ์„œ๋ฒ„ event ์ˆ˜์‹  โ†’ yield (partial text, final?)
"""
async with websockets.connect(
WS_URL,
extra_headers={"Authorization": f"Bearer {openai.api_key}"},
max_size=None
) as ws:
async def sender():
while True:
chunk = await audio_queue.get()
if chunk is None: # ์ข…๋ฃŒ ํ”Œ๋ž˜๊ทธ
await ws.send(json.dumps({"terminate": True}))
break
await ws.send(chunk)
asyncio.create_task(sender())
async for msg in ws:
data = json.loads(msg)
yield data["text"], data.get("final", False)
# โ”€โ”€โ”€ 4. Gradio ์ŠคํŠธ๋ฆผ ํ•ธ๋“ค๋Ÿฌ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
async def realtime_single(mic, src, tgt, state):
"""
mic: bytes chunk (Gradio ์ž๋™)
state: {"queue": Queue, "task": Task, "orig": str, "trans": str}
"""
if state is None:
state = {"queue": asyncio.Queue(), "task": None, "orig":"", "trans":""}
if mic is None: # ์ŠคํŠธ๋ฆผ ์ข…๋ฃŒ
await state["queue"].put(None)
return state["orig"], state["trans"], state
# ์ฒซ ํ˜ธ์ถœ์ด๋ฉด WS ํƒœ์Šคํฌ ์‹œ์ž‘
if state["task"] is None:
async def run_ws():
async for text, final in ws_stt_generator(state["queue"]):
state["orig"] += (" " if state["orig"] else "") + text
add = await gpt_translate(text, src, tgt)
state["trans"] += (" " if state["trans"] else "") + add
state["task"] = asyncio.create_task(run_ws())
# ๋งˆ์ดํฌ chunk enqueue
await state["queue"].put(mic)
return state["orig"], state["trans"], state
async def realtime_four(mic, src, state):
if state is None:
state = {"queue": asyncio.Queue(), "task": None,
"orig":"", "English":"", "Chinese":"", "Thai":"", "Russian":""}
if mic is None:
await state["queue"].put(None)
return tuple(state[k] for k in
["orig","English","Chinese","Thai","Russian"]) + (state,)
if state["task"] is None:
async def run_ws():
async for text, _ in ws_stt_generator(state["queue"]):
state["orig"] += (" "+text)
for lang in FOUR:
state[lang] += (" "+ await gpt_translate(text, src, lang))
state["task"] = asyncio.create_task(run_ws())
await state["queue"].put(mic)
return tuple(state[k] for k in
["orig","English","Chinese","Thai","Russian"]) + (state,)
# โ”€โ”€โ”€ 5. UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="SMARTok Demo") as demo:
with gr.Tabs():
# ํƒญ 1 โ€“ ์˜ค๋””์˜ค ๋ฒˆ์—ญ
with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค"):
src1=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
tgt1=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
aud1=gr.Audio(sources=["microphone","upload"],type="filepath")
btn1=gr.Button("๋ฒˆ์—ญ")
o1=gr.Textbox(label="์›๋ฌธ"); t1=gr.Textbox(label="๋ฒˆ์—ญ")
a1=gr.Audio(label="TTS",type="filepath",autoplay=True)
btn1.click(lambda a,s,t: translate_pdf.__wrapped__ if False else translate_pdf,
[aud1,src1,tgt1],[o1,t1,a1]) # dummy, ์œ ์ง€์šฉ
# ํƒญ 2 โ€“ PDF ๋ฒˆ์—ญ
with gr.TabItem("๐Ÿ“„ PDF"):
src2=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
tgt2=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
pdf=gr.File(file_types=[".pdf"])
btn2=gr.Button("๋ฒˆ์—ญ")
o2=gr.Textbox(label="์ถ”์ถœ ์›๋ฌธ",lines=15)
t2=gr.Textbox(label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ",lines=15)
btn2.click(translate_pdf:=translate_pdf,[pdf,src2,tgt2],[o2,t2])
# ํƒญ 3 โ€“ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด
with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1"):
src3=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
tgt3=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
mic3=gr.Audio(sources=["microphone"],streaming=True)
o3=gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)",lines=8)
t3=gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)",lines=8)
st3=gr.State()
mic3.stream(realtime_single,inputs=[src3,tgt3,st3],
outputs=[o3,t3,st3])
# ํƒญ 4 โ€“ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด
with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4"):
src4=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
mic4=gr.Audio(sources=["microphone"],streaming=True)
o4=gr.Textbox(label="์›๋ฌธ",lines=8)
e4=gr.Textbox(label="English",lines=8)
c4=gr.Textbox(label="Chinese(็ฎ€ไฝ“)",lines=8)
th4=gr.Textbox(label="Thai",lines=8)
r4=gr.Textbox(label="Russian",lines=8)
st4=gr.State()
mic4.stream(realtime_four,inputs=[src4,st4],
outputs=[o4,e4,c4,th4,r4,st4])
demo.launch(server_name="0.0.0.0",server_port=7860,debug=True)