openfree commited on
Commit
5897b48
ยท
verified ยท
1 Parent(s): a609646

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -57
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- import openai, os, io, tempfile, wave, time, threading
3
  from dotenv import load_dotenv
4
 
5
  # =============== ๊ณตํ†ต ์ดˆ๊ธฐํ™” ========================================
@@ -9,55 +9,72 @@ if not api_key:
9
  raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํŒŒ์ผ์— ์„ค์ •ํ•˜์„ธ์š”!")
10
  client = openai.OpenAI(api_key=api_key)
11
 
 
12
  LANGUAGES = [
13
  "Korean", "English", "Japanese", "Chinese",
14
  "Thai", "Russian", "Vietnamese",
15
  "Spanish", "French"
16
  ]
17
- LANG_CODE = {"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh",
18
- "Thai":"th","Russian":"ru","Vietnamese":"vi",
19
- "Spanish":"es","French":"fr"}
20
- VOICE = {lang: ("nova" if lang in ["Korean","Japanese","Chinese"] else "alloy")
21
- for lang in LANGUAGES}
22
-
23
- # ---------------- ๊ณตํ†ต ๋ฒˆ์—ญ/ํ•ฉ์„ฑ ------------------------------------
24
- def _gpt_translate(text, src, tgt):
 
 
 
 
25
  rsp = client.chat.completions.create(
26
  model="gpt-3.5-turbo",
27
  messages=[
28
- {"role":"system",
29
- "content":f"You are a professional translator. Translate {src} to {tgt}. "
30
- f"Only give the translated text."},
31
- {"role":"user","content":text}
 
 
 
 
32
  ],
33
- temperature=0.3,max_tokens=2048)
 
 
34
  return rsp.choices[0].message.content.strip()
35
 
36
- def _tts(text, lang):
37
- out = client.audio.speech.create(model="tts-1",
38
- voice=VOICE.get(lang,"alloy"),
39
- input=text[:4096])
40
- tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
41
- tmp.write(out.content); tmp.close()
 
 
 
42
  return tmp.name
43
 
44
  # =============== 1) ๋งˆ์ดํฌยทํŒŒ์ผ ๊ณตํ†ต ์ฒ˜๋ฆฌ ============================
45
  def translate_audio(audio_path, src, tgt):
46
  """wav/mp3 ๊ฒฝ๋กœ -> (์›๋ฌธ, ๋ฒˆ์—ญ๋ฌธ, ๋ฒˆ์—ญ TTS ๊ฒฝ๋กœ)"""
47
- with open(audio_path,"rb") as f:
48
  stt = client.audio.transcriptions.create(
49
  model="whisper-1",
50
  file=f,
51
- language=LANG_CODE.get(src))
 
52
  original = stt.text.strip()
53
  if not original:
54
  return "โš ๏ธ ์Œ์„ฑ ์ธ์‹ ์‹คํŒจ", "", None
 
55
  translated = _gpt_translate(original, src, tgt)
56
  tts_path = _tts(translated, tgt)
57
  return original, translated, tts_path
58
 
59
  # =============== 2) ์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ(๋ฒ ํƒ€) ============================
60
- STREAM_CHUNK_SEC = 4 # 4์ดˆ๋งˆ๋‹ค Whisper ํ˜ธ์ถœ
 
61
  def stream_generator(mic_stream, src, tgt):
62
  """generator: ๋งค chunk๋งˆ๋‹ค yield (์›๋ฌธ๋ˆ„์ , ๋ฒˆ์—ญ๋ˆ„์ )"""
63
  buffer = io.BytesIO()
@@ -68,67 +85,126 @@ def stream_generator(mic_stream, src, tgt):
68
  chunk = mic_stream.recv() # bytes
69
  if chunk is None: # ์ŠคํŠธ๋ฆผ ์ข…๋ฃŒ
70
  break
 
71
  if not wav_header:
72
- wav_header = chunk[:44] # WAV ํ—ค๋”
73
  buffer.write(chunk)
74
- # chunk ๊ธธ์ด๊ฐ€ STREAM_CHUNK_SEC ์ด์ƒ ์Œ“์˜€์œผ๋ฉด ์ฒ˜๋ฆฌ
75
- if buffer.getbuffer().nbytes > 16000*2*STREAM_CHUNK_SEC: # 16kHz 16-bit mono
76
  wav_bytes = wav_header + buffer.getvalue()
77
- with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp:
78
- tmp.write(wav_bytes); tmp.close()
 
79
  o, t, _ = translate_audio(tmp.name, src, tgt)
 
80
  original_acc += " " + o
81
  translated_acc += " " + t
82
  yield original_acc.strip(), translated_acc.strip()
83
  buffer = io.BytesIO() # reset buffer
84
- # ๋งˆ์ง€๋ง‰ ๋‚จ์€ ๋ฒ„ํผ
 
85
  if buffer.getbuffer().nbytes:
86
  wav_bytes = wav_header + buffer.getvalue()
87
- with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp:
88
- tmp.write(wav_bytes); tmp.close()
 
89
  o, t, _ = translate_audio(tmp.name, src, tgt)
90
- yield (original_acc+" "+o).strip(), (translated_acc+" "+t).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  # =============== Gradio UI ==========================================
93
  with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
94
  with gr.Tabs():
95
  # โ‘  ๋งˆ์ดํฌ ๋ฒˆ์—ญ (๋…น์Œ ํ›„ ์ผ๊ด„)
96
  with gr.TabItem("๐ŸŽ™๏ธ ๋งˆ์ดํฌ ๋ฒˆ์—ญ"):
97
- src1 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ")
98
- tgt1 = gr.Dropdown(LANGUAGES,value="English",label="์ถœ๋ ฅ")
99
- mic1 = gr.Audio(sources=["microphone"],type="filepath",label="๐ŸŽค ๋…น์Œ ํ›„ Stop")
100
- btn1 = gr.Button("๋ฒˆ์—ญ")
101
- stt1 = gr.Textbox(label="์›๋ฌธ",lines=5)
102
- tlt1 = gr.Textbox(label="๋ฒˆ์—ญ",lines=5)
103
- out1 = gr.Audio(label="TTS",type="filepath",autoplay=True)
104
- btn1.click(translate_audio,inputs=[mic1,src1,tgt1],
105
- outputs=[stt1,tlt1,out1])
 
 
 
 
106
 
107
  # โ‘ก ์˜ค๋””์˜ค ํŒŒ์ผ ๋ฒˆ์—ญ
108
  with gr.TabItem("๐ŸŽง ํŒŒ์ผ ๋ฒˆ์—ญ"):
109
- src2 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ")
110
- tgt2 = gr.Dropdown(LANGUAGES,value="English",label="์ถœ๋ ฅ")
111
- file2= gr.Audio(sources=["upload"],type="filepath",label="์˜ค๋””์˜ค ํŒŒ์ผ ์—…๋กœ๋“œ")
112
  btn2 = gr.Button("๋ฒˆ์—ญ")
113
- stt2 = gr.Textbox(label="์›๋ฌธ",lines=5)
114
- tlt2 = gr.Textbox(label="๋ฒˆ์—ญ",lines=5)
115
- out2 = gr.Audio(label="TTS",type="filepath",autoplay=True)
116
- btn2.click(translate_audio,inputs=[file2,src2,tgt2],
117
- outputs=[stt2,tlt2,out2])
 
 
 
 
118
 
119
  # โ‘ข ์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ ์ „์‚ฌยท๋ฒˆ์—ญ (Beta)
120
  with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ ๋ฒˆ์—ญ (Beta)"):
121
- gr.Markdown("๋งˆ์ดํฌ๋ฅผ ์ผœ๋ฉด 3-4์ดˆ ๋‹จ์œ„๋กœ ์ž๋ง‰์ด ๊ฐฑ์‹ ๋ฉ๋‹ˆ๋‹ค.")
122
- src3 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ")
123
- tgt3 = gr.Dropdown(LANGUAGES,value="English",label="์ถœ๋ ฅ")
124
- mic3 = gr.Audio(sources=["microphone"],streaming=True,label="๐ŸŽค ์‹ค์‹œ๊ฐ„")
125
- stt3 = gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)",lines=8)
126
- tlt3 = gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)",lines=8)
127
 
128
  def gen(audio, src_lang, tgt_lang):
129
  yield from stream_generator(audio, src_lang, tgt_lang)
130
 
131
- mic3.stream(gen, inputs=[src3,tgt3], outputs=[stt3,tlt3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
 
133
  if __name__ == "__main__":
134
- app.launch(server_name="0.0.0.0",server_port=7860,share=False,debug=True)
 
1
  import gradio as gr
2
+ import openai, os, io, tempfile, wave, time
3
  from dotenv import load_dotenv
4
 
5
  # =============== ๊ณตํ†ต ์ดˆ๊ธฐํ™” ========================================
 
9
  raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํŒŒ์ผ์— ์„ค์ •ํ•˜์„ธ์š”!")
10
  client = openai.OpenAI(api_key=api_key)
11
 
12
+ # ---------- ์ง€์› ์–ธ์–ด -----------------------------------------------
13
  LANGUAGES = [
14
  "Korean", "English", "Japanese", "Chinese",
15
  "Thai", "Russian", "Vietnamese",
16
  "Spanish", "French"
17
  ]
18
+ LANG_CODE = {
19
+ "Korean": "ko", "English": "en", "Japanese": "ja", "Chinese": "zh",
20
+ "Thai": "th", "Russian": "ru", "Vietnamese": "vi",
21
+ "Spanish": "es", "French": "fr"
22
+ }
23
+ VOICE = {
24
+ lang: ("nova" if lang in ["Korean", "Japanese", "Chinese"] else "alloy")
25
+ for lang in LANGUAGES
26
+ }
27
+
28
+ # ---------- ๊ณตํ†ต ์œ ํ‹ธ -----------------------------------------------
29
+ def _gpt_translate(text: str, src: str, tgt: str) -> str:
30
  rsp = client.chat.completions.create(
31
  model="gpt-3.5-turbo",
32
  messages=[
33
+ {
34
+ "role": "system",
35
+ "content": (
36
+ f"You are a professional translator. Translate the following {src} text to {tgt}. "
37
+ f"Only provide the translation without additional commentary."
38
+ )
39
+ },
40
+ {"role": "user", "content": text}
41
  ],
42
+ temperature=0.3,
43
+ max_tokens=2048
44
+ )
45
  return rsp.choices[0].message.content.strip()
46
 
47
+ def _tts(text: str, lang: str) -> str:
48
+ out = client.audio.speech.create(
49
+ model="tts-1",
50
+ voice=VOICE.get(lang, "alloy"),
51
+ input=text[:4096]
52
+ )
53
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
54
+ tmp.write(out.content)
55
+ tmp.close()
56
  return tmp.name
57
 
58
  # =============== 1) ๋งˆ์ดํฌยทํŒŒ์ผ ๊ณตํ†ต ์ฒ˜๋ฆฌ ============================
59
  def translate_audio(audio_path, src, tgt):
60
  """wav/mp3 ๊ฒฝ๋กœ -> (์›๋ฌธ, ๋ฒˆ์—ญ๋ฌธ, ๋ฒˆ์—ญ TTS ๊ฒฝ๋กœ)"""
61
+ with open(audio_path, "rb") as f:
62
  stt = client.audio.transcriptions.create(
63
  model="whisper-1",
64
  file=f,
65
+ language=LANG_CODE.get(src)
66
+ )
67
  original = stt.text.strip()
68
  if not original:
69
  return "โš ๏ธ ์Œ์„ฑ ์ธ์‹ ์‹คํŒจ", "", None
70
+
71
  translated = _gpt_translate(original, src, tgt)
72
  tts_path = _tts(translated, tgt)
73
  return original, translated, tts_path
74
 
75
  # =============== 2) ์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ(๋ฒ ํƒ€) ============================
76
+ STREAM_CHUNK_SEC = 4 # 4์ดˆ๋งˆ๋‹ค Whisper ํ˜ธ์ถœ
77
+
78
  def stream_generator(mic_stream, src, tgt):
79
  """generator: ๋งค chunk๋งˆ๋‹ค yield (์›๋ฌธ๋ˆ„์ , ๋ฒˆ์—ญ๋ˆ„์ )"""
80
  buffer = io.BytesIO()
 
85
  chunk = mic_stream.recv() # bytes
86
  if chunk is None: # ์ŠคํŠธ๋ฆผ ์ข…๋ฃŒ
87
  break
88
+
89
  if not wav_header:
90
+ wav_header = chunk[:44] # WAV ํ—ค๋”(PCM 16kHz 16bit mono)
91
  buffer.write(chunk)
92
+
93
+ if buffer.getbuffer().nbytes > 16000 * 2 * STREAM_CHUNK_SEC:
94
  wav_bytes = wav_header + buffer.getvalue()
95
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
96
+ tmp.write(wav_bytes)
97
+ tmp.close()
98
  o, t, _ = translate_audio(tmp.name, src, tgt)
99
+
100
  original_acc += " " + o
101
  translated_acc += " " + t
102
  yield original_acc.strip(), translated_acc.strip()
103
  buffer = io.BytesIO() # reset buffer
104
+
105
+ # ๋‚จ์€ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
106
  if buffer.getbuffer().nbytes:
107
  wav_bytes = wav_header + buffer.getvalue()
108
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
109
+ tmp.write(wav_bytes)
110
+ tmp.close()
111
  o, t, _ = translate_audio(tmp.name, src, tgt)
112
+
113
+ yield (original_acc + " " + o).strip(), (translated_acc + " " + t).strip()
114
+
115
+ # =============== 3) 4๊ฐœ๊ตญ์–ด ๋™์‹œ ๋ฒˆ์—ญ ===============================
116
+ FOUR_LANGS = ["English", "Chinese", "Thai", "Russian"]
117
+
118
+ def translate_audio_four(audio_path, src):
119
+ """ํ•œ ๋ฒˆ์˜ STT ํ›„ 4๊ฐœ ์–ธ์–ด(์˜/์ค‘/ํƒœ/๋Ÿฌ)๋กœ ๋™์‹œ ๋ฒˆ์—ญ"""
120
+ with open(audio_path, "rb") as f:
121
+ stt = client.audio.transcriptions.create(
122
+ model="whisper-1",
123
+ file=f,
124
+ language=LANG_CODE.get(src)
125
+ )
126
+ original = stt.text.strip()
127
+ if not original:
128
+ return ["โš ๏ธ ์Œ์„ฑ ์ธ์‹ ์‹คํŒจ"] + [""] * 4
129
+
130
+ outputs = [original]
131
+ for lang in FOUR_LANGS:
132
+ outputs.append(_gpt_translate(original, src, lang))
133
+ return outputs # ์ด 5๊ฐœ (์›๋ฌธ + 4์–ธ์–ด)
134
 
135
  # =============== Gradio UI ==========================================
136
  with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
137
  with gr.Tabs():
138
  # โ‘  ๋งˆ์ดํฌ ๋ฒˆ์—ญ (๋…น์Œ ํ›„ ์ผ๊ด„)
139
  with gr.TabItem("๐ŸŽ™๏ธ ๋งˆ์ดํฌ ๋ฒˆ์—ญ"):
140
+ src1 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ")
141
+ tgt1 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ")
142
+ mic1 = gr.Audio(sources=["microphone"], type="filepath", label="๐ŸŽค ๋…น์Œ ํ›„ Stop")
143
+ btn1 = gr.Button("๋ฒˆ์—ญ")
144
+ stt1 = gr.Textbox(label="์›๋ฌธ", lines=5)
145
+ tlt1 = gr.Textbox(label="๋ฒˆ์—ญ", lines=5)
146
+ out1 = gr.Audio(label="TTS", type="filepath", autoplay=True)
147
+
148
+ btn1.click(
149
+ translate_audio,
150
+ inputs=[mic1, src1, tgt1],
151
+ outputs=[stt1, tlt1, out1]
152
+ )
153
 
154
  # โ‘ก ์˜ค๋””์˜ค ํŒŒ์ผ ๋ฒˆ์—ญ
155
  with gr.TabItem("๐ŸŽง ํŒŒ์ผ ๋ฒˆ์—ญ"):
156
+ src2 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ")
157
+ tgt2 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ")
158
+ file2 = gr.Audio(sources=["upload"], type="filepath", label="์˜ค๋””์˜ค ํŒŒ์ผ ์—…๋กœ๋“œ")
159
  btn2 = gr.Button("๋ฒˆ์—ญ")
160
+ stt2 = gr.Textbox(label="์›๋ฌธ", lines=5)
161
+ tlt2 = gr.Textbox(label="๋ฒˆ์—ญ", lines=5)
162
+ out2 = gr.Audio(label="TTS", type="filepath", autoplay=True)
163
+
164
+ btn2.click(
165
+ translate_audio,
166
+ inputs=[file2, src2, tgt2],
167
+ outputs=[stt2, tlt2, out2]
168
+ )
169
 
170
  # โ‘ข ์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ ์ „์‚ฌยท๋ฒˆ์—ญ (Beta)
171
  with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ ๋ฒˆ์—ญ (Beta)"):
172
+ gr.Markdown("๋งˆ์ดํฌ๋ฅผ ์ผœ๋ฉด 3~4์ดˆ ๋‹จ์œ„๋กœ ์ž๋ง‰์ด ๊ฐฑ์‹ ๋ฉ๋‹ˆ๋‹ค.")
173
+ src3 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ")
174
+ tgt3 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ")
175
+ mic3 = gr.Audio(sources=["microphone"], streaming=True, label="๐ŸŽค ์‹ค์‹œ๊ฐ„")
176
+ stt3 = gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)", lines=8)
177
+ tlt3 = gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)", lines=8)
178
 
179
  def gen(audio, src_lang, tgt_lang):
180
  yield from stream_generator(audio, src_lang, tgt_lang)
181
 
182
+ mic3.stream(gen, inputs=[src3, tgt3], outputs=[stt3, tlt3])
183
+
184
+ # โ‘ฃ 4๊ฐœ๊ตญ์–ด ๋™์‹œ ๋ฒˆ์—ญ
185
+ with gr.TabItem("๐ŸŒ 4๊ฐœ ์–ธ์–ด ๋™์‹œ"):
186
+ gr.Markdown("์ž…๋ ฅ ์Œ์„ฑ์„ **English / Chinese(็ฎ€ไฝ“) / Thai / Russian** ๋กœ ๋™์‹œ์— ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค.")
187
+ src4 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
188
+ aud4 = gr.Audio(
189
+ sources=["microphone", "upload"],
190
+ type="filepath",
191
+ label="๐ŸŽค ๋…น์Œ ๋˜๋Š” ํŒŒ์ผ ์—…๋กœ๋“œ"
192
+ )
193
+ btn4 = gr.Button("๋ฒˆ์—ญ")
194
+
195
+ with gr.Row():
196
+ org4 = gr.Textbox(label="์›๋ฌธ", lines=4)
197
+ en4 = gr.Textbox(label="English", lines=4)
198
+ zh4 = gr.Textbox(label="Chinese (็ฎ€ไฝ“)", lines=4)
199
+ th4 = gr.Textbox(label="Thai", lines=4)
200
+ ru4 = gr.Textbox(label="Russian", lines=4)
201
+
202
+ btn4.click(
203
+ translate_audio_four,
204
+ inputs=[aud4, src4],
205
+ outputs=[org4, en4, zh4, th4, ru4]
206
+ )
207
 
208
+ # ===================== ์‹คํ–‰ ==========================================
209
  if __name__ == "__main__":
210
+ app.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True)