seawolf2357 commited on
Commit
e9bd082
ยท
verified ยท
1 Parent(s): 0d8a2ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -13
app.py CHANGED
@@ -546,7 +546,12 @@ HTML_CONTENT = """<!DOCTYPE html>
546
 
547
  try {
548
  const stream = await navigator.mediaDevices.getUserMedia({
549
- audio: true
 
 
 
 
 
550
  });
551
  setupAudioVisualization(stream);
552
  stream.getTracks().forEach(track => {
@@ -556,7 +561,11 @@ HTML_CONTENT = """<!DOCTYPE html>
556
  peerConnection.addEventListener('track', (evt) => {
557
  if (audioOutput.srcObject !== evt.streams[0]) {
558
  audioOutput.srcObject = evt.streams[0];
559
- audioOutput.play();
 
 
 
 
560
  }
561
  });
562
 
@@ -677,7 +686,19 @@ HTML_CONTENT = """<!DOCTYPE html>
677
  }
678
  }
679
 
 
 
 
680
  function addMessage(role, content) {
 
 
 
 
 
 
 
 
 
681
  const messageDiv = document.createElement('div');
682
  messageDiv.classList.add('message', role);
683
  messageDiv.textContent = content;
@@ -810,6 +831,8 @@ class OpenAIHandler(AsyncStreamHandler):
810
  self.keep_alive_task = None
811
  self.last_activity = datetime.now()
812
  self.connection_active = True
 
 
813
  logger.info(f"Handler created with web_search_enabled={web_search_enabled}, webrtc_id={webrtc_id}")
814
 
815
  def copy(self):
@@ -895,7 +918,15 @@ class OpenAIHandler(AsyncStreamHandler):
895
 
896
  # Define the web search function
897
  tools = []
898
- instructions = "You are a helpful assistant. Respond in Korean when the user speaks Korean."
 
 
 
 
 
 
 
 
899
 
900
  if self.web_search_enabled and self.search_client:
901
  tools = [{
@@ -919,7 +950,13 @@ class OpenAIHandler(AsyncStreamHandler):
919
 
920
  instructions = (
921
  "You are a helpful assistant with web search capabilities. "
922
- "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
 
 
 
 
 
 
923
  "- Weather (๋‚ ์”จ, ๊ธฐ์˜จ, ๋น„, ๋ˆˆ)\n"
924
  "- News (๋‰ด์Šค, ์†Œ์‹)\n"
925
  "- Current events (ํ˜„์žฌ, ์ตœ๊ทผ, ์˜ค๋Š˜, ์ง€๊ธˆ)\n"
@@ -927,20 +964,27 @@ class OpenAIHandler(AsyncStreamHandler):
927
  "- Sports scores or results\n"
928
  "- Any question about 2024 or 2025\n"
929
  "- Any time-sensitive information\n\n"
930
- "When in doubt, USE web_search. It's better to search and provide accurate information "
931
- "than to guess or use outdated information. Always respond in Korean when the user speaks Korean."
932
  )
933
 
934
  try:
935
  async with self.client.beta.realtime.connect(
936
  model="gpt-4o-mini-realtime-preview-2024-12-17"
937
  ) as conn:
938
- # Update session with tools
939
  session_update = {
940
- "turn_detection": {"type": "server_vad"},
 
 
 
 
 
941
  "instructions": instructions,
942
  "tools": tools,
943
- "tool_choice": "auto" if tools else "none"
 
 
 
944
  }
945
 
946
  await conn.session.update(session=session_update)
@@ -954,14 +998,34 @@ class OpenAIHandler(AsyncStreamHandler):
954
  async for event in self.connection:
955
  self.last_activity = datetime.now()
956
 
957
- # Debug logging for function calls
958
- if event.type.startswith("response.function_call"):
959
- logger.debug(f"Function event: {event.type}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
960
 
961
  if event.type == "response.audio_transcript.done":
962
  await self.output_queue.put(AdditionalOutputs(event))
963
 
964
  elif event.type == "response.audio.delta":
 
 
 
 
 
965
  await self.output_queue.put(
966
  (
967
  self.output_sample_rate,
@@ -1008,7 +1072,7 @@ class OpenAIHandler(AsyncStreamHandler):
1008
  "output": search_results
1009
  }
1010
  )
1011
- await self.connection.response.create()
1012
 
1013
  except Exception as e:
1014
  logger.error(f"Function call error: {e}")
 
546
 
547
  try {
548
  const stream = await navigator.mediaDevices.getUserMedia({
549
+ audio: {
550
+ echoCancellation: true, // ์—์ฝ” ์ œ๊ฑฐ
551
+ noiseSuppression: true, // ๋…ธ์ด์ฆˆ ์ œ๊ฑฐ
552
+ autoGainControl: true, // ์ž๋™ ๊ฒŒ์ธ ์ œ์–ด
553
+ sampleRate: 24000
554
+ }
555
  });
556
  setupAudioVisualization(stream);
557
  stream.getTracks().forEach(track => {
 
561
  peerConnection.addEventListener('track', (evt) => {
562
  if (audioOutput.srcObject !== evt.streams[0]) {
563
  audioOutput.srcObject = evt.streams[0];
564
+ // ์ž๋™ ์žฌ์ƒ ์‹œ ๋ณผ๋ฅจ ์กฐ์ ˆ๋กœ ํ”ผ๋“œ๋ฐฑ ๋ฐฉ์ง€
565
+ audioOutput.volume = 0.8;
566
+ audioOutput.play().catch(e => {
567
+ console.error('Audio play error:', e);
568
+ });
569
  }
570
  });
571
 
 
686
  }
687
  }
688
 
689
+ let lastMessageContent = '';
690
+ let lastMessageTime = 0;
691
+
692
  function addMessage(role, content) {
693
+ // ์ค‘๋ณต ๋ฉ”์‹œ์ง€ ๋ฐฉ์ง€
694
+ const now = Date.now();
695
+ if (content === lastMessageContent && (now - lastMessageTime) < 1000) {
696
+ return; // 1์ดˆ ์ด๋‚ด ๊ฐ™์€ ๋ฉ”์‹œ์ง€ ๋ฌด์‹œ
697
+ }
698
+
699
+ lastMessageContent = content;
700
+ lastMessageTime = now;
701
+
702
  const messageDiv = document.createElement('div');
703
  messageDiv.classList.add('message', role);
704
  messageDiv.textContent = content;
 
831
  self.keep_alive_task = None
832
  self.last_activity = datetime.now()
833
  self.connection_active = True
834
+ self.response_in_progress = False # ์‘๋‹ต ์ค‘๋ณต ๋ฐฉ์ง€
835
+ self.last_response_time = datetime.now() # ๋งˆ์ง€๋ง‰ ์‘๋‹ต ์‹œ๊ฐ„
836
  logger.info(f"Handler created with web_search_enabled={web_search_enabled}, webrtc_id={webrtc_id}")
837
 
838
  def copy(self):
 
918
 
919
  # Define the web search function
920
  tools = []
921
+ instructions = (
922
+ "You are a helpful assistant. Respond in Korean when the user speaks Korean. "
923
+ "IMPORTANT RULES:\n"
924
+ "1. Wait for the user to finish speaking before responding\n"
925
+ "2. Keep responses concise and to the point\n"
926
+ "3. Do not continue speaking if the user interrupts\n"
927
+ "4. Stop immediately when you finish answering the question\n"
928
+ "5. Do not add unnecessary elaboration or follow-up questions unless asked"
929
+ )
930
 
931
  if self.web_search_enabled and self.search_client:
932
  tools = [{
 
950
 
951
  instructions = (
952
  "You are a helpful assistant with web search capabilities. "
953
+ "IMPORTANT RULES:\n"
954
+ "1. Wait for the user to finish speaking before responding\n"
955
+ "2. Keep responses concise and to the point\n"
956
+ "3. Do not continue speaking if the user interrupts\n"
957
+ "4. Stop immediately when you finish answering the question\n"
958
+ "5. Do not add unnecessary elaboration or follow-up questions unless asked\n\n"
959
+ "WEB SEARCH RULES: You MUST use the web_search function for ANY of these topics:\n"
960
  "- Weather (๋‚ ์”จ, ๊ธฐ์˜จ, ๋น„, ๋ˆˆ)\n"
961
  "- News (๋‰ด์Šค, ์†Œ์‹)\n"
962
  "- Current events (ํ˜„์žฌ, ์ตœ๊ทผ, ์˜ค๋Š˜, ์ง€๊ธˆ)\n"
 
964
  "- Sports scores or results\n"
965
  "- Any question about 2024 or 2025\n"
966
  "- Any time-sensitive information\n\n"
967
+ "When in doubt, USE web_search for accuracy. Always respond in Korean when the user speaks Korean."
 
968
  )
969
 
970
  try:
971
  async with self.client.beta.realtime.connect(
972
  model="gpt-4o-mini-realtime-preview-2024-12-17"
973
  ) as conn:
974
+ # Update session with tools - VAD ์„ค์ • ๊ฐ•ํ™”
975
  session_update = {
976
+ "turn_detection": {
977
+ "type": "server_vad",
978
+ "threshold": 0.5, # ๋ฏผ๊ฐ๋„ ์กฐ์ •
979
+ "prefix_padding_ms": 300, # ์Œ์„ฑ ์‹œ์ž‘ ์ „ ํŒจ๋”ฉ
980
+ "silence_duration_ms": 500 # ์นจ๋ฌต ๊ฐ์ง€ ์‹œ๊ฐ„ ์ฆ๊ฐ€
981
+ },
982
  "instructions": instructions,
983
  "tools": tools,
984
+ "tool_choice": "auto" if tools else "none",
985
+ "voice": "echo", # ์Œ์„ฑ ์„ค์ •
986
+ "temperature": 0.7, # ์‘๋‹ต ๋‹ค์–‘์„ฑ ๊ฐ์†Œ
987
+ "max_response_output_tokens": "inf" # ๋ฌดํ•œ ์‘๋‹ต ๋ฐฉ์ง€
988
  }
989
 
990
  await conn.session.update(session=session_update)
 
998
  async for event in self.connection:
999
  self.last_activity = datetime.now()
1000
 
1001
+ # Debug logging
1002
+ if event.type in ["response.audio_transcript.done", "response.done", "response.created", "input_audio_buffer.speech_started", "input_audio_buffer.speech_stopped"]:
1003
+ logger.info(f"Event: {event.type}")
1004
+
1005
+ # ์‚ฌ์šฉ์ž ์Œ์„ฑ ๊ฐ์ง€
1006
+ if event.type == "input_audio_buffer.speech_started":
1007
+ logger.info("User started speaking")
1008
+ self.response_in_progress = False # ์‚ฌ์šฉ์ž๊ฐ€ ๋งํ•˜๋ฉด ์‘๋‹ต ์ค‘๋‹จ
1009
+
1010
+ # ์‘๋‹ต ์‹œ์ž‘/์ข…๋ฃŒ ์ถ”์ 
1011
+ if event.type == "response.created":
1012
+ self.response_in_progress = True
1013
+ self.last_response_time = datetime.now()
1014
+ logger.info("Response started")
1015
+
1016
+ if event.type == "response.done":
1017
+ self.response_in_progress = False
1018
+ logger.info("Response completed")
1019
 
1020
  if event.type == "response.audio_transcript.done":
1021
  await self.output_queue.put(AdditionalOutputs(event))
1022
 
1023
  elif event.type == "response.audio.delta":
1024
+ # ์‘๋‹ต ์ค‘๋ณต ๋ฐฉ์ง€ - ๋„ˆ๋ฌด ๋น ๋ฅธ ์—ฐ์† ์‘๋‹ต ์ฐจ๋‹จ
1025
+ time_since_last = (datetime.now() - self.last_response_time).total_seconds()
1026
+ if time_since_last < 0.1: # 100ms ์ด๋‚ด ์‘๋‹ต ๋ฌด์‹œ
1027
+ continue
1028
+
1029
  await self.output_queue.put(
1030
  (
1031
  self.output_sample_rate,
 
1072
  "output": search_results
1073
  }
1074
  )
1075
+ # response.create() ์ œ๊ฑฐ - ์ž๋™์œผ๋กœ ์‘๋‹ต ์ƒ์„ฑ๋จ
1076
 
1077
  except Exception as e:
1078
  logger.error(f"Function call error: {e}")