guillaumefrd commited on
Commit
b527097
·
1 Parent(s): 3568413

use openai client for image query and ASR

Browse files
langgraph_dir/config.py CHANGED
@@ -1 +1,3 @@
1
- OPENAI_MODEL_NAME = "gpt-4.1-nano"
 
 
 
1
+ # OPENAI_MODEL_NAME = "gpt-4.1-nano" # Overall Score: 10.0% (2/20 correct)
2
+ OPENAI_MODEL_NAME = "gpt-4.1-mini"
3
+ # OPENAI_MODEL_NAME = "gpt-4.1"
langgraph_dir/custom_tools.py CHANGED
@@ -1,5 +1,8 @@
 
1
  from langchain_core.tools import tool
2
  from huggingface_hub import InferenceClient
 
 
3
 
4
  # --- Basic operations --- #
5
 
@@ -82,44 +85,98 @@ def query_image(query: str, image_url: str) -> str:
82
  image_url (str): the URL to the image
83
  """
84
 
85
- client = InferenceClient(provider="nebius")
 
 
86
  try:
87
- completion = client.chat.completions.create(
88
- # model="google/gemma-3-27b-it",
89
- model="Qwen/Qwen2.5-VL-72B-Instruct",
90
- messages=[
91
- {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  "role": "user",
93
  "content": [
 
94
  {
95
- "type": "text",
96
- "text": query
97
  },
98
- {
99
- "type": "image_url",
100
- "image_url": {
101
- "url": image_url
102
- }
103
- }
104
- ]
105
- }
106
- ],
107
- max_tokens=512,
108
- )
109
- return completion.choices[0].message
110
 
111
  except Exception as e:
112
  return f"query_image failed: {e}"
113
 
 
114
  @tool
115
- def automatic_speech_recognition(file_url: str) -> str:
116
  """Transcribe an audio file to text
117
 
118
  Args:
119
  file_url (str): the URL to the audio file
 
120
  """
121
- client = InferenceClient(provider="fal-ai")
 
 
 
122
  try:
123
- return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  except Exception as e:
125
  return f"automatic_speech_recognition failed: {e}"
 
1
+ import requests
2
  from langchain_core.tools import tool
3
  from huggingface_hub import InferenceClient
4
+ from openai import OpenAI
5
+
6
 
7
  # --- Basic operations --- #
8
 
 
85
  image_url (str): the URL to the image
86
  """
87
 
88
+ # PROVIDER = 'huggingface'
89
+ PROVIDER = 'openai'
90
+
91
  try:
92
+ if PROVIDER == 'huggingface':
93
+ client = InferenceClient(provider="nebius")
94
+ completion = client.chat.completions.create(
95
+ # model="google/gemma-3-27b-it",
96
+ model="Qwen/Qwen2.5-VL-72B-Instruct",
97
+ messages=[
98
+ {
99
+ "role": "user",
100
+ "content": [
101
+ {
102
+ "type": "text",
103
+ "text": query
104
+ },
105
+ {
106
+ "type": "image_url",
107
+ "image_url": {
108
+ "url": image_url
109
+ }
110
+ }
111
+ ]
112
+ }
113
+ ],
114
+ max_tokens=512,
115
+ )
116
+ return completion.choices[0].message
117
+
118
+ elif PROVIDER == 'openai':
119
+ client = OpenAI()
120
+
121
+ response = client.responses.create(
122
+ model="gpt-4.1-mini",
123
+ input=[{
124
  "role": "user",
125
  "content": [
126
+ {"type": "input_text", "text": query},
127
  {
128
+ "type": "input_image",
129
+ "image_url": image_url,
130
  },
131
+ ],
132
+ }],
133
+ )
134
+
135
+ return response.output_text
136
+
137
+ else:
138
+ raise AttributeError(f'PROVIDER must be "openai" or "huggingface", received "{PROVIDER}"')
 
 
 
 
139
 
140
  except Exception as e:
141
  return f"query_image failed: {e}"
142
 
143
+
144
  @tool
145
+ def automatic_speech_recognition(file_url: str, file_extension: str) -> str:
146
  """Transcribe an audio file to text
147
 
148
  Args:
149
  file_url (str): the URL to the audio file
150
+ file_extension (str): the file extension, e.g. mp3
151
  """
152
+
153
+ # PROVIDER = 'huggingface'
154
+ PROVIDER = 'openai'
155
+
156
  try:
157
+ if PROVIDER == 'huggingface':
158
+ client = InferenceClient(provider="fal-ai")
159
+ return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3")
160
+
161
+ elif PROVIDER == 'openai':
162
+ # download the audio file
163
+ response = requests.get(file_url)
164
+ response.raise_for_status()
165
+ # write to disk
166
+ file_extension = file_extension.replace('.','')
167
+ with open(f'tmp.{file_extension}', 'wb') as file:
168
+ file.write(response.content)
169
+
170
+ audio_file = open(f'tmp.{file_extension}', "rb")
171
+ client = OpenAI()
172
+ transcription = client.audio.transcriptions.create(
173
+ model="whisper-1",
174
+ file=audio_file
175
+ )
176
+ return transcription.text
177
+
178
+ else:
179
+ raise AttributeError(f'PROVIDER must be "openai" or "huggingface", received "{PROVIDER}"')
180
+
181
  except Exception as e:
182
  return f"automatic_speech_recognition failed: {e}"