agents_final_assignment

Sleeping

App Files Files Community

guillaumefrd commited on May 3

Commit

b527097

1 Parent(s): 3568413

use openai client for image query and ASR

Browse files

Files changed (2) hide show

langgraph_dir/config.py +3 -1
langgraph_dir/custom_tools.py +80 -23

langgraph_dir/config.py CHANGED Viewed

	@@ -1 +1,3 @@
1	- OPENAI_MODEL_NAME = "gpt-4.1-nano"

+# OPENAI_MODEL_NAME = "gpt-4.1-nano" # Overall Score: 10.0% (2/20 correct)
+OPENAI_MODEL_NAME = "gpt-4.1-mini"
+# OPENAI_MODEL_NAME = "gpt-4.1"

langgraph_dir/custom_tools.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from langchain_core.tools import tool
 from huggingface_hub import InferenceClient
 # --- Basic operations --- #
@@ -82,44 +85,98 @@ def query_image(query: str, image_url: str) -> str:
         image_url (str): the URL to the image
     """
-    client = InferenceClient(provider="nebius")
     try:
-        completion = client.chat.completions.create(
-            # model="google/gemma-3-27b-it",
-            model="Qwen/Qwen2.5-VL-72B-Instruct",
-            messages=[
-                {
                     "role": "user",
                     "content": [
                         {
-                            "type": "text",
-                            "text": query
                         },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": image_url
-                            }
-                        }
-                    ]
-                }
-            ],
-            max_tokens=512,
-        )
-        return completion.choices[0].message
     except Exception as e:
         return f"query_image failed: {e}"
 @tool
-def automatic_speech_recognition(file_url: str) -> str:
     """Transcribe an audio file to text
     Args:
         file_url (str): the URL to the audio file
     """
-    client = InferenceClient(provider="fal-ai")
     try:
-        return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3")
     except Exception as e:
         return f"automatic_speech_recognition failed: {e}"

+import requests
 from langchain_core.tools import tool
 from huggingface_hub import InferenceClient
+from openai import OpenAI
 # --- Basic operations --- #
         image_url (str): the URL to the image
     """
+    # PROVIDER = 'huggingface'
+    PROVIDER = 'openai'
     try:
+        if PROVIDER == 'huggingface':
+            client = InferenceClient(provider="nebius")
+            completion = client.chat.completions.create(
+                # model="google/gemma-3-27b-it",
+                model="Qwen/Qwen2.5-VL-72B-Instruct",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": query
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": image_url
+                                }
+                            }
+                        ]
+                    }
+                ],
+                max_tokens=512,
+            )
+            return completion.choices[0].message
+        elif PROVIDER == 'openai':
+            client = OpenAI()
+            response = client.responses.create(
+                model="gpt-4.1-mini",
+                input=[{
                     "role": "user",
                     "content": [
+                        {"type": "input_text", "text": query},
                         {
+                            "type": "input_image",
+                            "image_url": image_url,
                         },
+                    ],
+                }],
+            )
+            return response.output_text
+        else:
+            raise AttributeError(f'PROVIDER must be "openai" or "huggingface", received "{PROVIDER}"')
     except Exception as e:
         return f"query_image failed: {e}"
 @tool
+def automatic_speech_recognition(file_url: str, file_extension: str) -> str:
     """Transcribe an audio file to text
     Args:
         file_url (str): the URL to the audio file
+        file_extension (str): the file extension, e.g. mp3
     """
+    # PROVIDER = 'huggingface'
+    PROVIDER = 'openai'
     try:
+        if PROVIDER == 'huggingface':
+            client = InferenceClient(provider="fal-ai")
+            return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3")
+        elif PROVIDER == 'openai':
+            # download the audio file
+            response = requests.get(file_url)
+            response.raise_for_status()
+            # write to disk
+            file_extension = file_extension.replace('.','')
+            with open(f'tmp.{file_extension}', 'wb') as file:
+                file.write(response.content)
+            audio_file = open(f'tmp.{file_extension}', "rb")
+            client = OpenAI()
+            transcription = client.audio.transcriptions.create(
+                model="whisper-1",
+                file=audio_file
+            )
+            return transcription.text
+        else:
+            raise AttributeError(f'PROVIDER must be "openai" or "huggingface", received "{PROVIDER}"')
     except Exception as e:
         return f"automatic_speech_recognition failed: {e}"