guillaumefrd's picture
use openai client for image query and ASR
b527097
raw
history blame
4.77 kB
import requests
from langchain_core.tools import tool
from huggingface_hub import InferenceClient
from openai import OpenAI
# --- Basic operations --- #
@tool
def multiply(a: float, b: float) -> float:
"""Multiplies two numbers.
Args:
a (float): the first number
b (float): the second number
"""
return a * b
@tool
def add(a: float, b: float) -> float:
"""Adds two numbers.
Args:
a (float): the first number
b (float): the second number
"""
return a + b
@tool
def subtract(a: float, b: float) -> int:
"""Subtracts two numbers.
Args:
a (float): the first number
b (float): the second number
"""
return a - b
@tool
def divide(a: float, b: float) -> float:
"""Divides two numbers.
Args:
a (float): the first float number
b (float): the second float number
"""
if b == 0:
raise ValueError("Cannot divided by zero.")
return a / b
@tool
def modulus(a: int, b: int) -> int:
"""Get the modulus of two numbers.
Args:
a (int): the first number
b (int): the second number
"""
return a % b
@tool
def power(a: float, b: float) -> float:
"""Get the power of two numbers.
Args:
a (float): the first number
b (float): the second number
"""
return a**b
# --- Functions --- #
@tool
def query_image(query: str, image_url: str) -> str:
"""Ask anything about an image using a Vision Language Model
Args:
query (str): the query about the image, e.g. how many persons are on the image?
image_url (str): the URL to the image
"""
# PROVIDER = 'huggingface'
PROVIDER = 'openai'
try:
if PROVIDER == 'huggingface':
client = InferenceClient(provider="nebius")
completion = client.chat.completions.create(
# model="google/gemma-3-27b-it",
model="Qwen/Qwen2.5-VL-72B-Instruct",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": query
},
{
"type": "image_url",
"image_url": {
"url": image_url
}
}
]
}
],
max_tokens=512,
)
return completion.choices[0].message
elif PROVIDER == 'openai':
client = OpenAI()
response = client.responses.create(
model="gpt-4.1-mini",
input=[{
"role": "user",
"content": [
{"type": "input_text", "text": query},
{
"type": "input_image",
"image_url": image_url,
},
],
}],
)
return response.output_text
else:
raise AttributeError(f'PROVIDER must be "openai" or "huggingface", received "{PROVIDER}"')
except Exception as e:
return f"query_image failed: {e}"
@tool
def automatic_speech_recognition(file_url: str, file_extension: str) -> str:
"""Transcribe an audio file to text
Args:
file_url (str): the URL to the audio file
file_extension (str): the file extension, e.g. mp3
"""
# PROVIDER = 'huggingface'
PROVIDER = 'openai'
try:
if PROVIDER == 'huggingface':
client = InferenceClient(provider="fal-ai")
return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3")
elif PROVIDER == 'openai':
# download the audio file
response = requests.get(file_url)
response.raise_for_status()
# write to disk
file_extension = file_extension.replace('.','')
with open(f'tmp.{file_extension}', 'wb') as file:
file.write(response.content)
audio_file = open(f'tmp.{file_extension}', "rb")
client = OpenAI()
transcription = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
return transcription.text
else:
raise AttributeError(f'PROVIDER must be "openai" or "huggingface", received "{PROVIDER}"')
except Exception as e:
return f"automatic_speech_recognition failed: {e}"