agents_final_assignment

Sleeping

App Files Files Community

agents_final_assignment / langgraph_dir /custom_tools.py

guillaumefrd

use openai client for image query and ASR

b527097 2 months ago

raw

history blame

4.77 kB

	import requests
	from langchain_core.tools import tool
	from huggingface_hub import InferenceClient
	from openai import OpenAI


	# --- Basic operations --- #

	@tool
	def multiply(a: float, b: float) -> float:
	"""Multiplies two numbers.

	Args:
	a (float): the first number
	b (float): the second number
	"""
	return a * b


	@tool
	def add(a: float, b: float) -> float:
	"""Adds two numbers.

	Args:
	a (float): the first number
	b (float): the second number
	"""
	return a + b


	@tool
	def subtract(a: float, b: float) -> int:
	"""Subtracts two numbers.

	Args:
	a (float): the first number
	b (float): the second number
	"""
	return a - b


	@tool
	def divide(a: float, b: float) -> float:
	"""Divides two numbers.

	Args:
	a (float): the first float number
	b (float): the second float number
	"""
	if b == 0:
	raise ValueError("Cannot divided by zero.")
	return a / b


	@tool
	def modulus(a: int, b: int) -> int:
	"""Get the modulus of two numbers.

	Args:
	a (int): the first number
	b (int): the second number
	"""
	return a % b


	@tool
	def power(a: float, b: float) -> float:
	"""Get the power of two numbers.

	Args:
	a (float): the first number
	b (float): the second number
	"""
	return a**b


	# --- Functions --- #

	@tool
	def query_image(query: str, image_url: str) -> str:
	"""Ask anything about an image using a Vision Language Model

	Args:
	query (str): the query about the image, e.g. how many persons are on the image?
	image_url (str): the URL to the image
	"""

	# PROVIDER = 'huggingface'
	PROVIDER = 'openai'

	try:
	if PROVIDER == 'huggingface':
	client = InferenceClient(provider="nebius")
	completion = client.chat.completions.create(
	# model="google/gemma-3-27b-it",
	model="Qwen/Qwen2.5-VL-72B-Instruct",
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": query
	},
	{
	"type": "image_url",
	"image_url": {
	"url": image_url
	}
	}
	]
	}
	],
	max_tokens=512,
	)
	return completion.choices[0].message

	elif PROVIDER == 'openai':
	client = OpenAI()

	response = client.responses.create(
	model="gpt-4.1-mini",
	input=[{
	"role": "user",
	"content": [
	{"type": "input_text", "text": query},
	{
	"type": "input_image",
	"image_url": image_url,
	},
	],
	}],
	)

	return response.output_text

	else:
	raise AttributeError(f'PROVIDER must be "openai" or "huggingface", received "{PROVIDER}"')

	except Exception as e:
	return f"query_image failed: {e}"


	@tool
	def automatic_speech_recognition(file_url: str, file_extension: str) -> str:
	"""Transcribe an audio file to text

	Args:
	file_url (str): the URL to the audio file
	file_extension (str): the file extension, e.g. mp3
	"""

	# PROVIDER = 'huggingface'
	PROVIDER = 'openai'

	try:
	if PROVIDER == 'huggingface':
	client = InferenceClient(provider="fal-ai")
	return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3")

	elif PROVIDER == 'openai':
	# download the audio file
	response = requests.get(file_url)
	response.raise_for_status()
	# write to disk
	file_extension = file_extension.replace('.','')
	with open(f'tmp.{file_extension}', 'wb') as file:
	file.write(response.content)

	audio_file = open(f'tmp.{file_extension}', "rb")
	client = OpenAI()
	transcription = client.audio.transcriptions.create(
	model="whisper-1",
	file=audio_file
	)
	return transcription.text

	else:
	raise AttributeError(f'PROVIDER must be "openai" or "huggingface", received "{PROVIDER}"')

	except Exception as e:
	return f"automatic_speech_recognition failed: {e}"