File size: 1,857 Bytes
cb5664b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from huggingface_hub import InferenceClient
from llama_index.core.tools import FunctionTool
# --- Functions --- #
def query_image(query: str, image_url: str) -> str:
"""Ask anything about an image using a Vision Language Model
Args:
query (str): the query about the image, e.g. how many persons are on the image?
image_url (str): the URL to the image
"""
client = InferenceClient(provider="nebius")
try:
completion = client.chat.completions.create(
# model="google/gemma-3-27b-it",
model="Qwen/Qwen2.5-VL-72B-Instruct",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": query
},
{
"type": "image_url",
"image_url": {
"url": image_url
}
}
]
}
],
max_tokens=512,
)
return completion.choices[0].message
except Exception as e:
return f"query_image failed: {e}"
def automatic_speech_recognition(file_url: str) -> str:
"""Transcribe an audio file to text
Args:
file_url (str): the URL to the audio file
"""
client = InferenceClient(provider="fal-ai")
try:
return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3")
except Exception as e:
return f"automatic_speech_recognition failed: {e}"
### --- Tool instance ---
query_image_tool = FunctionTool.from_defaults(query_image)
automatic_speech_recognition_tool = FunctionTool.from_defaults(automatic_speech_recognition) |