|
from huggingface_hub import InferenceClient |
|
from llama_index.core.tools import FunctionTool |
|
|
|
|
|
|
|
def query_image(query: str, image_url: str) -> str: |
|
"""Ask anything about an image using a Vision Language Model |
|
|
|
Args: |
|
query (str): the query about the image, e.g. how many persons are on the image? |
|
image_url (str): the URL to the image |
|
""" |
|
|
|
client = InferenceClient(provider="nebius") |
|
try: |
|
completion = client.chat.completions.create( |
|
|
|
model="Qwen/Qwen2.5-VL-72B-Instruct", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": query |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": image_url |
|
} |
|
} |
|
] |
|
} |
|
], |
|
max_tokens=512, |
|
) |
|
return completion.choices[0].message |
|
|
|
except Exception as e: |
|
return f"query_image failed: {e}" |
|
|
|
|
|
def automatic_speech_recognition(file_url: str) -> str: |
|
"""Transcribe an audio file to text |
|
|
|
Args: |
|
file_url (str): the URL to the audio file |
|
""" |
|
client = InferenceClient(provider="fal-ai") |
|
try: |
|
return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3") |
|
except Exception as e: |
|
return f"automatic_speech_recognition failed: {e}" |
|
|
|
|
|
|
|
|
|
|
|
query_image_tool = FunctionTool.from_defaults(query_image) |
|
automatic_speech_recognition_tool = FunctionTool.from_defaults(automatic_speech_recognition) |