|
"""Tools to handle multimodal understandig.""" |
|
|
|
import os |
|
import io |
|
import re |
|
import requests |
|
|
|
import librosa |
|
import soundfile as sf |
|
import pandas as pd |
|
from llama_index.core.tools import FunctionTool |
|
from huggingface_hub import InferenceClient |
|
from transformers import pipeline |
|
|
|
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
|
|
def transcribe_audio(file_id: str) -> str: |
|
""" |
|
Transcribes an English audio file identfied by its id. |
|
""" |
|
try: |
|
audio, sr = sf.read(_get_file(file_id)) |
|
if sr != 16000: |
|
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) |
|
except: |
|
return "Error: Invalid file. This file is either not an audio file or the id does not exist." |
|
|
|
asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") |
|
|
|
if (len(audio) / 16000) > 25: |
|
output = asr(audio, return_timestamps=True) |
|
else: |
|
output = asr(audio) |
|
|
|
return output["text"].strip() |
|
|
|
def transcribe_audio_hf(file_id: str) -> str: |
|
""" |
|
Transcribes an audio file identfied by its id. |
|
""" |
|
|
|
try: |
|
audio_bytes = _get_file(file_id).read() |
|
except: |
|
return "Error: Invalid file. This file is either not an audio file or the id does not exist." |
|
|
|
client = InferenceClient( |
|
provider="hf-inference", |
|
api_key=os.getenv("HF_TOKEN"), |
|
) |
|
|
|
output = client.automatic_speech_recognition(audio_bytes, model="openai/whisper-small") |
|
return output |
|
|
|
def get_transcription_tool(): |
|
return FunctionTool.from_defaults( |
|
fn=transcribe_audio, |
|
description="Transcribes an audio file identified by its id." |
|
) |
|
|
|
|
|
def answer_image_question(question: str, file_id: str) -> str: |
|
""" |
|
Answers questions about an image identified by its id. |
|
""" |
|
client = InferenceClient( |
|
provider="hf-inference", |
|
api_key=os.getenv("HF_TOKEN"), |
|
) |
|
|
|
completion = client.chat.completions.create( |
|
model= "Qwen/Qwen2.5-VL-32B-Instruct", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": question |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": DEFAULT_API_URL + f"/files/{file_id}", |
|
} |
|
} |
|
] |
|
} |
|
], |
|
max_tokens=512, |
|
) |
|
|
|
return remove_think(completion.choices[0].message.content) |
|
|
|
def get_image_qa_tool(): |
|
return FunctionTool.from_defaults( |
|
fn=answer_image_question, |
|
description="Answer a question about a given image. The image is identified by a file id." |
|
) |
|
|
|
def read_excel(file_id: str) -> str: |
|
file_io = _get_file(file_id) |
|
df = pd.read_excel(file_io) |
|
return df.to_markdown() |
|
|
|
def get_excel_tool(): |
|
return FunctionTool.from_defaults( |
|
fn=read_excel, |
|
description="Convert an excel file that is identified by its file id into a markdown string." |
|
) |
|
|
|
def analyse_excel(file_id: str) -> str: |
|
file_io = _get_file(file_id) |
|
df = pd.read_excel(file_io) |
|
return df.describe() |
|
|
|
def get_excel_analysis_tool(): |
|
return FunctionTool.from_defaults( |
|
fn=read_excel, |
|
description="Analyse an excel file that is identified by its file id and get common statistics such as mean or max per column." |
|
) |
|
|
|
def read_csv(file_id: str) -> str: |
|
file_io = _get_file(file_id) |
|
df = pd.read_csv(file_io) |
|
return df.to_markdown() |
|
|
|
def get_csv_tool(): |
|
return FunctionTool.from_defaults( |
|
fn=read_excel, |
|
description="Convert a csv file that is identified by its file id into a markdown string." |
|
) |
|
|
|
def analyse_csv(file_id: str) -> str: |
|
file_io = _get_file(file_id) |
|
df = pd.read_csv(file_io) |
|
return df.describe() |
|
|
|
def get_csv_analysis_tool(): |
|
return FunctionTool.from_defaults( |
|
fn=read_excel, |
|
description="Analyse a csv file that is identified by its file id and get common statistics such as mean or max per column." |
|
) |
|
|
|
def watch_video(video_url: str) -> str: |
|
return "You are not able to watch a Video yet. Reply with 'I don't know' to the question." |
|
|
|
def get_video_tool(): |
|
return FunctionTool.from_defaults( |
|
fn=watch_video, |
|
description="Watch a video and get a content description as a string." |
|
) |
|
|
|
def _get_file(task_id: str) -> io.BytesIO: |
|
res = requests.get(DEFAULT_API_URL + f"/files/{task_id}") |
|
if res.status_code != 200: |
|
raise FileNotFoundError("Invalid file or task id.") |
|
file_like = io.BytesIO(res.content) |
|
return file_like |
|
|
|
def remove_think(output: str) -> str: |
|
"""Removes the <think> part of an LLM output.""" |
|
if output: |
|
return re.sub("<think>.*</think>", "", output).strip() |
|
return output |
|
|
|
def read_txt_or_py_file(file_id: str) -> str: |
|
"""Read a python or txt file as plain text and return its content.""" |
|
try: |
|
bytes_io = _get_file(file_id) |
|
except: |
|
return "Error: Invalid file. This file is either not a .py/.txt file or the id does not exist." |
|
bytes_io.seek(0) |
|
return bytes_io.read().decode() |
|
|
|
def get_read_file_tool(): |
|
return FunctionTool.from_defaults( |
|
fn=read_txt_or_py_file, |
|
description="Read a python or txt file as plain text and return its content." |
|
) |
|
|