"""Tools to handle multimodal understandig.""" import os import io import re import requests import librosa import soundfile as sf import pandas as pd from llama_index.core.tools import FunctionTool from huggingface_hub import InferenceClient from transformers import pipeline DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" def transcribe_audio(file_id: str) -> str: """ Transcribes an English audio file identfied by its id. """ try: audio, sr = sf.read(_get_file(file_id)) if sr != 16000: audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) except: return "Error: Invalid file. This file is either not an audio file or the id does not exist." asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") if (len(audio) / 16000) > 25: output = asr(audio, return_timestamps=True) else: output = asr(audio) return output["text"].strip() def transcribe_audio_hf(file_id: str) -> str: """ Transcribes an audio file identfied by its id. """ #audio, sr = sf.read(_get_file(file_id)) try: audio_bytes = _get_file(file_id).read() except: return "Error: Invalid file. This file is either not an audio file or the id does not exist." client = InferenceClient( provider="hf-inference", api_key=os.getenv("HF_TOKEN"), ) output = client.automatic_speech_recognition(audio_bytes, model="openai/whisper-small") return output def get_transcription_tool(): return FunctionTool.from_defaults( fn=transcribe_audio, description="Transcribes an audio file identified by its id." ) def answer_image_question(question: str, file_id: str) -> str: """ Answers questions about an image identified by its id. """ client = InferenceClient( provider="hf-inference", api_key=os.getenv("HF_TOKEN"), ) completion = client.chat.completions.create( model= "Qwen/Qwen2.5-VL-32B-Instruct", messages=[ { "role": "user", "content": [ { "type": "text", "text": question }, { "type": "image_url", "image_url": { "url": DEFAULT_API_URL + f"/files/{file_id}", } } ] } ], max_tokens=512, ) return remove_think(completion.choices[0].message.content) def get_image_qa_tool(): return FunctionTool.from_defaults( fn=answer_image_question, description="Answer a question about a given image. The image is identified by a file id." ) def read_excel(file_id: str) -> str: file_io = _get_file(file_id) df = pd.read_excel(file_io) return df.to_markdown() def get_excel_tool(): return FunctionTool.from_defaults( fn=read_excel, description="Convert an excel file that is identified by its file id into a markdown string." ) def analyse_excel(file_id: str) -> str: file_io = _get_file(file_id) df = pd.read_excel(file_io) return df.describe() def get_excel_analysis_tool(): return FunctionTool.from_defaults( fn=read_excel, description="Analyse an excel file that is identified by its file id and get common statistics such as mean or max per column." ) def read_csv(file_id: str) -> str: file_io = _get_file(file_id) df = pd.read_csv(file_io) return df.to_markdown() def get_csv_tool(): return FunctionTool.from_defaults( fn=read_excel, description="Convert a csv file that is identified by its file id into a markdown string." ) def analyse_csv(file_id: str) -> str: file_io = _get_file(file_id) df = pd.read_csv(file_io) return df.describe() def get_csv_analysis_tool(): return FunctionTool.from_defaults( fn=read_excel, description="Analyse a csv file that is identified by its file id and get common statistics such as mean or max per column." ) def watch_video(question: str, youtube_url: str) -> str: """Answer a question about a YouTube video identified by its url.""" # TODO our Gemini Key cannot be used to watch videos. return "You are not able to watch a Video yet. Reply with 'I don't know' to the question." try: from google import genai from google.genai import types client = genai.Client(api_key="GOOGLE_API_KEY") response = client.models.generate_content( model='models/gemini-2.0-flash', contents=types.Content( parts=[ types.Part( file_data=types.FileData(file_uri=youtube_url) ), types.Part(text=question) ] ) ) except Exception as e: print(e) return "You are not able to watch a Video yet. Reply with 'I don't know' to the question." return str(response.text) def get_video_tool(): return FunctionTool.from_defaults( fn=watch_video, description="Answer a question about a YouTube video identified by its url." ) def _build_file_url(task_id: str) -> str: return DEFAULT_API_URL + f"/files/{task_id}" def _get_file(task_id: str) -> io.BytesIO: res = requests.get(_build_file_url(task_id)) if res.status_code != 200: raise FileNotFoundError("Invalid file or task id.") file_like = io.BytesIO(res.content) return file_like def remove_think(output: str) -> str: """Removes the part of an LLM output.""" if output: return re.sub(".*", "", output).strip() return output def read_txt_or_py_file(file_id: str) -> str: """Read a python or txt file as plain text and return its content.""" try: bytes_io = _get_file(file_id) except: return "Error: Invalid file. This file is either not a .py/.txt file or the id does not exist." bytes_io.seek(0) return bytes_io.read().decode() def get_read_file_tool(): return FunctionTool.from_defaults( fn=read_txt_or_py_file, description="Read a python or txt file as plain text and return its content." )