Martin Bär commited on
Commit
6fec0c8
·
1 Parent(s): 2fa94b3

Add multimodality tools

Browse files
Files changed (4) hide show
  1. app.py +2 -1
  2. basic_agent.py +16 -22
  3. multimodality_tools.py +155 -0
  4. requirements.txt +8 -1
app.py CHANGED
@@ -12,6 +12,7 @@ from basic_agent import BasicAgent
12
  # --- Constants ---
13
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
14
 
 
15
  Settings.llm = None # disable LLM for Index Retrieval
16
  Settings.chunk_size = 512 # Smaller chunk size for retrieval
17
 
@@ -78,7 +79,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
78
  print(f"Skipping item with missing task_id or question: {item}")
79
  continue
80
  try:
81
- submitted_answer = agent(question_text)
82
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
83
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
84
  except Exception as e:
 
12
  # --- Constants ---
13
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
14
 
15
+ # For Llamaindex's LoadAndSearchTool
16
  Settings.llm = None # disable LLM for Index Retrieval
17
  Settings.chunk_size = 512 # Smaller chunk size for retrieval
18
 
 
79
  print(f"Skipping item with missing task_id or question: {item}")
80
  continue
81
  try:
82
+ submitted_answer = agent(question_text, task_id)
83
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
84
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
85
  except Exception as e:
basic_agent.py CHANGED
@@ -1,18 +1,16 @@
1
- from llama_index.core.agent.workflow import AgentWorkflow
2
- from llama_index.core.workflow import Context
3
  from llama_index.core.tools import FunctionTool
4
  from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
5
  from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
6
  from llama_index.tools.wikipedia import WikipediaToolSpec
7
- from llama_index.core.tools.tool_spec.load_and_search import LoadAndSearchToolSpec
8
- from llama_index.readers.web import SimpleWebPageReader
9
- from llama_index.core.tools.ondemand_loader_tool import OnDemandLoaderTool
10
  from langfuse.llama_index import LlamaIndexInstrumentor
11
  from llama_index.llms.ollama import Ollama
12
- from llama_index.core.agent.workflow import ReActAgent, FunctionAgent
 
 
 
13
 
14
  class BasicAgent:
15
- def __init__(self, ollama=False, langfuse=True):
16
  if not ollama:
17
  llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")
18
  else:
@@ -28,9 +26,6 @@ class BasicAgent:
28
  tool_spec = DuckDuckGoSearchToolSpec()
29
  search_tool = FunctionTool.from_defaults(tool_spec.duckduckgo_full_search)
30
 
31
- wiki_spec = WikipediaToolSpec()
32
- wiki_search_tool = wiki_spec.to_tool_list()[1]
33
-
34
  # Convert into a LoadAndSearchToolSpec because the wikipedia search tool returns
35
  # entire Wikipedia pages and this can pollute the context window of the LLM
36
  wiki_spec = WikipediaToolSpec()
@@ -38,18 +33,13 @@ class BasicAgent:
38
 
39
  # Convert into a LoadAndSearchToolSpec because the wikipedia search tool returns
40
  # entire Wikipedia pages and this can pollute the context window of the LLM
41
-
42
  # TODO this does not work so well. We need to make the retriever return the top 5 chunks or sth.
43
- wiki_search_tool_las = LoadAndSearchToolSpec.from_defaults(wiki_search_tool).to_tool_list()
44
-
45
- webpage_tool = OnDemandLoaderTool.from_defaults(
46
- SimpleWebPageReader(html_to_text=True),
47
- name="Webpage search tool",
48
- description="A tool for loading the content of a webpage and querying it for information",
49
- )
50
 
51
- self.agent = AgentWorkflow.from_tools_or_functions( # ReActAgent(
52
- tools=[search_tool], # webpage_tool does not work properly - cookies etc
 
 
53
  llm=llm,
54
  verbose=True,
55
  system_prompt = (
@@ -68,8 +58,12 @@ class BasicAgent:
68
 
69
  # self.ctx = Context(self.agent)
70
 
71
- async def __call__(self, question: str) -> str:
72
- response = await self.agent.run(user_msg=question) # ctx=self.ctx)
 
 
 
 
73
 
74
  if self.langfuse:
75
  self.instrumentor.flush()
 
 
 
1
  from llama_index.core.tools import FunctionTool
2
  from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
3
  from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
4
  from llama_index.tools.wikipedia import WikipediaToolSpec
 
 
 
5
  from langfuse.llama_index import LlamaIndexInstrumentor
6
  from llama_index.llms.ollama import Ollama
7
+ from llama_index.core.agent.workflow import FunctionAgent
8
+
9
+ from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
10
+ get_excel_analysis_tool, get_excel_tool, get_csv_analysis_tool, get_csv_tool
11
 
12
  class BasicAgent:
13
+ def __init__(self, ollama=False, langfuse=False):
14
  if not ollama:
15
  llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")
16
  else:
 
26
  tool_spec = DuckDuckGoSearchToolSpec()
27
  search_tool = FunctionTool.from_defaults(tool_spec.duckduckgo_full_search)
28
 
 
 
 
29
  # Convert into a LoadAndSearchToolSpec because the wikipedia search tool returns
30
  # entire Wikipedia pages and this can pollute the context window of the LLM
31
  wiki_spec = WikipediaToolSpec()
 
33
 
34
  # Convert into a LoadAndSearchToolSpec because the wikipedia search tool returns
35
  # entire Wikipedia pages and this can pollute the context window of the LLM
 
36
  # TODO this does not work so well. We need to make the retriever return the top 5 chunks or sth.
37
+ # wiki_search_tool_las = LoadAndSearchToolSpec.from_defaults(wiki_search_tool).to_tool_list()
 
 
 
 
 
 
38
 
39
+ self.agent = FunctionAgent(
40
+ tools=[search_tool, wiki_search_tool, get_image_qa_tool(),
41
+ get_transcription_tool(), get_excel_analysis_tool(), get_excel_tool(),
42
+ get_csv_analysis_tool(), get_csv_tool()],
43
  llm=llm,
44
  verbose=True,
45
  system_prompt = (
 
58
 
59
  # self.ctx = Context(self.agent)
60
 
61
+ async def __call__(self, question: str, task_id: str = None) -> str:
62
+ file_str = ""
63
+ if task_id:
64
+ file_str = f'\nIf you need to load a file, do so by providing the id "{task_id}".'
65
+
66
+ response = await self.agent.run(user_msg=question + file_str) # ctx=self.ctx)
67
 
68
  if self.langfuse:
69
  self.instrumentor.flush()
multimodality_tools.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools to handle multimodal understandig."""
2
+
3
+ import os
4
+ import io
5
+ import requests
6
+
7
+ import librosa
8
+ import soundfile as sf
9
+ import pandas as pd
10
+ from llama_index.core.tools import FunctionTool
11
+ from huggingface_hub import InferenceClient
12
+ from transformers import pipeline
13
+
14
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
15
+
16
+ def transcribe_audio(file_id: str) -> str:
17
+ """
18
+ Transcribes an English audio file identfied by its id.
19
+ """
20
+ try:
21
+ audio, sr = sf.read(_get_file(file_id))
22
+ if sr != 16000:
23
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
24
+ except:
25
+ return "Error: Invalid file. This file is either not an audio file or the id does not exist."
26
+
27
+ asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
28
+
29
+ output = asr(audio, language="en")
30
+
31
+ return output["text"].strip()
32
+
33
+ def transcribe_audio_hf(file_id: str) -> str:
34
+ """
35
+ Transcribes an audio file identfied by its id.
36
+ """
37
+ #audio, sr = sf.read(_get_file(file_id))
38
+ try:
39
+ audio_bytes = _get_file(file_id).read()
40
+ except:
41
+ return "Error: Invalid file. This file is either not an audio file or the id does not exist."
42
+
43
+ client = InferenceClient(
44
+ provider="hf-inference",
45
+ api_key=os.getenv("HF_TOKEN"),
46
+ )
47
+
48
+ output = client.automatic_speech_recognition(audio_bytes, model="openai/whisper-small")
49
+ return output
50
+
51
+ def get_transcription_tool():
52
+ return FunctionTool.from_defaults(
53
+ fn=transcribe_audio,
54
+ description="Transcribes an audio file identified by its id."
55
+ )
56
+
57
+
58
+ def answer_image_question(question: str, file_id: str) -> str:
59
+ """
60
+ Answers questions about an image identified by its id.
61
+ """
62
+ client = InferenceClient(
63
+ provider="hf-inference",
64
+ api_key=os.getenv("HF_TOKEN"),
65
+ )
66
+
67
+ completion = client.chat.completions.create(
68
+ model= "Qwen/Qwen2.5-VL-32B-Instruct",
69
+ messages=[
70
+ {
71
+ "role": "user",
72
+ "content": [
73
+ {
74
+ "type": "text",
75
+ "text": question
76
+ },
77
+ {
78
+ "type": "image_url",
79
+ "image_url": {
80
+ "url": DEFAULT_API_URL + f"/files/{file_id}",
81
+ }
82
+ }
83
+ ]
84
+ }
85
+ ],
86
+ max_tokens=512,
87
+ )
88
+
89
+ return completion.choices[0].message.content
90
+
91
+ def get_image_qa_tool():
92
+ return FunctionTool.from_defaults(
93
+ fn=answer_image_question,
94
+ description="Answer a question about a given image. The image is identified by a file id."
95
+ )
96
+
97
+ def read_excel(file_id: str) -> str:
98
+ file_io = _get_file(file_id)
99
+ df = pd.read_excel(file_io)
100
+ return df.to_markdown()
101
+
102
+ def get_excel_tool():
103
+ return FunctionTool.from_defaults(
104
+ fn=read_excel,
105
+ description="Convert an excel file that is identified by its file id into a markdown string."
106
+ )
107
+
108
+ def analyse_excel(file_id: str) -> str:
109
+ file_io = _get_file(file_id)
110
+ df = pd.read_excel(file_io)
111
+ return df.describe()
112
+
113
+ def get_excel_analysis_tool():
114
+ return FunctionTool.from_defaults(
115
+ fn=read_excel,
116
+ description="Analyse an excel file that is identified by its file id and get common statistics such as mean or max per column."
117
+ )
118
+
119
+ def read_csv(file_id: str) -> str:
120
+ file_io = _get_file(file_id)
121
+ df = pd.read_csv(file_io)
122
+ return df.to_markdown()
123
+
124
+ def get_csv_tool():
125
+ return FunctionTool.from_defaults(
126
+ fn=read_excel,
127
+ description="Convert a csv file that is identified by its file id into a markdown string."
128
+ )
129
+
130
+ def analyse_csv(file_id: str) -> str:
131
+ file_io = _get_file(file_id)
132
+ df = pd.read_csv(file_io)
133
+ return df.describe()
134
+
135
+ def get_csv_analysis_tool():
136
+ return FunctionTool.from_defaults(
137
+ fn=read_excel,
138
+ description="Analyse a csv file that is identified by its file id and get common statistics such as mean or max per column."
139
+ )
140
+
141
+ def watch_video(video_url: str) -> str:
142
+ return "You are not able to watch a Video yet. Reply with 'I don't know' to the question."
143
+
144
+ def get_video_tool():
145
+ return FunctionTool.from_defaults(
146
+ fn=watch_video,
147
+ description="Watch a video and get a content description as a string."
148
+ )
149
+
150
+ def _get_file(task_id: str) -> io.BytesIO:
151
+ res = requests.get(DEFAULT_API_URL + f"/files/{task_id}")
152
+ if res.status_code != 200:
153
+ raise FileNotFoundError("Invalid file or task id.")
154
+ file_like = io.BytesIO(res.content)
155
+ return file_like
requirements.txt CHANGED
@@ -7,4 +7,11 @@ llama_index-tools-wikipedia
7
  llama-index-embeddings-huggingface
8
  llama-index-readers-web
9
  llama-index-llms-ollama
10
- langfuse
 
 
 
 
 
 
 
 
7
  llama-index-embeddings-huggingface
8
  llama-index-readers-web
9
  llama-index-llms-ollama
10
+ langfuse
11
+ tabulate
12
+ soundfile
13
+ librosa
14
+ pillow
15
+ pandas
16
+ huggingface_hub
17
+ transformers