guillaumefrd commited on
Commit
cb5664b
·
1 Parent(s): 79ec94d

add custom tools to query image and do ASR

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +17 -2
  3. config.py +8 -2
  4. custom_tools.py +63 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py CHANGED
@@ -11,8 +11,9 @@ from llama_index.tools.wikipedia import WikipediaToolSpec
11
  from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
12
  from llama_index.tools.code_interpreter import CodeInterpreterToolSpec
13
 
14
- from prompt import custom_react_system_header_str
15
  from config import HF_MODEL_NAME, HF_PROVIDER
 
 
16
 
17
  # (Keep Constants as is)
18
  # --- Constants ---
@@ -43,9 +44,10 @@ class LLamaIndexAgent:
43
 
44
  # tools definition
45
  tool_spec_list = []
46
- # tool_spec_list += WikipediaToolSpec().to_tool_list()
47
  tool_spec_list += DuckDuckGoSearchToolSpec().to_tool_list()
48
  tool_spec_list += CodeInterpreterToolSpec().to_tool_list()
 
49
 
50
  # agent definition
51
  self.agent = ReActAgent(llm=llm, tools=tool_spec_list)
@@ -151,10 +153,23 @@ async def run_and_submit_all(profile: gr.OAuthProfile | None):
151
  for item in questions_data:
152
  task_id = item.get("task_id")
153
  question_text = item.get("question")
 
154
  if not task_id or question_text is None:
155
  print(f"Skipping item with missing task_id or question: {item}")
156
  continue
157
  try:
 
 
 
 
 
 
 
 
 
 
 
 
158
  submitted_answer = await agent(question_text)
159
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
160
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
11
  from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
12
  from llama_index.tools.code_interpreter import CodeInterpreterToolSpec
13
 
 
14
  from config import HF_MODEL_NAME, HF_PROVIDER
15
+ from prompt import custom_react_system_header_str
16
+ from custom_tools import query_image_tool, automatic_speech_recognition_tool
17
 
18
  # (Keep Constants as is)
19
  # --- Constants ---
 
44
 
45
  # tools definition
46
  tool_spec_list = []
47
+ tool_spec_list += WikipediaToolSpec().to_tool_list()
48
  tool_spec_list += DuckDuckGoSearchToolSpec().to_tool_list()
49
  tool_spec_list += CodeInterpreterToolSpec().to_tool_list()
50
+ tool_spec_list += [query_image_tool, automatic_speech_recognition_tool]
51
 
52
  # agent definition
53
  self.agent = ReActAgent(llm=llm, tools=tool_spec_list)
 
153
  for item in questions_data:
154
  task_id = item.get("task_id")
155
  question_text = item.get("question")
156
+ file_name = item.get("file_name")
157
  if not task_id or question_text is None:
158
  print(f"Skipping item with missing task_id or question: {item}")
159
  continue
160
  try:
161
+ if file_name:
162
+ # add the URL of the data source to the question (so that the agent can deal with it)
163
+ file_url = f"{DEFAULT_API_URL}/files/{task_id}"
164
+ question_text += f"\nFile URL: {file_url}"
165
+ # get the extension of the file to help the agent
166
+ try:
167
+ ext = file_name.split('.')[-1]
168
+ question_text += f" (.{ext} file)"
169
+ except:
170
+ pass
171
+
172
+ # call the agent
173
  submitted_answer = await agent(question_text)
174
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
175
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
config.py CHANGED
@@ -1,2 +1,8 @@
1
- HF_MODEL_NAME = "google/gemma-3-27b-it" # multimodal (need to handle image input)
2
- HF_PROVIDER = "nebius"
 
 
 
 
 
 
 
1
+ HF_MODEL_NAME = "google/gemma-3-27b-it"
2
+ HF_PROVIDER = "nebius"
3
+
4
+ # HF_MODEL_NAME = "Qwen/Qwen2.5-Coder-32B-Instruct"
5
+ # HF_PROVIDER = "hf-inference"
6
+
7
+ # HF_MODEL_NAME = "Qwen/Qwen3-32B"
8
+ # HF_PROVIDER = "hf-inference"
custom_tools.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # custom tools
2
+
3
+ from huggingface_hub import InferenceClient
4
+ from llama_index.core.tools import FunctionTool
5
+
6
+ # --- Functions --- #
7
+
8
+ def query_image(query: str, image_url: str) -> str:
9
+ """Ask anything about an image using a Vision Language Model
10
+
11
+ Args:
12
+ query (str): the query about the image, e.g. how many persons are on the image?
13
+ image_url (str): the URL to the image
14
+ """
15
+
16
+ client = InferenceClient(provider="nebius")
17
+ try:
18
+ completion = client.chat.completions.create(
19
+ # model="google/gemma-3-27b-it",
20
+ model="Qwen/Qwen2.5-VL-72B-Instruct",
21
+ messages=[
22
+ {
23
+ "role": "user",
24
+ "content": [
25
+ {
26
+ "type": "text",
27
+ "text": query
28
+ },
29
+ {
30
+ "type": "image_url",
31
+ "image_url": {
32
+ "url": image_url
33
+ }
34
+ }
35
+ ]
36
+ }
37
+ ],
38
+ max_tokens=512,
39
+ )
40
+ return completion.choices[0].message
41
+
42
+ except Exception as e:
43
+ return f"query_image failed: {e}"
44
+
45
+
46
+ def automatic_speech_recognition(file_url: str) -> str:
47
+ """Transcribe an audio file to text
48
+
49
+ Args:
50
+ file_url (str): the URL to the audio file
51
+ """
52
+ client = InferenceClient(provider="fal-ai")
53
+ try:
54
+ return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3")
55
+ except Exception as e:
56
+ return f"automatic_speech_recognition failed: {e}"
57
+
58
+
59
+
60
+ ### --- Tool instance ---
61
+
62
+ query_image_tool = FunctionTool.from_defaults(query_image)
63
+ automatic_speech_recognition_tool = FunctionTool.from_defaults(automatic_speech_recognition)