robinsmits commited on
Commit
b800e08
Β·
1 Parent(s): 401718e

Multi Agent Setup

Browse files
Files changed (5) hide show
  1. README.md +9 -7
  2. agents.py +100 -0
  3. app.py +234 -0
  4. requirements.txt +14 -0
  5. tooling.py +302 -0
README.md CHANGED
@@ -1,13 +1,15 @@
1
  ---
2
- title: Agents Course Final Project
3
- emoji: 🏒
4
- colorFrom: red
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.33.2
8
  app_file: app.py
9
  pinned: false
10
- license: mit
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Template Final Assignment
3
+ emoji: πŸ•΅πŸ»β€β™‚οΈ
4
+ colorFrom: indigo
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.25.2
8
  app_file: app.py
9
  pinned: false
10
+ hf_oauth: true
11
+ # optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
12
+ hf_oauth_expiration_minutes: 480
13
  ---
14
 
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
agents.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import Modules
2
+ import os
3
+ import pandas as pd
4
+ import torch
5
+ from smolagents import LiteLLMModel, OpenAIServerModel
6
+ from smolagents import (ToolCallingAgent,
7
+ CodeAgent,
8
+ DuckDuckGoSearchTool,
9
+ VisitWebpageTool,
10
+ WikipediaSearchTool,
11
+ FinalAnswerTool,
12
+ PythonInterpreterTool)
13
+
14
+ # Custom Modules
15
+ from tooling import (vision_language_tool,
16
+ read_excel_tool,
17
+ speech_to_text_tool,
18
+ youtube_captions_tool)
19
+
20
+ # Agent Model
21
+ model = OpenAIServerModel(model_id = "gpt-4.1",
22
+ api_key = os.getenv('OPENAI_KEY'))
23
+
24
+ # Create Vision Agent
25
+ def create_vision_agent():
26
+ # Create Vision Agent
27
+ return ToolCallingAgent(model = model,
28
+ tools = [FinalAnswerTool(),
29
+ vision_language_tool],
30
+ name = 'vision_agent',
31
+ planning_interval = 2,
32
+ verbosity_level = 2,
33
+ max_steps = 4,
34
+ provide_run_summary = True,
35
+ description = """
36
+ A team member that will use a vision language model to answer a question about an image.
37
+ Ask him for all your questions that require answering a question about a picture or image.
38
+ Provide the file name of the image and the specific question that you want it answer.
39
+ """)
40
+
41
+ # Create Web Agent
42
+ def create_web_agent():
43
+ # Create Web Agent
44
+ return CodeAgent(model = model,
45
+ tools = [FinalAnswerTool(),
46
+ DuckDuckGoSearchTool(max_results = 15),
47
+ VisitWebpageTool(max_output_length = 75000),
48
+ WikipediaSearchTool(user_agent = "FinalAssignmentResearchBot ([email protected])",
49
+ language = "en",
50
+ content_type = "text",
51
+ extract_format = "WIKI")],
52
+ additional_authorized_imports = ["json",
53
+ "pandas",
54
+ 're',
55
+ 'bs4',
56
+ 'requests',
57
+ 'numpy',
58
+ 'math',
59
+ 'xml',
60
+ 'scikit-learn'],
61
+ name = 'web_agent',
62
+ planning_interval = 3,
63
+ verbosity_level = 2,
64
+ max_steps = 12,
65
+ provide_run_summary = True,
66
+ description = """
67
+ A team member that will use various tools to search for websites, to visit websites and to parse and read information from websites.
68
+ Every question that requires to retrieve information from the internet to be answered must be answered by using the web_agent.
69
+ The gathered information to create the final answer will be reported back to the manager_agent.
70
+ """)
71
+
72
+ # Create Manager Agent
73
+ def create_manager_agent():
74
+ # Create Managed Agents
75
+ vision_agent = create_vision_agent()
76
+ web_agent = create_web_agent()
77
+
78
+ # Return Manager Agent
79
+ return CodeAgent(model = model,
80
+ tools = [FinalAnswerTool(),
81
+ PythonInterpreterTool(),
82
+ speech_to_text_tool,
83
+ youtube_captions_tool,
84
+ read_excel_tool],
85
+ name = 'manager_agent',
86
+ additional_authorized_imports = ['json',
87
+ 'pandas',
88
+ 're',
89
+ 'bs4',
90
+ 'requests',
91
+ 'numpy',
92
+ 'math',
93
+ 'xml',
94
+ 'scikit-learn'],
95
+ planning_interval = 3,
96
+ verbosity_level = 2,
97
+ stream_outputs = True,
98
+ max_steps = 12,
99
+ provide_run_summary = True,
100
+ managed_agents = [vision_agent, web_agent])
app.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import requests
4
+ import inspect
5
+ import pandas as pd
6
+ import gc
7
+ import json
8
+
9
+ # Custom
10
+ from tooling import (check_for_file_name_and_return_prompt,
11
+ get_manager_agent_prompt,
12
+ gradio_main_instructions)
13
+ from agents import create_manager_agent
14
+
15
+
16
+ # --- Constants ---
17
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
18
+
19
+ def run_and_submit_all( profile: gr.OAuthProfile | None):
20
+ """
21
+ Fetches all questions, runs the BasicAgent on them, submits all answers,
22
+ and displays the results.
23
+ """
24
+ # --- Determine HF Space Runtime URL and Repo URL ---
25
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
26
+
27
+ if profile:
28
+ username= f"{profile.username}"
29
+ print(f"User logged in: {username}")
30
+ else:
31
+ print("User not logged in.")
32
+ return "Please Login to Hugging Face with the button.", None
33
+
34
+ api_url = DEFAULT_API_URL
35
+ questions_url = f"{api_url}/questions"
36
+ submit_url = f"{api_url}/submit"
37
+
38
+ # 1. Instantiate Agent ( modify this part to create your agent)
39
+ try:
40
+ # Create Manager Agent
41
+ manager_agent = create_manager_agent()
42
+
43
+ except Exception as e:
44
+ print(f"Error instantiating agent: {e}")
45
+ return f"Error initializing agent: {e}", None
46
+
47
+ # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
48
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
49
+ print(agent_code)
50
+
51
+
52
+ # 2. Fetch Questions
53
+ print(f"Fetching questions from: {questions_url}")
54
+ try:
55
+ response = requests.get(questions_url, timeout=15)
56
+ response.raise_for_status()
57
+ questions_data = response.json()
58
+ if not questions_data:
59
+ print("Fetched questions list is empty.")
60
+ return "Fetched questions list is empty or invalid format.", None
61
+ print(f"Fetched {len(questions_data)} questions.")
62
+ except requests.exceptions.RequestException as e:
63
+ print(f"Error fetching questions: {e}")
64
+ return f"Error fetching questions: {e}", None
65
+ except requests.exceptions.JSONDecodeError as e:
66
+ print(f"Error decoding JSON response from questions endpoint: {e}")
67
+ print(f"Response text: {response.text[:500]}")
68
+ return f"Error decoding server response for questions: {e}", None
69
+ except Exception as e:
70
+ print(f"An unexpected error occurred fetching questions: {e}")
71
+ return f"An unexpected error occurred fetching questions: {e}", None
72
+
73
+
74
+ # 3. Run your Agent
75
+ results_log = []
76
+ answers_payload = []
77
+ print(f"Running agent on {len(questions_data)} questions...")
78
+ for index, item in enumerate(questions_data):
79
+ print(f"Running question {index} {item.get('question')}")
80
+
81
+ task_id = item.get("task_id")
82
+ question_text = item.get("question")
83
+ file_name = item.get("file_name")
84
+
85
+ # File Check
86
+ file_prompt = check_for_file_name_and_return_prompt(file_name)
87
+
88
+ # File Download
89
+ if file_name != '':
90
+ # GET /files/{task_id}: Download a specific file associated with a given task ID.
91
+ files_url = f"{api_url}/files/{task_id}"
92
+ print(f"Fetching files for task_id: {task_id}")
93
+ try:
94
+ response = requests.get(files_url, stream=True, timeout=30)
95
+ response.raise_for_status()
96
+
97
+ # Save file to disk
98
+ with open(file_name, 'wb') as f:
99
+ for chunk in response.iter_content(chunk_size=8192):
100
+ if chunk: # filter out keep-alive chunks
101
+ f.write(chunk)
102
+ print(f"File '{file_name}' downloaded and saved successfully.")
103
+
104
+ except requests.exceptions.RequestException as e:
105
+ print(f"Request error while fetching files: {e}")
106
+ return f"Request error while fetching files: {e}", None
107
+ except Exception as e:
108
+ print(f"An unexpected error occurred while saving the file: {e}")
109
+ return f"An unexpected error occurred while saving the file: {e}", None
110
+
111
+
112
+ ################################################################################
113
+ ###### RUN MANAGER AGENT
114
+ ################################################################################
115
+ if not task_id or question_text is None:
116
+ print(f"Skipping item with missing task_id or question: {item}")
117
+ continue
118
+ try:
119
+ # Run Manager Agent
120
+ submitted_answer = manager_agent.run(get_manager_agent_prompt(question_text, file_prompt))
121
+
122
+ # Basic verification...convert both to string...
123
+ if type(submitted_answer) is list or type(submitted_answer) is dict:
124
+ submitted_answer = str(submitted_answer)
125
+
126
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
127
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
128
+ except Exception as e:
129
+ print(f"Error running agent on task {task_id}: {e}")
130
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
131
+
132
+ #################################################################################
133
+ # Writing the list of dictionaries to a plain text file (overwriting the existing file)
134
+ with open('results_log.txt', 'w') as file:
135
+ json.dump(results_log, file, indent=4)
136
+
137
+ if not answers_payload:
138
+ print("Agent did not produce any answers to submit.")
139
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
140
+
141
+
142
+ # 4. Prepare Submission
143
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
144
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
145
+ print(status_update)
146
+
147
+ # 5. Submit
148
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
149
+ try:
150
+ response = requests.post(submit_url, json=submission_data, timeout=60)
151
+ response.raise_for_status()
152
+ result_data = response.json()
153
+ final_status = (
154
+ f"Submission Successful!\n"
155
+ f"User: {result_data.get('username')}\n"
156
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
157
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
158
+ f"Message: {result_data.get('message', 'No message received.')}"
159
+ )
160
+ print("Submission successful.")
161
+ results_df = pd.DataFrame(results_log)
162
+ return final_status, results_df
163
+ except requests.exceptions.HTTPError as e:
164
+ error_detail = f"Server responded with status {e.response.status_code}."
165
+ try:
166
+ error_json = e.response.json()
167
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
168
+ except requests.exceptions.JSONDecodeError:
169
+ error_detail += f" Response: {e.response.text[:500]}"
170
+ status_message = f"Submission Failed: {error_detail}"
171
+ print(status_message)
172
+ results_df = pd.DataFrame(results_log)
173
+ return status_message, results_df
174
+ except requests.exceptions.Timeout:
175
+ status_message = "Submission Failed: The request timed out."
176
+ print(status_message)
177
+ results_df = pd.DataFrame(results_log)
178
+ return status_message, results_df
179
+ except requests.exceptions.RequestException as e:
180
+ status_message = f"Submission Failed: Network error - {e}"
181
+ print(status_message)
182
+ results_df = pd.DataFrame(results_log)
183
+ return status_message, results_df
184
+ except Exception as e:
185
+ status_message = f"An unexpected error occurred during submission: {e}"
186
+ print(status_message)
187
+ results_df = pd.DataFrame(results_log)
188
+ return status_message, results_df
189
+
190
+
191
+ # --- Build Gradio Interface using Blocks ---
192
+ with gr.Blocks() as demo:
193
+ gr.Markdown("# Basic Agent Evaluation Runner")
194
+ gr.Markdown(gradio_main_instructions)
195
+ gr.LoginButton()
196
+
197
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
198
+
199
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
200
+
201
+ # Removed max_rows=10 from DataFrame constructor
202
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
203
+
204
+ run_button.click(fn = run_and_submit_all,
205
+ outputs = [status_output, results_table])
206
+
207
+ if __name__ == "__main__":
208
+ print("\n" + "-"*30 + " App Starting " + "-"*30)
209
+ # Check for SPACE_HOST and SPACE_ID at startup for information
210
+ space_host_startup = os.getenv("SPACE_HOST")
211
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
212
+
213
+ if space_host_startup:
214
+ print(f"βœ… SPACE_HOST found: {space_host_startup}")
215
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
216
+ else:
217
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
218
+
219
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
220
+ print(f"βœ… SPACE_ID found: {space_id_startup}")
221
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
222
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
223
+ else:
224
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
225
+
226
+ print("-"*(60 + len(" App Starting ")) + "\n")
227
+
228
+ print("Launching Gradio Interface for Basic Agent Evaluation...")
229
+ demo.launch(debug=True, share=False)
230
+
231
+
232
+ """
233
+ Submission Failed: Server responded with status 422. Detail: [{'type': 'string_type', 'loc': ['body', 'answers', 13, 'submitted_answer', 'str'], 'msg': 'Input should be a valid string', 'input': ['45', '50', '67', '89']}, {'type': 'int_type', 'loc': ['body', 'answers', 13, 'submitted_answer', 'int'], 'msg': 'Input should be a valid integer', 'input': ['45', '50', '67', '89']}, {'type': 'float_type', 'loc': ['body', 'answers', 13, 'submitted_answer', 'float'], 'msg': 'Input should be a valid number', 'input': ['45', '50', '67', '89']}]
234
+ """
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio[oauth]
2
+ numpy
3
+ openpyxl
4
+ pandas
5
+ requests
6
+ smolagents[all]
7
+ autoawq
8
+ transformers==4.51.3
9
+ scikit-learn
10
+ wikipedia-api
11
+ num2words==0.5.14
12
+ yt-dlp
13
+ librosa
14
+ soundfile
tooling.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/huggingface/smolagents/blob/v1.17.0/src/smolagents/default_tools.py#L479
2
+
3
+ # Import Modules
4
+ import os
5
+ import pandas as pd
6
+ import yt_dlp
7
+ import re
8
+
9
+ # Smolagents
10
+ import torch
11
+ from transformers import AutoProcessor, AutoModelForVision2Seq
12
+ from smolagents import (tool)
13
+ from smolagents.tools import PipelineTool
14
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
15
+ import librosa
16
+ import numpy as np
17
+
18
+
19
+ gradio_main_instructions = """
20
+ **Instructions:**
21
+
22
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
23
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
24
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
25
+
26
+ ---
27
+ **Disclaimers:**
28
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
29
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
30
+ """
31
+
32
+ def get_manager_agent_prompt(question_text, file_prompt):
33
+ return f"""
34
+ # Objective:
35
+ Your task is to analyze the following question and to provide a final answer.
36
+
37
+ {file_prompt}
38
+
39
+ # Question:
40
+ {question_text}
41
+
42
+ # Final Answer requirements:
43
+ The final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
44
+ If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
45
+ If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
46
+ If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
47
+
48
+ !! Note !! If the question itself mentions specific instructions for how the answer should be formatted than make absolutely sure those are also applied to the answer!!
49
+ """
50
+
51
+ def check_for_file_name_and_return_prompt(file_name):
52
+ if file_name == '':
53
+ return 'For this question there is no file with additional information available.'
54
+ else:
55
+ # Detect File Type
56
+ if '.xlsx' in file_name:
57
+ file_type = 'Excel Sheet'
58
+ return f"""
59
+ # File Information
60
+ For this question there is a file named "{file_name}" with additional information related to the question available.
61
+ The specific file is of type: {file_type}.
62
+ The file is already downloaded and available for use.
63
+ Load the file based on the file name with the pandas python library or use the read_excel_tool. Choose what works best for you.
64
+ Carefully load the file and use its content in the best and correct way possible to help you answer the question."""
65
+ elif '.csv' in file_name:
66
+ file_type = 'CSV File'
67
+ return f"""
68
+ # File Information
69
+ For this question there is a file named "{file_name}" with additional information related to the question available.
70
+ The specific file is of type: {file_type}.
71
+ The file is already downloaded and available for use.
72
+ Load the file based on the file name with the pandas python library.
73
+ Carefully load the file and use its content in the best and correct way possible to help you answer the question."""
74
+ elif '.mp3' in file_name:
75
+ file_type = 'MP3 Audio File'
76
+ return f"""
77
+ # File Information
78
+ For this question there is a file named '{file_name}' with additional information related to the question available.
79
+ The specific file is of type: {file_type}.
80
+ The file is already downloaded and available for use with the available tools to load the specific file.
81
+ Carefully load the file and use its content in the best and correct way possible to help you answer the question.
82
+ If the file name mentioned specifically in the question is different from the following file name '{file_name}' then keep using the following file name: '{file_name}'.
83
+ """
84
+ elif '.png' in file_name:
85
+ file_type = 'PNG Image File'
86
+ return f"""
87
+ # File Information
88
+ For this question there is a file named "{file_name}" with additional information related to the question available.
89
+ The specific file is of type: {file_type}.
90
+ The file is already downloaded and available for use. Use the 'vision_agent' to load the file and answer the question.
91
+ Make sure to pass the file name and question!!"""
92
+ elif '.py' in file_name:
93
+ file_type = 'Python Script File'
94
+ with open(file_name, "r") as py_file:
95
+ python_script_contents = py_file.read()
96
+ return f"""
97
+ # File Information
98
+ For this question there is a file named '{file_name}' with additional information related to the question available.
99
+ The specific file is of type: {file_type}.
100
+ The file is already downloaded and available for use with the available tools to load the specific file.
101
+
102
+ As an extra service below is the content of the Python Script File also visible.
103
+
104
+ # Python Script File Content
105
+ ```
106
+ {python_script_contents}
107
+ ```
108
+ """
109
+
110
+ # Create Models for Vision Tool
111
+ device = "cuda"
112
+ vision_model_path = "ibm-granite/granite-vision-3.2-2b"
113
+ vision_processor = AutoProcessor.from_pretrained(vision_model_path)
114
+ vision_model = AutoModelForVision2Seq.from_pretrained(vision_model_path,
115
+ torch_dtype = torch.bfloat16).to(device)
116
+
117
+ @tool
118
+ def vision_language_tool(question: str, file_name: str) -> str:
119
+ """
120
+ This vision language tool will load any image based on the provided file_name and will answer the question that is provided.
121
+ Args:
122
+ question: A string that contains the question that we need to answer about the image.
123
+ file_name: A string containing the image file name.
124
+ Returns:
125
+ A string containing the answer to the question.
126
+ """
127
+
128
+ prompt = f"""
129
+ You are provided with an image.
130
+
131
+ Answer the following question about the image very specifically and in detail:
132
+
133
+ {question}"""
134
+ print(f"vlt: {os.listdir('./')}")
135
+ conversation = [
136
+ {
137
+ "role": "user",
138
+ "content": [{"type": "image", "url": file_name}, {"type": "text", "text": prompt}],
139
+ },
140
+ ]
141
+ inputs = vision_processor.apply_chat_template(conversation,
142
+ add_generation_prompt = True,
143
+ tokenize = True,
144
+ return_dict = True,
145
+ return_tensors = "pt").to(device)
146
+
147
+
148
+ # autoregressively complete prompt
149
+ model_output = vision_model.generate(**inputs,
150
+ max_new_tokens = 1024,
151
+ temperature = 0.2,
152
+ do_sample = True,
153
+ top_p = 0.975,
154
+ top_k = 75,
155
+ min_p = 0.05,
156
+ repetition_penalty = 1.15)
157
+ answer = vision_processor.decode(model_output[0], skip_special_tokens = True)
158
+
159
+ return answer
160
+
161
+ @tool
162
+ def speech_to_text_tool(file_name: str) -> str:
163
+ """
164
+ This speech to text tool will use the provided file name to load an mp3 audio file and and output a transcription of the audio file as a text string.
165
+ Args:
166
+ file_name: A string containing the audio file name.
167
+ Returns:
168
+ A string containing the transcribed text of the audio file.
169
+ """
170
+
171
+ # Load model and processor
172
+ model_name = "openai/whisper-small"
173
+ processor = WhisperProcessor.from_pretrained(model_name)
174
+ model = WhisperForConditionalGeneration.from_pretrained(model_name).to('cpu')
175
+ model.config.forced_decoder_ids = None
176
+
177
+ # Load and resample audio to 16kHz mono
178
+ speech_array, sampling_rate = librosa.load(file_name, sr = 16000, mono=True)
179
+
180
+ # Define chunk size: 30 seconds at 16kHz = 480000 samples
181
+ chunk_size = 30 * 16000 # 480000
182
+
183
+ # Split into chunks
184
+ chunks = [
185
+ speech_array[i:i+chunk_size]
186
+ for i in range(0, len(speech_array), chunk_size)
187
+ ]
188
+
189
+ # Pad last chunk if it's shorter
190
+ if len(chunks[-1]) < chunk_size:
191
+ chunks[-1] = np.pad(chunks[-1], (0, chunk_size - len(chunks[-1])))
192
+
193
+ # Prepare input features in batch
194
+ input_features = processor(chunks, sampling_rate=16000, return_tensors="pt").input_features
195
+
196
+ # Generate predictions in batch
197
+ predicted_ids = model.generate(input_features)
198
+
199
+ # Decode all chunks and concatenate
200
+ transcribed_texts = processor.batch_decode(predicted_ids, skip_special_tokens=True)
201
+ full_transcription = " ".join([t.strip() for t in transcribed_texts])
202
+
203
+ return full_transcription
204
+
205
+ @tool
206
+ def youtube_captions_tool(youtube_video_url: str) -> str:
207
+ """
208
+ This youtube captions tool will use a youtube video url to retrieve the captions and output them as a string containing the conversations in the video.
209
+ Args:
210
+ youtube_video_url: A string containing the url for a youtube video from which the captions will be retrieved.
211
+ Returns:
212
+ A string containing the captions of the youtube video url.
213
+ """
214
+
215
+ outtmpl = "caption.%(ext)s"
216
+ ydl_opts = {
217
+ 'writesubtitles': True,
218
+ 'writeautomaticsub': True,
219
+ 'subtitleslangs': ['en'],
220
+ 'skip_download': True,
221
+ 'outtmpl': outtmpl,
222
+ 'quiet': True
223
+ }
224
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
225
+ info = ydl.extract_info(youtube_video_url, download=True)
226
+ vtt_filename = None
227
+ for ext in ('en.vtt', 'en-US.vtt'):
228
+ if os.path.isfile(f'caption.{ext}'):
229
+ vtt_filename = f'caption.{ext}'
230
+ break
231
+ if not vtt_filename:
232
+ raise FileNotFoundError("Could not find English captions (.vtt) after download.")
233
+ with open(vtt_filename, encoding='utf-8') as f:
234
+ vtt_content = f.read()
235
+ os.remove(vtt_filename)
236
+
237
+ # Remove headers and unnecessary metadata
238
+ vtt_content = re.sub(r'WEBVTT.*?\n', '', vtt_content, flags=re.DOTALL)
239
+ vtt_content = re.sub(r'^Kind:.*\n?', '', vtt_content, flags=re.MULTILINE)
240
+ vtt_content = re.sub(r'^Language:.*\n?', '', vtt_content, flags=re.MULTILINE)
241
+ vtt_content = re.sub(r'^NOTE.*\n?', '', vtt_content, flags=re.MULTILINE)
242
+ vtt_content = re.sub(r'X-TIMESTAMP.*', '', vtt_content)
243
+ vtt_content = re.sub(r'\[.*?\]', '', vtt_content)
244
+ vtt_content = re.sub(r'<.*?>', '', vtt_content) # Remove tags like <c> and <00:00:01.000>
245
+
246
+ # Split by lines, remove lines that are timestamps, metadata, or blank
247
+ cleaned_lines = []
248
+ last_line = None
249
+ for line in vtt_content.splitlines():
250
+ line = line.strip()
251
+ if not line:
252
+ continue # Skip blank lines
253
+ if re.match(r'^\d{2}:\d{2}:\d{2}\.\d{3} -->', line):
254
+ continue # Skip timestamps
255
+ if re.match(r'^\d+$', line):
256
+ continue # Skip sequence numbers
257
+ if 'align:' in line or 'position:' in line:
258
+ # Remove align/position metadata but keep the actual text
259
+ line = re.sub(r'align:[^\s]+', '', line)
260
+ line = re.sub(r'position:[^\s]+', '', line)
261
+ line = line.strip()
262
+ if not line:
263
+ continue
264
+ if line == last_line:
265
+ continue # Deduplicate consecutive lines
266
+ cleaned_lines.append(line)
267
+ last_line = line
268
+ captions = '\n'.join(cleaned_lines).strip()
269
+
270
+ return captions
271
+
272
+ @tool
273
+ def read_excel_tool(file_name: str) -> str:
274
+ """
275
+ This read excel tool will use the provided file name to load an Excel file into a Pandas DataFrame and output the various information as a text string.
276
+ Args:
277
+ file_name: A string containing the Excel file name.
278
+ Returns:
279
+ A string containing the structured output from a Pandas DataFrame after reading the Excel file.
280
+ """
281
+ # Read Excel File
282
+ df = pd.read_excel(file_name)
283
+
284
+ # Excel String
285
+ excel_string = f"""
286
+ # Summary
287
+ The text below contains the information from the Excel File that has been loaded into a Pandas DataFrame.
288
+
289
+ ## DataFrame Shape
290
+ {df.shape}
291
+
292
+ ## DataFrame Columns
293
+ {df.columns}
294
+
295
+ ## DataFrame Describe
296
+ {df.describe}
297
+
298
+ ## DataFrame Head
299
+ {df.head(25)}
300
+ """
301
+
302
+ return excel_string