wt002 commited on
Commit
2b9b092
·
verified ·
1 Parent(s): 622f98e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +397 -50
app.py CHANGED
@@ -1,70 +1,417 @@
1
  import os
2
- import time
3
-
4
  import gradio as gr
5
- import pandas as pd
6
  import requests
7
- from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, LiteLLMModel, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
9
 
10
- from agent import (
11
- analyze_audio_file,
12
- analyze_image_file,
13
- analyze_xlsx_file,
14
- analyze_youtube_video,
15
- download_file_of_task_id,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
 
18
- # --- Constants ---
19
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- # --- Basic Agent Definition ---
22
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
23
  class BasicAgent:
24
  def __init__(self):
25
- print("BasicAgent initialized.")
26
-
27
- # Initialize the model
28
- model = LiteLLMModel(model_id=os.getenv("MODEL_ID"),
29
- api_key=os.getenv("GOOGLE_API_KEY"))
30
 
31
- # Initialize the searchs tool
32
- duck_duck_go_search_tool = DuckDuckGoSearchTool()
33
- wikipedia_search_tool = WikipediaSearchTool()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # Initialize Agent
36
  self.agent = CodeAgent(
37
- model = model,
38
- tools=[download_file_of_task_id, analyze_audio_file, analyze_image_file,
39
- analyze_xlsx_file, duck_duck_go_search_tool, wikipedia_search_tool,
40
- analyze_youtube_video, FinalAnswerTool()]
 
 
 
 
 
 
 
 
 
 
41
  )
 
 
 
42
 
43
- def __call__(self, question: str, task_id: str) -> str:
44
- task = f"""
45
- You are a general AI assistant.
46
- I will ask you a question and you can use 8 steps to answer it.
47
- You can use the tools I provided to you to answer the question.
48
- Every time you use a tool, the number of steps will decrease by one.
49
- If you have a list of possible pages to visit, prefer the wikipedia ones.
50
- If a page does not allow visit, skip it.
51
- Report your thoughts, and finish your answer with the following template:
52
- FINAL ANSWER: [YOUR FINAL ANSWER].
53
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
54
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
55
- If the answer is a number, represent it with digits.
56
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
57
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
58
 
59
- The taskid is {task_id} in case you need to get extra files, use taskid and not name of the file
60
- and the question is {question}
61
- """
62
-
63
- fixed_answer = self.agent.run(task)
64
- print(f"Agent returning fixed answer: {fixed_answer}")
65
- time.sleep(50)
66
 
67
- return fixed_answer
68
 
69
 
70
 
 
1
  import os
 
 
2
  import gradio as gr
 
3
  import requests
4
+ import inspect
5
+ import pandas as pd
6
+ from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
7
+ from dotenv import load_dotenv
8
+ import heapq
9
+ from collections import Counter
10
+ import re
11
+ from io import BytesIO
12
+ from youtube_transcript_api import YouTubeTranscriptApi
13
+ from langchain_community.tools.tavily_search import TavilySearchResults
14
+ from langchain_community.document_loaders import WikipediaLoader
15
+ from langchain_community.utilities import WikipediaAPIWrapper
16
+ from langchain_community.document_loaders import ArxivLoader
17
+
18
+ # (Keep Constants as is)
19
+ # --- Constants ---
20
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
21
+
22
+ #Load environment variables
23
+ load_dotenv()
24
+
25
+ import io
26
+ import contextlib
27
+ import traceback
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
29
+ from smolagents import Tool, CodeAgent, DuckDuckGoSearchTool, FinalAnswerTool, HfApiModel
30
+
31
+
32
+ class CodeLlamaTool(Tool):
33
+ name = "code_llama_tool"
34
+ description = "Solves reasoning/code questions using Meta Code Llama 7B Instruct"
35
+
36
+ inputs = {
37
+ "question": {
38
+ "type": "string",
39
+ "description": "The question requiring code-based or reasoning-based solution"
40
+ }
41
+ }
42
+ output_type = "string"
43
+
44
+ def __init__(self):
45
+ self.model_id = "codellama/CodeLlama-7b-Instruct-hf"
46
+ token = os.getenv("HF_TOKEN")
47
+
48
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, token=token)
49
+ self.model = AutoModelForCausalLM.from_pretrained(
50
+ self.model_id,
51
+ device_map="auto",
52
+ torch_dtype="auto",
53
+ token=token
54
+ )
55
+ self.pipeline = pipeline(
56
+ "text-generation",
57
+ model=self.model,
58
+ tokenizer=self.tokenizer,
59
+ max_new_tokens=512,
60
+ temperature=0.2,
61
+ truncation=True
62
+ )
63
+
64
+ def forward(self, question: str) -> str:
65
+ prompt = f"""You are an AI that uses Python code to answer questions.
66
+
67
+ Question: {question}
68
+
69
+ Instructions:
70
+ - If solving requires code, use a block like <tool>code</tool>.
71
+ - Always end with <final>FINAL ANSWER</final> containing the final number or string.
72
+
73
+ Example:
74
+ Question: What is 5 * sqrt(36)?
75
+ Answer:
76
+ <tool>
77
+ import math
78
+ print(5 * math.sqrt(36))
79
+ </tool>
80
+ <final>30.0</final>
81
+
82
+ Answer:"""
83
+
84
+ response = self.pipeline(prompt)[0]["generated_text"]
85
+ return self.parse_and_execute(response)
86
+
87
+ def parse_and_execute(self, response: str) -> str:
88
+ try:
89
+ # Extract and run code if exists
90
+ if "<tool>" in response and "</tool>" in response:
91
+ code = response.split("<tool>")[1].split("</tool>")[0].strip()
92
+ result = self._run_code(code)
93
+ return f"FINAL ANSWER (code output): {result}"
94
+
95
+ # Extract final result directly
96
+ elif "<final>" in response and "</final>" in response:
97
+ final = response.split("<final>")[1].split("</final>")[0].strip()
98
+ return f"FINAL ANSWER: {final}"
99
+
100
+ return f"Could not extract final answer.\n\n{response}"
101
+
102
+ except Exception as e:
103
+ return f"Error in parse_and_execute: {str(e)}\n\nFull response:\n{response}"
104
+
105
+ def _run_code(self, code: str) -> str:
106
+ buffer = io.StringIO()
107
+ try:
108
+ with contextlib.redirect_stdout(buffer):
109
+ exec(code, {})
110
+ return buffer.getvalue().strip()
111
+ except Exception:
112
+ return f"Error executing code:\n{traceback.format_exc()}"
113
+
114
+
115
+
116
+ #from smolagents import Tool
117
+ #from langchain_community.document_loaders import WikipediaLoader
118
+
119
+ class WikiSearchTool(Tool):
120
+ name = "wiki_search"
121
+ description = "Search Wikipedia for a query and return up to 2 results."
122
+ inputs = {
123
+ "query": {"type": "string", "description": "The search term for Wikipedia."}
124
+ }
125
+ output_type = "string"
126
+
127
+ def forward(self, query: str) -> str:
128
+ search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
129
+
130
+ formatted_search_docs = "\n\n---\n\n".join(
131
+ [
132
+ f'<Document source="{doc.metadata.get("source", "Wikipedia")}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
133
+ for doc in search_docs
134
+ ]
135
+ )
136
+ return formatted_search_docs
137
+
138
+
139
+
140
+
141
+ class StringReverseTool(Tool):
142
+ name = "reverse_message"
143
+ description = "When you received a strange text, try to reverse it and perform action described in reversed message."
144
+ inputs = {
145
+ "message": {
146
+ "type": "string",
147
+ "description": "A message, which looks like strange and can be reversed to get actions to execute."
148
+ }
149
+ }
150
+ output_type = "string"
151
+
152
+ def __init__(self):
153
+ return
154
+
155
+ def forward(self, message: str):
156
+ return message[::-1]
157
+
158
+ class KeywordsExtractorTool(Tool):
159
+ """Extracts top 5 keywords from a given text based on frequency."""
160
 
161
+ name = "keywords_extractor"
162
+ description = "This tool returns the 5 most frequent keywords occur in provided block of text."
163
 
164
+ inputs = {
165
+ "text": {
166
+ "type": "string",
167
+ "description": "Text to analyze for keywords.",
168
+ }
169
+ }
170
+ output_type = "string"
171
+
172
+ def forward(self, text: str) -> str:
173
+ try:
174
+ all_words = re.findall(r'\b\w+\b', text.lower())
175
+ conjunctions = {'a', 'and', 'of', 'is', 'in', 'to', 'the'}
176
+ filtered_words = []
177
+ for w in all_words:
178
+ if w not in conjunctions:
179
+ filtered_words.push(w)
180
+ word_counts = Counter(filtered_words)
181
+ k = 5
182
+ return heapq.nlargest(k, word_counts.items(), key=lambda x: x[1])
183
+ except Exception as e:
184
+ return f"Error during extracting most common words: {e}"
185
+
186
+ @tool
187
+ def parse_excel_to_json(task_id: str) -> dict:
188
+ """
189
+ For a given task_id fetch and parse an Excel file and save parsed data in structured JSON file.
190
+ Args:
191
+ task_id: An task ID to fetch.
192
+
193
+ Returns:
194
+ {
195
+ "task_id": str,
196
+ "sheets": {
197
+ "SheetName1": [ {col1: val1, col2: val2, ...}, ... ],
198
+ ...
199
+ },
200
+ "status": "Success" | "Error"
201
+ }
202
+ """
203
+ url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
204
+
205
+ try:
206
+ response = requests.get(url, timeout=100)
207
+ if response.status_code != 200:
208
+ return {"task_id": task_id, "sheets": {}, "status": f"{response.status_code} - Failed"}
209
+
210
+ xls_content = pd.ExcelFile(BytesIO(response.content))
211
+ json_sheets = {}
212
+
213
+ for sheet in xls_content.sheet_names:
214
+ df = xls_content.parse(sheet)
215
+ df = df.dropna(how="all")
216
+ rows = df.head(20).to_dict(orient="records")
217
+ json_sheets[sheet] = rows
218
+
219
+ return {
220
+ "task_id": task_id,
221
+ "sheets": json_sheets,
222
+ "status": "Success"
223
+ }
224
+
225
+ except Exception as e:
226
+ return {
227
+ "task_id": task_id,
228
+ "sheets": {},
229
+ "status": f"Error in parsing Excel file: {str(e)}"
230
+ }
231
+
232
+
233
+
234
+ class VideoTranscriptionTool(Tool):
235
+ """Fetch transcripts from YouTube videos"""
236
+ name = "transcript_video"
237
+ description = "Fetch text transcript from YouTube movies with optional timestamps"
238
+ inputs = {
239
+ "url": {"type": "string", "description": "YouTube video URL or ID"},
240
+ "include_timestamps": {"type": "boolean", "description": "If timestamps should be included in output", "nullable": True}
241
+ }
242
+ output_type = "string"
243
+
244
+ def forward(self, url: str, include_timestamps: bool = False) -> str:
245
+
246
+ if "youtube.com/watch" in url:
247
+ video_id = url.split("v=")[1].split("&")[0]
248
+ elif "youtu.be/" in url:
249
+ video_id = url.split("youtu.be/")[1].split("?")[0]
250
+ elif len(url.strip()) == 11: # Direct ID
251
+ video_id = url.strip()
252
+ else:
253
+ return f"YouTube URL or ID: {url} is invalid!"
254
+
255
+ try:
256
+ transcription = YouTubeTranscriptApi.get_transcript(video_id)
257
+
258
+ if include_timestamps:
259
+ formatted_transcription = []
260
+ for part in transcription:
261
+ timestamp = f"{int(part['start']//60)}:{int(part['start']%60):02d}"
262
+ formatted_transcription.append(f"[{timestamp}] {part['text']}")
263
+ return "\n".join(formatted_transcription)
264
+ else:
265
+ return " ".join([part['text'] for part in transcription])
266
+
267
+ except Exception as e:
268
+ return f"Error in extracting YouTube transcript: {str(e)}"
269
+
270
+
271
+
272
+ import os
273
+ import base64
274
+ import requests
275
+ import google.generativeai as genai
276
+ from PIL import Image
277
+ from io import BytesIO
278
+ from smolagents import (
279
+ CodeAgent,
280
+ ToolCallingAgent,
281
+ InferenceClientModel,
282
+ WebSearchTool,
283
+ HfApiModel,
284
+ DuckDuckGoSearchTool,
285
+ FinalAnswerTool,
286
+ tool
287
  )
288
 
289
+ # Configure Gemini
290
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
291
+
292
+ # Define image analysis tool
293
+ @tool
294
+ def analyze_image(image_input: str) -> str:
295
+ """
296
+ Analyzes images using AI vision. Input can be:
297
+ - Image URL (http/https)
298
+ - Base64 encoded image
299
+ - Local file path
300
+ Returns detailed image analysis.
301
+ """
302
+ try:
303
+ # Handle URL input
304
+ if image_input.startswith(('http://', 'https://')):
305
+ response = requests.get(image_input)
306
+ response.raise_for_status()
307
+ img = Image.open(BytesIO(response.content))
308
+ buffer = BytesIO()
309
+ img.save(buffer, format="JPEG")
310
+ image_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
311
+
312
+ # Handle base64 input
313
+ elif image_input.startswith('data:image'):
314
+ image_data = image_input.split(',')[1]
315
+
316
+ # Handle local file path
317
+ elif os.path.exists(image_input):
318
+ with open(image_input, "rb") as img_file:
319
+ image_data = base64.b64encode(img_file.read()).decode('utf-8')
320
+
321
+ else:
322
+ return "Invalid image input"
323
+
324
+ # Analyze with Gemini
325
+ model = genai.GenerativeModel('gemini-pro-vision')
326
+ response = model.generate_content([
327
+ "Analyze this image thoroughly. Describe all significant elements, text, objects, and context.",
328
+ genai.types.Part.from_data(
329
+ data=base64.b64decode(image_data),
330
+ mime_type="image/jpeg"
331
+ )
332
+ ])
333
+ return response.text
334
+
335
+ except Exception as e:
336
+ return f"Image analysis error: {str(e)}"
337
 
 
 
338
  class BasicAgent:
339
  def __init__(self):
340
+ token = os.environ.get("HF_API_TOKEN")
341
+ model = HfApiModel(
342
+ temperature=0.1,
343
+ token=token
344
+ )
345
 
346
+ # Existing tools
347
+ search_tool = DuckDuckGoSearchTool()
348
+ wiki_search_tool = WikiSearchTool()
349
+ str_reverse_tool = StringReverseTool()
350
+ keywords_extract_tool = KeywordsExtractorTool()
351
+ speech_to_text_tool = SpeechToTextTool()
352
+ visit_webpage_tool = VisitWebpageTool()
353
+ final_answer_tool = FinalAnswerTool()
354
+ video_transcription_tool = VideoTranscriptionTool()
355
+ code_llama_tool = CodeLlamaTool()
356
+
357
+ system_prompt = f"""
358
+ You are my general AI assistant. Your task is to answer the question I asked.
359
+ First, provide an explanation of your reasoning, step by step, to arrive at the answer.
360
+ Then, return your final answer in a single line, formatted as follows: "FINAL ANSWER: [YOUR FINAL ANSWER]".
361
+ [YOUR FINAL ANSWER] should be a number, a string, or a comma-separated list of numbers and/or strings, depending on the question.
362
+ If the answer is a number, do not use commas or units (e.g., $, %) unless specified.
363
+ If the answer is a string, do not use articles or abbreviations (e.g., for cities), and write digits in plain text unless specified.
364
+ If the answer is a comma-separated list, apply the above rules for each element based on whether it is a number or a string.
365
+ """
366
+
367
+ # Create web agent with image analysis capability
368
+ self.web_agent = ToolCallingAgent(
369
+ tools=[
370
+ WebSearchTool(),
371
+ visit_webpage_tool,
372
+ analyze_image # Add image analysis to web agent
373
+ ],
374
+ model=model,
375
+ max_steps=10,
376
+ name="web_search_agent",
377
+ description="Runs web searches and analyzes images",
378
+ )
379
 
380
+ # Create main agent with image analysis
381
  self.agent = CodeAgent(
382
+ model=model,
383
+ tools=[
384
+ search_tool,
385
+ wiki_search_tool,
386
+ str_reverse_tool,
387
+ keywords_extract_tool,
388
+ speech_to_text_tool,
389
+ visit_webpage_tool,
390
+ final_answer_tool,
391
+ video_transcription_tool,
392
+ code_llama_tool,
393
+ analyze_image # Add to main agent too
394
+ ],
395
+ add_base_tools=True
396
  )
397
+
398
+ # Update system prompt
399
+ self.agent.prompt_templates["system_prompt"] = self.agent.prompt_templates["system_prompt"] + system_prompt
400
 
401
+ def __call__(self, question: str) -> str:
402
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
403
+
404
+ # First try web agent for image-based queries
405
+ if any(keyword in question.lower() for keyword in ["image", "picture", "photo", "screenshot", "diagram"]):
406
+ print("Using web agent for image-related query")
407
+ answer = self.web_agent.run(question)
408
+ else:
409
+ print("Using main agent")
410
+ answer = self.agent.run(question)
 
 
 
 
 
411
 
412
+ print(f"Agent returning answer: {answer}")
413
+ return answer
 
 
 
 
 
414
 
 
415
 
416
 
417