luke9705 commited on
Commit
6a5d072
·
1 Parent(s): 3ee00d0

Refactor app.py: Update imports, enhance load_file return structure, and add generate_audio tool

Browse files
Files changed (1) hide show
  1. app.py +43 -28
app.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import base64
4
  import pandas as pd
5
  from PIL import Image
6
- from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, OpenAIServerModel, tool
7
  from typing import Optional
8
  import requests
9
  from io import BytesIO
@@ -12,7 +12,7 @@ from pathlib import Path
12
  import openai
13
  from openai import OpenAI
14
  import pdfplumber
15
- import anthropic
16
 
17
 
18
  ## utilties and class definition
@@ -44,7 +44,7 @@ def load_file(path: str) -> list | dict:
44
  if image is not None:
45
  return [image]
46
  elif ext.endswith(".mp3") or ext.endswith(".wav"):
47
- return {"raw document text": text, "audio path": path}
48
  else:
49
  return {"raw document text": text, "file path": path}
50
 
@@ -62,19 +62,8 @@ def check_format(answer: str | list, *args, **kwargs) -> list:
62
  return [answer]
63
  elif isinstance(answer, dict):
64
  raise TypeError(f"Final answer must be a list, not a dict. Please check the answer format.")
65
-
66
- class Claude:
67
- def __init__(self):
68
- self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
69
 
70
- def generate(self, prompt: str):
71
- message = self.client.messages.create(
72
- model="claude-sonnet-4-20250514",
73
- max_tokens=20000,
74
- temperature=1,
75
- messages=[{"role": "user", "content": prompt}]
76
- )
77
- return message.content
78
  ## tools definition
79
  @tool
80
  def download_images(image_urls: str) -> list:
@@ -83,7 +72,7 @@ def download_images(image_urls: str) -> list:
83
  Args:
84
  image_urls: comma‐separated list of URLs to download
85
  Returns:
86
- List of PIL.Image.Image objects
87
  """
88
  urls = [u.strip() for u in image_urls.split(",") if u.strip()] # strip() removes whitespaces
89
  images = []
@@ -99,7 +88,11 @@ def download_images(image_urls: str) -> list:
99
 
100
  except Exception as e:
101
  print(f"Failed to download from {url}: {e}")
102
- return images
 
 
 
 
103
 
104
  @tool # since they gave us OpenAI API credits, we can keep using it
105
  def transcribe_audio(audio_path: str) -> str:
@@ -113,10 +106,10 @@ def transcribe_audio(audio_path: str) -> str:
113
  client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
114
  with open(audio_path, "rb") as audio: # to modify path because it is arriving from gradio
115
  transcript = client.audio.transcriptions.create(
116
- file=audio,
117
- model="whisper-1",
118
- response_format="text",
119
- )
120
  print(transcript)
121
  try:
122
  return transcript
@@ -157,18 +150,34 @@ def generate_image(prompt: str, neg_prompt: str) -> Image.Image:
157
 
158
  return gr.Image(value=image, label="Generated Image")
159
 
160
- """@tool
161
- def generate_audio(prompt: str) -> object:
162
- space = smolagents.load_tool(
163
-
164
- )"""
 
 
 
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
 
168
  ## agent definition
169
  class Agent:
170
  def __init__(self, ):
171
- client = HfApiModel("deepseek-ai/DeepSeek-R1", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
172
  """client = OpenAIServerModel(
173
  model_id="claude-sonnet-4-20250514",
174
  api_base="https://api.anthropic.com/v1/",
@@ -176,7 +185,12 @@ class Agent:
176
  )"""
177
  self.agent = CodeAgent(
178
  model=client,
179
- tools=[DuckDuckGoSearchTool(max_results=5), VisitWebpageTool(max_output_length=20000), generate_image, download_images, transcribe_audio],
 
 
 
 
 
180
  additional_authorized_imports=["pandas", "PIL", "io"],
181
  planning_interval=3,
182
  max_steps=6,
@@ -223,6 +237,7 @@ def respond(message: str, history : dict, web_search: bool = False):
223
  else:
224
  file = load_file(files[0])
225
  message = agent(text, files=file, conversation_history=history)
 
226
  # output
227
  print("Agent response:", message)
228
 
 
3
  import base64
4
  import pandas as pd
5
  from PIL import Image
6
+ from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, OpenAIServerModel, tool, Tool
7
  from typing import Optional
8
  import requests
9
  from io import BytesIO
 
12
  import openai
13
  from openai import OpenAI
14
  import pdfplumber
15
+ import numpy as np
16
 
17
 
18
  ## utilties and class definition
 
44
  if image is not None:
45
  return [image]
46
  elif ext.endswith(".mp3") or ext.endswith(".wav"):
47
+ return {"audio": text, "audio path": path}
48
  else:
49
  return {"raw document text": text, "file path": path}
50
 
 
62
  return [answer]
63
  elif isinstance(answer, dict):
64
  raise TypeError(f"Final answer must be a list, not a dict. Please check the answer format.")
 
 
 
 
65
 
66
+
 
 
 
 
 
 
 
67
  ## tools definition
68
  @tool
69
  def download_images(image_urls: str) -> list:
 
72
  Args:
73
  image_urls: comma‐separated list of URLs to download
74
  Returns:
75
+ List of PIL.Image.Image objects wrapped by gr.Image
76
  """
77
  urls = [u.strip() for u in image_urls.split(",") if u.strip()] # strip() removes whitespaces
78
  images = []
 
88
 
89
  except Exception as e:
90
  print(f"Failed to download from {url}: {e}")
91
+
92
+ wrapped = []
93
+ for img in images:
94
+ wrapped.append(gr.Image(value=img))
95
+ return wrapped
96
 
97
  @tool # since they gave us OpenAI API credits, we can keep using it
98
  def transcribe_audio(audio_path: str) -> str:
 
106
  client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
107
  with open(audio_path, "rb") as audio: # to modify path because it is arriving from gradio
108
  transcript = client.audio.transcriptions.create(
109
+ file=audio,
110
+ model="whisper-1",
111
+ response_format="text",
112
+ )
113
  print(transcript)
114
  try:
115
  return transcript
 
150
 
151
  return gr.Image(value=image, label="Generated Image")
152
 
153
+ @tool
154
+ def generate_audio(prompt: str, duration: int, sample: Optional[list[int, np.ndarray]] = None) -> gr.Component:
155
+ """
156
+ Generate audio from a text prompt using MusicGen.
157
+ Args:
158
+ prompt: The text prompt to generate the audio from.
159
+ duration: Duration of the generated audio in seconds.
160
+ sample: Optional audio sample to guide generation.
161
 
162
+ Returns:
163
+ gr.Component: The generated audio as a Gradio Audio component.
164
+ """
165
+ client = Tool.from_space(
166
+ space_id="luke9705/MusicGen_custom",
167
+ token=os.environ.get('HF_TOKEN'),
168
+ name="Sound_Generator",
169
+ description="Generate music or sound effects from a text prompt using MusicGen."
170
+ )
171
+ sound = client(prompt, duration, sample)
172
+
173
+ return gr.Audio(value=sound)
174
+
175
 
176
 
177
  ## agent definition
178
  class Agent:
179
  def __init__(self, ):
180
+ client = HfApiModel("deepseek-ai/DeepSeek-R1-0528", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
181
  """client = OpenAIServerModel(
182
  model_id="claude-sonnet-4-20250514",
183
  api_base="https://api.anthropic.com/v1/",
 
185
  )"""
186
  self.agent = CodeAgent(
187
  model=client,
188
+ tools=[DuckDuckGoSearchTool(max_results=5),
189
+ VisitWebpageTool(max_output_length=20000),
190
+ generate_image,
191
+ generate_audio,
192
+ download_images,
193
+ transcribe_audio],
194
  additional_authorized_imports=["pandas", "PIL", "io"],
195
  planning_interval=3,
196
  max_steps=6,
 
237
  else:
238
  file = load_file(files[0])
239
  message = agent(text, files=file, conversation_history=history)
240
+
241
  # output
242
  print("Agent response:", message)
243