MultiAgent_System_for_Screenplay_Creation

Running

App Files Files Community

luke9705 commited on Jun 6

Commit

6a5d072

1 Parent(s): 3ee00d0

Refactor app.py: Update imports, enhance load_file return structure, and add generate_audio tool

Browse files

Files changed (1) hide show

app.py +43 -28

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import base64
 import pandas as pd
 from PIL import Image
-from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, OpenAIServerModel, tool
 from typing import Optional
 import requests
 from io import BytesIO
@@ -12,7 +12,7 @@ from pathlib import Path
 import openai
 from openai import OpenAI
 import pdfplumber
-import anthropic
 ## utilties and class definition
@@ -44,7 +44,7 @@ def load_file(path: str) -> list | dict:
     if image is not None:
         return [image]
     elif ext.endswith(".mp3") or ext.endswith(".wav"):
-        return {"raw document text": text, "audio path": path}
     else:
         return {"raw document text": text, "file path": path}
@@ -62,19 +62,8 @@ def check_format(answer: str | list, *args, **kwargs) -> list:
         return [answer]
     elif isinstance(answer, dict):
         raise TypeError(f"Final answer must be a list, not a dict. Please check the answer format.")
-class Claude:
-    def __init__(self):
-        self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
-    def generate(self, prompt: str):
-        message = self.client.messages.create(
-            model="claude-sonnet-4-20250514",
-            max_tokens=20000,
-            temperature=1,
-            messages=[{"role": "user", "content": prompt}]
-            )
-        return message.content
 ## tools definition
 @tool
 def download_images(image_urls: str) -> list:
@@ -83,7 +72,7 @@ def download_images(image_urls: str) -> list:
     Args:
         image_urls: comma‐separated list of URLs to download
     Returns:
-        List of PIL.Image.Image objects
     """
     urls = [u.strip() for u in image_urls.split(",") if u.strip()] # strip() removes whitespaces
     images = []
@@ -99,7 +88,11 @@ def download_images(image_urls: str) -> list:
         except Exception as e:
             print(f"Failed to download from {url}: {e}")
-    return images
 @tool # since they gave us OpenAI API credits, we can keep using it
 def transcribe_audio(audio_path: str) -> str:
@@ -113,10 +106,10 @@ def transcribe_audio(audio_path: str) -> str:
     client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
     with open(audio_path, "rb") as audio:                 # to modify path because it is arriving from gradio
         transcript = client.audio.transcriptions.create(
-        file=audio,
-        model="whisper-1",
-        response_format="text",
-    )
     print(transcript)
     try:
         return transcript
@@ -157,18 +150,34 @@ def generate_image(prompt: str, neg_prompt: str) -> Image.Image:
     return gr.Image(value=image, label="Generated Image")
-"""@tool
-def generate_audio(prompt: str) -> object:
-    space = smolagents.load_tool(
-    )"""
 ## agent definition
 class Agent:
     def __init__(self, ):
-        client = HfApiModel("deepseek-ai/DeepSeek-R1", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
         """client = OpenAIServerModel(
             model_id="claude-sonnet-4-20250514",
             api_base="https://api.anthropic.com/v1/",
@@ -176,7 +185,12 @@ class Agent:
         )"""
         self.agent = CodeAgent(
             model=client,
-            tools=[DuckDuckGoSearchTool(max_results=5), VisitWebpageTool(max_output_length=20000), generate_image, download_images, transcribe_audio],
             additional_authorized_imports=["pandas", "PIL", "io"],
             planning_interval=3,
             max_steps=6,
@@ -223,6 +237,7 @@ def respond(message: str, history : dict, web_search: bool = False):
         else:
             file = load_file(files[0])
             message = agent(text, files=file, conversation_history=history)
     # output
     print("Agent response:", message)

 import base64
 import pandas as pd
 from PIL import Image
+from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, OpenAIServerModel, tool, Tool
 from typing import Optional
 import requests
 from io import BytesIO
 import openai
 from openai import OpenAI
 import pdfplumber
+import numpy as np
 ## utilties and class definition
     if image is not None:
         return [image]
     elif ext.endswith(".mp3") or ext.endswith(".wav"):
+        return {"audio": text, "audio path": path}
     else:
         return {"raw document text": text, "file path": path}
         return [answer]
     elif isinstance(answer, dict):
         raise TypeError(f"Final answer must be a list, not a dict. Please check the answer format.")
 ## tools definition
 @tool
 def download_images(image_urls: str) -> list:
     Args:
         image_urls: comma‐separated list of URLs to download
     Returns:
+        List of PIL.Image.Image objects wrapped by gr.Image
     """
     urls = [u.strip() for u in image_urls.split(",") if u.strip()] # strip() removes whitespaces
     images = []
         except Exception as e:
             print(f"Failed to download from {url}: {e}")
+    wrapped = []
+    for img in images:
+        wrapped.append(gr.Image(value=img))
+    return wrapped
 @tool # since they gave us OpenAI API credits, we can keep using it
 def transcribe_audio(audio_path: str) -> str:
     client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
     with open(audio_path, "rb") as audio:                 # to modify path because it is arriving from gradio
         transcript = client.audio.transcriptions.create(
+            file=audio,
+            model="whisper-1",
+            response_format="text",
+            )
     print(transcript)
     try:
         return transcript
     return gr.Image(value=image, label="Generated Image")
+@tool
+def generate_audio(prompt: str, duration: int, sample: Optional[list[int, np.ndarray]] = None) -> gr.Component:
+    """
+    Generate audio from a text prompt using MusicGen.
+    Args:
+        prompt: The text prompt to generate the audio from.
+        duration: Duration of the generated audio in seconds.
+        sample: Optional audio sample to guide generation.
+    Returns:
+        gr.Component: The generated audio as a Gradio Audio component.
+    """
+    client = Tool.from_space(
+        space_id="luke9705/MusicGen_custom",
+        token=os.environ.get('HF_TOKEN'),
+        name="Sound_Generator",
+        description="Generate music or sound effects from a text prompt using MusicGen."
+    )
+    sound = client(prompt, duration, sample)
+    return gr.Audio(value=sound)
 ## agent definition
 class Agent:
     def __init__(self, ):
+        client = HfApiModel("deepseek-ai/DeepSeek-R1-0528", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
         """client = OpenAIServerModel(
             model_id="claude-sonnet-4-20250514",
             api_base="https://api.anthropic.com/v1/",
         )"""
         self.agent = CodeAgent(
             model=client,
+            tools=[DuckDuckGoSearchTool(max_results=5),
+                   VisitWebpageTool(max_output_length=20000),
+                   generate_image,
+                   generate_audio,
+                   download_images,
+                   transcribe_audio],
             additional_authorized_imports=["pandas", "PIL", "io"],
             planning_interval=3,
             max_steps=6,
         else:
             file = load_file(files[0])
             message = agent(text, files=file, conversation_history=history)
     # output
     print("Agent response:", message)