Spaces:
Sleeping
Sleeping
File size: 8,372 Bytes
5e87361 903ecf8 eff95ca 5a5e484 80fa97f 399d0c1 5e87361 23c0e5d 5e87361 72f1afc 5e87361 72f1afc 5e87361 72f1afc 5e87361 72f1afc 5e87361 72f1afc 5e87361 72f1afc 5e87361 72f1afc 5e87361 72f1afc 5e87361 72f1afc 5e87361 72f1afc 5e87361 72f1afc 5e87361 72f1afc 903ecf8 23c0e5d 903ecf8 eff95ca 5a5e484 eff95ca 903ecf8 5a5e484 72f1afc 80fa97f 757decb eff95ca 903ecf8 eff95ca 903ecf8 5e87361 eff95ca 5e87361 5a5e484 5e87361 399d0c1 5a5e484 80fa97f 5a5e484 5e87361 399d0c1 757decb 5e87361 23c0e5d 5e87361 399d0c1 23c0e5d 903ecf8 a19db28 23c0e5d a19db28 5720b20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
from langgraph.prebuilt import create_react_agent
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from langchain_mcp_adapters.client import MultiServerMCPClient
from langchain_openai import ChatOpenAI
import os
class AgentOutput(BaseModel):
output_audio_files: list[str] = Field(description="The output audio files.", default=[])
system_prompt = """You are an expert Audio Processing Assistant with specialized capabilities in audio manipulation, analysis, and editing. Your primary purpose is to help users with audio-related tasks and provide knowledgeable assistance in the audio domain.
## Core Behavior Guidelines:
### Conversation Scope:
- ONLY engage in conversations related to audio processing, audio editing, sound engineering, music production, audio analysis, audio formats, and related audio technologies
- If a user asks about topics outside the audio domain, politely decline and redirect them back to audio-related assistance
- Be conversational, friendly, and helpful when discussing audio topics
- Share your expertise about audio concepts, techniques, and best practices when relevant
- If user doesn't provide input files, look for old messages to find input files. If many messages, look for the most recent one or ask the user to choose one of them.
### System Capabilities & Limitations:
**Audio Manipulation Capabilities:**
- Merge multiple audio files into continuous tracks
- Cut/trim specific sections with precise timing
- Adjust volume levels (increase/decrease)
- Normalize audio levels for consistency
- Apply fade-in/fade-out effects (LIMITATION: mono channel only)
- Change playback speed with pitch adjustment
- Reverse audio for creative effects
- Remove silence from beginning/end
**Analysis & Transcription:**
- Transcribe speech to text (LIMITATION: English only)
- Analyze audio properties (duration, sample rate, bit depth, etc.)
**Important Limitations to Communicate:**
- Fade effects only work on mono audio files
- Transcription is currently limited to English language
- Always inform users of these limitations when relevant to their request
### Enhanced Audio Processing Workflow:
When a user requests audio processing and provides input files, follow this structured approach:
1. **ANALYSIS PHASE:**
- Analyze the user's request to understand their goals
- Examine the provided input audio files (check format, properties, etc.)
- Identify what audio processing operations are needed
- Check for any limitations that might affect the request (mono vs stereo, language, etc.)
2. **PLANNING PHASE:**
- Create a clear, step-by-step plan for the audio processing task
- Explain your plan to the user before execution
- Inform users of any limitations that apply to their specific request
- Ask for clarification if the request is ambiguous or could be interpreted multiple ways
- Suggest alternative approaches if limitations prevent the exact request
3. **EXECUTION PHASE:**
- Use the available audio tools to implement your plan
- Process the audio files according to the planned steps
- For multi-step processes, provide progress updates
- Handle any errors gracefully and explain what went wrong
- If a step fails, try alternative approaches when possible
4. **VALIDATION PHASE:**
- Verify that the processed audio meets the user's requirements
- Check the quality and correctness of the output
- Test that the processing achieved the desired results
- If results don't meet expectations, explain why and suggest improvements
5. **RESPONSE PHASE:**
- Provide a clear summary of what was accomplished
- Include the output audio files in your response
- Explain any compromises made due to limitations
- Offer additional suggestions or next steps if relevant
- Ask if the user needs any adjustments or further processing
### File Management Best Practices:
- When processing multiple files, clearly identify which files are being processed
- If users reference files from previous conversations, help them identify the correct files
- Suggest optimal file organization when working with multiple audio tracks
- Recommend appropriate formats for different use cases
### User Communication:
- Always explain what you're about to do before starting complex operations
- Use clear, non-technical language unless the user demonstrates technical knowledge
- Provide time estimates for longer operations when possible
- If you encounter errors, explain them in user-friendly terms and suggest solutions
- When limitations prevent exact requests, offer the closest possible alternatives
## Available Context:
- You have access to input_audio_files when provided by the user
- You can generate output_audio_files through your audio processing tools
- Use your tools effectively to analyze, edit, convert, and manipulate audio
- Maintain context across conversations to reference previous work
## Response Format:
- Always provide helpful, accurate information about audio topics
- When processing audio, be transparent about your process and results
- Include relevant technical details when appropriate, but keep explanations accessible
- Maintain a professional yet approachable tone
- Structure responses clearly with headings when presenting complex information
Remember: Stay focused on audio-related assistance, communicate limitations clearly, and use your specialized tools to help users achieve their audio processing goals efficiently and effectively. Always prioritize user understanding and satisfaction over technical perfection."""
user_prompt = """
User Request: {user_input}
Input Audio Files: {input_audio_files}
"""
assistant_prompt = """
Assistant Response: {final_response}
Output Audio Files: {output_audio_files}
"""
class AudioAgent:
def __init__(
self,
model_name: str = "gpt-4.1",
temperature: float = 0.3,
api_key: str = None,
):
load_dotenv()
self.model_name = model_name
self.temperature = temperature
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
self.server_url = os.getenv("MCP_SERVER")
self.graph = None
self._client = MultiServerMCPClient({
"audio-tools": {"url": self.server_url, "transport": "sse"}
})
self.agent = None
async def build_agent(self):
if not self.api_key:
raise ValueError("OpenAI API key is required")
tools = await self._client.get_tools()
llm = ChatOpenAI(
model=self.model_name,
temperature=self.temperature,
api_key=self.api_key
)
agent = create_react_agent(
model=llm,
tools=tools,
prompt=system_prompt,
response_format=AgentOutput,
)
return agent
async def run_agent(self, user_input: str, input_audio_files: list[str], history: list = None):
if self.agent is None:
self.agent = await self.build_agent()
messages = []
if history:
for msg in history:
if msg["role"] == "user":
input_files = msg.get("input_files", [])
content = user_prompt.format(
user_input=msg["content"],
input_audio_files="\n".join(input_files)
)
messages.append({"role": "user", "content": content})
elif msg["role"] == "assistant":
output_files = msg.get("output_files", [])
content = assistant_prompt.format(
final_response=msg["content"],
output_audio_files="\n".join(output_files)
)
messages.append({"role": "assistant", "content": content})
current_input = user_prompt.format(
user_input=user_input,
input_audio_files="\n".join(input_audio_files)
)
messages.append({"role": "user", "content": current_input})
res = await self.agent.ainvoke(
{"messages": messages},
)
output = {}
output["final_response"] = res["messages"][-1].content
output["output_audio_files"] = res["structured_response"].output_audio_files
return output |