YigitSekerci commited on
Commit
399d0c1
Β·
1 Parent(s): 9cce053

migrate to complicated agent

Browse files
src/agent.py CHANGED
@@ -1,14 +1,75 @@
1
  import asyncio
 
2
  from dotenv import load_dotenv
3
 
 
4
  from langchain_mcp_adapters.client import MultiServerMCPClient
5
- from langgraph.prebuilt import create_react_agent
6
- from langgraph.graph.graph import CompiledGraph
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  class AudioAgent:
9
  """
10
- Wraps a LangGraph REACT agent over your MCP audio-tools,
11
- exposing both one-shot and streaming chat methods.
 
 
 
 
 
 
 
 
12
  """
13
 
14
  def __init__(
@@ -20,71 +81,184 @@ class AudioAgent:
20
  self.model_name = model_name
21
  self.server_url = server_url
22
 
23
- # SSE client for your audio tools
24
  self._client = MultiServerMCPClient({
25
  "audio-tools": {"url": self.server_url, "transport": "sse"}
26
  })
27
 
28
- self._agent = None
 
29
 
30
  @property
31
  def is_initialized(self) -> bool:
32
- return self._agent is not None
33
 
34
  async def initialize(self) -> None:
35
- """Fetch tools from MCP and build a streaming-capable LangGraph REACT agent."""
36
  if self.is_initialized:
37
  return
38
 
39
- tools = await self._client.get_tools()
40
- if not tools:
 
41
  raise RuntimeError("No tools available from MCP server")
42
 
43
- self._agent: CompiledGraph = create_react_agent(
44
- model=self.model_name,
45
- tools=tools,
46
- prompt="""
47
- You are a helpful assistant that can use the following tools to help the user.
48
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  )
 
 
 
 
 
 
 
50
 
51
- def process_user_input(self, user_input: str):
52
- """
53
- Process user input and return a prompt for the agent.
54
- """
55
- return {"messages": [{"role": "user", "content": user_input}]}
56
-
57
- async def chat(self, prompt: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  """
59
- One-shot chat: returns the full LLM + tool-augmented reply.
60
  """
61
  if not self.is_initialized:
62
  await self.initialize()
63
- return await self._agent.ainvoke(self.process_user_input(prompt))
 
 
 
 
 
64
 
65
  async def stream_chat(self, prompt: str):
66
  """
67
- Streaming chat: prints tokens live and returns the full reply at the end.
68
  """
69
  if not self.is_initialized:
70
  await self.initialize()
71
 
72
- async for msg, metadata in self._agent.astream(
73
- self.process_user_input(prompt),
74
- stream_mode="messages"
75
- ):
76
- if msg.content:
77
- yield msg.content, metadata["langgraph_node"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  async def main():
 
80
  agent = AudioAgent()
81
- # one-shot example
82
- reply = await agent.chat("Hi! What audio tools are available?")
83
- print("β†’", reply)
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- # streaming example
86
- async for msg, node in agent.stream_chat("Explain how audio normalization works."):
87
- print(msg, end="", flush=True)
88
 
89
  if __name__ == "__main__":
90
  asyncio.run(main())
 
1
  import asyncio
2
+ from typing import Dict, Any, TypedDict, Annotated, List
3
  from dotenv import load_dotenv
4
 
5
+ from langchain_core.messages import BaseMessage
6
  from langchain_mcp_adapters.client import MultiServerMCPClient
7
+ from langgraph.graph import StateGraph, END
8
+ from langgraph.graph.message import add_messages
9
+ from langgraph.checkpoint.memory import MemorySaver
10
+
11
+ from .nodes import (
12
+ router_node,
13
+ chat_node,
14
+ script_generator_node,
15
+ planner_node,
16
+ audio_processor_node,
17
+ validator_node,
18
+ final_response_node
19
+ )
20
+
21
+
22
+ class AudioProcessingState(TypedDict):
23
+ """State schema for the audio processing graph."""
24
+
25
+ # Chat history
26
+ messages: Annotated[List[BaseMessage], add_messages]
27
+
28
+ # Audio files provided by user
29
+ audio_files: List[str]
30
+
31
+ # User's processing request
32
+ user_request: str
33
+
34
+ # Processing type determined by router
35
+ processing_type: str
36
+
37
+ # Generated scripts with timestamps
38
+ scripts: Dict[str, Any]
39
+
40
+ # Execution plan created by planner
41
+ execution_plan: List[Dict[str, Any]]
42
+
43
+ # Processing results
44
+ processed_files: Dict[str, str]
45
+
46
+ # Processing steps completed
47
+ completed_steps: List[str]
48
+
49
+ # Final output
50
+ final_audio_url: str
51
+ final_response: str
52
+
53
+ # Error handling
54
+ errors: List[str]
55
+ needs_reprocessing: bool
56
+
57
+ # Metadata
58
+ processing_metadata: Dict[str, Any]
59
+
60
 
61
  class AudioAgent:
62
  """
63
+ Advanced LangGraph-based audio processing agent with custom nodes.
64
+
65
+ Handles audio file processing through a sophisticated workflow:
66
+ 1. Router - Determines processing type
67
+ 2. Chat or Audio Processing Pipeline
68
+ 3. Script Generation - Creates timestamped transcripts
69
+ 4. Planning - Creates execution plan
70
+ 5. Processing - Executes audio tools
71
+ 6. Validation - Checks results and determines reprocessing
72
+ 7. Final Response - Formats output for user
73
  """
74
 
75
  def __init__(
 
81
  self.model_name = model_name
82
  self.server_url = server_url
83
 
84
+ # SSE client for audio tools
85
  self._client = MultiServerMCPClient({
86
  "audio-tools": {"url": self.server_url, "transport": "sse"}
87
  })
88
 
89
+ self._graph = None
90
+ self._tools = []
91
 
92
  @property
93
  def is_initialized(self) -> bool:
94
+ return self._graph is not None
95
 
96
  async def initialize(self) -> None:
97
+ """Initialize the LangGraph workflow with audio tools."""
98
  if self.is_initialized:
99
  return
100
 
101
+ # Get tools from MCP server
102
+ self._tools = await self._client.get_tools()
103
+ if not self._tools:
104
  raise RuntimeError("No tools available from MCP server")
105
 
106
+ # Build the graph
107
+ self._graph = self._build_graph()
108
+
109
+ def _build_graph(self) -> StateGraph:
110
+ """Build the LangGraph workflow."""
111
+
112
+ # Create the state graph
113
+ workflow = StateGraph(AudioProcessingState)
114
+
115
+ # Add nodes
116
+ workflow.add_node("router", router_node)
117
+ workflow.add_node("chat", chat_node)
118
+ workflow.add_node("script_generator", self._script_generator_with_tools)
119
+ workflow.add_node("planner", planner_node)
120
+ workflow.add_node("audio_processor", self._audio_processor_with_tools)
121
+ workflow.add_node("validator", validator_node)
122
+ workflow.add_node("final_response", final_response_node)
123
+
124
+ # Set entry point
125
+ workflow.set_entry_point("router")
126
+
127
+ # Add conditional edges based on processing type
128
+ workflow.add_conditional_edges(
129
+ "router",
130
+ self._route_processing_type,
131
+ {
132
+ "chat": "chat",
133
+ "audio_processing": "script_generator",
134
+ "dialogue_generation": "script_generator"
135
+ }
136
+ )
137
+
138
+ # Chat flow
139
+ workflow.add_edge("chat", "final_response")
140
+
141
+ # Audio processing flow
142
+ workflow.add_edge("script_generator", "planner")
143
+ workflow.add_edge("planner", "audio_processor")
144
+ workflow.add_edge("audio_processor", "validator")
145
+
146
+ # Validation flow with conditional reprocessing
147
+ workflow.add_conditional_edges(
148
+ "validator",
149
+ self._check_reprocessing_need,
150
+ {
151
+ "reprocess": "planner", # Go back to planning
152
+ "complete": "final_response"
153
+ }
154
  )
155
+
156
+ # Final response leads to end
157
+ workflow.add_edge("final_response", END)
158
+
159
+ # Compile with memory for conversation history
160
+ memory = MemorySaver()
161
+ return workflow.compile(checkpointer=memory)
162
 
163
+ async def _script_generator_with_tools(self, state: Dict[str, Any]) -> Dict[str, Any]:
164
+ """Script generator node with tools access."""
165
+ return await script_generator_node(state, self._tools)
166
+
167
+ async def _audio_processor_with_tools(self, state: Dict[str, Any]) -> Dict[str, Any]:
168
+ """Audio processor node with tools access."""
169
+ return await audio_processor_node(state, self._tools)
170
+
171
+ def _route_processing_type(self, state: Dict[str, Any]) -> str:
172
+ """Route based on processing type."""
173
+ return state.get("processing_type", "chat")
174
+
175
+ def _check_reprocessing_need(self, state: Dict[str, Any]) -> str:
176
+ """Check if reprocessing is needed."""
177
+ if state.get("needs_reprocessing", False):
178
+ return "reprocess"
179
+ return "complete"
180
+
181
+ def process_user_input(self, user_input: str) -> Dict[str, Any]:
182
+ """Process user input and create initial state."""
183
+ from langchain_core.messages import HumanMessage
184
+
185
+ return {
186
+ "messages": [HumanMessage(content=user_input)],
187
+ "audio_files": [],
188
+ "user_request": "",
189
+ "processing_type": "",
190
+ "scripts": {},
191
+ "execution_plan": [],
192
+ "processed_files": {},
193
+ "completed_steps": [],
194
+ "final_audio_url": "",
195
+ "final_response": "",
196
+ "errors": [],
197
+ "needs_reprocessing": False,
198
+ "processing_metadata": {}
199
+ }
200
+
201
+ async def chat(self, prompt: str) -> Dict[str, Any]:
202
  """
203
+ One-shot chat: returns the full processing result.
204
  """
205
  if not self.is_initialized:
206
  await self.initialize()
207
+
208
+ config = {"configurable": {"thread_id": "audio_agent_session"}}
209
+ initial_state = self.process_user_input(prompt)
210
+
211
+ result = await self._graph.ainvoke(initial_state, config)
212
+ return result
213
 
214
  async def stream_chat(self, prompt: str):
215
  """
216
+ Streaming chat: yields intermediate results as processing continues.
217
  """
218
  if not self.is_initialized:
219
  await self.initialize()
220
 
221
+ config = {"configurable": {"thread_id": "audio_agent_session"}}
222
+ initial_state = self.process_user_input(prompt)
223
+
224
+ async for chunk in self._graph.astream(initial_state, config):
225
+ # Extract the node name and content
226
+ for node_name, node_output in chunk.items():
227
+ if node_name == "__end__":
228
+ continue
229
+
230
+ # Get the latest message if available
231
+ messages = node_output.get("messages", [])
232
+ if messages and hasattr(messages[-1], 'content'):
233
+ content = messages[-1].content
234
+ if content:
235
+ yield content, node_name
236
+
237
+ # Also yield final audio URL if available
238
+ final_audio_url = node_output.get("final_audio_url", "")
239
+ if final_audio_url:
240
+ yield f"\n🎡 **Audio Ready**: [{final_audio_url}]({final_audio_url})", node_name
241
+
242
 
243
  async def main():
244
+ """Test the agent with various scenarios."""
245
  agent = AudioAgent()
246
+
247
+ # Test 1: Chat about capabilities
248
+ print("=== Test 1: Chat Query ===")
249
+ result = await agent.chat("What audio tools are available?")
250
+ print("Final Response:", result.get("final_response", ""))
251
+ print()
252
+
253
+ # Test 2: Audio processing request
254
+ print("=== Test 2: Audio Processing ===")
255
+ audio_request = "Process this audio file https://example.com/audio.mp3 - remove filler words and normalize volume"
256
+
257
+ print("Streaming response:")
258
+ async for content, node in agent.stream_chat(audio_request):
259
+ print(f"[{node}] {content[:100]}..." if len(content) > 100 else f"[{node}] {content}")
260
+ print()
261
 
 
 
 
262
 
263
  if __name__ == "__main__":
264
  asyncio.run(main())
src/nodes/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio processing graph nodes.
3
+ """
4
+
5
+ from .router import router_node
6
+ from .chat import chat_node
7
+ from .script_generator import script_generator_node
8
+ from .planner import planner_node
9
+ from .audio_processor import audio_processor_node
10
+ from .validator import validator_node
11
+ from .final_response import final_response_node
12
+
13
+ __all__ = [
14
+ "router_node",
15
+ "chat_node",
16
+ "script_generator_node",
17
+ "planner_node",
18
+ "audio_processor_node",
19
+ "validator_node",
20
+ "final_response_node"
21
+ ]
src/nodes/audio_processor.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio processor node for executing planned audio processing steps.
3
+ """
4
+
5
+ from typing import Dict, Any, List
6
+ from langchain_core.messages import AIMessage
7
+
8
+ async def audio_processor_node(state: Dict[str, Any], tools: list) -> Dict[str, Any]:
9
+ """
10
+ Execute the planned audio processing steps using available tools.
11
+ """
12
+
13
+ execution_plan = state.get("execution_plan", [])
14
+ processed_files = state.get("processed_files", {})
15
+ completed_steps = state.get("completed_steps", [])
16
+ errors = state.get("errors", [])
17
+
18
+ if not execution_plan:
19
+ return {
20
+ "processed_files": processed_files,
21
+ "completed_steps": completed_steps,
22
+ "errors": errors + ["No execution plan available"],
23
+ "messages": state.get("messages", [])
24
+ }
25
+
26
+ # Create tool lookup
27
+ tool_lookup = {tool.name: tool for tool in tools}
28
+
29
+ # Execute each step in the plan
30
+ current_file_urls = {} # Track current URL for each original file
31
+
32
+ for step in execution_plan:
33
+ try:
34
+ step_name = step.get("step", "unknown")
35
+ tool_name = step.get("tool", "")
36
+ params = step.get("params", {})
37
+ description = step.get("description", "")
38
+
39
+ if tool_name not in tool_lookup:
40
+ if tool_name == "manual_combine":
41
+ # Handle manual combine step
42
+ result = handle_manual_combine(step, current_file_urls)
43
+ if result:
44
+ processed_files.update(result)
45
+ completed_steps.append(f"βœ… {description}")
46
+ else:
47
+ errors.append(f"❌ {description} - Manual combination needed")
48
+ else:
49
+ errors.append(f"❌ Tool '{tool_name}' not available for step: {description}")
50
+ continue
51
+
52
+ # Get the tool and execute
53
+ tool = tool_lookup[tool_name]
54
+
55
+ # Update file URL if this file has been processed before
56
+ original_file = params.get("audio_file", "")
57
+ if original_file in current_file_urls:
58
+ params["audio_file"] = current_file_urls[original_file]
59
+
60
+ # Execute the tool
61
+ result = await tool.ainvoke(params)
62
+
63
+ # Extract new file URL from result if available
64
+ new_file_url = extract_file_url_from_result(result, original_file)
65
+ if new_file_url and new_file_url != params["audio_file"]:
66
+ current_file_urls[original_file] = new_file_url
67
+ processed_files[original_file] = new_file_url
68
+
69
+ completed_steps.append(f"βœ… {description}")
70
+
71
+ except Exception as e:
72
+ error_msg = f"❌ Failed step '{step.get('description', 'unknown')}': {str(e)}"
73
+ errors.append(error_msg)
74
+
75
+ # Create processing summary
76
+ processing_summary = create_processing_summary(completed_steps, errors, processed_files)
77
+ messages = state.get("messages", [])
78
+ messages.append(AIMessage(content=processing_summary))
79
+
80
+ # Determine if reprocessing is needed
81
+ needs_reprocessing = len(errors) > 0 and len(completed_steps) > 0
82
+
83
+ return {
84
+ "processed_files": processed_files,
85
+ "completed_steps": completed_steps,
86
+ "errors": errors,
87
+ "needs_reprocessing": needs_reprocessing,
88
+ "messages": messages
89
+ }
90
+
91
+
92
+ def extract_file_url_from_result(result, original_file: str) -> str:
93
+ """Extract the new file URL from tool result."""
94
+
95
+ if hasattr(result, 'artifact') and result.artifact:
96
+ # If result has artifact with file info
97
+ if hasattr(result.artifact, 'url'):
98
+ return result.artifact.url
99
+ elif hasattr(result.artifact, 'path'):
100
+ return result.artifact.path
101
+
102
+ if hasattr(result, 'content'):
103
+ content = result.content
104
+ # Look for URLs in the content
105
+ import re
106
+ url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+\.(mp3|wav|m4a|flac|aac|ogg)'
107
+ urls = re.findall(url_pattern, content, re.IGNORECASE)
108
+ if urls:
109
+ return urls[0]
110
+
111
+ # If no new URL found, return the original
112
+ return original_file
113
+
114
+
115
+ def handle_manual_combine(step: Dict[str, Any], current_file_urls: Dict[str, str]) -> Dict[str, str]:
116
+ """Handle manual file combination step."""
117
+
118
+ files = step.get("params", {}).get("files", [])
119
+
120
+ if len(files) < 2:
121
+ return {}
122
+
123
+ # For now, just return a placeholder combined file
124
+ # In a real implementation, this would call a combine tool
125
+ combined_url = f"combined_dialogue_{len(files)}_files.mp3"
126
+
127
+ return {"combined_dialogue": combined_url}
128
+
129
+
130
+ def create_processing_summary(completed_steps: List[str], errors: List[str], processed_files: Dict[str, str]) -> str:
131
+ """Create a summary of the processing results."""
132
+
133
+ summary = "πŸ”§ **Audio Processing Complete**\n\n"
134
+
135
+ # Completed steps
136
+ if completed_steps:
137
+ summary += f"**βœ… Completed Steps ({len(completed_steps)}):**\n"
138
+ for step in completed_steps[-5:]: # Show last 5 steps
139
+ summary += f"- {step}\n"
140
+ if len(completed_steps) > 5:
141
+ summary += f"- ... and {len(completed_steps) - 5} more steps\n"
142
+ summary += "\n"
143
+
144
+ # Processed files
145
+ if processed_files:
146
+ summary += "**🎡 Processed Audio Files:**\n"
147
+ for original, processed in processed_files.items():
148
+ filename = original.split('/')[-1] if '/' in original else original
149
+ processed_filename = processed.split('/')[-1] if '/' in processed else processed
150
+ summary += f"- {filename} β†’ {processed_filename}\n"
151
+ summary += "\n"
152
+
153
+ # Errors
154
+ if errors:
155
+ summary += f"**⚠️ Issues Encountered ({len(errors)}):**\n"
156
+ for error in errors[-3:]: # Show last 3 errors
157
+ summary += f"- {error}\n"
158
+ if len(errors) > 3:
159
+ summary += f"- ... and {len(errors) - 3} more issues\n"
160
+ summary += "\n"
161
+
162
+ if processed_files and not errors:
163
+ summary += "πŸŽ‰ **All processing completed successfully!**"
164
+ elif processed_files and errors:
165
+ summary += "⚠️ **Processing completed with some issues. Validation recommended.**"
166
+ else:
167
+ summary += "❌ **Processing failed. Please check the issues above.**"
168
+
169
+ return summary
src/nodes/chat.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Chat node for handling general questions and conversations.
3
+ """
4
+
5
+ from typing import Dict, Any
6
+ from langchain_core.messages import AIMessage
7
+
8
+
9
+ def chat_node(state: Dict[str, Any]) -> Dict[str, Any]:
10
+ """
11
+ Handle general chat messages and questions about audio capabilities.
12
+ """
13
+
14
+ user_request = state.get("user_request", "")
15
+
16
+ # Generate response based on the user's question
17
+ response = generate_chat_response(user_request)
18
+
19
+ # Add AI response to messages
20
+ messages = state.get("messages", [])
21
+ messages.append(AIMessage(content=response))
22
+
23
+ return {
24
+ "messages": messages,
25
+ "final_response": response
26
+ }
27
+
28
+
29
+ def generate_chat_response(user_request: str) -> str:
30
+ """Generate appropriate chat response."""
31
+
32
+ user_lower = user_request.lower()
33
+
34
+ # Audio tools information
35
+ if any(keyword in user_lower for keyword in ["tools", "available", "capabilities", "what can"]):
36
+ return """
37
+ 🎡 **Audio Agent Capabilities**
38
+
39
+ I can help you process and improve audio files using these tools:
40
+
41
+ **πŸ“Š Analysis & Information:**
42
+ - Get audio duration and metadata
43
+ - Generate timestamped transcriptions
44
+ - Analyze audio properties
45
+
46
+ **βœ‚οΈ Audio Editing:**
47
+ - Cut and trim audio segments
48
+ - Remove silence from recordings
49
+ - Apply fade in/out effects
50
+ - Reverse audio playback
51
+
52
+ **πŸ”§ Audio Enhancement:**
53
+ - Normalize audio levels (-20dB to 0dB)
54
+ - Adjust volume/gain (-20dB to +20dB)
55
+ - Change playback speed (0.25x to 4x)
56
+
57
+ **🎭 Advanced Processing:**
58
+ - Remove filler words from speech
59
+ - Combine multiple audio files into dialogue
60
+ - Create professional audio workflows
61
+
62
+ To get started, simply provide audio file URLs and describe what you'd like me to do!
63
+ """
64
+
65
+ # How to use instructions
66
+ if any(keyword in user_lower for keyword in ["how", "use", "start", "begin"]):
67
+ return """
68
+ πŸš€ **How to Use the Audio Agent**
69
+
70
+ 1. **Provide Audio Files**: Share URLs to your audio files (mp3, wav, m4a, etc.)
71
+
72
+ 2. **Describe Your Goal**: Tell me what you want to achieve:
73
+ - "Remove filler words and improve audio quality"
74
+ - "Cut this audio from 30 seconds to 2 minutes"
75
+ - "Combine these files into a dialogue"
76
+ - "Normalize the volume and add fade effects"
77
+
78
+ 3. **Let Me Work**: I'll automatically:
79
+ - Generate timestamped transcripts
80
+ - Create an execution plan
81
+ - Process your audio step by step
82
+ - Provide you with the improved audio file
83
+
84
+ **Example**:
85
+ "Here's my recording: https://example.com/audio.mp3 - please remove filler words and normalize the volume"
86
+ """
87
+
88
+ # Filler words explanation
89
+ if any(keyword in user_lower for keyword in ["filler", "um", "uh", "like"]):
90
+ return """
91
+ πŸ—£οΈ **Filler Word Removal**
92
+
93
+ I can help remove common filler words like "um", "uh", "like", "you know", etc. from your audio.
94
+
95
+ **Process**:
96
+ 1. I'll transcribe your audio with timestamps
97
+ 2. Identify filler words and their locations
98
+ 3. Remove those segments from the audio
99
+ 4. Apply smooth transitions to maintain natural flow
100
+
101
+ **Benefits**:
102
+ - More professional-sounding recordings
103
+ - Improved clarity and pace
104
+ - Better listener engagement
105
+
106
+ Just provide your audio file and mention "remove filler words" in your request!
107
+ """
108
+
109
+ # General greeting or unclear request
110
+ if any(keyword in user_lower for keyword in ["hello", "hi", "help"]) or len(user_request.strip()) < 10:
111
+ return """
112
+ πŸ‘‹ **Hello! I'm your Audio Processing Assistant**
113
+
114
+ I specialize in improving and processing audio files. I can:
115
+
116
+ - Remove filler words and improve speech clarity
117
+ - Cut, trim, and edit audio segments
118
+ - Normalize volume and apply professional effects
119
+ - Combine multiple files into conversations
120
+ - Generate timestamped transcriptions
121
+
122
+ **Ready to enhance your audio?** Just share your audio file URLs and tell me what you'd like me to do!
123
+
124
+ Type "what tools are available?" to see all my capabilities.
125
+ """
126
+
127
+ # Default response for other questions
128
+ return """
129
+ I'm here to help with audio processing! While I can chat about audio-related topics, my specialty is improving audio files.
130
+
131
+ If you have audio files you'd like me to process, just share the URLs and describe what you need. Otherwise, feel free to ask me about my audio processing capabilities!
132
+ """
src/nodes/final_response.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Final response node for formatting the final response to the user.
3
+ """
4
+
5
+ from typing import Dict, Any
6
+ from langchain_core.messages import AIMessage
7
+
8
+
9
+ def final_response_node(state: Dict[str, Any]) -> Dict[str, Any]:
10
+ """
11
+ Generate the final response to the user with processing results and audio files.
12
+ """
13
+
14
+ processing_type = state.get("processing_type", "")
15
+ processed_files = state.get("processed_files", {})
16
+ scripts = state.get("scripts", {})
17
+ errors = state.get("errors", [])
18
+ processing_metadata = state.get("processing_metadata", {})
19
+ user_request = state.get("user_request", "")
20
+
21
+ # Generate final response based on processing type
22
+ if processing_type == "chat":
23
+ # Chat responses are already handled in chat_node
24
+ final_response = state.get("final_response", "")
25
+ else:
26
+ final_response = create_audio_processing_response(
27
+ user_request, processed_files, scripts, errors, processing_metadata
28
+ )
29
+
30
+ # Add final response to messages
31
+ messages = state.get("messages", [])
32
+ if not any(msg.content == final_response for msg in messages if hasattr(msg, 'content')):
33
+ messages.append(AIMessage(content=final_response))
34
+
35
+ # Set final audio URL if available
36
+ final_audio_url = get_final_audio_url(processed_files, processing_type)
37
+
38
+ return {
39
+ "final_response": final_response,
40
+ "final_audio_url": final_audio_url,
41
+ "messages": messages
42
+ }
43
+
44
+
45
+ def create_audio_processing_response(
46
+ user_request: str,
47
+ processed_files: Dict[str, str],
48
+ scripts: Dict[str, Any],
49
+ errors: list,
50
+ processing_metadata: Dict[str, Any]
51
+ ) -> str:
52
+ """Create comprehensive audio processing response."""
53
+
54
+ response = "🎡 **Audio Processing Complete!**\n\n"
55
+
56
+ # User request summary
57
+ response += f"**Your Request**: {user_request}\n\n"
58
+
59
+ # Processing results
60
+ if processed_files:
61
+ response += "**βœ… Successfully Processed Files:**\n"
62
+ for i, (original, processed) in enumerate(processed_files.items(), 1):
63
+ original_name = get_filename_from_url(original)
64
+ processed_name = get_filename_from_url(processed)
65
+
66
+ response += f"{i}. **{original_name}**\n"
67
+ response += f" πŸ”— **Download**: [{processed_name}]({processed})\n\n"
68
+
69
+ # Add script info if available
70
+ if original in scripts:
71
+ script_data = scripts[original]
72
+ filler_count = len(script_data.get("filler_words", []))
73
+ if filler_count > 0:
74
+ response += f" πŸ“ Removed {filler_count} filler words\n"
75
+ response += f" πŸ“„ Transcript available\n\n"
76
+
77
+ # Processing summary
78
+ validation_results = processing_metadata.get("validation_results", {})
79
+ if validation_results:
80
+ completion_rate = validation_results.get("step_completion_rate", 0)
81
+ overall_status = validation_results.get("overall_status", "unknown")
82
+
83
+ response += f"**πŸ“Š Processing Summary:**\n"
84
+ response += f"- Status: {overall_status.replace('_', ' ').title()}\n"
85
+ response += f"- Completion: {completion_rate:.1%}\n"
86
+ response += f"- Files processed: {len(processed_files)}\n\n"
87
+
88
+ # Improvements made
89
+ improvements = extract_improvements_from_processing(processed_files, scripts, processing_metadata)
90
+ if improvements:
91
+ response += "**πŸ”§ Improvements Applied:**\n"
92
+ for improvement in improvements:
93
+ response += f"- {improvement}\n"
94
+ response += "\n"
95
+
96
+ # Recommendations
97
+ recommendations = validation_results.get("recommendations", [])
98
+ if recommendations:
99
+ response += "**πŸ’‘ Recommendations:**\n"
100
+ for rec in recommendations[:3]: # Show top 3
101
+ response += f"- {rec}\n"
102
+ response += "\n"
103
+
104
+ # Errors (if any)
105
+ if errors:
106
+ response += "**⚠️ Issues Encountered:**\n"
107
+ for error in errors[-2:]: # Show last 2 errors
108
+ response += f"- {error}\n"
109
+ response += "\n"
110
+
111
+ # Call to action
112
+ if processed_files:
113
+ response += "πŸŽ‰ **Your enhanced audio files are ready!** "
114
+ response += "Click the download links above to get your improved audio.\n\n"
115
+ response += "Need further adjustments? Just let me know what else you'd like me to do!"
116
+ else:
117
+ response += "❌ **Processing unsuccessful.** "
118
+ response += "Please check your audio file URLs and try again, or ask for help with a different approach."
119
+
120
+ return response
121
+
122
+
123
+ def get_final_audio_url(processed_files: Dict[str, str], processing_type: str) -> str:
124
+ """Get the final audio URL to return to the user."""
125
+
126
+ if not processed_files:
127
+ return ""
128
+
129
+ # For dialogue generation, look for combined file
130
+ if processing_type == "dialogue_generation":
131
+ for original, processed in processed_files.items():
132
+ if "combined" in processed or "dialogue" in processed:
133
+ return processed
134
+
135
+ # For single file processing, return the processed file
136
+ if len(processed_files) == 1:
137
+ return list(processed_files.values())[0]
138
+
139
+ # For multiple files, return the first one (or could be user's choice)
140
+ return list(processed_files.values())[0] if processed_files else ""
141
+
142
+
143
+ def get_filename_from_url(url: str) -> str:
144
+ """Extract filename from URL or path."""
145
+ if not url:
146
+ return "unknown_file"
147
+
148
+ # Extract filename from URL
149
+ if '/' in url:
150
+ filename = url.split('/')[-1]
151
+ else:
152
+ filename = url
153
+
154
+ # Remove query parameters if present
155
+ if '?' in filename:
156
+ filename = filename.split('?')[0]
157
+
158
+ return filename or "processed_audio"
159
+
160
+
161
+ def extract_improvements_from_processing(
162
+ processed_files: Dict[str, str],
163
+ scripts: Dict[str, Any],
164
+ processing_metadata: Dict[str, Any]
165
+ ) -> list:
166
+ """Extract list of improvements made during processing."""
167
+
168
+ improvements = []
169
+
170
+ # Check for filler word removal
171
+ total_fillers = 0
172
+ for script_data in scripts.values():
173
+ filler_words = script_data.get("filler_words", [])
174
+ total_fillers += len(filler_words)
175
+
176
+ if total_fillers > 0:
177
+ improvements.append(f"Removed {total_fillers} filler words for cleaner speech")
178
+
179
+ # Check for audio enhancement
180
+ if processed_files:
181
+ improvements.append("Enhanced audio quality and consistency")
182
+ improvements.append("Optimized volume levels and normalization")
183
+
184
+ # Check for silence removal
185
+ validation_results = processing_metadata.get("validation_results", {})
186
+ recommendations = validation_results.get("recommendations", [])
187
+
188
+ if any("silence" in rec.lower() for rec in recommendations):
189
+ improvements.append("Removed unnecessary silence and gaps")
190
+
191
+ if any("fade" in rec.lower() for rec in recommendations):
192
+ improvements.append("Added professional fade effects")
193
+
194
+ if any("cut" in rec.lower() for rec in recommendations):
195
+ improvements.append("Precisely cut and trimmed audio segments")
196
+
197
+ # Default improvements if files were processed
198
+ if processed_files and not improvements:
199
+ improvements.extend([
200
+ "Applied professional audio processing",
201
+ "Improved overall audio quality",
202
+ "Optimized for better listening experience"
203
+ ])
204
+
205
+ return improvements
src/nodes/planner.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Planner node for creating execution plans for audio processing.
3
+ """
4
+
5
+ from typing import Dict, Any, List
6
+ from langchain_core.messages import AIMessage
7
+
8
+
9
+ def planner_node(state: Dict[str, Any]) -> Dict[str, Any]:
10
+ """
11
+ Create an execution plan for audio processing based on user request and scripts.
12
+ """
13
+
14
+ user_request = state.get("user_request", "")
15
+ audio_files = state.get("audio_files", [])
16
+ scripts = state.get("scripts", {})
17
+ processing_type = state.get("processing_type", "")
18
+
19
+ # Create execution plan based on processing type and user request
20
+ if processing_type == "dialogue_generation":
21
+ execution_plan = create_dialogue_plan(user_request, audio_files, scripts)
22
+ else:
23
+ execution_plan = create_audio_processing_plan(user_request, audio_files, scripts)
24
+
25
+ # Create plan summary message
26
+ plan_summary = create_plan_summary(execution_plan)
27
+ messages = state.get("messages", [])
28
+ messages.append(AIMessage(content=plan_summary))
29
+
30
+ return {
31
+ "execution_plan": execution_plan,
32
+ "messages": messages
33
+ }
34
+
35
+
36
+ def create_audio_processing_plan(user_request: str, audio_files: List[str], scripts: Dict[str, Any]) -> List[Dict[str, Any]]:
37
+ """Create execution plan for single file audio processing."""
38
+
39
+ plan = []
40
+ user_lower = user_request.lower()
41
+
42
+ for audio_file in audio_files:
43
+ file_plan = []
44
+
45
+ # Step 1: Update audio info
46
+ file_plan.append({
47
+ "step": f"update_info_{audio_file}",
48
+ "tool": "update_audio_info",
49
+ "params": {"audio_file": audio_file},
50
+ "description": f"Update audio information for {audio_file}"
51
+ })
52
+
53
+ # Step 2: Update duration info
54
+ file_plan.append({
55
+ "step": f"update_duration_{audio_file}",
56
+ "tool": "update_duration_info",
57
+ "params": {"audio_file": audio_file},
58
+ "description": f"Update duration information for {audio_file}"
59
+ })
60
+
61
+ # Step 3: Process based on user request
62
+
63
+ # Filler word removal (via silence trimming and cutting)
64
+ if any(keyword in user_lower for keyword in ["filler", "remove", "clean", "improve"]):
65
+ # First, trim silence
66
+ file_plan.append({
67
+ "step": f"trim_silence_{audio_file}",
68
+ "tool": "apply_silence_trimming",
69
+ "params": {"audio_file": audio_file, "threshold_db": -40},
70
+ "description": f"Remove silence and filler segments from {audio_file}"
71
+ })
72
+
73
+ # Apply filler word removal via cutting (using script data)
74
+ if audio_file in scripts and scripts[audio_file].get("filler_words"):
75
+ file_plan.append({
76
+ "step": f"remove_fillers_{audio_file}",
77
+ "tool": "process_cut_audio",
78
+ "params": {"audio_file": audio_file, "_start_time": 0, "_end_time": 100},
79
+ "description": f"Remove filler words from {audio_file}",
80
+ "filler_data": scripts[audio_file]["filler_words"]
81
+ })
82
+
83
+ # Audio cutting/trimming
84
+ if any(keyword in user_lower for keyword in ["cut", "trim", "segment"]):
85
+ # Extract time ranges if specified
86
+ start_time, end_time = extract_time_range(user_request)
87
+ file_plan.append({
88
+ "step": f"cut_audio_{audio_file}",
89
+ "tool": "process_cut_audio",
90
+ "params": {"audio_file": audio_file, "_start_time": start_time, "_end_time": end_time},
91
+ "description": f"Cut audio from {start_time}s to {end_time}s"
92
+ })
93
+
94
+ # Volume/normalization adjustments
95
+ if any(keyword in user_lower for keyword in ["normalize", "volume", "loud", "quiet", "level"]):
96
+ if "normalize" in user_lower:
97
+ target_level = extract_target_level(user_request)
98
+ file_plan.append({
99
+ "step": f"normalize_{audio_file}",
100
+ "tool": "apply_normalization",
101
+ "params": {"audio_file": audio_file, "target_level": target_level},
102
+ "description": f"Normalize audio to {target_level}dB"
103
+ })
104
+ else:
105
+ gain_db = extract_gain_value(user_request)
106
+ file_plan.append({
107
+ "step": f"adjust_volume_{audio_file}",
108
+ "tool": "apply_volume_adjustment",
109
+ "params": {"audio_file": audio_file, "gain_db": gain_db},
110
+ "description": f"Adjust volume by {gain_db}dB"
111
+ })
112
+
113
+ # Speed adjustments
114
+ if any(keyword in user_lower for keyword in ["speed", "fast", "slow", "tempo"]):
115
+ speed_factor = extract_speed_factor(user_request)
116
+ file_plan.append({
117
+ "step": f"adjust_speed_{audio_file}",
118
+ "tool": "apply_speed_adjustment",
119
+ "params": {"audio_file": audio_file, "speed_factor": speed_factor},
120
+ "description": f"Adjust speed to {speed_factor}x"
121
+ })
122
+
123
+ # Fade effects
124
+ if any(keyword in user_lower for keyword in ["fade", "smooth", "transition"]):
125
+ fade_in, fade_out = extract_fade_values(user_request)
126
+ file_plan.append({
127
+ "step": f"apply_fades_{audio_file}",
128
+ "tool": "apply_fades",
129
+ "params": {"audio_file": audio_file, "fade_in_ms": fade_in, "fade_out_ms": fade_out},
130
+ "description": f"Apply fade in ({fade_in}ms) and fade out ({fade_out}ms)"
131
+ })
132
+
133
+ # If no specific processing mentioned, apply default enhancement
134
+ if len(file_plan) <= 2: # Only info updates
135
+ file_plan.extend([
136
+ {
137
+ "step": f"enhance_{audio_file}",
138
+ "tool": "apply_silence_trimming",
139
+ "params": {"audio_file": audio_file, "threshold_db": -40},
140
+ "description": f"Remove silence from {audio_file}"
141
+ },
142
+ {
143
+ "step": f"normalize_{audio_file}",
144
+ "tool": "apply_normalization",
145
+ "params": {"audio_file": audio_file, "target_level": -3},
146
+ "description": f"Normalize audio levels"
147
+ }
148
+ ])
149
+
150
+ plan.extend(file_plan)
151
+
152
+ return plan
153
+
154
+
155
+ def create_dialogue_plan(user_request: str, audio_files: List[str], scripts: Dict[str, Any]) -> List[Dict[str, Any]]:
156
+ """Create execution plan for dialogue generation from multiple files."""
157
+
158
+ plan = []
159
+
160
+ # Step 1: Process each file individually first
161
+ for audio_file in audio_files:
162
+ # Update info
163
+ plan.append({
164
+ "step": f"update_info_{audio_file}",
165
+ "tool": "update_audio_info",
166
+ "params": {"audio_file": audio_file},
167
+ "description": f"Update audio info for {audio_file}"
168
+ })
169
+
170
+ # Clean up the audio
171
+ plan.append({
172
+ "step": f"cleanup_{audio_file}",
173
+ "tool": "apply_silence_trimming",
174
+ "params": {"audio_file": audio_file, "threshold_db": -40},
175
+ "description": f"Clean silence from {audio_file}"
176
+ })
177
+
178
+ # Normalize levels
179
+ plan.append({
180
+ "step": f"normalize_{audio_file}",
181
+ "tool": "apply_normalization",
182
+ "params": {"audio_file": audio_file, "target_level": -6},
183
+ "description": f"Normalize {audio_file} for dialogue"
184
+ })
185
+
186
+ # Add fades for smooth transitions
187
+ plan.append({
188
+ "step": f"fade_{audio_file}",
189
+ "tool": "apply_fades",
190
+ "params": {"audio_file": audio_file, "fade_in_ms": 200, "fade_out_ms": 200},
191
+ "description": f"Add fades to {audio_file}"
192
+ })
193
+
194
+ # Step 2: Combine files (this would need a combine tool, but we'll note it)
195
+ plan.append({
196
+ "step": "combine_dialogue",
197
+ "tool": "manual_combine", # This would need to be implemented
198
+ "params": {"files": audio_files},
199
+ "description": "Combine processed files into dialogue",
200
+ "note": "This step requires manual combination or a dedicated combine tool"
201
+ })
202
+
203
+ return plan
204
+
205
+
206
+ def extract_time_range(user_request: str) -> tuple:
207
+ """Extract start and end times from user request."""
208
+ import re
209
+
210
+ # Look for time patterns like "30 seconds to 2 minutes" or "1:30 to 3:45"
211
+ time_pattern = r'(\d+):?(\d*)\s*(?:seconds?|minutes?|s|m)?\s*to\s*(\d+):?(\d*)\s*(?:seconds?|minutes?|s|m)?'
212
+ match = re.search(time_pattern, user_request.lower())
213
+
214
+ if match:
215
+ start_min, start_sec, end_min, end_sec = match.groups()
216
+ start_time = int(start_min) + (int(start_sec) if start_sec else 0) / 60
217
+ end_time = int(end_min) + (int(end_sec) if end_sec else 0) / 60
218
+ return start_time, end_time
219
+
220
+ # Default range
221
+ return 0, 30
222
+
223
+
224
+ def extract_target_level(user_request: str) -> float:
225
+ """Extract target normalization level."""
226
+ import re
227
+ match = re.search(r'-?(\d+(?:\.\d+)?)\s*db', user_request.lower())
228
+ if match:
229
+ return -abs(float(match.group(1))) # Ensure negative
230
+ return -3 # Default
231
+
232
+
233
+ def extract_gain_value(user_request: str) -> float:
234
+ """Extract gain adjustment value."""
235
+ import re
236
+ match = re.search(r'([+-]?\d+(?:\.\d+)?)\s*db', user_request.lower())
237
+ if match:
238
+ return float(match.group(1))
239
+ return 0 # Default
240
+
241
+
242
+ def extract_speed_factor(user_request: str) -> float:
243
+ """Extract speed factor."""
244
+ import re
245
+ match = re.search(r'(\d+(?:\.\d+)?)\s*x', user_request.lower())
246
+ if match:
247
+ return float(match.group(1))
248
+
249
+ if any(word in user_request.lower() for word in ["fast", "faster", "quick"]):
250
+ return 1.5
251
+ elif any(word in user_request.lower() for word in ["slow", "slower"]):
252
+ return 0.75
253
+
254
+ return 1.0 # Default
255
+
256
+
257
+ def extract_fade_values(user_request: str) -> tuple:
258
+ """Extract fade in/out values."""
259
+ import re
260
+ match = re.search(r'(\d+)\s*ms', user_request.lower())
261
+ if match:
262
+ value = int(match.group(1))
263
+ return value, value
264
+ return 100, 100 # Default
265
+
266
+
267
+ def create_plan_summary(execution_plan: List[Dict[str, Any]]) -> str:
268
+ """Create a summary of the execution plan."""
269
+
270
+ if not execution_plan:
271
+ return "❌ **No execution plan could be created**"
272
+
273
+ summary = "πŸ“‹ **Execution Plan Created**\n\n"
274
+ summary += f"**Total Steps**: {len(execution_plan)}\n\n"
275
+
276
+ for i, step in enumerate(execution_plan, 1):
277
+ tool_name = step.get("tool", "unknown")
278
+ description = step.get("description", "No description")
279
+
280
+ summary += f"**{i}.** `{tool_name}`\n"
281
+ summary += f" {description}\n\n"
282
+
283
+ summary += "βœ… **Ready to execute plan...**"
284
+ return summary
src/nodes/router.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Router node to determine processing type based on user input.
3
+ """
4
+
5
+ import re
6
+ from typing import Dict, Any, List
7
+ from langchain_core.messages import HumanMessage, AIMessage
8
+
9
+
10
+ def router_node(state: Dict[str, Any]) -> Dict[str, Any]:
11
+ """
12
+ Route the conversation based on user input.
13
+
14
+ Determines if this is:
15
+ - A general chat question
16
+ - Audio processing request
17
+ - Dialogue generation request
18
+ """
19
+
20
+ # Get the latest user message
21
+ latest_message = None
22
+ for msg in reversed(state.get("messages", [])):
23
+ if isinstance(msg, HumanMessage):
24
+ latest_message = msg
25
+ break
26
+
27
+ if not latest_message:
28
+ return {
29
+ "processing_type": "chat",
30
+ "user_request": "",
31
+ "audio_files": []
32
+ }
33
+
34
+ user_content = latest_message.content.lower()
35
+
36
+ # Extract audio file URLs/paths from the message
37
+ audio_files = extract_audio_files(latest_message.content)
38
+
39
+ # Determine processing type
40
+ processing_type = determine_processing_type(user_content, audio_files)
41
+
42
+ return {
43
+ "processing_type": processing_type,
44
+ "user_request": latest_message.content,
45
+ "audio_files": audio_files,
46
+ "errors": [],
47
+ "needs_reprocessing": False,
48
+ "completed_steps": [],
49
+ "scripts": {},
50
+ "processed_files": {},
51
+ "processing_metadata": {}
52
+ }
53
+
54
+
55
+ def extract_audio_files(content: str) -> List[str]:
56
+ """Extract audio file URLs or paths from user message."""
57
+
58
+ # Look for URLs (http/https)
59
+ url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+\.(mp3|wav|m4a|flac|aac|ogg)'
60
+ urls = re.findall(url_pattern, content, re.IGNORECASE)
61
+
62
+ # Look for file paths
63
+ path_pattern = r'[^\s<>"{}|\\^`\[\]]+\.(mp3|wav|m4a|flac|aac|ogg)'
64
+ paths = re.findall(path_pattern, content, re.IGNORECASE)
65
+
66
+ # Combine and deduplicate
67
+ audio_files = list(set(urls + [path for path in paths if not path.startswith('http')]))
68
+
69
+ return audio_files
70
+
71
+
72
+ def determine_processing_type(content: str, audio_files: List[str]) -> str:
73
+ """Determine the type of processing needed."""
74
+
75
+ # If no audio files, it's a chat
76
+ if not audio_files:
77
+ # Check if user is asking about audio tools or capabilities
78
+ audio_keywords = [
79
+ 'audio', 'sound', 'music', 'voice', 'recording', 'transcript',
80
+ 'cut', 'trim', 'normalize', 'volume', 'fade', 'speed', 'reverse'
81
+ ]
82
+
83
+ if any(keyword in content for keyword in audio_keywords):
84
+ return "chat" # User asking about audio capabilities
85
+
86
+ return "chat"
87
+
88
+ # If audio files are present, determine processing type
89
+ dialogue_keywords = [
90
+ 'dialogue', 'conversation', 'combine', 'merge', 'mix',
91
+ 'conversation', 'discussion', 'interview'
92
+ ]
93
+
94
+ if any(keyword in content for keyword in dialogue_keywords):
95
+ return "dialogue_generation"
96
+
97
+ return "audio_processing"
src/nodes/script_generator.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script generator node for creating timestamped transcripts.
3
+ """
4
+
5
+ from typing import Dict, Any
6
+ from langchain_core.messages import AIMessage
7
+
8
+
9
+ async def script_generator_node(state: Dict[str, Any], tools: list) -> Dict[str, Any]:
10
+ """
11
+ Generate timestamped scripts for all audio files using transcription tools.
12
+ """
13
+
14
+ audio_files = state.get("audio_files", [])
15
+
16
+ if not audio_files:
17
+ return {
18
+ "scripts": {},
19
+ "errors": ["No audio files provided for transcription"]
20
+ }
21
+
22
+ scripts = {}
23
+ errors = []
24
+ completed_steps = state.get("completed_steps", [])
25
+
26
+ # Get transcription tools
27
+ transcribe_tool = None
28
+ update_transcription_tool = None
29
+
30
+ for tool in tools:
31
+ if tool.name == "transcribe_audio_sync":
32
+ transcribe_tool = tool
33
+ elif tool.name == "update_transcription_info":
34
+ update_transcription_tool = tool
35
+
36
+ if not transcribe_tool:
37
+ return {
38
+ "scripts": {},
39
+ "errors": ["Transcription tool not available"]
40
+ }
41
+
42
+ # Process each audio file
43
+ for audio_file in audio_files:
44
+ try:
45
+ # Update transcription info first if tool is available
46
+ if update_transcription_tool:
47
+ await update_transcription_tool.ainvoke({"audio_file": audio_file})
48
+
49
+ # Generate transcript with timestamps
50
+ transcript_result = await transcribe_tool.ainvoke({"audio_file": audio_file})
51
+
52
+ # Parse the transcript result
53
+ if hasattr(transcript_result, 'content'):
54
+ transcript_content = transcript_result.content
55
+ else:
56
+ transcript_content = str(transcript_result)
57
+
58
+ scripts[audio_file] = {
59
+ "transcript": transcript_content,
60
+ "timestamps": extract_timestamps(transcript_content),
61
+ "filler_words": identify_filler_words(transcript_content)
62
+ }
63
+
64
+ completed_steps.append(f"Transcribed: {audio_file}")
65
+
66
+ except Exception as e:
67
+ errors.append(f"Failed to transcribe {audio_file}: {str(e)}")
68
+
69
+ # Create response message
70
+ script_summary = create_script_summary(scripts)
71
+ messages = state.get("messages", [])
72
+ messages.append(AIMessage(content=script_summary))
73
+
74
+ return {
75
+ "scripts": scripts,
76
+ "completed_steps": completed_steps,
77
+ "errors": errors,
78
+ "messages": messages
79
+ }
80
+
81
+
82
+ def extract_timestamps(transcript_content: str) -> list:
83
+ """Extract timestamp information from transcript."""
84
+ # This is a simplified implementation
85
+ # In a real scenario, the transcription tool would provide proper timestamps
86
+
87
+ timestamps = []
88
+ lines = transcript_content.split('\n')
89
+
90
+ for i, line in enumerate(lines):
91
+ if line.strip():
92
+ # Estimate timestamps based on line position
93
+ start_time = i * 3.0 # Rough estimate of 3 seconds per line
94
+ end_time = start_time + 3.0
95
+
96
+ timestamps.append({
97
+ "start": start_time,
98
+ "end": end_time,
99
+ "text": line.strip()
100
+ })
101
+
102
+ return timestamps
103
+
104
+
105
+ def identify_filler_words(transcript_content: str) -> list:
106
+ """Identify filler words and their approximate positions."""
107
+
108
+ filler_words = [
109
+ "um", "uh", "like", "you know", "so", "well", "actually",
110
+ "basically", "literally", "I mean", "sort of", "kind of"
111
+ ]
112
+
113
+ found_fillers = []
114
+ words = transcript_content.lower().split()
115
+
116
+ for i, word in enumerate(words):
117
+ # Clean the word (remove punctuation)
118
+ clean_word = word.strip('.,!?;:"()[]{}')
119
+
120
+ if clean_word in filler_words:
121
+ found_fillers.append({
122
+ "word": clean_word,
123
+ "position": i,
124
+ "context": " ".join(words[max(0, i-2):min(len(words), i+3)])
125
+ })
126
+
127
+ return found_fillers
128
+
129
+
130
+ def create_script_summary(scripts: Dict[str, Any]) -> str:
131
+ """Create a summary of the generated scripts."""
132
+
133
+ if not scripts:
134
+ return "❌ **Script Generation Failed**\n\nNo transcripts could be generated."
135
+
136
+ summary = "πŸ“ **Transcripts Generated Successfully**\n\n"
137
+
138
+ for file_url, script_data in scripts.items():
139
+ filename = file_url.split('/')[-1] if '/' in file_url else file_url
140
+ transcript = script_data.get("transcript", "")
141
+ filler_count = len(script_data.get("filler_words", []))
142
+ timestamp_count = len(script_data.get("timestamps", []))
143
+
144
+ summary += f"**🎡 {filename}**\n"
145
+ summary += f"- Transcript length: {len(transcript)} characters\n"
146
+ summary += f"- Timestamps: {timestamp_count} segments\n"
147
+ summary += f"- Filler words detected: {filler_count}\n\n"
148
+
149
+ # Show first few lines of transcript
150
+ lines = transcript.split('\n')[:3]
151
+ if lines:
152
+ summary += "**Preview:**\n"
153
+ for line in lines:
154
+ if line.strip():
155
+ summary += f"> {line.strip()}\n"
156
+ summary += "\n"
157
+
158
+ summary += "βœ… **Ready for execution planning...**"
159
+ return summary
src/nodes/validator.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Validator node for checking processing results and determining if reprocessing is needed.
3
+ """
4
+
5
+ from typing import Dict, Any, List
6
+ from langchain_core.messages import AIMessage
7
+
8
+
9
+ def validator_node(state: Dict[str, Any]) -> Dict[str, Any]:
10
+ """
11
+ Validate the processing results and determine if reprocessing is needed.
12
+ """
13
+
14
+ processed_files = state.get("processed_files", {})
15
+ errors = state.get("errors", [])
16
+ completed_steps = state.get("completed_steps", [])
17
+ execution_plan = state.get("execution_plan", [])
18
+ needs_reprocessing = state.get("needs_reprocessing", False)
19
+
20
+ # Perform validation checks
21
+ validation_results = perform_validation_checks(
22
+ processed_files, errors, completed_steps, execution_plan
23
+ )
24
+
25
+ # Determine if reprocessing is needed
26
+ should_reprocess = determine_reprocessing_need(validation_results, needs_reprocessing)
27
+
28
+ # Create validation summary
29
+ validation_summary = create_validation_summary(validation_results, should_reprocess)
30
+ messages = state.get("messages", [])
31
+ messages.append(AIMessage(content=validation_summary))
32
+
33
+ return {
34
+ "needs_reprocessing": should_reprocess,
35
+ "processing_metadata": {
36
+ "validation_results": validation_results,
37
+ "validation_timestamp": get_current_timestamp()
38
+ },
39
+ "messages": messages
40
+ }
41
+
42
+
43
+ def perform_validation_checks(
44
+ processed_files: Dict[str, str],
45
+ errors: List[str],
46
+ completed_steps: List[str],
47
+ execution_plan: List[Dict[str, Any]]
48
+ ) -> Dict[str, Any]:
49
+ """Perform comprehensive validation of processing results."""
50
+
51
+ validation_results = {
52
+ "overall_status": "unknown",
53
+ "file_processing_success": {},
54
+ "step_completion_rate": 0,
55
+ "critical_errors": [],
56
+ "warnings": [],
57
+ "recommendations": []
58
+ }
59
+
60
+ # Check file processing success
61
+ for original_file in processed_files.keys():
62
+ processed_url = processed_files[original_file]
63
+
64
+ if processed_url and processed_url != original_file:
65
+ validation_results["file_processing_success"][original_file] = "success"
66
+ else:
67
+ validation_results["file_processing_success"][original_file] = "failed"
68
+
69
+ # Calculate step completion rate
70
+ total_steps = len(execution_plan)
71
+ if total_steps > 0:
72
+ successful_steps = len([step for step in completed_steps if step.startswith("βœ…")])
73
+ validation_results["step_completion_rate"] = successful_steps / total_steps
74
+
75
+ # Analyze errors for critical issues
76
+ critical_keywords = ["tool not available", "failed to transcribe", "connection", "timeout"]
77
+ for error in errors:
78
+ error_lower = error.lower()
79
+ if any(keyword in error_lower for keyword in critical_keywords):
80
+ validation_results["critical_errors"].append(error)
81
+ else:
82
+ validation_results["warnings"].append(error)
83
+
84
+ # Generate recommendations
85
+ validation_results["recommendations"] = generate_recommendations(
86
+ processed_files, errors, completed_steps, validation_results["step_completion_rate"]
87
+ )
88
+
89
+ # Determine overall status
90
+ if validation_results["step_completion_rate"] >= 0.8 and not validation_results["critical_errors"]:
91
+ validation_results["overall_status"] = "success"
92
+ elif validation_results["step_completion_rate"] >= 0.5:
93
+ validation_results["overall_status"] = "partial_success"
94
+ else:
95
+ validation_results["overall_status"] = "failed"
96
+
97
+ return validation_results
98
+
99
+
100
+ def determine_reprocessing_need(validation_results: Dict[str, Any], current_needs_reprocessing: bool) -> bool:
101
+ """Determine if reprocessing is needed based on validation results."""
102
+
103
+ overall_status = validation_results.get("overall_status", "unknown")
104
+ step_completion_rate = validation_results.get("step_completion_rate", 0)
105
+ critical_errors = validation_results.get("critical_errors", [])
106
+
107
+ # Don't reprocess if we're already in a reprocessing cycle to avoid loops
108
+ if current_needs_reprocessing:
109
+ return False
110
+
111
+ # Reprocess if there are critical errors and some steps succeeded
112
+ if critical_errors and step_completion_rate > 0.2:
113
+ return True
114
+
115
+ # Reprocess if completion rate is low but not zero
116
+ if 0.1 < step_completion_rate < 0.7:
117
+ return True
118
+
119
+ # Don't reprocess if everything failed (likely a fundamental issue)
120
+ if step_completion_rate <= 0.1:
121
+ return False
122
+
123
+ # Don't reprocess if mostly successful
124
+ if step_completion_rate >= 0.8:
125
+ return False
126
+
127
+ return False
128
+
129
+
130
+ def generate_recommendations(
131
+ processed_files: Dict[str, str],
132
+ errors: List[str],
133
+ completed_steps: List[str],
134
+ completion_rate: float
135
+ ) -> List[str]:
136
+ """Generate recommendations based on processing results."""
137
+
138
+ recommendations = []
139
+
140
+ # File-specific recommendations
141
+ if not processed_files:
142
+ recommendations.append("No audio files were successfully processed. Check file URLs and format compatibility.")
143
+ elif len(processed_files) == 1:
144
+ recommendations.append("Single file processed. Consider adding fade effects or normalization for better quality.")
145
+ else:
146
+ recommendations.append(f"Multiple files processed ({len(processed_files)}). Consider combining them for dialogue if needed.")
147
+
148
+ # Error-based recommendations
149
+ if any("transcribe" in error.lower() for error in errors):
150
+ recommendations.append("Transcription issues detected. Verify audio quality and format.")
151
+
152
+ if any("tool not available" in error.lower() for error in errors):
153
+ recommendations.append("Some tools were unavailable. Check MCP server connection.")
154
+
155
+ if any("normalize" in step for step in completed_steps):
156
+ recommendations.append("Audio levels normalized. Consider adjusting volume manually if needed.")
157
+
158
+ # Completion rate recommendations
159
+ if completion_rate < 0.5:
160
+ recommendations.append("Low completion rate. Consider simplifying the processing request.")
161
+ elif completion_rate > 0.9:
162
+ recommendations.append("Processing highly successful! Audio should be significantly improved.")
163
+
164
+ # Quality recommendations
165
+ filler_steps = [step for step in completed_steps if "filler" in step.lower()]
166
+ if filler_steps:
167
+ recommendations.append("Filler words processed. Review the audio for natural flow.")
168
+
169
+ cut_steps = [step for step in completed_steps if "cut" in step.lower()]
170
+ if cut_steps:
171
+ recommendations.append("Audio segments cut. Verify timing and transitions.")
172
+
173
+ return recommendations
174
+
175
+
176
+ def get_current_timestamp() -> str:
177
+ """Get current timestamp for metadata."""
178
+ import datetime
179
+ return datetime.datetime.now().isoformat()
180
+
181
+
182
+ def create_validation_summary(validation_results: Dict[str, Any], should_reprocess: bool) -> str:
183
+ """Create a summary of validation results."""
184
+
185
+ overall_status = validation_results.get("overall_status", "unknown")
186
+ completion_rate = validation_results.get("step_completion_rate", 0)
187
+ critical_errors = validation_results.get("critical_errors", [])
188
+ warnings = validation_results.get("warnings", [])
189
+ recommendations = validation_results.get("recommendations", [])
190
+
191
+ # Status emoji and header
192
+ status_emoji = {
193
+ "success": "βœ…",
194
+ "partial_success": "⚠️",
195
+ "failed": "❌",
196
+ "unknown": "❓"
197
+ }.get(overall_status, "❓")
198
+
199
+ summary = f"{status_emoji} **Validation Results**\n\n"
200
+
201
+ # Overall status
202
+ summary += f"**Overall Status**: {overall_status.replace('_', ' ').title()}\n"
203
+ summary += f"**Completion Rate**: {completion_rate:.1%}\n\n"
204
+
205
+ # Critical errors
206
+ if critical_errors:
207
+ summary += f"**🚨 Critical Issues ({len(critical_errors)}):**\n"
208
+ for error in critical_errors[:3]: # Show first 3
209
+ summary += f"- {error}\n"
210
+ if len(critical_errors) > 3:
211
+ summary += f"- ... and {len(critical_errors) - 3} more\n"
212
+ summary += "\n"
213
+
214
+ # Warnings
215
+ if warnings:
216
+ summary += f"**⚠️ Warnings ({len(warnings)}):**\n"
217
+ for warning in warnings[:2]: # Show first 2
218
+ summary += f"- {warning}\n"
219
+ if len(warnings) > 2:
220
+ summary += f"- ... and {len(warnings) - 2} more\n"
221
+ summary += "\n"
222
+
223
+ # Recommendations
224
+ if recommendations:
225
+ summary += "**πŸ’‘ Recommendations:**\n"
226
+ for rec in recommendations[:3]: # Show first 3
227
+ summary += f"- {rec}\n"
228
+ if len(recommendations) > 3:
229
+ summary += f"- ... and {len(recommendations) - 3} more\n"
230
+ summary += "\n"
231
+
232
+ # Reprocessing decision
233
+ if should_reprocess:
234
+ summary += "πŸ”„ **Reprocessing recommended** to address issues and improve results."
235
+ else:
236
+ if overall_status == "success":
237
+ summary += "πŸŽ‰ **Processing complete!** No reprocessing needed."
238
+ else:
239
+ summary += "⏹️ **Processing complete.** Reprocessing not recommended."
240
+
241
+ return summary
src/state.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Graph state definition for the audio processing agent.
3
+ """
4
+
5
+ from typing import List, Dict, Any, Optional, Annotated
6
+ from langchain_core.messages import BaseMessage
7
+ from langgraph.graph.message import add_messages
8
+
9
+
10
+ class AudioProcessingState:
11
+ """State schema for the audio processing graph."""
12
+
13
+ # Chat history
14
+ messages: Annotated[List[BaseMessage], add_messages]
15
+
16
+ # Audio files provided by user
17
+ audio_files: List[str] # URLs or paths to audio files
18
+
19
+ # User's processing request
20
+ user_request: str
21
+
22
+ # Processing type determined by router
23
+ processing_type: str # "chat", "audio_processing", "dialogue_generation"
24
+
25
+ # Generated scripts with timestamps
26
+ scripts: Dict[str, Any] # {file_url: {transcript: str, timestamps: List}}
27
+
28
+ # Execution plan created by planner
29
+ execution_plan: List[Dict[str, Any]] # List of tool calls with parameters
30
+
31
+ # Processing results
32
+ processed_files: Dict[str, str] # {original_url: processed_url}
33
+
34
+ # Processing steps completed
35
+ completed_steps: List[str]
36
+
37
+ # Final output
38
+ final_audio_url: Optional[str]
39
+ final_response: str
40
+
41
+ # Error handling
42
+ errors: List[str]
43
+ needs_reprocessing: bool
44
+
45
+ # Metadata
46
+ processing_metadata: Dict[str, Any]
src/ui.py CHANGED
@@ -1,25 +1,41 @@
1
  import asyncio
2
  import gradio as gr
3
- from gradio import ChatMessage
4
  from .agent import AudioAgent
5
 
6
  # Global agent instance
7
  agent = AudioAgent()
8
 
9
- def user_input(user_message, history):
10
  """
11
- Handle user input and add to chat history
12
  """
13
- if not user_message.strip():
14
- return "", history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Add user message to history
17
- history.append({"role": "user", "content": user_message})
18
- return "", history
19
 
20
  async def bot_response(history):
21
  """
22
- Generate bot response with streaming, organizing content by graph nodes
23
  """
24
  if not history or history[-1]["role"] != "user":
25
  return
@@ -36,27 +52,37 @@ async def bot_response(history):
36
  yield history
37
 
38
  # Track current node and organize content by nodes
39
- current_content = ""
40
- current_node = None
41
  nodes_content = {}
 
42
 
43
  # Stream the response
44
  async for chunk, node_name in agent.stream_chat(user_message):
45
- # If we encounter a new node, update the display structure
46
- if node_name != current_node:
47
- current_node = node_name
48
- if node_name not in nodes_content:
49
- nodes_content[node_name] = ""
50
 
51
  # Add chunk to the current node's content
 
 
 
52
  if chunk:
53
  nodes_content[node_name] += chunk
54
 
55
  # Build the formatted content with node headers
56
  formatted_content = ""
 
57
  for node, content in nodes_content.items():
58
  if content.strip(): # Only show nodes that have content
59
- formatted_content += f"**πŸ”§ {node}**\n\n{content}\n\n"
 
 
 
 
 
 
 
 
60
 
61
  # Update the chat history
62
  history[-1]["content"] = formatted_content.rstrip()
@@ -70,6 +96,32 @@ async def bot_response(history):
70
  history.append({"role": "assistant", "content": f"❌ **Error**: {str(e)}"})
71
  yield history
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def bot_response_sync(history):
74
  """
75
  Synchronous wrapper for the async bot response
@@ -88,35 +140,115 @@ def bot_response_sync(history):
88
 
89
  def create_interface():
90
  """
91
- Create and return the Gradio interface
92
  """
93
- with gr.Blocks(title="Audio Agent Chatbot") as demo:
94
- gr.Markdown("# 🎡 Audio Agent Chatbot")
95
- gr.Markdown("Chat with your audio agent! Ask about available tools or audio processing.")
96
-
97
- chatbot = gr.Chatbot(
98
- type="messages",
99
- height=500,
100
- show_copy_button=True,
101
- show_share_button=False
102
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  with gr.Row():
105
  msg = gr.Textbox(
106
- label="Your Message",
107
- placeholder="Ask about audio tools or processing...",
108
- lines=2,
109
  scale=4
110
  )
111
- send_btn = gr.Button("Send", variant="primary", scale=1)
112
 
113
- clear_btn = gr.Button("Clear Chat", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  # Handle user input and bot response
 
 
 
116
  msg.submit(
117
- user_input,
118
- [msg, chatbot],
119
- [msg, chatbot],
120
  queue=False
121
  ).then(
122
  bot_response_sync,
@@ -125,9 +257,9 @@ def create_interface():
125
  )
126
 
127
  send_btn.click(
128
- user_input,
129
- [msg, chatbot],
130
- [msg, chatbot],
131
  queue=False
132
  ).then(
133
  bot_response_sync,
@@ -137,14 +269,29 @@ def create_interface():
137
 
138
  # Clear chat
139
  clear_btn.click(
140
- lambda: [],
141
  None,
142
- chatbot,
143
  queue=False
144
  )
 
 
 
 
 
 
 
 
 
 
145
 
146
  return demo
147
 
148
  if __name__ == "__main__":
149
  demo = create_interface()
150
- demo.launch(share=False, server_name="0.0.0.0", server_port=7861)
 
 
 
 
 
 
1
  import asyncio
2
  import gradio as gr
3
+ from typing import List, Tuple
4
  from .agent import AudioAgent
5
 
6
  # Global agent instance
7
  agent = AudioAgent()
8
 
9
+ def user_input(user_message, audio_files, history):
10
  """
11
+ Handle user input with text and audio files
12
  """
13
+ if not user_message.strip() and not audio_files:
14
+ return "", [], history
15
+
16
+ # Process audio files into URLs/paths
17
+ audio_file_paths = []
18
+ if audio_files:
19
+ for audio_file in audio_files:
20
+ if hasattr(audio_file, 'name'):
21
+ audio_file_paths.append(audio_file.name)
22
+ else:
23
+ audio_file_paths.append(str(audio_file))
24
+
25
+ # Create combined message with audio files
26
+ if audio_file_paths:
27
+ audio_list = "\n".join([f"Audio file: {path}" for path in audio_file_paths])
28
+ combined_message = f"{user_message}\n\n{audio_list}" if user_message.strip() else audio_list
29
+ else:
30
+ combined_message = user_message
31
 
32
  # Add user message to history
33
+ history.append({"role": "user", "content": combined_message})
34
+ return "", [], history
35
 
36
  async def bot_response(history):
37
  """
38
+ Generate bot response with streaming, organized by graph nodes
39
  """
40
  if not history or history[-1]["role"] != "user":
41
  return
 
52
  yield history
53
 
54
  # Track current node and organize content by nodes
 
 
55
  nodes_content = {}
56
+ processed_audio_urls = []
57
 
58
  # Stream the response
59
  async for chunk, node_name in agent.stream_chat(user_message):
60
+ # Check if this chunk contains an audio URL
61
+ if "Audio Ready" in chunk and "http" in chunk:
62
+ processed_audio_urls.append(chunk)
63
+ continue
 
64
 
65
  # Add chunk to the current node's content
66
+ if node_name not in nodes_content:
67
+ nodes_content[node_name] = ""
68
+
69
  if chunk:
70
  nodes_content[node_name] += chunk
71
 
72
  # Build the formatted content with node headers
73
  formatted_content = ""
74
+
75
  for node, content in nodes_content.items():
76
  if content.strip(): # Only show nodes that have content
77
+ node_emoji = get_node_emoji(node)
78
+ formatted_content += f"**{node_emoji} {format_node_name(node)}**\n\n{content}\n\n"
79
+
80
+ # Add processed audio URLs at the end
81
+ if processed_audio_urls:
82
+ formatted_content += "**🎡 Processed Audio Files:**\n"
83
+ for audio_url in processed_audio_urls:
84
+ formatted_content += f"{audio_url}\n"
85
+ formatted_content += "\n"
86
 
87
  # Update the chat history
88
  history[-1]["content"] = formatted_content.rstrip()
 
96
  history.append({"role": "assistant", "content": f"❌ **Error**: {str(e)}"})
97
  yield history
98
 
99
+ def get_node_emoji(node_name: str) -> str:
100
+ """Get emoji for different node types."""
101
+ node_emojis = {
102
+ "router": "πŸ”€",
103
+ "chat": "πŸ’¬",
104
+ "script_generator": "πŸ“",
105
+ "planner": "πŸ“‹",
106
+ "audio_processor": "πŸ”§",
107
+ "validator": "βœ…",
108
+ "final_response": "🎯"
109
+ }
110
+ return node_emojis.get(node_name, "βš™οΈ")
111
+
112
+ def format_node_name(node_name: str) -> str:
113
+ """Format node name for display."""
114
+ name_mapping = {
115
+ "router": "Routing Request",
116
+ "chat": "Chat Response",
117
+ "script_generator": "Generating Transcripts",
118
+ "planner": "Creating Execution Plan",
119
+ "audio_processor": "Processing Audio",
120
+ "validator": "Validating Results",
121
+ "final_response": "Final Results"
122
+ }
123
+ return name_mapping.get(node_name, node_name.replace("_", " ").title())
124
+
125
  def bot_response_sync(history):
126
  """
127
  Synchronous wrapper for the async bot response
 
140
 
141
  def create_interface():
142
  """
143
+ Create and return the enhanced Gradio interface
144
  """
145
+ with gr.Blocks(
146
+ title="Audio Agent - Professional Audio Processing",
147
+ theme=gr.themes.Soft(),
148
+ css="""
149
+ .audio-upload-area {
150
+ border: 2px dashed #ccc;
151
+ border-radius: 10px;
152
+ padding: 20px;
153
+ text-align: center;
154
+ margin: 10px 0;
155
+ }
156
+ .processed-audio {
157
+ background: #f0f9ff;
158
+ border: 1px solid #0891b2;
159
+ border-radius: 8px;
160
+ padding: 15px;
161
+ margin: 10px 0;
162
+ }
163
+ """
164
+ ) as demo:
165
+
166
+ gr.Markdown("""
167
+ # 🎡 Audio Agent - Professional Audio Processing
168
+
169
+ Upload audio files and describe what you want to achieve. I can remove filler words,
170
+ normalize volume, cut segments, combine files, and much more!
171
+
172
+ **Supported formats**: MP3, WAV, M4A, FLAC, AAC, OGG
173
+ """)
174
+
175
+ with gr.Row():
176
+ with gr.Column(scale=2):
177
+ chatbot = gr.Chatbot(
178
+ type="messages",
179
+ height=400,
180
+ show_copy_button=True,
181
+ show_share_button=False,
182
+ avatar_images=(None, "🎡"),
183
+ bubble_full_width=False
184
+ )
185
+
186
+ with gr.Column(scale=1):
187
+ gr.Markdown("### 🎡 Upload Audio Files")
188
+
189
+ audio_files = gr.File(
190
+ file_count="multiple",
191
+ file_types=["audio"],
192
+ label="Select Audio Files",
193
+ height=150
194
+ )
195
+
196
+ gr.Markdown("""
197
+ **Quick Examples:**
198
+ - "Remove filler words and normalize volume"
199
+ - "Cut this audio from 30 seconds to 2 minutes"
200
+ - "Combine these files into a dialogue"
201
+ - "Apply fade effects and enhance quality"
202
+ """)
203
 
204
  with gr.Row():
205
  msg = gr.Textbox(
206
+ label="Describe what you want to do",
207
+ placeholder="e.g., 'Remove filler words and improve audio quality' or 'What tools are available?'",
208
+ lines=3,
209
  scale=4
210
  )
211
+ send_btn = gr.Button("πŸš€ Process Audio", variant="primary", scale=1, size="lg")
212
 
213
+ with gr.Row():
214
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", variant="secondary")
215
+ examples_btn = gr.Button("πŸ’‘ Show Examples", variant="secondary")
216
+
217
+ # Examples section (initially hidden)
218
+ examples_section = gr.Markdown(
219
+ """
220
+ ### πŸ“š Example Requests
221
+
222
+ **Audio Enhancement:**
223
+ - "Clean up this recording - remove filler words and background noise"
224
+ - "Normalize the volume and add fade effects"
225
+ - "Make this audio sound more professional"
226
+
227
+ **Audio Editing:**
228
+ - "Cut the audio from 1:30 to 3:45"
229
+ - "Speed up this recording by 1.5x"
230
+ - "Reverse this audio clip"
231
+
232
+ **Dialogue Creation:**
233
+ - "Combine these two audio files into a conversation"
234
+ - "Create a dialogue with proper transitions between speakers"
235
+
236
+ **Information & Analysis:**
237
+ - "Generate a transcript with timestamps"
238
+ - "What audio processing tools are available?"
239
+ - "How does audio normalization work?"
240
+ """,
241
+ visible=False
242
+ )
243
 
244
  # Handle user input and bot response
245
+ def handle_submit(message, files, history):
246
+ return user_input(message, files, history)
247
+
248
  msg.submit(
249
+ handle_submit,
250
+ [msg, audio_files, chatbot],
251
+ [msg, audio_files, chatbot],
252
  queue=False
253
  ).then(
254
  bot_response_sync,
 
257
  )
258
 
259
  send_btn.click(
260
+ handle_submit,
261
+ [msg, audio_files, chatbot],
262
+ [msg, audio_files, chatbot],
263
  queue=False
264
  ).then(
265
  bot_response_sync,
 
269
 
270
  # Clear chat
271
  clear_btn.click(
272
+ lambda: ([], []),
273
  None,
274
+ [chatbot, audio_files],
275
  queue=False
276
  )
277
+
278
+ # Toggle examples
279
+ def toggle_examples(current_visibility):
280
+ return not current_visibility
281
+
282
+ examples_btn.click(
283
+ toggle_examples,
284
+ examples_section,
285
+ examples_section
286
+ )
287
 
288
  return demo
289
 
290
  if __name__ == "__main__":
291
  demo = create_interface()
292
+ demo.launch(
293
+ share=False,
294
+ server_name="0.0.0.0",
295
+ server_port=7861,
296
+ show_error=True
297
+ )