Spaces:
Sleeping
Sleeping
Commit
Β·
399d0c1
1
Parent(s):
9cce053
migrate to complicated agent
Browse files- src/agent.py +212 -38
- src/nodes/__init__.py +21 -0
- src/nodes/audio_processor.py +169 -0
- src/nodes/chat.py +132 -0
- src/nodes/final_response.py +205 -0
- src/nodes/planner.py +284 -0
- src/nodes/router.py +97 -0
- src/nodes/script_generator.py +159 -0
- src/nodes/validator.py +241 -0
- src/state.py +46 -0
- src/ui.py +188 -41
src/agent.py
CHANGED
@@ -1,14 +1,75 @@
|
|
1 |
import asyncio
|
|
|
2 |
from dotenv import load_dotenv
|
3 |
|
|
|
4 |
from langchain_mcp_adapters.client import MultiServerMCPClient
|
5 |
-
from langgraph.
|
6 |
-
from langgraph.graph.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
class AudioAgent:
|
9 |
"""
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"""
|
13 |
|
14 |
def __init__(
|
@@ -20,71 +81,184 @@ class AudioAgent:
|
|
20 |
self.model_name = model_name
|
21 |
self.server_url = server_url
|
22 |
|
23 |
-
# SSE client for
|
24 |
self._client = MultiServerMCPClient({
|
25 |
"audio-tools": {"url": self.server_url, "transport": "sse"}
|
26 |
})
|
27 |
|
28 |
-
self.
|
|
|
29 |
|
30 |
@property
|
31 |
def is_initialized(self) -> bool:
|
32 |
-
return self.
|
33 |
|
34 |
async def initialize(self) -> None:
|
35 |
-
"""
|
36 |
if self.is_initialized:
|
37 |
return
|
38 |
|
39 |
-
tools
|
40 |
-
|
|
|
41 |
raise RuntimeError("No tools available from MCP server")
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
def
|
52 |
-
"""
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
"""
|
59 |
-
One-shot chat: returns the full
|
60 |
"""
|
61 |
if not self.is_initialized:
|
62 |
await self.initialize()
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
async def stream_chat(self, prompt: str):
|
66 |
"""
|
67 |
-
Streaming chat:
|
68 |
"""
|
69 |
if not self.is_initialized:
|
70 |
await self.initialize()
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
):
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
async def main():
|
|
|
80 |
agent = AudioAgent()
|
81 |
-
|
82 |
-
|
83 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
-
# streaming example
|
86 |
-
async for msg, node in agent.stream_chat("Explain how audio normalization works."):
|
87 |
-
print(msg, end="", flush=True)
|
88 |
|
89 |
if __name__ == "__main__":
|
90 |
asyncio.run(main())
|
|
|
1 |
import asyncio
|
2 |
+
from typing import Dict, Any, TypedDict, Annotated, List
|
3 |
from dotenv import load_dotenv
|
4 |
|
5 |
+
from langchain_core.messages import BaseMessage
|
6 |
from langchain_mcp_adapters.client import MultiServerMCPClient
|
7 |
+
from langgraph.graph import StateGraph, END
|
8 |
+
from langgraph.graph.message import add_messages
|
9 |
+
from langgraph.checkpoint.memory import MemorySaver
|
10 |
+
|
11 |
+
from .nodes import (
|
12 |
+
router_node,
|
13 |
+
chat_node,
|
14 |
+
script_generator_node,
|
15 |
+
planner_node,
|
16 |
+
audio_processor_node,
|
17 |
+
validator_node,
|
18 |
+
final_response_node
|
19 |
+
)
|
20 |
+
|
21 |
+
|
22 |
+
class AudioProcessingState(TypedDict):
|
23 |
+
"""State schema for the audio processing graph."""
|
24 |
+
|
25 |
+
# Chat history
|
26 |
+
messages: Annotated[List[BaseMessage], add_messages]
|
27 |
+
|
28 |
+
# Audio files provided by user
|
29 |
+
audio_files: List[str]
|
30 |
+
|
31 |
+
# User's processing request
|
32 |
+
user_request: str
|
33 |
+
|
34 |
+
# Processing type determined by router
|
35 |
+
processing_type: str
|
36 |
+
|
37 |
+
# Generated scripts with timestamps
|
38 |
+
scripts: Dict[str, Any]
|
39 |
+
|
40 |
+
# Execution plan created by planner
|
41 |
+
execution_plan: List[Dict[str, Any]]
|
42 |
+
|
43 |
+
# Processing results
|
44 |
+
processed_files: Dict[str, str]
|
45 |
+
|
46 |
+
# Processing steps completed
|
47 |
+
completed_steps: List[str]
|
48 |
+
|
49 |
+
# Final output
|
50 |
+
final_audio_url: str
|
51 |
+
final_response: str
|
52 |
+
|
53 |
+
# Error handling
|
54 |
+
errors: List[str]
|
55 |
+
needs_reprocessing: bool
|
56 |
+
|
57 |
+
# Metadata
|
58 |
+
processing_metadata: Dict[str, Any]
|
59 |
+
|
60 |
|
61 |
class AudioAgent:
|
62 |
"""
|
63 |
+
Advanced LangGraph-based audio processing agent with custom nodes.
|
64 |
+
|
65 |
+
Handles audio file processing through a sophisticated workflow:
|
66 |
+
1. Router - Determines processing type
|
67 |
+
2. Chat or Audio Processing Pipeline
|
68 |
+
3. Script Generation - Creates timestamped transcripts
|
69 |
+
4. Planning - Creates execution plan
|
70 |
+
5. Processing - Executes audio tools
|
71 |
+
6. Validation - Checks results and determines reprocessing
|
72 |
+
7. Final Response - Formats output for user
|
73 |
"""
|
74 |
|
75 |
def __init__(
|
|
|
81 |
self.model_name = model_name
|
82 |
self.server_url = server_url
|
83 |
|
84 |
+
# SSE client for audio tools
|
85 |
self._client = MultiServerMCPClient({
|
86 |
"audio-tools": {"url": self.server_url, "transport": "sse"}
|
87 |
})
|
88 |
|
89 |
+
self._graph = None
|
90 |
+
self._tools = []
|
91 |
|
92 |
@property
|
93 |
def is_initialized(self) -> bool:
|
94 |
+
return self._graph is not None
|
95 |
|
96 |
async def initialize(self) -> None:
|
97 |
+
"""Initialize the LangGraph workflow with audio tools."""
|
98 |
if self.is_initialized:
|
99 |
return
|
100 |
|
101 |
+
# Get tools from MCP server
|
102 |
+
self._tools = await self._client.get_tools()
|
103 |
+
if not self._tools:
|
104 |
raise RuntimeError("No tools available from MCP server")
|
105 |
|
106 |
+
# Build the graph
|
107 |
+
self._graph = self._build_graph()
|
108 |
+
|
109 |
+
def _build_graph(self) -> StateGraph:
|
110 |
+
"""Build the LangGraph workflow."""
|
111 |
+
|
112 |
+
# Create the state graph
|
113 |
+
workflow = StateGraph(AudioProcessingState)
|
114 |
+
|
115 |
+
# Add nodes
|
116 |
+
workflow.add_node("router", router_node)
|
117 |
+
workflow.add_node("chat", chat_node)
|
118 |
+
workflow.add_node("script_generator", self._script_generator_with_tools)
|
119 |
+
workflow.add_node("planner", planner_node)
|
120 |
+
workflow.add_node("audio_processor", self._audio_processor_with_tools)
|
121 |
+
workflow.add_node("validator", validator_node)
|
122 |
+
workflow.add_node("final_response", final_response_node)
|
123 |
+
|
124 |
+
# Set entry point
|
125 |
+
workflow.set_entry_point("router")
|
126 |
+
|
127 |
+
# Add conditional edges based on processing type
|
128 |
+
workflow.add_conditional_edges(
|
129 |
+
"router",
|
130 |
+
self._route_processing_type,
|
131 |
+
{
|
132 |
+
"chat": "chat",
|
133 |
+
"audio_processing": "script_generator",
|
134 |
+
"dialogue_generation": "script_generator"
|
135 |
+
}
|
136 |
+
)
|
137 |
+
|
138 |
+
# Chat flow
|
139 |
+
workflow.add_edge("chat", "final_response")
|
140 |
+
|
141 |
+
# Audio processing flow
|
142 |
+
workflow.add_edge("script_generator", "planner")
|
143 |
+
workflow.add_edge("planner", "audio_processor")
|
144 |
+
workflow.add_edge("audio_processor", "validator")
|
145 |
+
|
146 |
+
# Validation flow with conditional reprocessing
|
147 |
+
workflow.add_conditional_edges(
|
148 |
+
"validator",
|
149 |
+
self._check_reprocessing_need,
|
150 |
+
{
|
151 |
+
"reprocess": "planner", # Go back to planning
|
152 |
+
"complete": "final_response"
|
153 |
+
}
|
154 |
)
|
155 |
+
|
156 |
+
# Final response leads to end
|
157 |
+
workflow.add_edge("final_response", END)
|
158 |
+
|
159 |
+
# Compile with memory for conversation history
|
160 |
+
memory = MemorySaver()
|
161 |
+
return workflow.compile(checkpointer=memory)
|
162 |
|
163 |
+
async def _script_generator_with_tools(self, state: Dict[str, Any]) -> Dict[str, Any]:
|
164 |
+
"""Script generator node with tools access."""
|
165 |
+
return await script_generator_node(state, self._tools)
|
166 |
+
|
167 |
+
async def _audio_processor_with_tools(self, state: Dict[str, Any]) -> Dict[str, Any]:
|
168 |
+
"""Audio processor node with tools access."""
|
169 |
+
return await audio_processor_node(state, self._tools)
|
170 |
+
|
171 |
+
def _route_processing_type(self, state: Dict[str, Any]) -> str:
|
172 |
+
"""Route based on processing type."""
|
173 |
+
return state.get("processing_type", "chat")
|
174 |
+
|
175 |
+
def _check_reprocessing_need(self, state: Dict[str, Any]) -> str:
|
176 |
+
"""Check if reprocessing is needed."""
|
177 |
+
if state.get("needs_reprocessing", False):
|
178 |
+
return "reprocess"
|
179 |
+
return "complete"
|
180 |
+
|
181 |
+
def process_user_input(self, user_input: str) -> Dict[str, Any]:
|
182 |
+
"""Process user input and create initial state."""
|
183 |
+
from langchain_core.messages import HumanMessage
|
184 |
+
|
185 |
+
return {
|
186 |
+
"messages": [HumanMessage(content=user_input)],
|
187 |
+
"audio_files": [],
|
188 |
+
"user_request": "",
|
189 |
+
"processing_type": "",
|
190 |
+
"scripts": {},
|
191 |
+
"execution_plan": [],
|
192 |
+
"processed_files": {},
|
193 |
+
"completed_steps": [],
|
194 |
+
"final_audio_url": "",
|
195 |
+
"final_response": "",
|
196 |
+
"errors": [],
|
197 |
+
"needs_reprocessing": False,
|
198 |
+
"processing_metadata": {}
|
199 |
+
}
|
200 |
+
|
201 |
+
async def chat(self, prompt: str) -> Dict[str, Any]:
|
202 |
"""
|
203 |
+
One-shot chat: returns the full processing result.
|
204 |
"""
|
205 |
if not self.is_initialized:
|
206 |
await self.initialize()
|
207 |
+
|
208 |
+
config = {"configurable": {"thread_id": "audio_agent_session"}}
|
209 |
+
initial_state = self.process_user_input(prompt)
|
210 |
+
|
211 |
+
result = await self._graph.ainvoke(initial_state, config)
|
212 |
+
return result
|
213 |
|
214 |
async def stream_chat(self, prompt: str):
|
215 |
"""
|
216 |
+
Streaming chat: yields intermediate results as processing continues.
|
217 |
"""
|
218 |
if not self.is_initialized:
|
219 |
await self.initialize()
|
220 |
|
221 |
+
config = {"configurable": {"thread_id": "audio_agent_session"}}
|
222 |
+
initial_state = self.process_user_input(prompt)
|
223 |
+
|
224 |
+
async for chunk in self._graph.astream(initial_state, config):
|
225 |
+
# Extract the node name and content
|
226 |
+
for node_name, node_output in chunk.items():
|
227 |
+
if node_name == "__end__":
|
228 |
+
continue
|
229 |
+
|
230 |
+
# Get the latest message if available
|
231 |
+
messages = node_output.get("messages", [])
|
232 |
+
if messages and hasattr(messages[-1], 'content'):
|
233 |
+
content = messages[-1].content
|
234 |
+
if content:
|
235 |
+
yield content, node_name
|
236 |
+
|
237 |
+
# Also yield final audio URL if available
|
238 |
+
final_audio_url = node_output.get("final_audio_url", "")
|
239 |
+
if final_audio_url:
|
240 |
+
yield f"\nπ΅ **Audio Ready**: [{final_audio_url}]({final_audio_url})", node_name
|
241 |
+
|
242 |
|
243 |
async def main():
|
244 |
+
"""Test the agent with various scenarios."""
|
245 |
agent = AudioAgent()
|
246 |
+
|
247 |
+
# Test 1: Chat about capabilities
|
248 |
+
print("=== Test 1: Chat Query ===")
|
249 |
+
result = await agent.chat("What audio tools are available?")
|
250 |
+
print("Final Response:", result.get("final_response", ""))
|
251 |
+
print()
|
252 |
+
|
253 |
+
# Test 2: Audio processing request
|
254 |
+
print("=== Test 2: Audio Processing ===")
|
255 |
+
audio_request = "Process this audio file https://example.com/audio.mp3 - remove filler words and normalize volume"
|
256 |
+
|
257 |
+
print("Streaming response:")
|
258 |
+
async for content, node in agent.stream_chat(audio_request):
|
259 |
+
print(f"[{node}] {content[:100]}..." if len(content) > 100 else f"[{node}] {content}")
|
260 |
+
print()
|
261 |
|
|
|
|
|
|
|
262 |
|
263 |
if __name__ == "__main__":
|
264 |
asyncio.run(main())
|
src/nodes/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Audio processing graph nodes.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from .router import router_node
|
6 |
+
from .chat import chat_node
|
7 |
+
from .script_generator import script_generator_node
|
8 |
+
from .planner import planner_node
|
9 |
+
from .audio_processor import audio_processor_node
|
10 |
+
from .validator import validator_node
|
11 |
+
from .final_response import final_response_node
|
12 |
+
|
13 |
+
__all__ = [
|
14 |
+
"router_node",
|
15 |
+
"chat_node",
|
16 |
+
"script_generator_node",
|
17 |
+
"planner_node",
|
18 |
+
"audio_processor_node",
|
19 |
+
"validator_node",
|
20 |
+
"final_response_node"
|
21 |
+
]
|
src/nodes/audio_processor.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Audio processor node for executing planned audio processing steps.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import Dict, Any, List
|
6 |
+
from langchain_core.messages import AIMessage
|
7 |
+
|
8 |
+
async def audio_processor_node(state: Dict[str, Any], tools: list) -> Dict[str, Any]:
|
9 |
+
"""
|
10 |
+
Execute the planned audio processing steps using available tools.
|
11 |
+
"""
|
12 |
+
|
13 |
+
execution_plan = state.get("execution_plan", [])
|
14 |
+
processed_files = state.get("processed_files", {})
|
15 |
+
completed_steps = state.get("completed_steps", [])
|
16 |
+
errors = state.get("errors", [])
|
17 |
+
|
18 |
+
if not execution_plan:
|
19 |
+
return {
|
20 |
+
"processed_files": processed_files,
|
21 |
+
"completed_steps": completed_steps,
|
22 |
+
"errors": errors + ["No execution plan available"],
|
23 |
+
"messages": state.get("messages", [])
|
24 |
+
}
|
25 |
+
|
26 |
+
# Create tool lookup
|
27 |
+
tool_lookup = {tool.name: tool for tool in tools}
|
28 |
+
|
29 |
+
# Execute each step in the plan
|
30 |
+
current_file_urls = {} # Track current URL for each original file
|
31 |
+
|
32 |
+
for step in execution_plan:
|
33 |
+
try:
|
34 |
+
step_name = step.get("step", "unknown")
|
35 |
+
tool_name = step.get("tool", "")
|
36 |
+
params = step.get("params", {})
|
37 |
+
description = step.get("description", "")
|
38 |
+
|
39 |
+
if tool_name not in tool_lookup:
|
40 |
+
if tool_name == "manual_combine":
|
41 |
+
# Handle manual combine step
|
42 |
+
result = handle_manual_combine(step, current_file_urls)
|
43 |
+
if result:
|
44 |
+
processed_files.update(result)
|
45 |
+
completed_steps.append(f"β
{description}")
|
46 |
+
else:
|
47 |
+
errors.append(f"β {description} - Manual combination needed")
|
48 |
+
else:
|
49 |
+
errors.append(f"β Tool '{tool_name}' not available for step: {description}")
|
50 |
+
continue
|
51 |
+
|
52 |
+
# Get the tool and execute
|
53 |
+
tool = tool_lookup[tool_name]
|
54 |
+
|
55 |
+
# Update file URL if this file has been processed before
|
56 |
+
original_file = params.get("audio_file", "")
|
57 |
+
if original_file in current_file_urls:
|
58 |
+
params["audio_file"] = current_file_urls[original_file]
|
59 |
+
|
60 |
+
# Execute the tool
|
61 |
+
result = await tool.ainvoke(params)
|
62 |
+
|
63 |
+
# Extract new file URL from result if available
|
64 |
+
new_file_url = extract_file_url_from_result(result, original_file)
|
65 |
+
if new_file_url and new_file_url != params["audio_file"]:
|
66 |
+
current_file_urls[original_file] = new_file_url
|
67 |
+
processed_files[original_file] = new_file_url
|
68 |
+
|
69 |
+
completed_steps.append(f"β
{description}")
|
70 |
+
|
71 |
+
except Exception as e:
|
72 |
+
error_msg = f"β Failed step '{step.get('description', 'unknown')}': {str(e)}"
|
73 |
+
errors.append(error_msg)
|
74 |
+
|
75 |
+
# Create processing summary
|
76 |
+
processing_summary = create_processing_summary(completed_steps, errors, processed_files)
|
77 |
+
messages = state.get("messages", [])
|
78 |
+
messages.append(AIMessage(content=processing_summary))
|
79 |
+
|
80 |
+
# Determine if reprocessing is needed
|
81 |
+
needs_reprocessing = len(errors) > 0 and len(completed_steps) > 0
|
82 |
+
|
83 |
+
return {
|
84 |
+
"processed_files": processed_files,
|
85 |
+
"completed_steps": completed_steps,
|
86 |
+
"errors": errors,
|
87 |
+
"needs_reprocessing": needs_reprocessing,
|
88 |
+
"messages": messages
|
89 |
+
}
|
90 |
+
|
91 |
+
|
92 |
+
def extract_file_url_from_result(result, original_file: str) -> str:
|
93 |
+
"""Extract the new file URL from tool result."""
|
94 |
+
|
95 |
+
if hasattr(result, 'artifact') and result.artifact:
|
96 |
+
# If result has artifact with file info
|
97 |
+
if hasattr(result.artifact, 'url'):
|
98 |
+
return result.artifact.url
|
99 |
+
elif hasattr(result.artifact, 'path'):
|
100 |
+
return result.artifact.path
|
101 |
+
|
102 |
+
if hasattr(result, 'content'):
|
103 |
+
content = result.content
|
104 |
+
# Look for URLs in the content
|
105 |
+
import re
|
106 |
+
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+\.(mp3|wav|m4a|flac|aac|ogg)'
|
107 |
+
urls = re.findall(url_pattern, content, re.IGNORECASE)
|
108 |
+
if urls:
|
109 |
+
return urls[0]
|
110 |
+
|
111 |
+
# If no new URL found, return the original
|
112 |
+
return original_file
|
113 |
+
|
114 |
+
|
115 |
+
def handle_manual_combine(step: Dict[str, Any], current_file_urls: Dict[str, str]) -> Dict[str, str]:
|
116 |
+
"""Handle manual file combination step."""
|
117 |
+
|
118 |
+
files = step.get("params", {}).get("files", [])
|
119 |
+
|
120 |
+
if len(files) < 2:
|
121 |
+
return {}
|
122 |
+
|
123 |
+
# For now, just return a placeholder combined file
|
124 |
+
# In a real implementation, this would call a combine tool
|
125 |
+
combined_url = f"combined_dialogue_{len(files)}_files.mp3"
|
126 |
+
|
127 |
+
return {"combined_dialogue": combined_url}
|
128 |
+
|
129 |
+
|
130 |
+
def create_processing_summary(completed_steps: List[str], errors: List[str], processed_files: Dict[str, str]) -> str:
|
131 |
+
"""Create a summary of the processing results."""
|
132 |
+
|
133 |
+
summary = "π§ **Audio Processing Complete**\n\n"
|
134 |
+
|
135 |
+
# Completed steps
|
136 |
+
if completed_steps:
|
137 |
+
summary += f"**β
Completed Steps ({len(completed_steps)}):**\n"
|
138 |
+
for step in completed_steps[-5:]: # Show last 5 steps
|
139 |
+
summary += f"- {step}\n"
|
140 |
+
if len(completed_steps) > 5:
|
141 |
+
summary += f"- ... and {len(completed_steps) - 5} more steps\n"
|
142 |
+
summary += "\n"
|
143 |
+
|
144 |
+
# Processed files
|
145 |
+
if processed_files:
|
146 |
+
summary += "**π΅ Processed Audio Files:**\n"
|
147 |
+
for original, processed in processed_files.items():
|
148 |
+
filename = original.split('/')[-1] if '/' in original else original
|
149 |
+
processed_filename = processed.split('/')[-1] if '/' in processed else processed
|
150 |
+
summary += f"- {filename} β {processed_filename}\n"
|
151 |
+
summary += "\n"
|
152 |
+
|
153 |
+
# Errors
|
154 |
+
if errors:
|
155 |
+
summary += f"**β οΈ Issues Encountered ({len(errors)}):**\n"
|
156 |
+
for error in errors[-3:]: # Show last 3 errors
|
157 |
+
summary += f"- {error}\n"
|
158 |
+
if len(errors) > 3:
|
159 |
+
summary += f"- ... and {len(errors) - 3} more issues\n"
|
160 |
+
summary += "\n"
|
161 |
+
|
162 |
+
if processed_files and not errors:
|
163 |
+
summary += "π **All processing completed successfully!**"
|
164 |
+
elif processed_files and errors:
|
165 |
+
summary += "β οΈ **Processing completed with some issues. Validation recommended.**"
|
166 |
+
else:
|
167 |
+
summary += "β **Processing failed. Please check the issues above.**"
|
168 |
+
|
169 |
+
return summary
|
src/nodes/chat.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Chat node for handling general questions and conversations.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import Dict, Any
|
6 |
+
from langchain_core.messages import AIMessage
|
7 |
+
|
8 |
+
|
9 |
+
def chat_node(state: Dict[str, Any]) -> Dict[str, Any]:
|
10 |
+
"""
|
11 |
+
Handle general chat messages and questions about audio capabilities.
|
12 |
+
"""
|
13 |
+
|
14 |
+
user_request = state.get("user_request", "")
|
15 |
+
|
16 |
+
# Generate response based on the user's question
|
17 |
+
response = generate_chat_response(user_request)
|
18 |
+
|
19 |
+
# Add AI response to messages
|
20 |
+
messages = state.get("messages", [])
|
21 |
+
messages.append(AIMessage(content=response))
|
22 |
+
|
23 |
+
return {
|
24 |
+
"messages": messages,
|
25 |
+
"final_response": response
|
26 |
+
}
|
27 |
+
|
28 |
+
|
29 |
+
def generate_chat_response(user_request: str) -> str:
|
30 |
+
"""Generate appropriate chat response."""
|
31 |
+
|
32 |
+
user_lower = user_request.lower()
|
33 |
+
|
34 |
+
# Audio tools information
|
35 |
+
if any(keyword in user_lower for keyword in ["tools", "available", "capabilities", "what can"]):
|
36 |
+
return """
|
37 |
+
π΅ **Audio Agent Capabilities**
|
38 |
+
|
39 |
+
I can help you process and improve audio files using these tools:
|
40 |
+
|
41 |
+
**π Analysis & Information:**
|
42 |
+
- Get audio duration and metadata
|
43 |
+
- Generate timestamped transcriptions
|
44 |
+
- Analyze audio properties
|
45 |
+
|
46 |
+
**βοΈ Audio Editing:**
|
47 |
+
- Cut and trim audio segments
|
48 |
+
- Remove silence from recordings
|
49 |
+
- Apply fade in/out effects
|
50 |
+
- Reverse audio playback
|
51 |
+
|
52 |
+
**π§ Audio Enhancement:**
|
53 |
+
- Normalize audio levels (-20dB to 0dB)
|
54 |
+
- Adjust volume/gain (-20dB to +20dB)
|
55 |
+
- Change playback speed (0.25x to 4x)
|
56 |
+
|
57 |
+
**π Advanced Processing:**
|
58 |
+
- Remove filler words from speech
|
59 |
+
- Combine multiple audio files into dialogue
|
60 |
+
- Create professional audio workflows
|
61 |
+
|
62 |
+
To get started, simply provide audio file URLs and describe what you'd like me to do!
|
63 |
+
"""
|
64 |
+
|
65 |
+
# How to use instructions
|
66 |
+
if any(keyword in user_lower for keyword in ["how", "use", "start", "begin"]):
|
67 |
+
return """
|
68 |
+
π **How to Use the Audio Agent**
|
69 |
+
|
70 |
+
1. **Provide Audio Files**: Share URLs to your audio files (mp3, wav, m4a, etc.)
|
71 |
+
|
72 |
+
2. **Describe Your Goal**: Tell me what you want to achieve:
|
73 |
+
- "Remove filler words and improve audio quality"
|
74 |
+
- "Cut this audio from 30 seconds to 2 minutes"
|
75 |
+
- "Combine these files into a dialogue"
|
76 |
+
- "Normalize the volume and add fade effects"
|
77 |
+
|
78 |
+
3. **Let Me Work**: I'll automatically:
|
79 |
+
- Generate timestamped transcripts
|
80 |
+
- Create an execution plan
|
81 |
+
- Process your audio step by step
|
82 |
+
- Provide you with the improved audio file
|
83 |
+
|
84 |
+
**Example**:
|
85 |
+
"Here's my recording: https://example.com/audio.mp3 - please remove filler words and normalize the volume"
|
86 |
+
"""
|
87 |
+
|
88 |
+
# Filler words explanation
|
89 |
+
if any(keyword in user_lower for keyword in ["filler", "um", "uh", "like"]):
|
90 |
+
return """
|
91 |
+
π£οΈ **Filler Word Removal**
|
92 |
+
|
93 |
+
I can help remove common filler words like "um", "uh", "like", "you know", etc. from your audio.
|
94 |
+
|
95 |
+
**Process**:
|
96 |
+
1. I'll transcribe your audio with timestamps
|
97 |
+
2. Identify filler words and their locations
|
98 |
+
3. Remove those segments from the audio
|
99 |
+
4. Apply smooth transitions to maintain natural flow
|
100 |
+
|
101 |
+
**Benefits**:
|
102 |
+
- More professional-sounding recordings
|
103 |
+
- Improved clarity and pace
|
104 |
+
- Better listener engagement
|
105 |
+
|
106 |
+
Just provide your audio file and mention "remove filler words" in your request!
|
107 |
+
"""
|
108 |
+
|
109 |
+
# General greeting or unclear request
|
110 |
+
if any(keyword in user_lower for keyword in ["hello", "hi", "help"]) or len(user_request.strip()) < 10:
|
111 |
+
return """
|
112 |
+
π **Hello! I'm your Audio Processing Assistant**
|
113 |
+
|
114 |
+
I specialize in improving and processing audio files. I can:
|
115 |
+
|
116 |
+
- Remove filler words and improve speech clarity
|
117 |
+
- Cut, trim, and edit audio segments
|
118 |
+
- Normalize volume and apply professional effects
|
119 |
+
- Combine multiple files into conversations
|
120 |
+
- Generate timestamped transcriptions
|
121 |
+
|
122 |
+
**Ready to enhance your audio?** Just share your audio file URLs and tell me what you'd like me to do!
|
123 |
+
|
124 |
+
Type "what tools are available?" to see all my capabilities.
|
125 |
+
"""
|
126 |
+
|
127 |
+
# Default response for other questions
|
128 |
+
return """
|
129 |
+
I'm here to help with audio processing! While I can chat about audio-related topics, my specialty is improving audio files.
|
130 |
+
|
131 |
+
If you have audio files you'd like me to process, just share the URLs and describe what you need. Otherwise, feel free to ask me about my audio processing capabilities!
|
132 |
+
"""
|
src/nodes/final_response.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Final response node for formatting the final response to the user.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import Dict, Any
|
6 |
+
from langchain_core.messages import AIMessage
|
7 |
+
|
8 |
+
|
9 |
+
def final_response_node(state: Dict[str, Any]) -> Dict[str, Any]:
|
10 |
+
"""
|
11 |
+
Generate the final response to the user with processing results and audio files.
|
12 |
+
"""
|
13 |
+
|
14 |
+
processing_type = state.get("processing_type", "")
|
15 |
+
processed_files = state.get("processed_files", {})
|
16 |
+
scripts = state.get("scripts", {})
|
17 |
+
errors = state.get("errors", [])
|
18 |
+
processing_metadata = state.get("processing_metadata", {})
|
19 |
+
user_request = state.get("user_request", "")
|
20 |
+
|
21 |
+
# Generate final response based on processing type
|
22 |
+
if processing_type == "chat":
|
23 |
+
# Chat responses are already handled in chat_node
|
24 |
+
final_response = state.get("final_response", "")
|
25 |
+
else:
|
26 |
+
final_response = create_audio_processing_response(
|
27 |
+
user_request, processed_files, scripts, errors, processing_metadata
|
28 |
+
)
|
29 |
+
|
30 |
+
# Add final response to messages
|
31 |
+
messages = state.get("messages", [])
|
32 |
+
if not any(msg.content == final_response for msg in messages if hasattr(msg, 'content')):
|
33 |
+
messages.append(AIMessage(content=final_response))
|
34 |
+
|
35 |
+
# Set final audio URL if available
|
36 |
+
final_audio_url = get_final_audio_url(processed_files, processing_type)
|
37 |
+
|
38 |
+
return {
|
39 |
+
"final_response": final_response,
|
40 |
+
"final_audio_url": final_audio_url,
|
41 |
+
"messages": messages
|
42 |
+
}
|
43 |
+
|
44 |
+
|
45 |
+
def create_audio_processing_response(
|
46 |
+
user_request: str,
|
47 |
+
processed_files: Dict[str, str],
|
48 |
+
scripts: Dict[str, Any],
|
49 |
+
errors: list,
|
50 |
+
processing_metadata: Dict[str, Any]
|
51 |
+
) -> str:
|
52 |
+
"""Create comprehensive audio processing response."""
|
53 |
+
|
54 |
+
response = "π΅ **Audio Processing Complete!**\n\n"
|
55 |
+
|
56 |
+
# User request summary
|
57 |
+
response += f"**Your Request**: {user_request}\n\n"
|
58 |
+
|
59 |
+
# Processing results
|
60 |
+
if processed_files:
|
61 |
+
response += "**β
Successfully Processed Files:**\n"
|
62 |
+
for i, (original, processed) in enumerate(processed_files.items(), 1):
|
63 |
+
original_name = get_filename_from_url(original)
|
64 |
+
processed_name = get_filename_from_url(processed)
|
65 |
+
|
66 |
+
response += f"{i}. **{original_name}**\n"
|
67 |
+
response += f" π **Download**: [{processed_name}]({processed})\n\n"
|
68 |
+
|
69 |
+
# Add script info if available
|
70 |
+
if original in scripts:
|
71 |
+
script_data = scripts[original]
|
72 |
+
filler_count = len(script_data.get("filler_words", []))
|
73 |
+
if filler_count > 0:
|
74 |
+
response += f" π Removed {filler_count} filler words\n"
|
75 |
+
response += f" π Transcript available\n\n"
|
76 |
+
|
77 |
+
# Processing summary
|
78 |
+
validation_results = processing_metadata.get("validation_results", {})
|
79 |
+
if validation_results:
|
80 |
+
completion_rate = validation_results.get("step_completion_rate", 0)
|
81 |
+
overall_status = validation_results.get("overall_status", "unknown")
|
82 |
+
|
83 |
+
response += f"**π Processing Summary:**\n"
|
84 |
+
response += f"- Status: {overall_status.replace('_', ' ').title()}\n"
|
85 |
+
response += f"- Completion: {completion_rate:.1%}\n"
|
86 |
+
response += f"- Files processed: {len(processed_files)}\n\n"
|
87 |
+
|
88 |
+
# Improvements made
|
89 |
+
improvements = extract_improvements_from_processing(processed_files, scripts, processing_metadata)
|
90 |
+
if improvements:
|
91 |
+
response += "**π§ Improvements Applied:**\n"
|
92 |
+
for improvement in improvements:
|
93 |
+
response += f"- {improvement}\n"
|
94 |
+
response += "\n"
|
95 |
+
|
96 |
+
# Recommendations
|
97 |
+
recommendations = validation_results.get("recommendations", [])
|
98 |
+
if recommendations:
|
99 |
+
response += "**π‘ Recommendations:**\n"
|
100 |
+
for rec in recommendations[:3]: # Show top 3
|
101 |
+
response += f"- {rec}\n"
|
102 |
+
response += "\n"
|
103 |
+
|
104 |
+
# Errors (if any)
|
105 |
+
if errors:
|
106 |
+
response += "**β οΈ Issues Encountered:**\n"
|
107 |
+
for error in errors[-2:]: # Show last 2 errors
|
108 |
+
response += f"- {error}\n"
|
109 |
+
response += "\n"
|
110 |
+
|
111 |
+
# Call to action
|
112 |
+
if processed_files:
|
113 |
+
response += "π **Your enhanced audio files are ready!** "
|
114 |
+
response += "Click the download links above to get your improved audio.\n\n"
|
115 |
+
response += "Need further adjustments? Just let me know what else you'd like me to do!"
|
116 |
+
else:
|
117 |
+
response += "β **Processing unsuccessful.** "
|
118 |
+
response += "Please check your audio file URLs and try again, or ask for help with a different approach."
|
119 |
+
|
120 |
+
return response
|
121 |
+
|
122 |
+
|
123 |
+
def get_final_audio_url(processed_files: Dict[str, str], processing_type: str) -> str:
|
124 |
+
"""Get the final audio URL to return to the user."""
|
125 |
+
|
126 |
+
if not processed_files:
|
127 |
+
return ""
|
128 |
+
|
129 |
+
# For dialogue generation, look for combined file
|
130 |
+
if processing_type == "dialogue_generation":
|
131 |
+
for original, processed in processed_files.items():
|
132 |
+
if "combined" in processed or "dialogue" in processed:
|
133 |
+
return processed
|
134 |
+
|
135 |
+
# For single file processing, return the processed file
|
136 |
+
if len(processed_files) == 1:
|
137 |
+
return list(processed_files.values())[0]
|
138 |
+
|
139 |
+
# For multiple files, return the first one (or could be user's choice)
|
140 |
+
return list(processed_files.values())[0] if processed_files else ""
|
141 |
+
|
142 |
+
|
143 |
+
def get_filename_from_url(url: str) -> str:
|
144 |
+
"""Extract filename from URL or path."""
|
145 |
+
if not url:
|
146 |
+
return "unknown_file"
|
147 |
+
|
148 |
+
# Extract filename from URL
|
149 |
+
if '/' in url:
|
150 |
+
filename = url.split('/')[-1]
|
151 |
+
else:
|
152 |
+
filename = url
|
153 |
+
|
154 |
+
# Remove query parameters if present
|
155 |
+
if '?' in filename:
|
156 |
+
filename = filename.split('?')[0]
|
157 |
+
|
158 |
+
return filename or "processed_audio"
|
159 |
+
|
160 |
+
|
161 |
+
def extract_improvements_from_processing(
|
162 |
+
processed_files: Dict[str, str],
|
163 |
+
scripts: Dict[str, Any],
|
164 |
+
processing_metadata: Dict[str, Any]
|
165 |
+
) -> list:
|
166 |
+
"""Extract list of improvements made during processing."""
|
167 |
+
|
168 |
+
improvements = []
|
169 |
+
|
170 |
+
# Check for filler word removal
|
171 |
+
total_fillers = 0
|
172 |
+
for script_data in scripts.values():
|
173 |
+
filler_words = script_data.get("filler_words", [])
|
174 |
+
total_fillers += len(filler_words)
|
175 |
+
|
176 |
+
if total_fillers > 0:
|
177 |
+
improvements.append(f"Removed {total_fillers} filler words for cleaner speech")
|
178 |
+
|
179 |
+
# Check for audio enhancement
|
180 |
+
if processed_files:
|
181 |
+
improvements.append("Enhanced audio quality and consistency")
|
182 |
+
improvements.append("Optimized volume levels and normalization")
|
183 |
+
|
184 |
+
# Check for silence removal
|
185 |
+
validation_results = processing_metadata.get("validation_results", {})
|
186 |
+
recommendations = validation_results.get("recommendations", [])
|
187 |
+
|
188 |
+
if any("silence" in rec.lower() for rec in recommendations):
|
189 |
+
improvements.append("Removed unnecessary silence and gaps")
|
190 |
+
|
191 |
+
if any("fade" in rec.lower() for rec in recommendations):
|
192 |
+
improvements.append("Added professional fade effects")
|
193 |
+
|
194 |
+
if any("cut" in rec.lower() for rec in recommendations):
|
195 |
+
improvements.append("Precisely cut and trimmed audio segments")
|
196 |
+
|
197 |
+
# Default improvements if files were processed
|
198 |
+
if processed_files and not improvements:
|
199 |
+
improvements.extend([
|
200 |
+
"Applied professional audio processing",
|
201 |
+
"Improved overall audio quality",
|
202 |
+
"Optimized for better listening experience"
|
203 |
+
])
|
204 |
+
|
205 |
+
return improvements
|
src/nodes/planner.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Planner node for creating execution plans for audio processing.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import Dict, Any, List
|
6 |
+
from langchain_core.messages import AIMessage
|
7 |
+
|
8 |
+
|
9 |
+
def planner_node(state: Dict[str, Any]) -> Dict[str, Any]:
|
10 |
+
"""
|
11 |
+
Create an execution plan for audio processing based on user request and scripts.
|
12 |
+
"""
|
13 |
+
|
14 |
+
user_request = state.get("user_request", "")
|
15 |
+
audio_files = state.get("audio_files", [])
|
16 |
+
scripts = state.get("scripts", {})
|
17 |
+
processing_type = state.get("processing_type", "")
|
18 |
+
|
19 |
+
# Create execution plan based on processing type and user request
|
20 |
+
if processing_type == "dialogue_generation":
|
21 |
+
execution_plan = create_dialogue_plan(user_request, audio_files, scripts)
|
22 |
+
else:
|
23 |
+
execution_plan = create_audio_processing_plan(user_request, audio_files, scripts)
|
24 |
+
|
25 |
+
# Create plan summary message
|
26 |
+
plan_summary = create_plan_summary(execution_plan)
|
27 |
+
messages = state.get("messages", [])
|
28 |
+
messages.append(AIMessage(content=plan_summary))
|
29 |
+
|
30 |
+
return {
|
31 |
+
"execution_plan": execution_plan,
|
32 |
+
"messages": messages
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
def create_audio_processing_plan(user_request: str, audio_files: List[str], scripts: Dict[str, Any]) -> List[Dict[str, Any]]:
|
37 |
+
"""Create execution plan for single file audio processing."""
|
38 |
+
|
39 |
+
plan = []
|
40 |
+
user_lower = user_request.lower()
|
41 |
+
|
42 |
+
for audio_file in audio_files:
|
43 |
+
file_plan = []
|
44 |
+
|
45 |
+
# Step 1: Update audio info
|
46 |
+
file_plan.append({
|
47 |
+
"step": f"update_info_{audio_file}",
|
48 |
+
"tool": "update_audio_info",
|
49 |
+
"params": {"audio_file": audio_file},
|
50 |
+
"description": f"Update audio information for {audio_file}"
|
51 |
+
})
|
52 |
+
|
53 |
+
# Step 2: Update duration info
|
54 |
+
file_plan.append({
|
55 |
+
"step": f"update_duration_{audio_file}",
|
56 |
+
"tool": "update_duration_info",
|
57 |
+
"params": {"audio_file": audio_file},
|
58 |
+
"description": f"Update duration information for {audio_file}"
|
59 |
+
})
|
60 |
+
|
61 |
+
# Step 3: Process based on user request
|
62 |
+
|
63 |
+
# Filler word removal (via silence trimming and cutting)
|
64 |
+
if any(keyword in user_lower for keyword in ["filler", "remove", "clean", "improve"]):
|
65 |
+
# First, trim silence
|
66 |
+
file_plan.append({
|
67 |
+
"step": f"trim_silence_{audio_file}",
|
68 |
+
"tool": "apply_silence_trimming",
|
69 |
+
"params": {"audio_file": audio_file, "threshold_db": -40},
|
70 |
+
"description": f"Remove silence and filler segments from {audio_file}"
|
71 |
+
})
|
72 |
+
|
73 |
+
# Apply filler word removal via cutting (using script data)
|
74 |
+
if audio_file in scripts and scripts[audio_file].get("filler_words"):
|
75 |
+
file_plan.append({
|
76 |
+
"step": f"remove_fillers_{audio_file}",
|
77 |
+
"tool": "process_cut_audio",
|
78 |
+
"params": {"audio_file": audio_file, "_start_time": 0, "_end_time": 100},
|
79 |
+
"description": f"Remove filler words from {audio_file}",
|
80 |
+
"filler_data": scripts[audio_file]["filler_words"]
|
81 |
+
})
|
82 |
+
|
83 |
+
# Audio cutting/trimming
|
84 |
+
if any(keyword in user_lower for keyword in ["cut", "trim", "segment"]):
|
85 |
+
# Extract time ranges if specified
|
86 |
+
start_time, end_time = extract_time_range(user_request)
|
87 |
+
file_plan.append({
|
88 |
+
"step": f"cut_audio_{audio_file}",
|
89 |
+
"tool": "process_cut_audio",
|
90 |
+
"params": {"audio_file": audio_file, "_start_time": start_time, "_end_time": end_time},
|
91 |
+
"description": f"Cut audio from {start_time}s to {end_time}s"
|
92 |
+
})
|
93 |
+
|
94 |
+
# Volume/normalization adjustments
|
95 |
+
if any(keyword in user_lower for keyword in ["normalize", "volume", "loud", "quiet", "level"]):
|
96 |
+
if "normalize" in user_lower:
|
97 |
+
target_level = extract_target_level(user_request)
|
98 |
+
file_plan.append({
|
99 |
+
"step": f"normalize_{audio_file}",
|
100 |
+
"tool": "apply_normalization",
|
101 |
+
"params": {"audio_file": audio_file, "target_level": target_level},
|
102 |
+
"description": f"Normalize audio to {target_level}dB"
|
103 |
+
})
|
104 |
+
else:
|
105 |
+
gain_db = extract_gain_value(user_request)
|
106 |
+
file_plan.append({
|
107 |
+
"step": f"adjust_volume_{audio_file}",
|
108 |
+
"tool": "apply_volume_adjustment",
|
109 |
+
"params": {"audio_file": audio_file, "gain_db": gain_db},
|
110 |
+
"description": f"Adjust volume by {gain_db}dB"
|
111 |
+
})
|
112 |
+
|
113 |
+
# Speed adjustments
|
114 |
+
if any(keyword in user_lower for keyword in ["speed", "fast", "slow", "tempo"]):
|
115 |
+
speed_factor = extract_speed_factor(user_request)
|
116 |
+
file_plan.append({
|
117 |
+
"step": f"adjust_speed_{audio_file}",
|
118 |
+
"tool": "apply_speed_adjustment",
|
119 |
+
"params": {"audio_file": audio_file, "speed_factor": speed_factor},
|
120 |
+
"description": f"Adjust speed to {speed_factor}x"
|
121 |
+
})
|
122 |
+
|
123 |
+
# Fade effects
|
124 |
+
if any(keyword in user_lower for keyword in ["fade", "smooth", "transition"]):
|
125 |
+
fade_in, fade_out = extract_fade_values(user_request)
|
126 |
+
file_plan.append({
|
127 |
+
"step": f"apply_fades_{audio_file}",
|
128 |
+
"tool": "apply_fades",
|
129 |
+
"params": {"audio_file": audio_file, "fade_in_ms": fade_in, "fade_out_ms": fade_out},
|
130 |
+
"description": f"Apply fade in ({fade_in}ms) and fade out ({fade_out}ms)"
|
131 |
+
})
|
132 |
+
|
133 |
+
# If no specific processing mentioned, apply default enhancement
|
134 |
+
if len(file_plan) <= 2: # Only info updates
|
135 |
+
file_plan.extend([
|
136 |
+
{
|
137 |
+
"step": f"enhance_{audio_file}",
|
138 |
+
"tool": "apply_silence_trimming",
|
139 |
+
"params": {"audio_file": audio_file, "threshold_db": -40},
|
140 |
+
"description": f"Remove silence from {audio_file}"
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"step": f"normalize_{audio_file}",
|
144 |
+
"tool": "apply_normalization",
|
145 |
+
"params": {"audio_file": audio_file, "target_level": -3},
|
146 |
+
"description": f"Normalize audio levels"
|
147 |
+
}
|
148 |
+
])
|
149 |
+
|
150 |
+
plan.extend(file_plan)
|
151 |
+
|
152 |
+
return plan
|
153 |
+
|
154 |
+
|
155 |
+
def create_dialogue_plan(user_request: str, audio_files: List[str], scripts: Dict[str, Any]) -> List[Dict[str, Any]]:
|
156 |
+
"""Create execution plan for dialogue generation from multiple files."""
|
157 |
+
|
158 |
+
plan = []
|
159 |
+
|
160 |
+
# Step 1: Process each file individually first
|
161 |
+
for audio_file in audio_files:
|
162 |
+
# Update info
|
163 |
+
plan.append({
|
164 |
+
"step": f"update_info_{audio_file}",
|
165 |
+
"tool": "update_audio_info",
|
166 |
+
"params": {"audio_file": audio_file},
|
167 |
+
"description": f"Update audio info for {audio_file}"
|
168 |
+
})
|
169 |
+
|
170 |
+
# Clean up the audio
|
171 |
+
plan.append({
|
172 |
+
"step": f"cleanup_{audio_file}",
|
173 |
+
"tool": "apply_silence_trimming",
|
174 |
+
"params": {"audio_file": audio_file, "threshold_db": -40},
|
175 |
+
"description": f"Clean silence from {audio_file}"
|
176 |
+
})
|
177 |
+
|
178 |
+
# Normalize levels
|
179 |
+
plan.append({
|
180 |
+
"step": f"normalize_{audio_file}",
|
181 |
+
"tool": "apply_normalization",
|
182 |
+
"params": {"audio_file": audio_file, "target_level": -6},
|
183 |
+
"description": f"Normalize {audio_file} for dialogue"
|
184 |
+
})
|
185 |
+
|
186 |
+
# Add fades for smooth transitions
|
187 |
+
plan.append({
|
188 |
+
"step": f"fade_{audio_file}",
|
189 |
+
"tool": "apply_fades",
|
190 |
+
"params": {"audio_file": audio_file, "fade_in_ms": 200, "fade_out_ms": 200},
|
191 |
+
"description": f"Add fades to {audio_file}"
|
192 |
+
})
|
193 |
+
|
194 |
+
# Step 2: Combine files (this would need a combine tool, but we'll note it)
|
195 |
+
plan.append({
|
196 |
+
"step": "combine_dialogue",
|
197 |
+
"tool": "manual_combine", # This would need to be implemented
|
198 |
+
"params": {"files": audio_files},
|
199 |
+
"description": "Combine processed files into dialogue",
|
200 |
+
"note": "This step requires manual combination or a dedicated combine tool"
|
201 |
+
})
|
202 |
+
|
203 |
+
return plan
|
204 |
+
|
205 |
+
|
206 |
+
def extract_time_range(user_request: str) -> tuple:
|
207 |
+
"""Extract start and end times from user request."""
|
208 |
+
import re
|
209 |
+
|
210 |
+
# Look for time patterns like "30 seconds to 2 minutes" or "1:30 to 3:45"
|
211 |
+
time_pattern = r'(\d+):?(\d*)\s*(?:seconds?|minutes?|s|m)?\s*to\s*(\d+):?(\d*)\s*(?:seconds?|minutes?|s|m)?'
|
212 |
+
match = re.search(time_pattern, user_request.lower())
|
213 |
+
|
214 |
+
if match:
|
215 |
+
start_min, start_sec, end_min, end_sec = match.groups()
|
216 |
+
start_time = int(start_min) + (int(start_sec) if start_sec else 0) / 60
|
217 |
+
end_time = int(end_min) + (int(end_sec) if end_sec else 0) / 60
|
218 |
+
return start_time, end_time
|
219 |
+
|
220 |
+
# Default range
|
221 |
+
return 0, 30
|
222 |
+
|
223 |
+
|
224 |
+
def extract_target_level(user_request: str) -> float:
|
225 |
+
"""Extract target normalization level."""
|
226 |
+
import re
|
227 |
+
match = re.search(r'-?(\d+(?:\.\d+)?)\s*db', user_request.lower())
|
228 |
+
if match:
|
229 |
+
return -abs(float(match.group(1))) # Ensure negative
|
230 |
+
return -3 # Default
|
231 |
+
|
232 |
+
|
233 |
+
def extract_gain_value(user_request: str) -> float:
|
234 |
+
"""Extract gain adjustment value."""
|
235 |
+
import re
|
236 |
+
match = re.search(r'([+-]?\d+(?:\.\d+)?)\s*db', user_request.lower())
|
237 |
+
if match:
|
238 |
+
return float(match.group(1))
|
239 |
+
return 0 # Default
|
240 |
+
|
241 |
+
|
242 |
+
def extract_speed_factor(user_request: str) -> float:
|
243 |
+
"""Extract speed factor."""
|
244 |
+
import re
|
245 |
+
match = re.search(r'(\d+(?:\.\d+)?)\s*x', user_request.lower())
|
246 |
+
if match:
|
247 |
+
return float(match.group(1))
|
248 |
+
|
249 |
+
if any(word in user_request.lower() for word in ["fast", "faster", "quick"]):
|
250 |
+
return 1.5
|
251 |
+
elif any(word in user_request.lower() for word in ["slow", "slower"]):
|
252 |
+
return 0.75
|
253 |
+
|
254 |
+
return 1.0 # Default
|
255 |
+
|
256 |
+
|
257 |
+
def extract_fade_values(user_request: str) -> tuple:
|
258 |
+
"""Extract fade in/out values."""
|
259 |
+
import re
|
260 |
+
match = re.search(r'(\d+)\s*ms', user_request.lower())
|
261 |
+
if match:
|
262 |
+
value = int(match.group(1))
|
263 |
+
return value, value
|
264 |
+
return 100, 100 # Default
|
265 |
+
|
266 |
+
|
267 |
+
def create_plan_summary(execution_plan: List[Dict[str, Any]]) -> str:
|
268 |
+
"""Create a summary of the execution plan."""
|
269 |
+
|
270 |
+
if not execution_plan:
|
271 |
+
return "β **No execution plan could be created**"
|
272 |
+
|
273 |
+
summary = "π **Execution Plan Created**\n\n"
|
274 |
+
summary += f"**Total Steps**: {len(execution_plan)}\n\n"
|
275 |
+
|
276 |
+
for i, step in enumerate(execution_plan, 1):
|
277 |
+
tool_name = step.get("tool", "unknown")
|
278 |
+
description = step.get("description", "No description")
|
279 |
+
|
280 |
+
summary += f"**{i}.** `{tool_name}`\n"
|
281 |
+
summary += f" {description}\n\n"
|
282 |
+
|
283 |
+
summary += "β
**Ready to execute plan...**"
|
284 |
+
return summary
|
src/nodes/router.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Router node to determine processing type based on user input.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import re
|
6 |
+
from typing import Dict, Any, List
|
7 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
8 |
+
|
9 |
+
|
10 |
+
def router_node(state: Dict[str, Any]) -> Dict[str, Any]:
|
11 |
+
"""
|
12 |
+
Route the conversation based on user input.
|
13 |
+
|
14 |
+
Determines if this is:
|
15 |
+
- A general chat question
|
16 |
+
- Audio processing request
|
17 |
+
- Dialogue generation request
|
18 |
+
"""
|
19 |
+
|
20 |
+
# Get the latest user message
|
21 |
+
latest_message = None
|
22 |
+
for msg in reversed(state.get("messages", [])):
|
23 |
+
if isinstance(msg, HumanMessage):
|
24 |
+
latest_message = msg
|
25 |
+
break
|
26 |
+
|
27 |
+
if not latest_message:
|
28 |
+
return {
|
29 |
+
"processing_type": "chat",
|
30 |
+
"user_request": "",
|
31 |
+
"audio_files": []
|
32 |
+
}
|
33 |
+
|
34 |
+
user_content = latest_message.content.lower()
|
35 |
+
|
36 |
+
# Extract audio file URLs/paths from the message
|
37 |
+
audio_files = extract_audio_files(latest_message.content)
|
38 |
+
|
39 |
+
# Determine processing type
|
40 |
+
processing_type = determine_processing_type(user_content, audio_files)
|
41 |
+
|
42 |
+
return {
|
43 |
+
"processing_type": processing_type,
|
44 |
+
"user_request": latest_message.content,
|
45 |
+
"audio_files": audio_files,
|
46 |
+
"errors": [],
|
47 |
+
"needs_reprocessing": False,
|
48 |
+
"completed_steps": [],
|
49 |
+
"scripts": {},
|
50 |
+
"processed_files": {},
|
51 |
+
"processing_metadata": {}
|
52 |
+
}
|
53 |
+
|
54 |
+
|
55 |
+
def extract_audio_files(content: str) -> List[str]:
|
56 |
+
"""Extract audio file URLs or paths from user message."""
|
57 |
+
|
58 |
+
# Look for URLs (http/https)
|
59 |
+
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+\.(mp3|wav|m4a|flac|aac|ogg)'
|
60 |
+
urls = re.findall(url_pattern, content, re.IGNORECASE)
|
61 |
+
|
62 |
+
# Look for file paths
|
63 |
+
path_pattern = r'[^\s<>"{}|\\^`\[\]]+\.(mp3|wav|m4a|flac|aac|ogg)'
|
64 |
+
paths = re.findall(path_pattern, content, re.IGNORECASE)
|
65 |
+
|
66 |
+
# Combine and deduplicate
|
67 |
+
audio_files = list(set(urls + [path for path in paths if not path.startswith('http')]))
|
68 |
+
|
69 |
+
return audio_files
|
70 |
+
|
71 |
+
|
72 |
+
def determine_processing_type(content: str, audio_files: List[str]) -> str:
|
73 |
+
"""Determine the type of processing needed."""
|
74 |
+
|
75 |
+
# If no audio files, it's a chat
|
76 |
+
if not audio_files:
|
77 |
+
# Check if user is asking about audio tools or capabilities
|
78 |
+
audio_keywords = [
|
79 |
+
'audio', 'sound', 'music', 'voice', 'recording', 'transcript',
|
80 |
+
'cut', 'trim', 'normalize', 'volume', 'fade', 'speed', 'reverse'
|
81 |
+
]
|
82 |
+
|
83 |
+
if any(keyword in content for keyword in audio_keywords):
|
84 |
+
return "chat" # User asking about audio capabilities
|
85 |
+
|
86 |
+
return "chat"
|
87 |
+
|
88 |
+
# If audio files are present, determine processing type
|
89 |
+
dialogue_keywords = [
|
90 |
+
'dialogue', 'conversation', 'combine', 'merge', 'mix',
|
91 |
+
'conversation', 'discussion', 'interview'
|
92 |
+
]
|
93 |
+
|
94 |
+
if any(keyword in content for keyword in dialogue_keywords):
|
95 |
+
return "dialogue_generation"
|
96 |
+
|
97 |
+
return "audio_processing"
|
src/nodes/script_generator.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Script generator node for creating timestamped transcripts.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import Dict, Any
|
6 |
+
from langchain_core.messages import AIMessage
|
7 |
+
|
8 |
+
|
9 |
+
async def script_generator_node(state: Dict[str, Any], tools: list) -> Dict[str, Any]:
|
10 |
+
"""
|
11 |
+
Generate timestamped scripts for all audio files using transcription tools.
|
12 |
+
"""
|
13 |
+
|
14 |
+
audio_files = state.get("audio_files", [])
|
15 |
+
|
16 |
+
if not audio_files:
|
17 |
+
return {
|
18 |
+
"scripts": {},
|
19 |
+
"errors": ["No audio files provided for transcription"]
|
20 |
+
}
|
21 |
+
|
22 |
+
scripts = {}
|
23 |
+
errors = []
|
24 |
+
completed_steps = state.get("completed_steps", [])
|
25 |
+
|
26 |
+
# Get transcription tools
|
27 |
+
transcribe_tool = None
|
28 |
+
update_transcription_tool = None
|
29 |
+
|
30 |
+
for tool in tools:
|
31 |
+
if tool.name == "transcribe_audio_sync":
|
32 |
+
transcribe_tool = tool
|
33 |
+
elif tool.name == "update_transcription_info":
|
34 |
+
update_transcription_tool = tool
|
35 |
+
|
36 |
+
if not transcribe_tool:
|
37 |
+
return {
|
38 |
+
"scripts": {},
|
39 |
+
"errors": ["Transcription tool not available"]
|
40 |
+
}
|
41 |
+
|
42 |
+
# Process each audio file
|
43 |
+
for audio_file in audio_files:
|
44 |
+
try:
|
45 |
+
# Update transcription info first if tool is available
|
46 |
+
if update_transcription_tool:
|
47 |
+
await update_transcription_tool.ainvoke({"audio_file": audio_file})
|
48 |
+
|
49 |
+
# Generate transcript with timestamps
|
50 |
+
transcript_result = await transcribe_tool.ainvoke({"audio_file": audio_file})
|
51 |
+
|
52 |
+
# Parse the transcript result
|
53 |
+
if hasattr(transcript_result, 'content'):
|
54 |
+
transcript_content = transcript_result.content
|
55 |
+
else:
|
56 |
+
transcript_content = str(transcript_result)
|
57 |
+
|
58 |
+
scripts[audio_file] = {
|
59 |
+
"transcript": transcript_content,
|
60 |
+
"timestamps": extract_timestamps(transcript_content),
|
61 |
+
"filler_words": identify_filler_words(transcript_content)
|
62 |
+
}
|
63 |
+
|
64 |
+
completed_steps.append(f"Transcribed: {audio_file}")
|
65 |
+
|
66 |
+
except Exception as e:
|
67 |
+
errors.append(f"Failed to transcribe {audio_file}: {str(e)}")
|
68 |
+
|
69 |
+
# Create response message
|
70 |
+
script_summary = create_script_summary(scripts)
|
71 |
+
messages = state.get("messages", [])
|
72 |
+
messages.append(AIMessage(content=script_summary))
|
73 |
+
|
74 |
+
return {
|
75 |
+
"scripts": scripts,
|
76 |
+
"completed_steps": completed_steps,
|
77 |
+
"errors": errors,
|
78 |
+
"messages": messages
|
79 |
+
}
|
80 |
+
|
81 |
+
|
82 |
+
def extract_timestamps(transcript_content: str) -> list:
|
83 |
+
"""Extract timestamp information from transcript."""
|
84 |
+
# This is a simplified implementation
|
85 |
+
# In a real scenario, the transcription tool would provide proper timestamps
|
86 |
+
|
87 |
+
timestamps = []
|
88 |
+
lines = transcript_content.split('\n')
|
89 |
+
|
90 |
+
for i, line in enumerate(lines):
|
91 |
+
if line.strip():
|
92 |
+
# Estimate timestamps based on line position
|
93 |
+
start_time = i * 3.0 # Rough estimate of 3 seconds per line
|
94 |
+
end_time = start_time + 3.0
|
95 |
+
|
96 |
+
timestamps.append({
|
97 |
+
"start": start_time,
|
98 |
+
"end": end_time,
|
99 |
+
"text": line.strip()
|
100 |
+
})
|
101 |
+
|
102 |
+
return timestamps
|
103 |
+
|
104 |
+
|
105 |
+
def identify_filler_words(transcript_content: str) -> list:
|
106 |
+
"""Identify filler words and their approximate positions."""
|
107 |
+
|
108 |
+
filler_words = [
|
109 |
+
"um", "uh", "like", "you know", "so", "well", "actually",
|
110 |
+
"basically", "literally", "I mean", "sort of", "kind of"
|
111 |
+
]
|
112 |
+
|
113 |
+
found_fillers = []
|
114 |
+
words = transcript_content.lower().split()
|
115 |
+
|
116 |
+
for i, word in enumerate(words):
|
117 |
+
# Clean the word (remove punctuation)
|
118 |
+
clean_word = word.strip('.,!?;:"()[]{}')
|
119 |
+
|
120 |
+
if clean_word in filler_words:
|
121 |
+
found_fillers.append({
|
122 |
+
"word": clean_word,
|
123 |
+
"position": i,
|
124 |
+
"context": " ".join(words[max(0, i-2):min(len(words), i+3)])
|
125 |
+
})
|
126 |
+
|
127 |
+
return found_fillers
|
128 |
+
|
129 |
+
|
130 |
+
def create_script_summary(scripts: Dict[str, Any]) -> str:
|
131 |
+
"""Create a summary of the generated scripts."""
|
132 |
+
|
133 |
+
if not scripts:
|
134 |
+
return "β **Script Generation Failed**\n\nNo transcripts could be generated."
|
135 |
+
|
136 |
+
summary = "π **Transcripts Generated Successfully**\n\n"
|
137 |
+
|
138 |
+
for file_url, script_data in scripts.items():
|
139 |
+
filename = file_url.split('/')[-1] if '/' in file_url else file_url
|
140 |
+
transcript = script_data.get("transcript", "")
|
141 |
+
filler_count = len(script_data.get("filler_words", []))
|
142 |
+
timestamp_count = len(script_data.get("timestamps", []))
|
143 |
+
|
144 |
+
summary += f"**π΅ {filename}**\n"
|
145 |
+
summary += f"- Transcript length: {len(transcript)} characters\n"
|
146 |
+
summary += f"- Timestamps: {timestamp_count} segments\n"
|
147 |
+
summary += f"- Filler words detected: {filler_count}\n\n"
|
148 |
+
|
149 |
+
# Show first few lines of transcript
|
150 |
+
lines = transcript.split('\n')[:3]
|
151 |
+
if lines:
|
152 |
+
summary += "**Preview:**\n"
|
153 |
+
for line in lines:
|
154 |
+
if line.strip():
|
155 |
+
summary += f"> {line.strip()}\n"
|
156 |
+
summary += "\n"
|
157 |
+
|
158 |
+
summary += "β
**Ready for execution planning...**"
|
159 |
+
return summary
|
src/nodes/validator.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Validator node for checking processing results and determining if reprocessing is needed.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import Dict, Any, List
|
6 |
+
from langchain_core.messages import AIMessage
|
7 |
+
|
8 |
+
|
9 |
+
def validator_node(state: Dict[str, Any]) -> Dict[str, Any]:
|
10 |
+
"""
|
11 |
+
Validate the processing results and determine if reprocessing is needed.
|
12 |
+
"""
|
13 |
+
|
14 |
+
processed_files = state.get("processed_files", {})
|
15 |
+
errors = state.get("errors", [])
|
16 |
+
completed_steps = state.get("completed_steps", [])
|
17 |
+
execution_plan = state.get("execution_plan", [])
|
18 |
+
needs_reprocessing = state.get("needs_reprocessing", False)
|
19 |
+
|
20 |
+
# Perform validation checks
|
21 |
+
validation_results = perform_validation_checks(
|
22 |
+
processed_files, errors, completed_steps, execution_plan
|
23 |
+
)
|
24 |
+
|
25 |
+
# Determine if reprocessing is needed
|
26 |
+
should_reprocess = determine_reprocessing_need(validation_results, needs_reprocessing)
|
27 |
+
|
28 |
+
# Create validation summary
|
29 |
+
validation_summary = create_validation_summary(validation_results, should_reprocess)
|
30 |
+
messages = state.get("messages", [])
|
31 |
+
messages.append(AIMessage(content=validation_summary))
|
32 |
+
|
33 |
+
return {
|
34 |
+
"needs_reprocessing": should_reprocess,
|
35 |
+
"processing_metadata": {
|
36 |
+
"validation_results": validation_results,
|
37 |
+
"validation_timestamp": get_current_timestamp()
|
38 |
+
},
|
39 |
+
"messages": messages
|
40 |
+
}
|
41 |
+
|
42 |
+
|
43 |
+
def perform_validation_checks(
|
44 |
+
processed_files: Dict[str, str],
|
45 |
+
errors: List[str],
|
46 |
+
completed_steps: List[str],
|
47 |
+
execution_plan: List[Dict[str, Any]]
|
48 |
+
) -> Dict[str, Any]:
|
49 |
+
"""Perform comprehensive validation of processing results."""
|
50 |
+
|
51 |
+
validation_results = {
|
52 |
+
"overall_status": "unknown",
|
53 |
+
"file_processing_success": {},
|
54 |
+
"step_completion_rate": 0,
|
55 |
+
"critical_errors": [],
|
56 |
+
"warnings": [],
|
57 |
+
"recommendations": []
|
58 |
+
}
|
59 |
+
|
60 |
+
# Check file processing success
|
61 |
+
for original_file in processed_files.keys():
|
62 |
+
processed_url = processed_files[original_file]
|
63 |
+
|
64 |
+
if processed_url and processed_url != original_file:
|
65 |
+
validation_results["file_processing_success"][original_file] = "success"
|
66 |
+
else:
|
67 |
+
validation_results["file_processing_success"][original_file] = "failed"
|
68 |
+
|
69 |
+
# Calculate step completion rate
|
70 |
+
total_steps = len(execution_plan)
|
71 |
+
if total_steps > 0:
|
72 |
+
successful_steps = len([step for step in completed_steps if step.startswith("β
")])
|
73 |
+
validation_results["step_completion_rate"] = successful_steps / total_steps
|
74 |
+
|
75 |
+
# Analyze errors for critical issues
|
76 |
+
critical_keywords = ["tool not available", "failed to transcribe", "connection", "timeout"]
|
77 |
+
for error in errors:
|
78 |
+
error_lower = error.lower()
|
79 |
+
if any(keyword in error_lower for keyword in critical_keywords):
|
80 |
+
validation_results["critical_errors"].append(error)
|
81 |
+
else:
|
82 |
+
validation_results["warnings"].append(error)
|
83 |
+
|
84 |
+
# Generate recommendations
|
85 |
+
validation_results["recommendations"] = generate_recommendations(
|
86 |
+
processed_files, errors, completed_steps, validation_results["step_completion_rate"]
|
87 |
+
)
|
88 |
+
|
89 |
+
# Determine overall status
|
90 |
+
if validation_results["step_completion_rate"] >= 0.8 and not validation_results["critical_errors"]:
|
91 |
+
validation_results["overall_status"] = "success"
|
92 |
+
elif validation_results["step_completion_rate"] >= 0.5:
|
93 |
+
validation_results["overall_status"] = "partial_success"
|
94 |
+
else:
|
95 |
+
validation_results["overall_status"] = "failed"
|
96 |
+
|
97 |
+
return validation_results
|
98 |
+
|
99 |
+
|
100 |
+
def determine_reprocessing_need(validation_results: Dict[str, Any], current_needs_reprocessing: bool) -> bool:
|
101 |
+
"""Determine if reprocessing is needed based on validation results."""
|
102 |
+
|
103 |
+
overall_status = validation_results.get("overall_status", "unknown")
|
104 |
+
step_completion_rate = validation_results.get("step_completion_rate", 0)
|
105 |
+
critical_errors = validation_results.get("critical_errors", [])
|
106 |
+
|
107 |
+
# Don't reprocess if we're already in a reprocessing cycle to avoid loops
|
108 |
+
if current_needs_reprocessing:
|
109 |
+
return False
|
110 |
+
|
111 |
+
# Reprocess if there are critical errors and some steps succeeded
|
112 |
+
if critical_errors and step_completion_rate > 0.2:
|
113 |
+
return True
|
114 |
+
|
115 |
+
# Reprocess if completion rate is low but not zero
|
116 |
+
if 0.1 < step_completion_rate < 0.7:
|
117 |
+
return True
|
118 |
+
|
119 |
+
# Don't reprocess if everything failed (likely a fundamental issue)
|
120 |
+
if step_completion_rate <= 0.1:
|
121 |
+
return False
|
122 |
+
|
123 |
+
# Don't reprocess if mostly successful
|
124 |
+
if step_completion_rate >= 0.8:
|
125 |
+
return False
|
126 |
+
|
127 |
+
return False
|
128 |
+
|
129 |
+
|
130 |
+
def generate_recommendations(
|
131 |
+
processed_files: Dict[str, str],
|
132 |
+
errors: List[str],
|
133 |
+
completed_steps: List[str],
|
134 |
+
completion_rate: float
|
135 |
+
) -> List[str]:
|
136 |
+
"""Generate recommendations based on processing results."""
|
137 |
+
|
138 |
+
recommendations = []
|
139 |
+
|
140 |
+
# File-specific recommendations
|
141 |
+
if not processed_files:
|
142 |
+
recommendations.append("No audio files were successfully processed. Check file URLs and format compatibility.")
|
143 |
+
elif len(processed_files) == 1:
|
144 |
+
recommendations.append("Single file processed. Consider adding fade effects or normalization for better quality.")
|
145 |
+
else:
|
146 |
+
recommendations.append(f"Multiple files processed ({len(processed_files)}). Consider combining them for dialogue if needed.")
|
147 |
+
|
148 |
+
# Error-based recommendations
|
149 |
+
if any("transcribe" in error.lower() for error in errors):
|
150 |
+
recommendations.append("Transcription issues detected. Verify audio quality and format.")
|
151 |
+
|
152 |
+
if any("tool not available" in error.lower() for error in errors):
|
153 |
+
recommendations.append("Some tools were unavailable. Check MCP server connection.")
|
154 |
+
|
155 |
+
if any("normalize" in step for step in completed_steps):
|
156 |
+
recommendations.append("Audio levels normalized. Consider adjusting volume manually if needed.")
|
157 |
+
|
158 |
+
# Completion rate recommendations
|
159 |
+
if completion_rate < 0.5:
|
160 |
+
recommendations.append("Low completion rate. Consider simplifying the processing request.")
|
161 |
+
elif completion_rate > 0.9:
|
162 |
+
recommendations.append("Processing highly successful! Audio should be significantly improved.")
|
163 |
+
|
164 |
+
# Quality recommendations
|
165 |
+
filler_steps = [step for step in completed_steps if "filler" in step.lower()]
|
166 |
+
if filler_steps:
|
167 |
+
recommendations.append("Filler words processed. Review the audio for natural flow.")
|
168 |
+
|
169 |
+
cut_steps = [step for step in completed_steps if "cut" in step.lower()]
|
170 |
+
if cut_steps:
|
171 |
+
recommendations.append("Audio segments cut. Verify timing and transitions.")
|
172 |
+
|
173 |
+
return recommendations
|
174 |
+
|
175 |
+
|
176 |
+
def get_current_timestamp() -> str:
|
177 |
+
"""Get current timestamp for metadata."""
|
178 |
+
import datetime
|
179 |
+
return datetime.datetime.now().isoformat()
|
180 |
+
|
181 |
+
|
182 |
+
def create_validation_summary(validation_results: Dict[str, Any], should_reprocess: bool) -> str:
|
183 |
+
"""Create a summary of validation results."""
|
184 |
+
|
185 |
+
overall_status = validation_results.get("overall_status", "unknown")
|
186 |
+
completion_rate = validation_results.get("step_completion_rate", 0)
|
187 |
+
critical_errors = validation_results.get("critical_errors", [])
|
188 |
+
warnings = validation_results.get("warnings", [])
|
189 |
+
recommendations = validation_results.get("recommendations", [])
|
190 |
+
|
191 |
+
# Status emoji and header
|
192 |
+
status_emoji = {
|
193 |
+
"success": "β
",
|
194 |
+
"partial_success": "β οΈ",
|
195 |
+
"failed": "β",
|
196 |
+
"unknown": "β"
|
197 |
+
}.get(overall_status, "β")
|
198 |
+
|
199 |
+
summary = f"{status_emoji} **Validation Results**\n\n"
|
200 |
+
|
201 |
+
# Overall status
|
202 |
+
summary += f"**Overall Status**: {overall_status.replace('_', ' ').title()}\n"
|
203 |
+
summary += f"**Completion Rate**: {completion_rate:.1%}\n\n"
|
204 |
+
|
205 |
+
# Critical errors
|
206 |
+
if critical_errors:
|
207 |
+
summary += f"**π¨ Critical Issues ({len(critical_errors)}):**\n"
|
208 |
+
for error in critical_errors[:3]: # Show first 3
|
209 |
+
summary += f"- {error}\n"
|
210 |
+
if len(critical_errors) > 3:
|
211 |
+
summary += f"- ... and {len(critical_errors) - 3} more\n"
|
212 |
+
summary += "\n"
|
213 |
+
|
214 |
+
# Warnings
|
215 |
+
if warnings:
|
216 |
+
summary += f"**β οΈ Warnings ({len(warnings)}):**\n"
|
217 |
+
for warning in warnings[:2]: # Show first 2
|
218 |
+
summary += f"- {warning}\n"
|
219 |
+
if len(warnings) > 2:
|
220 |
+
summary += f"- ... and {len(warnings) - 2} more\n"
|
221 |
+
summary += "\n"
|
222 |
+
|
223 |
+
# Recommendations
|
224 |
+
if recommendations:
|
225 |
+
summary += "**π‘ Recommendations:**\n"
|
226 |
+
for rec in recommendations[:3]: # Show first 3
|
227 |
+
summary += f"- {rec}\n"
|
228 |
+
if len(recommendations) > 3:
|
229 |
+
summary += f"- ... and {len(recommendations) - 3} more\n"
|
230 |
+
summary += "\n"
|
231 |
+
|
232 |
+
# Reprocessing decision
|
233 |
+
if should_reprocess:
|
234 |
+
summary += "π **Reprocessing recommended** to address issues and improve results."
|
235 |
+
else:
|
236 |
+
if overall_status == "success":
|
237 |
+
summary += "π **Processing complete!** No reprocessing needed."
|
238 |
+
else:
|
239 |
+
summary += "βΉοΈ **Processing complete.** Reprocessing not recommended."
|
240 |
+
|
241 |
+
return summary
|
src/state.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Graph state definition for the audio processing agent.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import List, Dict, Any, Optional, Annotated
|
6 |
+
from langchain_core.messages import BaseMessage
|
7 |
+
from langgraph.graph.message import add_messages
|
8 |
+
|
9 |
+
|
10 |
+
class AudioProcessingState:
|
11 |
+
"""State schema for the audio processing graph."""
|
12 |
+
|
13 |
+
# Chat history
|
14 |
+
messages: Annotated[List[BaseMessage], add_messages]
|
15 |
+
|
16 |
+
# Audio files provided by user
|
17 |
+
audio_files: List[str] # URLs or paths to audio files
|
18 |
+
|
19 |
+
# User's processing request
|
20 |
+
user_request: str
|
21 |
+
|
22 |
+
# Processing type determined by router
|
23 |
+
processing_type: str # "chat", "audio_processing", "dialogue_generation"
|
24 |
+
|
25 |
+
# Generated scripts with timestamps
|
26 |
+
scripts: Dict[str, Any] # {file_url: {transcript: str, timestamps: List}}
|
27 |
+
|
28 |
+
# Execution plan created by planner
|
29 |
+
execution_plan: List[Dict[str, Any]] # List of tool calls with parameters
|
30 |
+
|
31 |
+
# Processing results
|
32 |
+
processed_files: Dict[str, str] # {original_url: processed_url}
|
33 |
+
|
34 |
+
# Processing steps completed
|
35 |
+
completed_steps: List[str]
|
36 |
+
|
37 |
+
# Final output
|
38 |
+
final_audio_url: Optional[str]
|
39 |
+
final_response: str
|
40 |
+
|
41 |
+
# Error handling
|
42 |
+
errors: List[str]
|
43 |
+
needs_reprocessing: bool
|
44 |
+
|
45 |
+
# Metadata
|
46 |
+
processing_metadata: Dict[str, Any]
|
src/ui.py
CHANGED
@@ -1,25 +1,41 @@
|
|
1 |
import asyncio
|
2 |
import gradio as gr
|
3 |
-
from
|
4 |
from .agent import AudioAgent
|
5 |
|
6 |
# Global agent instance
|
7 |
agent = AudioAgent()
|
8 |
|
9 |
-
def user_input(user_message, history):
|
10 |
"""
|
11 |
-
Handle user input
|
12 |
"""
|
13 |
-
if not user_message.strip():
|
14 |
-
return "", history
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# Add user message to history
|
17 |
-
history.append({"role": "user", "content":
|
18 |
-
return "", history
|
19 |
|
20 |
async def bot_response(history):
|
21 |
"""
|
22 |
-
Generate bot response with streaming,
|
23 |
"""
|
24 |
if not history or history[-1]["role"] != "user":
|
25 |
return
|
@@ -36,27 +52,37 @@ async def bot_response(history):
|
|
36 |
yield history
|
37 |
|
38 |
# Track current node and organize content by nodes
|
39 |
-
current_content = ""
|
40 |
-
current_node = None
|
41 |
nodes_content = {}
|
|
|
42 |
|
43 |
# Stream the response
|
44 |
async for chunk, node_name in agent.stream_chat(user_message):
|
45 |
-
#
|
46 |
-
if
|
47 |
-
|
48 |
-
|
49 |
-
nodes_content[node_name] = ""
|
50 |
|
51 |
# Add chunk to the current node's content
|
|
|
|
|
|
|
52 |
if chunk:
|
53 |
nodes_content[node_name] += chunk
|
54 |
|
55 |
# Build the formatted content with node headers
|
56 |
formatted_content = ""
|
|
|
57 |
for node, content in nodes_content.items():
|
58 |
if content.strip(): # Only show nodes that have content
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
# Update the chat history
|
62 |
history[-1]["content"] = formatted_content.rstrip()
|
@@ -70,6 +96,32 @@ async def bot_response(history):
|
|
70 |
history.append({"role": "assistant", "content": f"β **Error**: {str(e)}"})
|
71 |
yield history
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
def bot_response_sync(history):
|
74 |
"""
|
75 |
Synchronous wrapper for the async bot response
|
@@ -88,35 +140,115 @@ def bot_response_sync(history):
|
|
88 |
|
89 |
def create_interface():
|
90 |
"""
|
91 |
-
Create and return the Gradio interface
|
92 |
"""
|
93 |
-
with gr.Blocks(
|
94 |
-
|
95 |
-
gr.
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
with gr.Row():
|
105 |
msg = gr.Textbox(
|
106 |
-
label="
|
107 |
-
placeholder="
|
108 |
-
lines=
|
109 |
scale=4
|
110 |
)
|
111 |
-
send_btn = gr.Button("
|
112 |
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
# Handle user input and bot response
|
|
|
|
|
|
|
116 |
msg.submit(
|
117 |
-
|
118 |
-
[msg, chatbot],
|
119 |
-
[msg, chatbot],
|
120 |
queue=False
|
121 |
).then(
|
122 |
bot_response_sync,
|
@@ -125,9 +257,9 @@ def create_interface():
|
|
125 |
)
|
126 |
|
127 |
send_btn.click(
|
128 |
-
|
129 |
-
[msg, chatbot],
|
130 |
-
[msg, chatbot],
|
131 |
queue=False
|
132 |
).then(
|
133 |
bot_response_sync,
|
@@ -137,14 +269,29 @@ def create_interface():
|
|
137 |
|
138 |
# Clear chat
|
139 |
clear_btn.click(
|
140 |
-
lambda: [],
|
141 |
None,
|
142 |
-
chatbot,
|
143 |
queue=False
|
144 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
return demo
|
147 |
|
148 |
if __name__ == "__main__":
|
149 |
demo = create_interface()
|
150 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import asyncio
|
2 |
import gradio as gr
|
3 |
+
from typing import List, Tuple
|
4 |
from .agent import AudioAgent
|
5 |
|
6 |
# Global agent instance
|
7 |
agent = AudioAgent()
|
8 |
|
9 |
+
def user_input(user_message, audio_files, history):
|
10 |
"""
|
11 |
+
Handle user input with text and audio files
|
12 |
"""
|
13 |
+
if not user_message.strip() and not audio_files:
|
14 |
+
return "", [], history
|
15 |
+
|
16 |
+
# Process audio files into URLs/paths
|
17 |
+
audio_file_paths = []
|
18 |
+
if audio_files:
|
19 |
+
for audio_file in audio_files:
|
20 |
+
if hasattr(audio_file, 'name'):
|
21 |
+
audio_file_paths.append(audio_file.name)
|
22 |
+
else:
|
23 |
+
audio_file_paths.append(str(audio_file))
|
24 |
+
|
25 |
+
# Create combined message with audio files
|
26 |
+
if audio_file_paths:
|
27 |
+
audio_list = "\n".join([f"Audio file: {path}" for path in audio_file_paths])
|
28 |
+
combined_message = f"{user_message}\n\n{audio_list}" if user_message.strip() else audio_list
|
29 |
+
else:
|
30 |
+
combined_message = user_message
|
31 |
|
32 |
# Add user message to history
|
33 |
+
history.append({"role": "user", "content": combined_message})
|
34 |
+
return "", [], history
|
35 |
|
36 |
async def bot_response(history):
|
37 |
"""
|
38 |
+
Generate bot response with streaming, organized by graph nodes
|
39 |
"""
|
40 |
if not history or history[-1]["role"] != "user":
|
41 |
return
|
|
|
52 |
yield history
|
53 |
|
54 |
# Track current node and organize content by nodes
|
|
|
|
|
55 |
nodes_content = {}
|
56 |
+
processed_audio_urls = []
|
57 |
|
58 |
# Stream the response
|
59 |
async for chunk, node_name in agent.stream_chat(user_message):
|
60 |
+
# Check if this chunk contains an audio URL
|
61 |
+
if "Audio Ready" in chunk and "http" in chunk:
|
62 |
+
processed_audio_urls.append(chunk)
|
63 |
+
continue
|
|
|
64 |
|
65 |
# Add chunk to the current node's content
|
66 |
+
if node_name not in nodes_content:
|
67 |
+
nodes_content[node_name] = ""
|
68 |
+
|
69 |
if chunk:
|
70 |
nodes_content[node_name] += chunk
|
71 |
|
72 |
# Build the formatted content with node headers
|
73 |
formatted_content = ""
|
74 |
+
|
75 |
for node, content in nodes_content.items():
|
76 |
if content.strip(): # Only show nodes that have content
|
77 |
+
node_emoji = get_node_emoji(node)
|
78 |
+
formatted_content += f"**{node_emoji} {format_node_name(node)}**\n\n{content}\n\n"
|
79 |
+
|
80 |
+
# Add processed audio URLs at the end
|
81 |
+
if processed_audio_urls:
|
82 |
+
formatted_content += "**π΅ Processed Audio Files:**\n"
|
83 |
+
for audio_url in processed_audio_urls:
|
84 |
+
formatted_content += f"{audio_url}\n"
|
85 |
+
formatted_content += "\n"
|
86 |
|
87 |
# Update the chat history
|
88 |
history[-1]["content"] = formatted_content.rstrip()
|
|
|
96 |
history.append({"role": "assistant", "content": f"β **Error**: {str(e)}"})
|
97 |
yield history
|
98 |
|
99 |
+
def get_node_emoji(node_name: str) -> str:
|
100 |
+
"""Get emoji for different node types."""
|
101 |
+
node_emojis = {
|
102 |
+
"router": "π",
|
103 |
+
"chat": "π¬",
|
104 |
+
"script_generator": "π",
|
105 |
+
"planner": "π",
|
106 |
+
"audio_processor": "π§",
|
107 |
+
"validator": "β
",
|
108 |
+
"final_response": "π―"
|
109 |
+
}
|
110 |
+
return node_emojis.get(node_name, "βοΈ")
|
111 |
+
|
112 |
+
def format_node_name(node_name: str) -> str:
|
113 |
+
"""Format node name for display."""
|
114 |
+
name_mapping = {
|
115 |
+
"router": "Routing Request",
|
116 |
+
"chat": "Chat Response",
|
117 |
+
"script_generator": "Generating Transcripts",
|
118 |
+
"planner": "Creating Execution Plan",
|
119 |
+
"audio_processor": "Processing Audio",
|
120 |
+
"validator": "Validating Results",
|
121 |
+
"final_response": "Final Results"
|
122 |
+
}
|
123 |
+
return name_mapping.get(node_name, node_name.replace("_", " ").title())
|
124 |
+
|
125 |
def bot_response_sync(history):
|
126 |
"""
|
127 |
Synchronous wrapper for the async bot response
|
|
|
140 |
|
141 |
def create_interface():
|
142 |
"""
|
143 |
+
Create and return the enhanced Gradio interface
|
144 |
"""
|
145 |
+
with gr.Blocks(
|
146 |
+
title="Audio Agent - Professional Audio Processing",
|
147 |
+
theme=gr.themes.Soft(),
|
148 |
+
css="""
|
149 |
+
.audio-upload-area {
|
150 |
+
border: 2px dashed #ccc;
|
151 |
+
border-radius: 10px;
|
152 |
+
padding: 20px;
|
153 |
+
text-align: center;
|
154 |
+
margin: 10px 0;
|
155 |
+
}
|
156 |
+
.processed-audio {
|
157 |
+
background: #f0f9ff;
|
158 |
+
border: 1px solid #0891b2;
|
159 |
+
border-radius: 8px;
|
160 |
+
padding: 15px;
|
161 |
+
margin: 10px 0;
|
162 |
+
}
|
163 |
+
"""
|
164 |
+
) as demo:
|
165 |
+
|
166 |
+
gr.Markdown("""
|
167 |
+
# π΅ Audio Agent - Professional Audio Processing
|
168 |
+
|
169 |
+
Upload audio files and describe what you want to achieve. I can remove filler words,
|
170 |
+
normalize volume, cut segments, combine files, and much more!
|
171 |
+
|
172 |
+
**Supported formats**: MP3, WAV, M4A, FLAC, AAC, OGG
|
173 |
+
""")
|
174 |
+
|
175 |
+
with gr.Row():
|
176 |
+
with gr.Column(scale=2):
|
177 |
+
chatbot = gr.Chatbot(
|
178 |
+
type="messages",
|
179 |
+
height=400,
|
180 |
+
show_copy_button=True,
|
181 |
+
show_share_button=False,
|
182 |
+
avatar_images=(None, "π΅"),
|
183 |
+
bubble_full_width=False
|
184 |
+
)
|
185 |
+
|
186 |
+
with gr.Column(scale=1):
|
187 |
+
gr.Markdown("### π΅ Upload Audio Files")
|
188 |
+
|
189 |
+
audio_files = gr.File(
|
190 |
+
file_count="multiple",
|
191 |
+
file_types=["audio"],
|
192 |
+
label="Select Audio Files",
|
193 |
+
height=150
|
194 |
+
)
|
195 |
+
|
196 |
+
gr.Markdown("""
|
197 |
+
**Quick Examples:**
|
198 |
+
- "Remove filler words and normalize volume"
|
199 |
+
- "Cut this audio from 30 seconds to 2 minutes"
|
200 |
+
- "Combine these files into a dialogue"
|
201 |
+
- "Apply fade effects and enhance quality"
|
202 |
+
""")
|
203 |
|
204 |
with gr.Row():
|
205 |
msg = gr.Textbox(
|
206 |
+
label="Describe what you want to do",
|
207 |
+
placeholder="e.g., 'Remove filler words and improve audio quality' or 'What tools are available?'",
|
208 |
+
lines=3,
|
209 |
scale=4
|
210 |
)
|
211 |
+
send_btn = gr.Button("π Process Audio", variant="primary", scale=1, size="lg")
|
212 |
|
213 |
+
with gr.Row():
|
214 |
+
clear_btn = gr.Button("ποΈ Clear Chat", variant="secondary")
|
215 |
+
examples_btn = gr.Button("π‘ Show Examples", variant="secondary")
|
216 |
+
|
217 |
+
# Examples section (initially hidden)
|
218 |
+
examples_section = gr.Markdown(
|
219 |
+
"""
|
220 |
+
### π Example Requests
|
221 |
+
|
222 |
+
**Audio Enhancement:**
|
223 |
+
- "Clean up this recording - remove filler words and background noise"
|
224 |
+
- "Normalize the volume and add fade effects"
|
225 |
+
- "Make this audio sound more professional"
|
226 |
+
|
227 |
+
**Audio Editing:**
|
228 |
+
- "Cut the audio from 1:30 to 3:45"
|
229 |
+
- "Speed up this recording by 1.5x"
|
230 |
+
- "Reverse this audio clip"
|
231 |
+
|
232 |
+
**Dialogue Creation:**
|
233 |
+
- "Combine these two audio files into a conversation"
|
234 |
+
- "Create a dialogue with proper transitions between speakers"
|
235 |
+
|
236 |
+
**Information & Analysis:**
|
237 |
+
- "Generate a transcript with timestamps"
|
238 |
+
- "What audio processing tools are available?"
|
239 |
+
- "How does audio normalization work?"
|
240 |
+
""",
|
241 |
+
visible=False
|
242 |
+
)
|
243 |
|
244 |
# Handle user input and bot response
|
245 |
+
def handle_submit(message, files, history):
|
246 |
+
return user_input(message, files, history)
|
247 |
+
|
248 |
msg.submit(
|
249 |
+
handle_submit,
|
250 |
+
[msg, audio_files, chatbot],
|
251 |
+
[msg, audio_files, chatbot],
|
252 |
queue=False
|
253 |
).then(
|
254 |
bot_response_sync,
|
|
|
257 |
)
|
258 |
|
259 |
send_btn.click(
|
260 |
+
handle_submit,
|
261 |
+
[msg, audio_files, chatbot],
|
262 |
+
[msg, audio_files, chatbot],
|
263 |
queue=False
|
264 |
).then(
|
265 |
bot_response_sync,
|
|
|
269 |
|
270 |
# Clear chat
|
271 |
clear_btn.click(
|
272 |
+
lambda: ([], []),
|
273 |
None,
|
274 |
+
[chatbot, audio_files],
|
275 |
queue=False
|
276 |
)
|
277 |
+
|
278 |
+
# Toggle examples
|
279 |
+
def toggle_examples(current_visibility):
|
280 |
+
return not current_visibility
|
281 |
+
|
282 |
+
examples_btn.click(
|
283 |
+
toggle_examples,
|
284 |
+
examples_section,
|
285 |
+
examples_section
|
286 |
+
)
|
287 |
|
288 |
return demo
|
289 |
|
290 |
if __name__ == "__main__":
|
291 |
demo = create_interface()
|
292 |
+
demo.launch(
|
293 |
+
share=False,
|
294 |
+
server_name="0.0.0.0",
|
295 |
+
server_port=7861,
|
296 |
+
show_error=True
|
297 |
+
)
|