Spaces:

Agents-MCP-Hackathon
/

Audio-Agent

Sleeping

App Files Files Community

YigitSekerci commited on Jun 7

Commit

399d0c1

1 Parent(s): 9cce053

migrate to complicated agent

Browse files

Files changed (11) hide show

src/agent.py +212 -38
src/nodes/__init__.py +21 -0
src/nodes/audio_processor.py +169 -0
src/nodes/chat.py +132 -0
src/nodes/final_response.py +205 -0
src/nodes/planner.py +284 -0
src/nodes/router.py +97 -0
src/nodes/script_generator.py +159 -0
src/nodes/validator.py +241 -0
src/state.py +46 -0
src/ui.py +188 -41

src/agent.py CHANGED Viewed

@@ -1,14 +1,75 @@
 import asyncio
 from dotenv import load_dotenv
 from langchain_mcp_adapters.client import MultiServerMCPClient
-from langgraph.prebuilt import create_react_agent
-from langgraph.graph.graph import CompiledGraph
 class AudioAgent:
     """
-    Wraps a LangGraph REACT agent over your MCP audio-tools,
-    exposing both one-shot and streaming chat methods.
     """
     def __init__(
@@ -20,71 +81,184 @@ class AudioAgent:
         self.model_name = model_name
         self.server_url = server_url
-        # SSE client for your audio tools
         self._client = MultiServerMCPClient({
             "audio-tools": {"url": self.server_url, "transport": "sse"}
         })
-        self._agent = None
     @property
     def is_initialized(self) -> bool:
-        return self._agent is not None
     async def initialize(self) -> None:
-        """Fetch tools from MCP and build a streaming-capable LangGraph REACT agent."""
         if self.is_initialized:
             return
-        tools = await self._client.get_tools()
-        if not tools:
             raise RuntimeError("No tools available from MCP server")
-        self._agent: CompiledGraph = create_react_agent(
-            model=self.model_name,
-            tools=tools,
-            prompt="""
-            You are a helpful assistant that can use the following tools to help the user.
-            """
         )
-    def process_user_input(self, user_input: str):
-        """
-        Process user input and return a prompt for the agent.
-        """
-        return {"messages": [{"role": "user", "content": user_input}]}
-    async def chat(self, prompt: str) -> str:
         """
-        One-shot chat: returns the full LLM + tool-augmented reply.
         """
         if not self.is_initialized:
             await self.initialize()
-        return await self._agent.ainvoke(self.process_user_input(prompt))
     async def stream_chat(self, prompt: str):
         """
-        Streaming chat: prints tokens live and returns the full reply at the end.
         """
         if not self.is_initialized:
             await self.initialize()
-        async for msg, metadata in self._agent.astream(
-            self.process_user_input(prompt),
-            stream_mode="messages"
-        ):
-            if msg.content:
-                yield msg.content, metadata["langgraph_node"]
 async def main():
     agent = AudioAgent()
-    # one-shot example
-    reply = await agent.chat("Hi! What audio tools are available?")
-    print("→", reply)
-    # streaming example
-    async for msg, node in agent.stream_chat("Explain how audio normalization works."):
-        print(msg, end="", flush=True)
 if __name__ == "__main__":
     asyncio.run(main())

 import asyncio
+from typing import Dict, Any, TypedDict, Annotated, List
 from dotenv import load_dotenv
+from langchain_core.messages import BaseMessage
 from langchain_mcp_adapters.client import MultiServerMCPClient
+from langgraph.graph import StateGraph, END
+from langgraph.graph.message import add_messages
+from langgraph.checkpoint.memory import MemorySaver
+from .nodes import (
+    router_node,
+    chat_node,
+    script_generator_node,
+    planner_node,
+    audio_processor_node,
+    validator_node,
+    final_response_node
+)
+class AudioProcessingState(TypedDict):
+    """State schema for the audio processing graph."""
+    # Chat history
+    messages: Annotated[List[BaseMessage], add_messages]
+    # Audio files provided by user
+    audio_files: List[str]
+    # User's processing request
+    user_request: str
+    # Processing type determined by router
+    processing_type: str
+    # Generated scripts with timestamps
+    scripts: Dict[str, Any]
+    # Execution plan created by planner
+    execution_plan: List[Dict[str, Any]]
+    # Processing results
+    processed_files: Dict[str, str]
+    # Processing steps completed
+    completed_steps: List[str]
+    # Final output
+    final_audio_url: str
+    final_response: str
+    # Error handling
+    errors: List[str]
+    needs_reprocessing: bool
+    # Metadata
+    processing_metadata: Dict[str, Any]
 class AudioAgent:
     """
+    Advanced LangGraph-based audio processing agent with custom nodes.
+    Handles audio file processing through a sophisticated workflow:
+    1. Router - Determines processing type
+    2. Chat or Audio Processing Pipeline
+    3. Script Generation - Creates timestamped transcripts
+    4. Planning - Creates execution plan
+    5. Processing - Executes audio tools
+    6. Validation - Checks results and determines reprocessing
+    7. Final Response - Formats output for user
     """
     def __init__(
         self.model_name = model_name
         self.server_url = server_url
+        # SSE client for audio tools
         self._client = MultiServerMCPClient({
             "audio-tools": {"url": self.server_url, "transport": "sse"}
         })
+        self._graph = None
+        self._tools = []
     @property
     def is_initialized(self) -> bool:
+        return self._graph is not None
     async def initialize(self) -> None:
+        """Initialize the LangGraph workflow with audio tools."""
         if self.is_initialized:
             return
+        # Get tools from MCP server
+        self._tools = await self._client.get_tools()
+        if not self._tools:
             raise RuntimeError("No tools available from MCP server")
+        # Build the graph
+        self._graph = self._build_graph()
+    def _build_graph(self) -> StateGraph:
+        """Build the LangGraph workflow."""
+        # Create the state graph
+        workflow = StateGraph(AudioProcessingState)
+        # Add nodes
+        workflow.add_node("router", router_node)
+        workflow.add_node("chat", chat_node)
+        workflow.add_node("script_generator", self._script_generator_with_tools)
+        workflow.add_node("planner", planner_node)
+        workflow.add_node("audio_processor", self._audio_processor_with_tools)
+        workflow.add_node("validator", validator_node)
+        workflow.add_node("final_response", final_response_node)
+        # Set entry point
+        workflow.set_entry_point("router")
+        # Add conditional edges based on processing type
+        workflow.add_conditional_edges(
+            "router",
+            self._route_processing_type,
+            {
+                "chat": "chat",
+                "audio_processing": "script_generator",
+                "dialogue_generation": "script_generator"
+            }
+        )
+        # Chat flow
+        workflow.add_edge("chat", "final_response")
+        # Audio processing flow
+        workflow.add_edge("script_generator", "planner")
+        workflow.add_edge("planner", "audio_processor")
+        workflow.add_edge("audio_processor", "validator")
+        # Validation flow with conditional reprocessing
+        workflow.add_conditional_edges(
+            "validator",
+            self._check_reprocessing_need,
+            {
+                "reprocess": "planner",  # Go back to planning
+                "complete": "final_response"
+            }
         )
+        # Final response leads to end
+        workflow.add_edge("final_response", END)
+        # Compile with memory for conversation history
+        memory = MemorySaver()
+        return workflow.compile(checkpointer=memory)
+    async def _script_generator_with_tools(self, state: Dict[str, Any]) -> Dict[str, Any]:
+        """Script generator node with tools access."""
+        return await script_generator_node(state, self._tools)
+    async def _audio_processor_with_tools(self, state: Dict[str, Any]) -> Dict[str, Any]:
+        """Audio processor node with tools access."""
+        return await audio_processor_node(state, self._tools)
+    def _route_processing_type(self, state: Dict[str, Any]) -> str:
+        """Route based on processing type."""
+        return state.get("processing_type", "chat")
+    def _check_reprocessing_need(self, state: Dict[str, Any]) -> str:
+        """Check if reprocessing is needed."""
+        if state.get("needs_reprocessing", False):
+            return "reprocess"
+        return "complete"
+    def process_user_input(self, user_input: str) -> Dict[str, Any]:
+        """Process user input and create initial state."""
+        from langchain_core.messages import HumanMessage
+        return {
+            "messages": [HumanMessage(content=user_input)],
+            "audio_files": [],
+            "user_request": "",
+            "processing_type": "",
+            "scripts": {},
+            "execution_plan": [],
+            "processed_files": {},
+            "completed_steps": [],
+            "final_audio_url": "",
+            "final_response": "",
+            "errors": [],
+            "needs_reprocessing": False,
+            "processing_metadata": {}
+        }
+    async def chat(self, prompt: str) -> Dict[str, Any]:
         """
+        One-shot chat: returns the full processing result.
         """
         if not self.is_initialized:
             await self.initialize()
+        config = {"configurable": {"thread_id": "audio_agent_session"}}
+        initial_state = self.process_user_input(prompt)
+        result = await self._graph.ainvoke(initial_state, config)
+        return result
     async def stream_chat(self, prompt: str):
         """
+        Streaming chat: yields intermediate results as processing continues.
         """
         if not self.is_initialized:
             await self.initialize()
+        config = {"configurable": {"thread_id": "audio_agent_session"}}
+        initial_state = self.process_user_input(prompt)
+        async for chunk in self._graph.astream(initial_state, config):
+            # Extract the node name and content
+            for node_name, node_output in chunk.items():
+                if node_name == "__end__":
+                    continue
+                # Get the latest message if available
+                messages = node_output.get("messages", [])
+                if messages and hasattr(messages[-1], 'content'):
+                    content = messages[-1].content
+                    if content:
+                        yield content, node_name
+                # Also yield final audio URL if available
+                final_audio_url = node_output.get("final_audio_url", "")
+                if final_audio_url:
+                    yield f"\n🎵 **Audio Ready**: [{final_audio_url}]({final_audio_url})", node_name
 async def main():
+    """Test the agent with various scenarios."""
     agent = AudioAgent()
+    # Test 1: Chat about capabilities
+    print("=== Test 1: Chat Query ===")
+    result = await agent.chat("What audio tools are available?")
+    print("Final Response:", result.get("final_response", ""))
+    print()
+    # Test 2: Audio processing request
+    print("=== Test 2: Audio Processing ===")
+    audio_request = "Process this audio file https://example.com/audio.mp3 - remove filler words and normalize volume"
+    print("Streaming response:")
+    async for content, node in agent.stream_chat(audio_request):
+        print(f"[{node}] {content[:100]}..." if len(content) > 100 else f"[{node}] {content}")
+    print()
 if __name__ == "__main__":
     asyncio.run(main())

src/nodes/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Audio processing graph nodes.
+"""
+from .router import router_node
+from .chat import chat_node
+from .script_generator import script_generator_node
+from .planner import planner_node
+from .audio_processor import audio_processor_node
+from .validator import validator_node
+from .final_response import final_response_node
+__all__ = [
+    "router_node",
+    "chat_node",
+    "script_generator_node",
+    "planner_node",
+    "audio_processor_node",
+    "validator_node",
+    "final_response_node"
+]

src/nodes/audio_processor.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""
+Audio processor node for executing planned audio processing steps.
+"""
+from typing import Dict, Any, List
+from langchain_core.messages import AIMessage
+async def audio_processor_node(state: Dict[str, Any], tools: list) -> Dict[str, Any]:
+    """
+    Execute the planned audio processing steps using available tools.
+    """
+    execution_plan = state.get("execution_plan", [])
+    processed_files = state.get("processed_files", {})
+    completed_steps = state.get("completed_steps", [])
+    errors = state.get("errors", [])
+    if not execution_plan:
+        return {
+            "processed_files": processed_files,
+            "completed_steps": completed_steps,
+            "errors": errors + ["No execution plan available"],
+            "messages": state.get("messages", [])
+        }
+    # Create tool lookup
+    tool_lookup = {tool.name: tool for tool in tools}
+    # Execute each step in the plan
+    current_file_urls = {}  # Track current URL for each original file
+    for step in execution_plan:
+        try:
+            step_name = step.get("step", "unknown")
+            tool_name = step.get("tool", "")
+            params = step.get("params", {})
+            description = step.get("description", "")
+            if tool_name not in tool_lookup:
+                if tool_name == "manual_combine":
+                    # Handle manual combine step
+                    result = handle_manual_combine(step, current_file_urls)
+                    if result:
+                        processed_files.update(result)
+                        completed_steps.append(f"✅ {description}")
+                    else:
+                        errors.append(f"❌ {description} - Manual combination needed")
+                else:
+                    errors.append(f"❌ Tool '{tool_name}' not available for step: {description}")
+                continue
+            # Get the tool and execute
+            tool = tool_lookup[tool_name]
+            # Update file URL if this file has been processed before
+            original_file = params.get("audio_file", "")
+            if original_file in current_file_urls:
+                params["audio_file"] = current_file_urls[original_file]
+            # Execute the tool
+            result = await tool.ainvoke(params)
+            # Extract new file URL from result if available
+            new_file_url = extract_file_url_from_result(result, original_file)
+            if new_file_url and new_file_url != params["audio_file"]:
+                current_file_urls[original_file] = new_file_url
+                processed_files[original_file] = new_file_url
+            completed_steps.append(f"✅ {description}")
+        except Exception as e:
+            error_msg = f"❌ Failed step '{step.get('description', 'unknown')}': {str(e)}"
+            errors.append(error_msg)
+    # Create processing summary
+    processing_summary = create_processing_summary(completed_steps, errors, processed_files)
+    messages = state.get("messages", [])
+    messages.append(AIMessage(content=processing_summary))
+    # Determine if reprocessing is needed
+    needs_reprocessing = len(errors) > 0 and len(completed_steps) > 0
+    return {
+        "processed_files": processed_files,
+        "completed_steps": completed_steps,
+        "errors": errors,
+        "needs_reprocessing": needs_reprocessing,
+        "messages": messages
+    }
+def extract_file_url_from_result(result, original_file: str) -> str:
+    """Extract the new file URL from tool result."""
+    if hasattr(result, 'artifact') and result.artifact:
+        # If result has artifact with file info
+        if hasattr(result.artifact, 'url'):
+            return result.artifact.url
+        elif hasattr(result.artifact, 'path'):
+            return result.artifact.path
+    if hasattr(result, 'content'):
+        content = result.content
+        # Look for URLs in the content
+        import re
+        url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+\.(mp3|wav|m4a|flac|aac|ogg)'
+        urls = re.findall(url_pattern, content, re.IGNORECASE)
+        if urls:
+            return urls[0]
+    # If no new URL found, return the original
+    return original_file
+def handle_manual_combine(step: Dict[str, Any], current_file_urls: Dict[str, str]) -> Dict[str, str]:
+    """Handle manual file combination step."""
+    files = step.get("params", {}).get("files", [])
+    if len(files) < 2:
+        return {}
+    # For now, just return a placeholder combined file
+    # In a real implementation, this would call a combine tool
+    combined_url = f"combined_dialogue_{len(files)}_files.mp3"
+    return {"combined_dialogue": combined_url}
+def create_processing_summary(completed_steps: List[str], errors: List[str], processed_files: Dict[str, str]) -> str:
+    """Create a summary of the processing results."""
+    summary = "🔧 **Audio Processing Complete**\n\n"
+    # Completed steps
+    if completed_steps:
+        summary += f"**✅ Completed Steps ({len(completed_steps)}):**\n"
+        for step in completed_steps[-5:]:  # Show last 5 steps
+            summary += f"- {step}\n"
+        if len(completed_steps) > 5:
+            summary += f"- ... and {len(completed_steps) - 5} more steps\n"
+        summary += "\n"
+    # Processed files
+    if processed_files:
+        summary += "**🎵 Processed Audio Files:**\n"
+        for original, processed in processed_files.items():
+            filename = original.split('/')[-1] if '/' in original else original
+            processed_filename = processed.split('/')[-1] if '/' in processed else processed
+            summary += f"- {filename} → {processed_filename}\n"
+        summary += "\n"
+    # Errors
+    if errors:
+        summary += f"**⚠️ Issues Encountered ({len(errors)}):**\n"
+        for error in errors[-3:]:  # Show last 3 errors
+            summary += f"- {error}\n"
+        if len(errors) > 3:
+            summary += f"- ... and {len(errors) - 3} more issues\n"
+        summary += "\n"
+    if processed_files and not errors:
+        summary += "🎉 **All processing completed successfully!**"
+    elif processed_files and errors:
+        summary += "⚠️ **Processing completed with some issues. Validation recommended.**"
+    else:
+        summary += "❌ **Processing failed. Please check the issues above.**"
+    return summary

src/nodes/chat.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+Chat node for handling general questions and conversations.
+"""
+from typing import Dict, Any
+from langchain_core.messages import AIMessage
+def chat_node(state: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Handle general chat messages and questions about audio capabilities.
+    """
+    user_request = state.get("user_request", "")
+    # Generate response based on the user's question
+    response = generate_chat_response(user_request)
+    # Add AI response to messages
+    messages = state.get("messages", [])
+    messages.append(AIMessage(content=response))
+    return {
+        "messages": messages,
+        "final_response": response
+    }
+def generate_chat_response(user_request: str) -> str:
+    """Generate appropriate chat response."""
+    user_lower = user_request.lower()
+    # Audio tools information
+    if any(keyword in user_lower for keyword in ["tools", "available", "capabilities", "what can"]):
+        return """
+🎵 **Audio Agent Capabilities**
+I can help you process and improve audio files using these tools:
+**📊 Analysis & Information:**
+- Get audio duration and metadata
+- Generate timestamped transcriptions
+- Analyze audio properties
+**✂️ Audio Editing:**
+- Cut and trim audio segments
+- Remove silence from recordings
+- Apply fade in/out effects
+- Reverse audio playback
+**🔧 Audio Enhancement:**
+- Normalize audio levels (-20dB to 0dB)
+- Adjust volume/gain (-20dB to +20dB)
+- Change playback speed (0.25x to 4x)
+**🎭 Advanced Processing:**
+- Remove filler words from speech
+- Combine multiple audio files into dialogue
+- Create professional audio workflows
+To get started, simply provide audio file URLs and describe what you'd like me to do!
+        """
+    # How to use instructions
+    if any(keyword in user_lower for keyword in ["how", "use", "start", "begin"]):
+        return """
+🚀 **How to Use the Audio Agent**
+1. **Provide Audio Files**: Share URLs to your audio files (mp3, wav, m4a, etc.)
+2. **Describe Your Goal**: Tell me what you want to achieve:
+   - "Remove filler words and improve audio quality"
+   - "Cut this audio from 30 seconds to 2 minutes"
+   - "Combine these files into a dialogue"
+   - "Normalize the volume and add fade effects"
+3. **Let Me Work**: I'll automatically:
+   - Generate timestamped transcripts
+   - Create an execution plan
+   - Process your audio step by step
+   - Provide you with the improved audio file
+**Example**:
+"Here's my recording: https://example.com/audio.mp3 - please remove filler words and normalize the volume"
+        """
+    # Filler words explanation
+    if any(keyword in user_lower for keyword in ["filler", "um", "uh", "like"]):
+        return """
+🗣️ **Filler Word Removal**
+I can help remove common filler words like "um", "uh", "like", "you know", etc. from your audio.
+**Process**:
+1. I'll transcribe your audio with timestamps
+2. Identify filler words and their locations
+3. Remove those segments from the audio
+4. Apply smooth transitions to maintain natural flow
+**Benefits**:
+- More professional-sounding recordings
+- Improved clarity and pace
+- Better listener engagement
+Just provide your audio file and mention "remove filler words" in your request!
+        """
+    # General greeting or unclear request
+    if any(keyword in user_lower for keyword in ["hello", "hi", "help"]) or len(user_request.strip()) < 10:
+        return """
+👋 **Hello! I'm your Audio Processing Assistant**
+I specialize in improving and processing audio files. I can:
+- Remove filler words and improve speech clarity
+- Cut, trim, and edit audio segments
+- Normalize volume and apply professional effects
+- Combine multiple files into conversations
+- Generate timestamped transcriptions
+**Ready to enhance your audio?** Just share your audio file URLs and tell me what you'd like me to do!
+Type "what tools are available?" to see all my capabilities.
+        """
+    # Default response for other questions
+    return """
+I'm here to help with audio processing! While I can chat about audio-related topics, my specialty is improving audio files.
+If you have audio files you'd like me to process, just share the URLs and describe what you need. Otherwise, feel free to ask me about my audio processing capabilities!
+    """

src/nodes/final_response.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+Final response node for formatting the final response to the user.
+"""
+from typing import Dict, Any
+from langchain_core.messages import AIMessage
+def final_response_node(state: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Generate the final response to the user with processing results and audio files.
+    """
+    processing_type = state.get("processing_type", "")
+    processed_files = state.get("processed_files", {})
+    scripts = state.get("scripts", {})
+    errors = state.get("errors", [])
+    processing_metadata = state.get("processing_metadata", {})
+    user_request = state.get("user_request", "")
+    # Generate final response based on processing type
+    if processing_type == "chat":
+        # Chat responses are already handled in chat_node
+        final_response = state.get("final_response", "")
+    else:
+        final_response = create_audio_processing_response(
+            user_request, processed_files, scripts, errors, processing_metadata
+        )
+    # Add final response to messages
+    messages = state.get("messages", [])
+    if not any(msg.content == final_response for msg in messages if hasattr(msg, 'content')):
+        messages.append(AIMessage(content=final_response))
+    # Set final audio URL if available
+    final_audio_url = get_final_audio_url(processed_files, processing_type)
+    return {
+        "final_response": final_response,
+        "final_audio_url": final_audio_url,
+        "messages": messages
+    }
+def create_audio_processing_response(
+    user_request: str,
+    processed_files: Dict[str, str],
+    scripts: Dict[str, Any],
+    errors: list,
+    processing_metadata: Dict[str, Any]
+) -> str:
+    """Create comprehensive audio processing response."""
+    response = "🎵 **Audio Processing Complete!**\n\n"
+    # User request summary
+    response += f"**Your Request**: {user_request}\n\n"
+    # Processing results
+    if processed_files:
+        response += "**✅ Successfully Processed Files:**\n"
+        for i, (original, processed) in enumerate(processed_files.items(), 1):
+            original_name = get_filename_from_url(original)
+            processed_name = get_filename_from_url(processed)
+            response += f"{i}. **{original_name}**\n"
+            response += f"   🔗 **Download**: [{processed_name}]({processed})\n\n"
+            # Add script info if available
+            if original in scripts:
+                script_data = scripts[original]
+                filler_count = len(script_data.get("filler_words", []))
+                if filler_count > 0:
+                    response += f"   📝 Removed {filler_count} filler words\n"
+                response += f"   📄 Transcript available\n\n"
+    # Processing summary
+    validation_results = processing_metadata.get("validation_results", {})
+    if validation_results:
+        completion_rate = validation_results.get("step_completion_rate", 0)
+        overall_status = validation_results.get("overall_status", "unknown")
+        response += f"**📊 Processing Summary:**\n"
+        response += f"- Status: {overall_status.replace('_', ' ').title()}\n"
+        response += f"- Completion: {completion_rate:.1%}\n"
+        response += f"- Files processed: {len(processed_files)}\n\n"
+    # Improvements made
+    improvements = extract_improvements_from_processing(processed_files, scripts, processing_metadata)
+    if improvements:
+        response += "**🔧 Improvements Applied:**\n"
+        for improvement in improvements:
+            response += f"- {improvement}\n"
+        response += "\n"
+    # Recommendations
+    recommendations = validation_results.get("recommendations", [])
+    if recommendations:
+        response += "**💡 Recommendations:**\n"
+        for rec in recommendations[:3]:  # Show top 3
+            response += f"- {rec}\n"
+        response += "\n"
+    # Errors (if any)
+    if errors:
+        response += "**⚠️ Issues Encountered:**\n"
+        for error in errors[-2:]:  # Show last 2 errors
+            response += f"- {error}\n"
+        response += "\n"
+    # Call to action
+    if processed_files:
+        response += "🎉 **Your enhanced audio files are ready!** "
+        response += "Click the download links above to get your improved audio.\n\n"
+        response += "Need further adjustments? Just let me know what else you'd like me to do!"
+    else:
+        response += "❌ **Processing unsuccessful.** "
+        response += "Please check your audio file URLs and try again, or ask for help with a different approach."
+    return response
+def get_final_audio_url(processed_files: Dict[str, str], processing_type: str) -> str:
+    """Get the final audio URL to return to the user."""
+    if not processed_files:
+        return ""
+    # For dialogue generation, look for combined file
+    if processing_type == "dialogue_generation":
+        for original, processed in processed_files.items():
+            if "combined" in processed or "dialogue" in processed:
+                return processed
+    # For single file processing, return the processed file
+    if len(processed_files) == 1:
+        return list(processed_files.values())[0]
+    # For multiple files, return the first one (or could be user's choice)
+    return list(processed_files.values())[0] if processed_files else ""
+def get_filename_from_url(url: str) -> str:
+    """Extract filename from URL or path."""
+    if not url:
+        return "unknown_file"
+    # Extract filename from URL
+    if '/' in url:
+        filename = url.split('/')[-1]
+    else:
+        filename = url
+    # Remove query parameters if present
+    if '?' in filename:
+        filename = filename.split('?')[0]
+    return filename or "processed_audio"
+def extract_improvements_from_processing(
+    processed_files: Dict[str, str],
+    scripts: Dict[str, Any],
+    processing_metadata: Dict[str, Any]
+) -> list:
+    """Extract list of improvements made during processing."""
+    improvements = []
+    # Check for filler word removal
+    total_fillers = 0
+    for script_data in scripts.values():
+        filler_words = script_data.get("filler_words", [])
+        total_fillers += len(filler_words)
+    if total_fillers > 0:
+        improvements.append(f"Removed {total_fillers} filler words for cleaner speech")
+    # Check for audio enhancement
+    if processed_files:
+        improvements.append("Enhanced audio quality and consistency")
+        improvements.append("Optimized volume levels and normalization")
+    # Check for silence removal
+    validation_results = processing_metadata.get("validation_results", {})
+    recommendations = validation_results.get("recommendations", [])
+    if any("silence" in rec.lower() for rec in recommendations):
+        improvements.append("Removed unnecessary silence and gaps")
+    if any("fade" in rec.lower() for rec in recommendations):
+        improvements.append("Added professional fade effects")
+    if any("cut" in rec.lower() for rec in recommendations):
+        improvements.append("Precisely cut and trimmed audio segments")
+    # Default improvements if files were processed
+    if processed_files and not improvements:
+        improvements.extend([
+            "Applied professional audio processing",
+            "Improved overall audio quality",
+            "Optimized for better listening experience"
+        ])
+    return improvements

src/nodes/planner.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""
+Planner node for creating execution plans for audio processing.
+"""
+from typing import Dict, Any, List
+from langchain_core.messages import AIMessage
+def planner_node(state: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Create an execution plan for audio processing based on user request and scripts.
+    """
+    user_request = state.get("user_request", "")
+    audio_files = state.get("audio_files", [])
+    scripts = state.get("scripts", {})
+    processing_type = state.get("processing_type", "")
+    # Create execution plan based on processing type and user request
+    if processing_type == "dialogue_generation":
+        execution_plan = create_dialogue_plan(user_request, audio_files, scripts)
+    else:
+        execution_plan = create_audio_processing_plan(user_request, audio_files, scripts)
+    # Create plan summary message
+    plan_summary = create_plan_summary(execution_plan)
+    messages = state.get("messages", [])
+    messages.append(AIMessage(content=plan_summary))
+    return {
+        "execution_plan": execution_plan,
+        "messages": messages
+    }
+def create_audio_processing_plan(user_request: str, audio_files: List[str], scripts: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """Create execution plan for single file audio processing."""
+    plan = []
+    user_lower = user_request.lower()
+    for audio_file in audio_files:
+        file_plan = []
+        # Step 1: Update audio info
+        file_plan.append({
+            "step": f"update_info_{audio_file}",
+            "tool": "update_audio_info",
+            "params": {"audio_file": audio_file},
+            "description": f"Update audio information for {audio_file}"
+        })
+        # Step 2: Update duration info
+        file_plan.append({
+            "step": f"update_duration_{audio_file}",
+            "tool": "update_duration_info",
+            "params": {"audio_file": audio_file},
+            "description": f"Update duration information for {audio_file}"
+        })
+        # Step 3: Process based on user request
+        # Filler word removal (via silence trimming and cutting)
+        if any(keyword in user_lower for keyword in ["filler", "remove", "clean", "improve"]):
+            # First, trim silence
+            file_plan.append({
+                "step": f"trim_silence_{audio_file}",
+                "tool": "apply_silence_trimming",
+                "params": {"audio_file": audio_file, "threshold_db": -40},
+                "description": f"Remove silence and filler segments from {audio_file}"
+            })
+            # Apply filler word removal via cutting (using script data)
+            if audio_file in scripts and scripts[audio_file].get("filler_words"):
+                file_plan.append({
+                    "step": f"remove_fillers_{audio_file}",
+                    "tool": "process_cut_audio",
+                    "params": {"audio_file": audio_file, "_start_time": 0, "_end_time": 100},
+                    "description": f"Remove filler words from {audio_file}",
+                    "filler_data": scripts[audio_file]["filler_words"]
+                })
+        # Audio cutting/trimming
+        if any(keyword in user_lower for keyword in ["cut", "trim", "segment"]):
+            # Extract time ranges if specified
+            start_time, end_time = extract_time_range(user_request)
+            file_plan.append({
+                "step": f"cut_audio_{audio_file}",
+                "tool": "process_cut_audio",
+                "params": {"audio_file": audio_file, "_start_time": start_time, "_end_time": end_time},
+                "description": f"Cut audio from {start_time}s to {end_time}s"
+            })
+        # Volume/normalization adjustments
+        if any(keyword in user_lower for keyword in ["normalize", "volume", "loud", "quiet", "level"]):
+            if "normalize" in user_lower:
+                target_level = extract_target_level(user_request)
+                file_plan.append({
+                    "step": f"normalize_{audio_file}",
+                    "tool": "apply_normalization",
+                    "params": {"audio_file": audio_file, "target_level": target_level},
+                    "description": f"Normalize audio to {target_level}dB"
+                })
+            else:
+                gain_db = extract_gain_value(user_request)
+                file_plan.append({
+                    "step": f"adjust_volume_{audio_file}",
+                    "tool": "apply_volume_adjustment",
+                    "params": {"audio_file": audio_file, "gain_db": gain_db},
+                    "description": f"Adjust volume by {gain_db}dB"
+                })
+        # Speed adjustments
+        if any(keyword in user_lower for keyword in ["speed", "fast", "slow", "tempo"]):
+            speed_factor = extract_speed_factor(user_request)
+            file_plan.append({
+                "step": f"adjust_speed_{audio_file}",
+                "tool": "apply_speed_adjustment",
+                "params": {"audio_file": audio_file, "speed_factor": speed_factor},
+                "description": f"Adjust speed to {speed_factor}x"
+            })
+        # Fade effects
+        if any(keyword in user_lower for keyword in ["fade", "smooth", "transition"]):
+            fade_in, fade_out = extract_fade_values(user_request)
+            file_plan.append({
+                "step": f"apply_fades_{audio_file}",
+                "tool": "apply_fades",
+                "params": {"audio_file": audio_file, "fade_in_ms": fade_in, "fade_out_ms": fade_out},
+                "description": f"Apply fade in ({fade_in}ms) and fade out ({fade_out}ms)"
+            })
+        # If no specific processing mentioned, apply default enhancement
+        if len(file_plan) <= 2:  # Only info updates
+            file_plan.extend([
+                {
+                    "step": f"enhance_{audio_file}",
+                    "tool": "apply_silence_trimming",
+                    "params": {"audio_file": audio_file, "threshold_db": -40},
+                    "description": f"Remove silence from {audio_file}"
+                },
+                {
+                    "step": f"normalize_{audio_file}",
+                    "tool": "apply_normalization",
+                    "params": {"audio_file": audio_file, "target_level": -3},
+                    "description": f"Normalize audio levels"
+                }
+            ])
+        plan.extend(file_plan)
+    return plan
+def create_dialogue_plan(user_request: str, audio_files: List[str], scripts: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """Create execution plan for dialogue generation from multiple files."""
+    plan = []
+    # Step 1: Process each file individually first
+    for audio_file in audio_files:
+        # Update info
+        plan.append({
+            "step": f"update_info_{audio_file}",
+            "tool": "update_audio_info",
+            "params": {"audio_file": audio_file},
+            "description": f"Update audio info for {audio_file}"
+        })
+        # Clean up the audio
+        plan.append({
+            "step": f"cleanup_{audio_file}",
+            "tool": "apply_silence_trimming",
+            "params": {"audio_file": audio_file, "threshold_db": -40},
+            "description": f"Clean silence from {audio_file}"
+        })
+        # Normalize levels
+        plan.append({
+            "step": f"normalize_{audio_file}",
+            "tool": "apply_normalization",
+            "params": {"audio_file": audio_file, "target_level": -6},
+            "description": f"Normalize {audio_file} for dialogue"
+        })
+        # Add fades for smooth transitions
+        plan.append({
+            "step": f"fade_{audio_file}",
+            "tool": "apply_fades",
+            "params": {"audio_file": audio_file, "fade_in_ms": 200, "fade_out_ms": 200},
+            "description": f"Add fades to {audio_file}"
+        })
+    # Step 2: Combine files (this would need a combine tool, but we'll note it)
+    plan.append({
+        "step": "combine_dialogue",
+        "tool": "manual_combine",  # This would need to be implemented
+        "params": {"files": audio_files},
+        "description": "Combine processed files into dialogue",
+        "note": "This step requires manual combination or a dedicated combine tool"
+    })
+    return plan
+def extract_time_range(user_request: str) -> tuple:
+    """Extract start and end times from user request."""
+    import re
+    # Look for time patterns like "30 seconds to 2 minutes" or "1:30 to 3:45"
+    time_pattern = r'(\d+):?(\d*)\s*(?:seconds?|minutes?|s|m)?\s*to\s*(\d+):?(\d*)\s*(?:seconds?|minutes?|s|m)?'
+    match = re.search(time_pattern, user_request.lower())
+    if match:
+        start_min, start_sec, end_min, end_sec = match.groups()
+        start_time = int(start_min) + (int(start_sec) if start_sec else 0) / 60
+        end_time = int(end_min) + (int(end_sec) if end_sec else 0) / 60
+        return start_time, end_time
+    # Default range
+    return 0, 30
+def extract_target_level(user_request: str) -> float:
+    """Extract target normalization level."""
+    import re
+    match = re.search(r'-?(\d+(?:\.\d+)?)\s*db', user_request.lower())
+    if match:
+        return -abs(float(match.group(1)))  # Ensure negative
+    return -3  # Default
+def extract_gain_value(user_request: str) -> float:
+    """Extract gain adjustment value."""
+    import re
+    match = re.search(r'([+-]?\d+(?:\.\d+)?)\s*db', user_request.lower())
+    if match:
+        return float(match.group(1))
+    return 0  # Default
+def extract_speed_factor(user_request: str) -> float:
+    """Extract speed factor."""
+    import re
+    match = re.search(r'(\d+(?:\.\d+)?)\s*x', user_request.lower())
+    if match:
+        return float(match.group(1))
+    if any(word in user_request.lower() for word in ["fast", "faster", "quick"]):
+        return 1.5
+    elif any(word in user_request.lower() for word in ["slow", "slower"]):
+        return 0.75
+    return 1.0  # Default
+def extract_fade_values(user_request: str) -> tuple:
+    """Extract fade in/out values."""
+    import re
+    match = re.search(r'(\d+)\s*ms', user_request.lower())
+    if match:
+        value = int(match.group(1))
+        return value, value
+    return 100, 100  # Default
+def create_plan_summary(execution_plan: List[Dict[str, Any]]) -> str:
+    """Create a summary of the execution plan."""
+    if not execution_plan:
+        return "❌ **No execution plan could be created**"
+    summary = "📋 **Execution Plan Created**\n\n"
+    summary += f"**Total Steps**: {len(execution_plan)}\n\n"
+    for i, step in enumerate(execution_plan, 1):
+        tool_name = step.get("tool", "unknown")
+        description = step.get("description", "No description")
+        summary += f"**{i}.** `{tool_name}`\n"
+        summary += f"   {description}\n\n"
+    summary += "✅ **Ready to execute plan...**"
+    return summary

src/nodes/router.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+Router node to determine processing type based on user input.
+"""
+import re
+from typing import Dict, Any, List
+from langchain_core.messages import HumanMessage, AIMessage
+def router_node(state: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Route the conversation based on user input.
+    Determines if this is:
+    - A general chat question
+    - Audio processing request
+    - Dialogue generation request
+    """
+    # Get the latest user message
+    latest_message = None
+    for msg in reversed(state.get("messages", [])):
+        if isinstance(msg, HumanMessage):
+            latest_message = msg
+            break
+    if not latest_message:
+        return {
+            "processing_type": "chat",
+            "user_request": "",
+            "audio_files": []
+        }
+    user_content = latest_message.content.lower()
+    # Extract audio file URLs/paths from the message
+    audio_files = extract_audio_files(latest_message.content)
+    # Determine processing type
+    processing_type = determine_processing_type(user_content, audio_files)
+    return {
+        "processing_type": processing_type,
+        "user_request": latest_message.content,
+        "audio_files": audio_files,
+        "errors": [],
+        "needs_reprocessing": False,
+        "completed_steps": [],
+        "scripts": {},
+        "processed_files": {},
+        "processing_metadata": {}
+    }
+def extract_audio_files(content: str) -> List[str]:
+    """Extract audio file URLs or paths from user message."""
+    # Look for URLs (http/https)
+    url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+\.(mp3|wav|m4a|flac|aac|ogg)'
+    urls = re.findall(url_pattern, content, re.IGNORECASE)
+    # Look for file paths
+    path_pattern = r'[^\s<>"{}|\\^`\[\]]+\.(mp3|wav|m4a|flac|aac|ogg)'
+    paths = re.findall(path_pattern, content, re.IGNORECASE)
+    # Combine and deduplicate
+    audio_files = list(set(urls + [path for path in paths if not path.startswith('http')]))
+    return audio_files
+def determine_processing_type(content: str, audio_files: List[str]) -> str:
+    """Determine the type of processing needed."""
+    # If no audio files, it's a chat
+    if not audio_files:
+        # Check if user is asking about audio tools or capabilities
+        audio_keywords = [
+            'audio', 'sound', 'music', 'voice', 'recording', 'transcript',
+            'cut', 'trim', 'normalize', 'volume', 'fade', 'speed', 'reverse'
+        ]
+        if any(keyword in content for keyword in audio_keywords):
+            return "chat"  # User asking about audio capabilities
+        return "chat"
+    # If audio files are present, determine processing type
+    dialogue_keywords = [
+        'dialogue', 'conversation', 'combine', 'merge', 'mix',
+        'conversation', 'discussion', 'interview'
+    ]
+    if any(keyword in content for keyword in dialogue_keywords):
+        return "dialogue_generation"
+    return "audio_processing"

src/nodes/script_generator.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+Script generator node for creating timestamped transcripts.
+"""
+from typing import Dict, Any
+from langchain_core.messages import AIMessage
+async def script_generator_node(state: Dict[str, Any], tools: list) -> Dict[str, Any]:
+    """
+    Generate timestamped scripts for all audio files using transcription tools.
+    """
+    audio_files = state.get("audio_files", [])
+    if not audio_files:
+        return {
+            "scripts": {},
+            "errors": ["No audio files provided for transcription"]
+        }
+    scripts = {}
+    errors = []
+    completed_steps = state.get("completed_steps", [])
+    # Get transcription tools
+    transcribe_tool = None
+    update_transcription_tool = None
+    for tool in tools:
+        if tool.name == "transcribe_audio_sync":
+            transcribe_tool = tool
+        elif tool.name == "update_transcription_info":
+            update_transcription_tool = tool
+    if not transcribe_tool:
+        return {
+            "scripts": {},
+            "errors": ["Transcription tool not available"]
+        }
+    # Process each audio file
+    for audio_file in audio_files:
+        try:
+            # Update transcription info first if tool is available
+            if update_transcription_tool:
+                await update_transcription_tool.ainvoke({"audio_file": audio_file})
+            # Generate transcript with timestamps
+            transcript_result = await transcribe_tool.ainvoke({"audio_file": audio_file})
+            # Parse the transcript result
+            if hasattr(transcript_result, 'content'):
+                transcript_content = transcript_result.content
+            else:
+                transcript_content = str(transcript_result)
+            scripts[audio_file] = {
+                "transcript": transcript_content,
+                "timestamps": extract_timestamps(transcript_content),
+                "filler_words": identify_filler_words(transcript_content)
+            }
+            completed_steps.append(f"Transcribed: {audio_file}")
+        except Exception as e:
+            errors.append(f"Failed to transcribe {audio_file}: {str(e)}")
+    # Create response message
+    script_summary = create_script_summary(scripts)
+    messages = state.get("messages", [])
+    messages.append(AIMessage(content=script_summary))
+    return {
+        "scripts": scripts,
+        "completed_steps": completed_steps,
+        "errors": errors,
+        "messages": messages
+    }
+def extract_timestamps(transcript_content: str) -> list:
+    """Extract timestamp information from transcript."""
+    # This is a simplified implementation
+    # In a real scenario, the transcription tool would provide proper timestamps
+    timestamps = []
+    lines = transcript_content.split('\n')
+    for i, line in enumerate(lines):
+        if line.strip():
+            # Estimate timestamps based on line position
+            start_time = i * 3.0  # Rough estimate of 3 seconds per line
+            end_time = start_time + 3.0
+            timestamps.append({
+                "start": start_time,
+                "end": end_time,
+                "text": line.strip()
+            })
+    return timestamps
+def identify_filler_words(transcript_content: str) -> list:
+    """Identify filler words and their approximate positions."""
+    filler_words = [
+        "um", "uh", "like", "you know", "so", "well", "actually",
+        "basically", "literally", "I mean", "sort of", "kind of"
+    ]
+    found_fillers = []
+    words = transcript_content.lower().split()
+    for i, word in enumerate(words):
+        # Clean the word (remove punctuation)
+        clean_word = word.strip('.,!?;:"()[]{}')
+        if clean_word in filler_words:
+            found_fillers.append({
+                "word": clean_word,
+                "position": i,
+                "context": " ".join(words[max(0, i-2):min(len(words), i+3)])
+            })
+    return found_fillers
+def create_script_summary(scripts: Dict[str, Any]) -> str:
+    """Create a summary of the generated scripts."""
+    if not scripts:
+        return "❌ **Script Generation Failed**\n\nNo transcripts could be generated."
+    summary = "📝 **Transcripts Generated Successfully**\n\n"
+    for file_url, script_data in scripts.items():
+        filename = file_url.split('/')[-1] if '/' in file_url else file_url
+        transcript = script_data.get("transcript", "")
+        filler_count = len(script_data.get("filler_words", []))
+        timestamp_count = len(script_data.get("timestamps", []))
+        summary += f"**🎵 {filename}**\n"
+        summary += f"- Transcript length: {len(transcript)} characters\n"
+        summary += f"- Timestamps: {timestamp_count} segments\n"
+        summary += f"- Filler words detected: {filler_count}\n\n"
+        # Show first few lines of transcript
+        lines = transcript.split('\n')[:3]
+        if lines:
+            summary += "**Preview:**\n"
+            for line in lines:
+                if line.strip():
+                    summary += f"> {line.strip()}\n"
+            summary += "\n"
+    summary += "✅ **Ready for execution planning...**"
+    return summary

src/nodes/validator.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Validator node for checking processing results and determining if reprocessing is needed.
+"""
+from typing import Dict, Any, List
+from langchain_core.messages import AIMessage
+def validator_node(state: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Validate the processing results and determine if reprocessing is needed.
+    """
+    processed_files = state.get("processed_files", {})
+    errors = state.get("errors", [])
+    completed_steps = state.get("completed_steps", [])
+    execution_plan = state.get("execution_plan", [])
+    needs_reprocessing = state.get("needs_reprocessing", False)
+    # Perform validation checks
+    validation_results = perform_validation_checks(
+        processed_files, errors, completed_steps, execution_plan
+    )
+    # Determine if reprocessing is needed
+    should_reprocess = determine_reprocessing_need(validation_results, needs_reprocessing)
+    # Create validation summary
+    validation_summary = create_validation_summary(validation_results, should_reprocess)
+    messages = state.get("messages", [])
+    messages.append(AIMessage(content=validation_summary))
+    return {
+        "needs_reprocessing": should_reprocess,
+        "processing_metadata": {
+            "validation_results": validation_results,
+            "validation_timestamp": get_current_timestamp()
+        },
+        "messages": messages
+    }
+def perform_validation_checks(
+    processed_files: Dict[str, str],
+    errors: List[str],
+    completed_steps: List[str],
+    execution_plan: List[Dict[str, Any]]
+) -> Dict[str, Any]:
+    """Perform comprehensive validation of processing results."""
+    validation_results = {
+        "overall_status": "unknown",
+        "file_processing_success": {},
+        "step_completion_rate": 0,
+        "critical_errors": [],
+        "warnings": [],
+        "recommendations": []
+    }
+    # Check file processing success
+    for original_file in processed_files.keys():
+        processed_url = processed_files[original_file]
+        if processed_url and processed_url != original_file:
+            validation_results["file_processing_success"][original_file] = "success"
+        else:
+            validation_results["file_processing_success"][original_file] = "failed"
+    # Calculate step completion rate
+    total_steps = len(execution_plan)
+    if total_steps > 0:
+        successful_steps = len([step for step in completed_steps if step.startswith("✅")])
+        validation_results["step_completion_rate"] = successful_steps / total_steps
+    # Analyze errors for critical issues
+    critical_keywords = ["tool not available", "failed to transcribe", "connection", "timeout"]
+    for error in errors:
+        error_lower = error.lower()
+        if any(keyword in error_lower for keyword in critical_keywords):
+            validation_results["critical_errors"].append(error)
+        else:
+            validation_results["warnings"].append(error)
+    # Generate recommendations
+    validation_results["recommendations"] = generate_recommendations(
+        processed_files, errors, completed_steps, validation_results["step_completion_rate"]
+    )
+    # Determine overall status
+    if validation_results["step_completion_rate"] >= 0.8 and not validation_results["critical_errors"]:
+        validation_results["overall_status"] = "success"
+    elif validation_results["step_completion_rate"] >= 0.5:
+        validation_results["overall_status"] = "partial_success"
+    else:
+        validation_results["overall_status"] = "failed"
+    return validation_results
+def determine_reprocessing_need(validation_results: Dict[str, Any], current_needs_reprocessing: bool) -> bool:
+    """Determine if reprocessing is needed based on validation results."""
+    overall_status = validation_results.get("overall_status", "unknown")
+    step_completion_rate = validation_results.get("step_completion_rate", 0)
+    critical_errors = validation_results.get("critical_errors", [])
+    # Don't reprocess if we're already in a reprocessing cycle to avoid loops
+    if current_needs_reprocessing:
+        return False
+    # Reprocess if there are critical errors and some steps succeeded
+    if critical_errors and step_completion_rate > 0.2:
+        return True
+    # Reprocess if completion rate is low but not zero
+    if 0.1 < step_completion_rate < 0.7:
+        return True
+    # Don't reprocess if everything failed (likely a fundamental issue)
+    if step_completion_rate <= 0.1:
+        return False
+    # Don't reprocess if mostly successful
+    if step_completion_rate >= 0.8:
+        return False
+    return False
+def generate_recommendations(
+    processed_files: Dict[str, str],
+    errors: List[str],
+    completed_steps: List[str],
+    completion_rate: float
+) -> List[str]:
+    """Generate recommendations based on processing results."""
+    recommendations = []
+    # File-specific recommendations
+    if not processed_files:
+        recommendations.append("No audio files were successfully processed. Check file URLs and format compatibility.")
+    elif len(processed_files) == 1:
+        recommendations.append("Single file processed. Consider adding fade effects or normalization for better quality.")
+    else:
+        recommendations.append(f"Multiple files processed ({len(processed_files)}). Consider combining them for dialogue if needed.")
+    # Error-based recommendations
+    if any("transcribe" in error.lower() for error in errors):
+        recommendations.append("Transcription issues detected. Verify audio quality and format.")
+    if any("tool not available" in error.lower() for error in errors):
+        recommendations.append("Some tools were unavailable. Check MCP server connection.")
+    if any("normalize" in step for step in completed_steps):
+        recommendations.append("Audio levels normalized. Consider adjusting volume manually if needed.")
+    # Completion rate recommendations
+    if completion_rate < 0.5:
+        recommendations.append("Low completion rate. Consider simplifying the processing request.")
+    elif completion_rate > 0.9:
+        recommendations.append("Processing highly successful! Audio should be significantly improved.")
+    # Quality recommendations
+    filler_steps = [step for step in completed_steps if "filler" in step.lower()]
+    if filler_steps:
+        recommendations.append("Filler words processed. Review the audio for natural flow.")
+    cut_steps = [step for step in completed_steps if "cut" in step.lower()]
+    if cut_steps:
+        recommendations.append("Audio segments cut. Verify timing and transitions.")
+    return recommendations
+def get_current_timestamp() -> str:
+    """Get current timestamp for metadata."""
+    import datetime
+    return datetime.datetime.now().isoformat()
+def create_validation_summary(validation_results: Dict[str, Any], should_reprocess: bool) -> str:
+    """Create a summary of validation results."""
+    overall_status = validation_results.get("overall_status", "unknown")
+    completion_rate = validation_results.get("step_completion_rate", 0)
+    critical_errors = validation_results.get("critical_errors", [])
+    warnings = validation_results.get("warnings", [])
+    recommendations = validation_results.get("recommendations", [])
+    # Status emoji and header
+    status_emoji = {
+        "success": "✅",
+        "partial_success": "⚠️",
+        "failed": "❌",
+        "unknown": "❓"
+    }.get(overall_status, "❓")
+    summary = f"{status_emoji} **Validation Results**\n\n"
+    # Overall status
+    summary += f"**Overall Status**: {overall_status.replace('_', ' ').title()}\n"
+    summary += f"**Completion Rate**: {completion_rate:.1%}\n\n"
+    # Critical errors
+    if critical_errors:
+        summary += f"**🚨 Critical Issues ({len(critical_errors)}):**\n"
+        for error in critical_errors[:3]:  # Show first 3
+            summary += f"- {error}\n"
+        if len(critical_errors) > 3:
+            summary += f"- ... and {len(critical_errors) - 3} more\n"
+        summary += "\n"
+    # Warnings
+    if warnings:
+        summary += f"**⚠️ Warnings ({len(warnings)}):**\n"
+        for warning in warnings[:2]:  # Show first 2
+            summary += f"- {warning}\n"
+        if len(warnings) > 2:
+            summary += f"- ... and {len(warnings) - 2} more\n"
+        summary += "\n"
+    # Recommendations
+    if recommendations:
+        summary += "**💡 Recommendations:**\n"
+        for rec in recommendations[:3]:  # Show first 3
+            summary += f"- {rec}\n"
+        if len(recommendations) > 3:
+            summary += f"- ... and {len(recommendations) - 3} more\n"
+        summary += "\n"
+    # Reprocessing decision
+    if should_reprocess:
+        summary += "🔄 **Reprocessing recommended** to address issues and improve results."
+    else:
+        if overall_status == "success":
+            summary += "🎉 **Processing complete!** No reprocessing needed."
+        else:
+            summary += "⏹️ **Processing complete.** Reprocessing not recommended."
+    return summary

src/state.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+Graph state definition for the audio processing agent.
+"""
+from typing import List, Dict, Any, Optional, Annotated
+from langchain_core.messages import BaseMessage
+from langgraph.graph.message import add_messages
+class AudioProcessingState:
+    """State schema for the audio processing graph."""
+    # Chat history
+    messages: Annotated[List[BaseMessage], add_messages]
+    # Audio files provided by user
+    audio_files: List[str]  # URLs or paths to audio files
+    # User's processing request
+    user_request: str
+    # Processing type determined by router
+    processing_type: str  # "chat", "audio_processing", "dialogue_generation"
+    # Generated scripts with timestamps
+    scripts: Dict[str, Any]  # {file_url: {transcript: str, timestamps: List}}
+    # Execution plan created by planner
+    execution_plan: List[Dict[str, Any]]  # List of tool calls with parameters
+    # Processing results
+    processed_files: Dict[str, str]  # {original_url: processed_url}
+    # Processing steps completed
+    completed_steps: List[str]
+    # Final output
+    final_audio_url: Optional[str]
+    final_response: str
+    # Error handling
+    errors: List[str]
+    needs_reprocessing: bool
+    # Metadata
+    processing_metadata: Dict[str, Any]

src/ui.py CHANGED Viewed

@@ -1,25 +1,41 @@
 import asyncio
 import gradio as gr
-from gradio import ChatMessage
 from .agent import AudioAgent
 # Global agent instance
 agent = AudioAgent()
-def user_input(user_message, history):
     """
-    Handle user input and add to chat history
     """
-    if not user_message.strip():
-        return "", history
     # Add user message to history
-    history.append({"role": "user", "content": user_message})
-    return "", history
 async def bot_response(history):
     """
-    Generate bot response with streaming, organizing content by graph nodes
     """
     if not history or history[-1]["role"] != "user":
         return
@@ -36,27 +52,37 @@ async def bot_response(history):
         yield history
         # Track current node and organize content by nodes
-        current_content = ""
-        current_node = None
         nodes_content = {}
         # Stream the response
         async for chunk, node_name in agent.stream_chat(user_message):
-            # If we encounter a new node, update the display structure
-            if node_name != current_node:
-                current_node = node_name
-                if node_name not in nodes_content:
-                    nodes_content[node_name] = ""
             # Add chunk to the current node's content
             if chunk:
                 nodes_content[node_name] += chunk
             # Build the formatted content with node headers
             formatted_content = ""
             for node, content in nodes_content.items():
                 if content.strip():  # Only show nodes that have content
-                    formatted_content += f"**🔧 {node}**\n\n{content}\n\n"
             # Update the chat history
             history[-1]["content"] = formatted_content.rstrip()
@@ -70,6 +96,32 @@ async def bot_response(history):
             history.append({"role": "assistant", "content": f"❌ **Error**: {str(e)}"})
         yield history
 def bot_response_sync(history):
     """
     Synchronous wrapper for the async bot response
@@ -88,35 +140,115 @@ def bot_response_sync(history):
 def create_interface():
     """
-    Create and return the Gradio interface
     """
-    with gr.Blocks(title="Audio Agent Chatbot") as demo:
-        gr.Markdown("# 🎵 Audio Agent Chatbot")
-        gr.Markdown("Chat with your audio agent! Ask about available tools or audio processing.")
-        chatbot = gr.Chatbot(
-            type="messages",
-            height=500,
-            show_copy_button=True,
-            show_share_button=False
-        )
         with gr.Row():
             msg = gr.Textbox(
-                label="Your Message",
-                placeholder="Ask about audio tools or processing...",
-                lines=2,
                 scale=4
             )
-            send_btn = gr.Button("Send", variant="primary", scale=1)
-        clear_btn = gr.Button("Clear Chat", variant="secondary")
         # Handle user input and bot response
         msg.submit(
-            user_input,
-            [msg, chatbot],
-            [msg, chatbot],
             queue=False
         ).then(
             bot_response_sync,
@@ -125,9 +257,9 @@ def create_interface():
         )
         send_btn.click(
-            user_input,
-            [msg, chatbot],
-            [msg, chatbot],
             queue=False
         ).then(
             bot_response_sync,
@@ -137,14 +269,29 @@ def create_interface():
         # Clear chat
         clear_btn.click(
-            lambda: [],
             None,
-            chatbot,
             queue=False
         )
     return demo
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch(share=False, server_name="0.0.0.0", server_port=7861)

 import asyncio
 import gradio as gr
+from typing import List, Tuple
 from .agent import AudioAgent
 # Global agent instance
 agent = AudioAgent()
+def user_input(user_message, audio_files, history):
     """
+    Handle user input with text and audio files
     """
+    if not user_message.strip() and not audio_files:
+        return "", [], history
+    # Process audio files into URLs/paths
+    audio_file_paths = []
+    if audio_files:
+        for audio_file in audio_files:
+            if hasattr(audio_file, 'name'):
+                audio_file_paths.append(audio_file.name)
+            else:
+                audio_file_paths.append(str(audio_file))
+    # Create combined message with audio files
+    if audio_file_paths:
+        audio_list = "\n".join([f"Audio file: {path}" for path in audio_file_paths])
+        combined_message = f"{user_message}\n\n{audio_list}" if user_message.strip() else audio_list
+    else:
+        combined_message = user_message
     # Add user message to history
+    history.append({"role": "user", "content": combined_message})
+    return "", [], history
 async def bot_response(history):
     """
+    Generate bot response with streaming, organized by graph nodes
     """
     if not history or history[-1]["role"] != "user":
         return
         yield history
         # Track current node and organize content by nodes
         nodes_content = {}
+        processed_audio_urls = []
         # Stream the response
         async for chunk, node_name in agent.stream_chat(user_message):
+            # Check if this chunk contains an audio URL
+            if "Audio Ready" in chunk and "http" in chunk:
+                processed_audio_urls.append(chunk)
+                continue
             # Add chunk to the current node's content
+            if node_name not in nodes_content:
+                nodes_content[node_name] = ""
             if chunk:
                 nodes_content[node_name] += chunk
             # Build the formatted content with node headers
             formatted_content = ""
             for node, content in nodes_content.items():
                 if content.strip():  # Only show nodes that have content
+                    node_emoji = get_node_emoji(node)
+                    formatted_content += f"**{node_emoji} {format_node_name(node)}**\n\n{content}\n\n"
+            # Add processed audio URLs at the end
+            if processed_audio_urls:
+                formatted_content += "**🎵 Processed Audio Files:**\n"
+                for audio_url in processed_audio_urls:
+                    formatted_content += f"{audio_url}\n"
+                formatted_content += "\n"
             # Update the chat history
             history[-1]["content"] = formatted_content.rstrip()
             history.append({"role": "assistant", "content": f"❌ **Error**: {str(e)}"})
         yield history
+def get_node_emoji(node_name: str) -> str:
+    """Get emoji for different node types."""
+    node_emojis = {
+        "router": "🔀",
+        "chat": "💬",
+        "script_generator": "📝",
+        "planner": "📋",
+        "audio_processor": "🔧",
+        "validator": "✅",
+        "final_response": "🎯"
+    }
+    return node_emojis.get(node_name, "⚙️")
+def format_node_name(node_name: str) -> str:
+    """Format node name for display."""
+    name_mapping = {
+        "router": "Routing Request",
+        "chat": "Chat Response",
+        "script_generator": "Generating Transcripts",
+        "planner": "Creating Execution Plan",
+        "audio_processor": "Processing Audio",
+        "validator": "Validating Results",
+        "final_response": "Final Results"
+    }
+    return name_mapping.get(node_name, node_name.replace("_", " ").title())
 def bot_response_sync(history):
     """
     Synchronous wrapper for the async bot response
 def create_interface():
     """
+    Create and return the enhanced Gradio interface
     """
+    with gr.Blocks(
+        title="Audio Agent - Professional Audio Processing",
+        theme=gr.themes.Soft(),
+        css="""
+        .audio-upload-area {
+            border: 2px dashed #ccc;
+            border-radius: 10px;
+            padding: 20px;
+            text-align: center;
+            margin: 10px 0;
+        }
+        .processed-audio {
+            background: #f0f9ff;
+            border: 1px solid #0891b2;
+            border-radius: 8px;
+            padding: 15px;
+            margin: 10px 0;
+        }
+        """
+    ) as demo:
+        gr.Markdown("""
+        # 🎵 Audio Agent - Professional Audio Processing
+        Upload audio files and describe what you want to achieve. I can remove filler words,
+        normalize volume, cut segments, combine files, and much more!
+        **Supported formats**: MP3, WAV, M4A, FLAC, AAC, OGG
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(
+                    type="messages",
+                    height=400,
+                    show_copy_button=True,
+                    show_share_button=False,
+                    avatar_images=(None, "🎵"),
+                    bubble_full_width=False
+                )
+            with gr.Column(scale=1):
+                gr.Markdown("### 🎵 Upload Audio Files")
+                audio_files = gr.File(
+                    file_count="multiple",
+                    file_types=["audio"],
+                    label="Select Audio Files",
+                    height=150
+                )
+                gr.Markdown("""
+                **Quick Examples:**
+                - "Remove filler words and normalize volume"
+                - "Cut this audio from 30 seconds to 2 minutes"
+                - "Combine these files into a dialogue"
+                - "Apply fade effects and enhance quality"
+                """)
         with gr.Row():
             msg = gr.Textbox(
+                label="Describe what you want to do",
+                placeholder="e.g., 'Remove filler words and improve audio quality' or 'What tools are available?'",
+                lines=3,
                 scale=4
             )
+            send_btn = gr.Button("🚀 Process Audio", variant="primary", scale=1, size="lg")
+        with gr.Row():
+            clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
+            examples_btn = gr.Button("💡 Show Examples", variant="secondary")
+        # Examples section (initially hidden)
+        examples_section = gr.Markdown(
+            """
+            ### 📚 Example Requests
+            **Audio Enhancement:**
+            - "Clean up this recording - remove filler words and background noise"
+            - "Normalize the volume and add fade effects"
+            - "Make this audio sound more professional"
+            **Audio Editing:**
+            - "Cut the audio from 1:30 to 3:45"
+            - "Speed up this recording by 1.5x"
+            - "Reverse this audio clip"
+            **Dialogue Creation:**
+            - "Combine these two audio files into a conversation"
+            - "Create a dialogue with proper transitions between speakers"
+            **Information & Analysis:**
+            - "Generate a transcript with timestamps"
+            - "What audio processing tools are available?"
+            - "How does audio normalization work?"
+            """,
+            visible=False
+        )
         # Handle user input and bot response
+        def handle_submit(message, files, history):
+            return user_input(message, files, history)
         msg.submit(
+            handle_submit,
+            [msg, audio_files, chatbot],
+            [msg, audio_files, chatbot],
             queue=False
         ).then(
             bot_response_sync,
         )
         send_btn.click(
+            handle_submit,
+            [msg, audio_files, chatbot],
+            [msg, audio_files, chatbot],
             queue=False
         ).then(
             bot_response_sync,
         # Clear chat
         clear_btn.click(
+            lambda: ([], []),
             None,
+            [chatbot, audio_files],
             queue=False
         )
+        # Toggle examples
+        def toggle_examples(current_visibility):
+            return not current_visibility
+        examples_btn.click(
+            toggle_examples,
+            examples_section,
+            examples_section
+        )
     return demo
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7861,
+        show_error=True
+    )