Unknown504 commited on
Commit
19ea30a
·
verified ·
1 Parent(s): 7a5fe7a

Upload folder using huggingface_hub

Browse files
.dockerignore CHANGED
@@ -1,2 +1,5 @@
1
  data
2
- tmp
 
 
 
 
1
  data
2
+ tmp
3
+ results
4
+
5
+ .env
.env.example CHANGED
@@ -30,20 +30,32 @@ UNBOUND_API_KEY=
30
  SiliconFLOW_ENDPOINT=https://api.siliconflow.cn/v1/
31
  SiliconFLOW_API_KEY=
32
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Set to false to disable anonymized telemetry
34
  ANONYMIZED_TELEMETRY=false
35
 
36
  # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
37
  BROWSER_USE_LOGGING_LEVEL=info
38
 
39
- # Chrome settings
40
- CHROME_PATH=
41
- CHROME_USER_DATA=
42
- CHROME_DEBUGGING_PORT=9222
43
- CHROME_DEBUGGING_HOST=localhost
44
  # Set to true to keep browser open between AI tasks
45
- CHROME_PERSISTENT_SESSION=false
46
- CHROME_CDP=
 
47
  # Display settings
48
  # Format: WIDTHxHEIGHTxDEPTH
49
  RESOLUTION=1920x1080x24
 
30
  SiliconFLOW_ENDPOINT=https://api.siliconflow.cn/v1/
31
  SiliconFLOW_API_KEY=
32
 
33
+ IBM_ENDPOINT=https://us-south.ml.cloud.ibm.com
34
+ IBM_API_KEY=
35
+ IBM_PROJECT_ID=
36
+
37
+ GROK_ENDPOINT="https://api.x.ai/v1"
38
+ GROK_API_KEY=
39
+
40
+ #set default LLM
41
+ DEFAULT_LLM=openai
42
+
43
+
44
  # Set to false to disable anonymized telemetry
45
  ANONYMIZED_TELEMETRY=false
46
 
47
  # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
48
  BROWSER_USE_LOGGING_LEVEL=info
49
 
50
+ # Browser settings
51
+ BROWSER_PATH=
52
+ BROWSER_USER_DATA=
53
+ BROWSER_DEBUGGING_PORT=9222
54
+ BROWSER_DEBUGGING_HOST=localhost
55
  # Set to true to keep browser open between AI tasks
56
+ KEEP_BROWSER_OPEN=true
57
+ USE_OWN_BROWSER=false
58
+ BROWSER_CDP=
59
  # Display settings
60
  # Format: WIDTHxHEIGHTxDEPTH
61
  RESOLUTION=1920x1080x24
.github/workflows/build.yml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build Docker Image
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ push:
7
+ branches: [main]
8
+
9
+ env:
10
+ GITHUB_CR_REPO: ghcr.io/${{ github.repository }}
11
+
12
+ jobs:
13
+ build:
14
+ runs-on: ubuntu-latest
15
+ strategy:
16
+ fail-fast: false
17
+ matrix:
18
+ platform:
19
+ - linux/amd64
20
+ - linux/arm64
21
+ steps:
22
+ - name: Prepare
23
+ run: |
24
+ platform=${{ matrix.platform }}
25
+ echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
26
+
27
+ - name: Docker meta
28
+ id: meta
29
+ uses: docker/metadata-action@v5
30
+ with:
31
+ images: |
32
+ ${{ env.GITHUB_CR_REPO }}
33
+
34
+ - name: Login to GHCR
35
+ uses: docker/login-action@v3
36
+ with:
37
+ registry: ghcr.io
38
+ username: ${{ github.repository_owner }}
39
+ password: ${{ secrets.GITHUB_TOKEN }}
40
+
41
+ - name: Set up QEMU
42
+ uses: docker/setup-qemu-action@v3
43
+
44
+ - name: Set up Docker Buildx
45
+ uses: docker/setup-buildx-action@v3
46
+
47
+ - name: Build and push by digest
48
+ id: build
49
+ uses: docker/build-push-action@v6
50
+ with:
51
+ platforms: ${{ matrix.platform }}
52
+ labels: ${{ steps.meta.outputs.labels }}
53
+ tags: |
54
+ ${{ env.GITHUB_CR_REPO }}
55
+ build-args: |
56
+ TARGETPLATFORM=${{ matrix.platform }}
57
+ outputs: type=image,push-by-digest=true,name-canonical=true,push=true
58
+
59
+ - name: Export digest
60
+ run: |
61
+ mkdir -p ${{ runner.temp }}/digests
62
+ digest="${{ steps.build.outputs.digest }}"
63
+ touch "${{ runner.temp }}/digests/${digest#sha256:}"
64
+
65
+ - name: Upload digest
66
+ uses: actions/upload-artifact@v4
67
+ with:
68
+ name: digests-${{ env.PLATFORM_PAIR }}
69
+ path: ${{ runner.temp }}/digests/*
70
+ if-no-files-found: error
71
+ retention-days: 1
72
+
73
+ merge:
74
+ runs-on: ubuntu-latest
75
+ needs:
76
+ - build
77
+ steps:
78
+ - name: Download digests
79
+ uses: actions/download-artifact@v4
80
+ with:
81
+ path: ${{ runner.temp }}/digests
82
+ pattern: digests-*
83
+ merge-multiple: true
84
+
85
+ - name: Login to GHCR
86
+ uses: docker/login-action@v3
87
+ with:
88
+ registry: ghcr.io
89
+ username: ${{ github.repository_owner }}
90
+ password: ${{ secrets.GITHUB_TOKEN }}
91
+
92
+ - name: Set up Docker Buildx
93
+ uses: docker/setup-buildx-action@v3
94
+
95
+ - name: Docker meta
96
+ id: meta
97
+ uses: docker/metadata-action@v5
98
+ with:
99
+ images: |
100
+ ${{ env.GITHUB_CR_REPO }}
101
+ tags: |
102
+ type=ref,event=branch
103
+ type=ref,event=pr
104
+ type=semver,pattern={{version}}
105
+ type=semver,pattern={{major}}
106
+
107
+ - name: Docker tags
108
+ run: |
109
+ tags=$(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON")
110
+ if [ -z "$tags" ]; then
111
+ echo "DOCKER_METADATA_OUTPUT_VERSION=${{ github.ref_name }}" >> $GITHUB_ENV
112
+ tags="-t ${{ env.GITHUB_CR_REPO }}:${{ github.ref_name }}"
113
+ fi
114
+ echo "DOCKER_METADATA_TAGS=$tags" >> $GITHUB_ENV
115
+
116
+ - name: Create manifest list and push
117
+ working-directory: ${{ runner.temp }}/digests
118
+ run: |
119
+ docker buildx imagetools create ${{ env.DOCKER_METADATA_TAGS }} \
120
+ $(printf '${{ env.GITHUB_CR_REPO }}@sha256:%s ' *)
121
+
122
+ - name: Inspect image
123
+ run: |
124
+ docker buildx imagetools inspect ${{ env.GITHUB_CR_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION }}
.gitignore CHANGED
@@ -187,3 +187,6 @@ data/
187
 
188
  # For Config Files (Current Settings)
189
  .config.pkl
 
 
 
 
187
 
188
  # For Config Files (Current Settings)
189
  .config.pkl
190
+ *.pdf
191
+
192
+ workflow
Dockerfile CHANGED
@@ -1,5 +1,9 @@
1
  FROM python:3.11-slim
2
 
 
 
 
 
3
  # Install system dependencies
4
  RUN apt-get update && apt-get install -y \
5
  wget \
@@ -28,7 +32,6 @@ RUN apt-get update && apt-get install -y \
28
  fonts-liberation \
29
  dbus \
30
  xauth \
31
- xvfb \
32
  x11vnc \
33
  tigervnc-tools \
34
  supervisor \
@@ -40,6 +43,7 @@ RUN apt-get update && apt-get install -y \
40
  fonts-dejavu \
41
  fonts-dejavu-core \
42
  fonts-dejavu-extra \
 
43
  && rm -rf /var/lib/apt/lists/*
44
 
45
  # Install noVNC
@@ -47,40 +51,49 @@ RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
47
  && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
48
  && ln -s /opt/novnc/vnc.html /opt/novnc/index.html
49
 
50
- # Set platform for ARM64 compatibility
51
- ARG TARGETPLATFORM=linux/amd64
 
 
 
 
 
 
 
 
52
 
53
  # Set up working directory
54
  WORKDIR /app
55
 
56
  # Copy requirements and install Python dependencies
57
  COPY requirements.txt .
 
58
  RUN pip install --no-cache-dir -r requirements.txt
59
 
60
- # Install Playwright and browsers with system dependencies
61
- ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
62
- RUN playwright install --with-deps chromium
63
- RUN playwright install-deps
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  # Copy the application code
66
  COPY . .
67
 
68
- # Set environment variables
69
- ENV PYTHONUNBUFFERED=1
70
- ENV BROWSER_USE_LOGGING_LEVEL=info
71
- ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
72
- ENV ANONYMIZED_TELEMETRY=false
73
- ENV DISPLAY=:99
74
- ENV RESOLUTION=1920x1080x24
75
- ENV VNC_PASSWORD=vncpassword
76
- ENV CHROME_PERSISTENT_SESSION=true
77
- ENV RESOLUTION_WIDTH=1920
78
- ENV RESOLUTION_HEIGHT=1080
79
-
80
  # Set up supervisor configuration
81
  RUN mkdir -p /var/log/supervisor
82
  COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
83
 
84
- EXPOSE 7788 6080 5901
85
 
86
  CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
 
 
1
  FROM python:3.11-slim
2
 
3
+ # Set platform for multi-arch builds (Docker Buildx will set this)
4
+ ARG TARGETPLATFORM
5
+ ARG NODE_MAJOR=20
6
+
7
  # Install system dependencies
8
  RUN apt-get update && apt-get install -y \
9
  wget \
 
32
  fonts-liberation \
33
  dbus \
34
  xauth \
 
35
  x11vnc \
36
  tigervnc-tools \
37
  supervisor \
 
43
  fonts-dejavu \
44
  fonts-dejavu-core \
45
  fonts-dejavu-extra \
46
+ vim \
47
  && rm -rf /var/lib/apt/lists/*
48
 
49
  # Install noVNC
 
51
  && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
52
  && ln -s /opt/novnc/vnc.html /opt/novnc/index.html
53
 
54
+ # Install Node.js using NodeSource PPA
55
+ RUN mkdir -p /etc/apt/keyrings \
56
+ && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
57
+ && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list \
58
+ && apt-get update \
59
+ && apt-get install nodejs -y \
60
+ && rm -rf /var/lib/apt/lists/*
61
+
62
+ # Verify Node.js and npm installation (optional, but good for debugging)
63
+ RUN node -v && npm -v && npx -v
64
 
65
  # Set up working directory
66
  WORKDIR /app
67
 
68
  # Copy requirements and install Python dependencies
69
  COPY requirements.txt .
70
+
71
  RUN pip install --no-cache-dir -r requirements.txt
72
 
73
+ # Install playwright browsers and dependencies
74
+ # playwright documentation suggests PLAYWRIGHT_BROWSERS_PATH is still relevant
75
+ # or that playwright installs to a similar default location that Playwright would.
76
+ # Let's assume playwright respects PLAYWRIGHT_BROWSERS_PATH or its default install location is findable.
77
+ ENV PLAYWRIGHT_BROWSERS_PATH=/ms-browsers
78
+ RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH
79
+
80
+ # Install recommended: Google Chrome (instead of just Chromium for better undetectability)
81
+ # The 'playwright install chrome' command might download and place it.
82
+ # The '--with-deps' equivalent for playwright install is to run 'playwright install-deps chrome' after.
83
+ # RUN playwright install chrome --with-deps
84
+
85
+ # Alternative: Install Chromium if Google Chrome is problematic in certain environments
86
+ RUN playwright install chromium --with-deps
87
+
88
 
89
  # Copy the application code
90
  COPY . .
91
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  # Set up supervisor configuration
93
  RUN mkdir -p /var/log/supervisor
94
  COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
95
 
96
+ EXPOSE 7788 6080 5901 9222
97
 
98
  CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
99
+ #CMD ["/bin/bash"]
README.md CHANGED
@@ -2,7 +2,7 @@
2
  title: web-ui
3
  app_file: webui.py
4
  sdk: gradio
5
- sdk_version: 5.23.1
6
  ---
7
  <img src="./assets/web-ui.png" alt="Browser Use Web UI" width="full"/>
8
 
@@ -29,10 +29,6 @@ We would like to officially thank [WarmShao](https://github.com/warmshao) for hi
29
 
30
  ## Installation Guide
31
 
32
- ### Prerequisites
33
- - Python 3.11 or higher
34
- - Git (for cloning the repository)
35
-
36
  ### Option 1: Local Installation
37
 
38
  Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started.
@@ -71,15 +67,13 @@ Install Python packages:
71
  uv pip install -r requirements.txt
72
  ```
73
 
74
- Install Browsers in Playwright:
75
- You can install specific browsers by running:
76
  ```bash
77
- playwright install --with-deps chromium
78
  ```
79
-
80
- To install all browsers:
81
  ```bash
82
- playwright install
83
  ```
84
 
85
  #### Step 4: Configure Environment
@@ -94,6 +88,29 @@ cp .env.example .env
94
  ```
95
  2. Open `.env` in your preferred text editor and add your API keys and other settings
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  ### Option 2: Docker Installation
98
 
99
  #### Prerequisites
@@ -101,14 +118,14 @@ cp .env.example .env
101
  - [Docker Desktop](https://www.docker.com/products/docker-desktop/) (For Windows/macOS)
102
  - [Docker Engine](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) (For Linux)
103
 
104
- #### Installation Steps
105
- 1. Clone the repository:
106
  ```bash
107
  git clone https://github.com/browser-use/web-ui.git
108
  cd web-ui
109
  ```
110
 
111
- 2. Create and configure environment file:
 
112
  - Windows (Command Prompt):
113
  ```bash
114
  copy .env.example .env
@@ -117,122 +134,23 @@ copy .env.example .env
117
  ```bash
118
  cp .env.example .env
119
  ```
120
- Edit `.env` with your preferred text editor and add your API keys
121
 
122
- 3. Run with Docker:
123
  ```bash
124
- # Build and start the container with default settings (browser closes after AI tasks)
125
  docker compose up --build
126
  ```
 
127
  ```bash
128
- # Or run with persistent browser (browser stays open between AI tasks)
129
- CHROME_PERSISTENT_SESSION=true docker compose up --build
130
  ```
131
 
132
-
133
- 4. Access the Application:
134
- - Web Interface: Open `http://localhost:7788` in your browser
135
  - VNC Viewer (for watching browser interactions): Open `http://localhost:6080/vnc.html`
136
  - Default VNC password: "youvncpassword"
137
  - Can be changed by setting `VNC_PASSWORD` in your `.env` file
138
 
139
- ## Usage
140
-
141
- ### Local Setup
142
- 1. **Run the WebUI:**
143
- After completing the installation steps above, start the application:
144
- ```bash
145
- python webui.py --ip 127.0.0.1 --port 7788
146
- ```
147
- 2. WebUI options:
148
- - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`.
149
- - `--port`: The port to bind the WebUI to. Default is `7788`.
150
- - `--theme`: The theme for the user interface. Default is `Ocean`.
151
- - **Default**: The standard theme with a balanced design.
152
- - **Soft**: A gentle, muted color scheme for a relaxed viewing experience.
153
- - **Monochrome**: A grayscale theme with minimal color for simplicity and focus.
154
- - **Glass**: A sleek, semi-transparent design for a modern appearance.
155
- - **Origin**: A classic, retro-inspired theme for a nostalgic feel.
156
- - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors.
157
- - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect.
158
- - `--dark-mode`: Enables dark mode for the user interface.
159
- 3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
160
- 4. **Using Your Own Browser(Optional):**
161
- - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data.
162
- - Windows
163
- ```env
164
- CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
165
- CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
166
- ```
167
- > Note: Replace `YourUsername` with your actual Windows username for Windows systems.
168
- - Mac
169
- ```env
170
- CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
171
- CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
172
- ```
173
- - Close all Chrome windows
174
- - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
175
- - Check the "Use Own Browser" option within the Browser Settings.
176
- 5. **Keep Browser Open(Optional):**
177
- - Set `CHROME_PERSISTENT_SESSION=true` in the `.env` file.
178
-
179
- ### Docker Setup
180
- 1. **Environment Variables:**
181
- - All configuration is done through the `.env` file
182
- - Available environment variables:
183
- ```
184
- # LLM API Keys
185
- OPENAI_API_KEY=your_key_here
186
- ANTHROPIC_API_KEY=your_key_here
187
- GOOGLE_API_KEY=your_key_here
188
-
189
- # Browser Settings
190
- CHROME_PERSISTENT_SESSION=true # Set to true to keep browser open between AI tasks
191
- RESOLUTION=1920x1080x24 # Custom resolution format: WIDTHxHEIGHTxDEPTH
192
- RESOLUTION_WIDTH=1920 # Custom width in pixels
193
- RESOLUTION_HEIGHT=1080 # Custom height in pixels
194
-
195
- # VNC Settings
196
- VNC_PASSWORD=your_vnc_password # Optional, defaults to "vncpassword"
197
- ```
198
-
199
- 2. **Platform Support:**
200
- - Supports both AMD64 and ARM64 architectures
201
- - For ARM64 systems (e.g., Apple Silicon Macs), the container will automatically use the appropriate image
202
-
203
- 3. **Browser Persistence Modes:**
204
- - **Default Mode (CHROME_PERSISTENT_SESSION=false):**
205
- - Browser opens and closes with each AI task
206
- - Clean state for each interaction
207
- - Lower resource usage
208
-
209
- - **Persistent Mode (CHROME_PERSISTENT_SESSION=true):**
210
- - Browser stays open between AI tasks
211
- - Maintains history and state
212
- - Allows viewing previous AI interactions
213
- - Set in `.env` file or via environment variable when starting container
214
-
215
- 4. **Viewing Browser Interactions:**
216
- - Access the noVNC viewer at `http://localhost:6080/vnc.html`
217
- - Enter the VNC password (default: "vncpassword" or what you set in VNC_PASSWORD)
218
- - Direct VNC access available on port 5900 (mapped to container port 5901)
219
- - You can now see all browser interactions in real-time
220
-
221
- 5. **Container Management:**
222
- ```bash
223
- # Start with persistent browser
224
- CHROME_PERSISTENT_SESSION=true docker compose up -d
225
-
226
- # Start with default mode (browser closes after tasks)
227
- docker compose up -d
228
-
229
- # View logs
230
- docker compose logs -f
231
-
232
- # Stop the container
233
- docker compose down
234
- ```
235
-
236
  ## Changelog
237
  - [x] **2025/01/26:** Thanks to @vvincent1234. Now browser-use-webui can combine with DeepSeek-r1 to engage in deep thinking!
238
  - [x] **2025/01/10:** Thanks to @casistack. Now we have Docker Setup option and also Support keep browser open between tasks.[Video tutorial demo](https://github.com/browser-use/web-ui/issues/1#issuecomment-2582511750).
 
2
  title: web-ui
3
  app_file: webui.py
4
  sdk: gradio
5
+ sdk_version: 5.27.0
6
  ---
7
  <img src="./assets/web-ui.png" alt="Browser Use Web UI" width="full"/>
8
 
 
29
 
30
  ## Installation Guide
31
 
 
 
 
 
32
  ### Option 1: Local Installation
33
 
34
  Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started.
 
67
  uv pip install -r requirements.txt
68
  ```
69
 
70
+ Install Browsers in playwright.
 
71
  ```bash
72
+ playwright install --with-deps
73
  ```
74
+ Or you can install specific browsers by running:
 
75
  ```bash
76
+ playwright install chromium --with-deps
77
  ```
78
 
79
  #### Step 4: Configure Environment
 
88
  ```
89
  2. Open `.env` in your preferred text editor and add your API keys and other settings
90
 
91
+ #### Step 5: Enjoy the web-ui
92
+ 1. **Run the WebUI:**
93
+ ```bash
94
+ python webui.py --ip 127.0.0.1 --port 7788
95
+ ```
96
+ 2. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
97
+ 3. **Using Your Own Browser(Optional):**
98
+ - Set `BROWSER_PATH` to the executable path of your browser and `BROWSER_USER_DATA` to the user data directory of your browser. Leave `BROWSER_USER_DATA` empty if you want to use local user data.
99
+ - Windows
100
+ ```env
101
+ BROWSER_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
102
+ BROWSER_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
103
+ ```
104
+ > Note: Replace `YourUsername` with your actual Windows username for Windows systems.
105
+ - Mac
106
+ ```env
107
+ BROWSER_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
108
+ BROWSER_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
109
+ ```
110
+ - Close all Chrome windows
111
+ - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
112
+ - Check the "Use Own Browser" option within the Browser Settings.
113
+
114
  ### Option 2: Docker Installation
115
 
116
  #### Prerequisites
 
118
  - [Docker Desktop](https://www.docker.com/products/docker-desktop/) (For Windows/macOS)
119
  - [Docker Engine](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) (For Linux)
120
 
121
+ #### Step 1: Clone the Repository
 
122
  ```bash
123
  git clone https://github.com/browser-use/web-ui.git
124
  cd web-ui
125
  ```
126
 
127
+ #### Step 2: Configure Environment
128
+ 1. Create a copy of the example environment file:
129
  - Windows (Command Prompt):
130
  ```bash
131
  copy .env.example .env
 
134
  ```bash
135
  cp .env.example .env
136
  ```
137
+ 2. Open `.env` in your preferred text editor and add your API keys and other settings
138
 
139
+ #### Step 3: Docker Build and Run
140
  ```bash
 
141
  docker compose up --build
142
  ```
143
+ For ARM64 systems (e.g., Apple Silicon Macs), please run follow command:
144
  ```bash
145
+ TARGETPLATFORM=linux/arm64 docker compose up --build
 
146
  ```
147
 
148
+ #### Step 4: Enjoy the web-ui and vnc
149
+ - Web-UI: Open `http://localhost:7788` in your browser
 
150
  - VNC Viewer (for watching browser interactions): Open `http://localhost:6080/vnc.html`
151
  - Default VNC password: "youvncpassword"
152
  - Can be changed by setting `VNC_PASSWORD` in your `.env` file
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  ## Changelog
155
  - [x] **2025/01/26:** Thanks to @vvincent1234. Now browser-use-webui can combine with DeepSeek-r1 to engage in deep thinking!
156
  - [x] **2025/01/10:** Thanks to @casistack. Now we have Docker Setup option and also Support keep browser open between tasks.[Video tutorial demo](https://github.com/browser-use/web-ui/issues/1#issuecomment-2582511750).
docker-compose.yml CHANGED
@@ -1,59 +1,80 @@
1
  services:
 
2
  browser-use-webui:
3
- platform: linux/amd64
4
  build:
5
  context: .
6
- dockerfile: ${DOCKERFILE:-Dockerfile}
7
  args:
8
  TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
9
  ports:
10
- - "7788:7788" # Gradio default port
11
- - "6080:6080" # noVNC web interface
12
- - "5901:5901" # VNC port
13
- - "9222:9222" # Chrome remote debugging port
14
  environment:
 
15
  - OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1}
16
  - OPENAI_API_KEY=${OPENAI_API_KEY:-}
17
- - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
18
  - ANTHROPIC_ENDPOINT=${ANTHROPIC_ENDPOINT:-https://api.anthropic.com}
 
19
  - GOOGLE_API_KEY=${GOOGLE_API_KEY:-}
20
  - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-}
21
  - AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-}
 
22
  - DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com}
23
  - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
24
  - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://localhost:11434}
25
- - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
26
  - MISTRAL_ENDPOINT=${MISTRAL_ENDPOINT:-https://api.mistral.ai/v1}
 
27
  - ALIBABA_ENDPOINT=${ALIBABA_ENDPOINT:-https://dashscope.aliyuncs.com/compatible-mode/v1}
28
  - ALIBABA_API_KEY=${ALIBABA_API_KEY:-}
29
  - MOONSHOT_ENDPOINT=${MOONSHOT_ENDPOINT:-https://api.moonshot.cn/v1}
30
  - MOONSHOT_API_KEY=${MOONSHOT_API_KEY:-}
31
- - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
 
 
 
 
 
 
 
 
32
  - ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false}
33
- - CHROME_PATH=/usr/bin/google-chrome
34
- - CHROME_USER_DATA=/app/data/chrome_data
35
- - CHROME_PERSISTENT_SESSION=${CHROME_PERSISTENT_SESSION:-false}
36
- - CHROME_CDP=${CHROME_CDP:-http://localhost:9222}
 
 
 
 
 
 
 
 
37
  - DISPLAY=:99
38
- - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
 
 
39
  - RESOLUTION=${RESOLUTION:-1920x1080x24}
40
  - RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920}
41
  - RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080}
42
- - VNC_PASSWORD=${VNC_PASSWORD:-vncpassword}
43
- - CHROME_DEBUGGING_PORT=9222
44
- - CHROME_DEBUGGING_HOST=localhost
 
45
  volumes:
46
  - /tmp/.X11-unix:/tmp/.X11-unix
 
47
  restart: unless-stopped
48
  shm_size: '2gb'
49
  cap_add:
50
  - SYS_ADMIN
51
- security_opt:
52
- - seccomp=unconfined
53
  tmpfs:
54
  - /tmp
55
  healthcheck:
56
- test: ["CMD", "nc", "-z", "localhost", "5901"]
57
  interval: 10s
58
  timeout: 5s
59
- retries: 3
 
1
  services:
2
+ # debug: docker compose run --rm -it browser-use-webui bash
3
  browser-use-webui:
4
+ # image: ghcr.io/browser-use/web-ui # Using precompiled image
5
  build:
6
  context: .
7
+ dockerfile: Dockerfile
8
  args:
9
  TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
10
  ports:
11
+ - "7788:7788"
12
+ - "6080:6080"
13
+ - "5901:5901"
14
+ - "9222:9222"
15
  environment:
16
+ # LLM API Keys & Endpoints
17
  - OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1}
18
  - OPENAI_API_KEY=${OPENAI_API_KEY:-}
 
19
  - ANTHROPIC_ENDPOINT=${ANTHROPIC_ENDPOINT:-https://api.anthropic.com}
20
+ - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
21
  - GOOGLE_API_KEY=${GOOGLE_API_KEY:-}
22
  - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-}
23
  - AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-}
24
+ - AZURE_OPENAI_API_VERSION=${AZURE_OPENAI_API_VERSION:-2025-01-01-preview}
25
  - DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com}
26
  - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
27
  - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://localhost:11434}
 
28
  - MISTRAL_ENDPOINT=${MISTRAL_ENDPOINT:-https://api.mistral.ai/v1}
29
+ - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
30
  - ALIBABA_ENDPOINT=${ALIBABA_ENDPOINT:-https://dashscope.aliyuncs.com/compatible-mode/v1}
31
  - ALIBABA_API_KEY=${ALIBABA_API_KEY:-}
32
  - MOONSHOT_ENDPOINT=${MOONSHOT_ENDPOINT:-https://api.moonshot.cn/v1}
33
  - MOONSHOT_API_KEY=${MOONSHOT_API_KEY:-}
34
+ - UNBOUND_ENDPOINT=${UNBOUND_ENDPOINT:-https://api.getunbound.ai}
35
+ - UNBOUND_API_KEY=${UNBOUND_API_KEY:-}
36
+ - SiliconFLOW_ENDPOINT=${SiliconFLOW_ENDPOINT:-https://api.siliconflow.cn/v1/}
37
+ - SiliconFLOW_API_KEY=${SiliconFLOW_API_KEY:-}
38
+ - IBM_ENDPOINT=${IBM_ENDPOINT:-https://us-south.ml.cloud.ibm.com}
39
+ - IBM_API_KEY=${IBM_API_KEY:-}
40
+ - IBM_PROJECT_ID=${IBM_PROJECT_ID:-}
41
+
42
+ # Application Settings
43
  - ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false}
44
+ - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
45
+
46
+ # Browser Settings
47
+ - BROWSER_PATH=
48
+ - BROWSER_USER_DATA=
49
+ - BROWSER_DEBUGGING_PORT=${BROWSER_DEBUGGING_PORT:-9222}
50
+ - BROWSER_DEBUGGING_HOST=localhost
51
+ - USE_OWN_BROWSER=false
52
+ - KEEP_BROWSER_OPEN=true
53
+ - BROWSER_CDP=${BROWSER_CDP:-} # e.g., http://localhost:9222
54
+
55
+ # Display Settings
56
  - DISPLAY=:99
57
+ # This ENV is used by the Dockerfile during build time if playwright respects it.
58
+ # It's not strictly needed at runtime by docker-compose unless your app or scripts also read it.
59
+ - PLAYWRIGHT_BROWSERS_PATH=/ms-browsers # Matches Dockerfile ENV
60
  - RESOLUTION=${RESOLUTION:-1920x1080x24}
61
  - RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920}
62
  - RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080}
63
+
64
+ # VNC Settings
65
+ - VNC_PASSWORD=${VNC_PASSWORD:-youvncpassword}
66
+
67
  volumes:
68
  - /tmp/.X11-unix:/tmp/.X11-unix
69
+ # - ./my_chrome_data:/app/data/chrome_data # Optional: persist browser data
70
  restart: unless-stopped
71
  shm_size: '2gb'
72
  cap_add:
73
  - SYS_ADMIN
 
 
74
  tmpfs:
75
  - /tmp
76
  healthcheck:
77
+ test: ["CMD", "nc", "-z", "localhost", "5901"] # VNC port
78
  interval: 10s
79
  timeout: 5s
80
+ retries: 3
requirements.txt CHANGED
@@ -1,7 +1,10 @@
1
- browser-use==0.1.40
2
  pyperclip==1.9.0
3
- gradio==5.23.1
4
  json-repair
5
  langchain-mistralai==0.2.4
6
- langchain-google-genai==2.0.8
7
  MainContentExtractor==0.0.4
 
 
 
 
 
1
+ browser-use==0.1.48
2
  pyperclip==1.9.0
3
+ gradio==5.27.0
4
  json-repair
5
  langchain-mistralai==0.2.4
 
6
  MainContentExtractor==0.0.4
7
+ langchain-ibm==0.3.10
8
+ langchain_mcp_adapters==0.0.9
9
+ langgraph==0.3.34
10
+ langchain-community
src/agent/browser_use/browser_use_agent.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+
7
+ # from lmnr.sdk.decorators import observe
8
+ from browser_use.agent.gif import create_history_gif
9
+ from browser_use.agent.service import Agent, AgentHookFunc
10
+ from browser_use.agent.views import (
11
+ ActionResult,
12
+ AgentHistory,
13
+ AgentHistoryList,
14
+ AgentStepInfo,
15
+ ToolCallingMethod,
16
+ )
17
+ from browser_use.browser.views import BrowserStateHistory
18
+ from browser_use.utils import time_execution_async
19
+ from dotenv import load_dotenv
20
+ from browser_use.agent.message_manager.utils import is_model_without_tool_support
21
+
22
+ load_dotenv()
23
+ logger = logging.getLogger(__name__)
24
+
25
+ SKIP_LLM_API_KEY_VERIFICATION = (
26
+ os.environ.get("SKIP_LLM_API_KEY_VERIFICATION", "false").lower()[0] in "ty1"
27
+ )
28
+
29
+
30
+ class BrowserUseAgent(Agent):
31
+ def _set_tool_calling_method(self) -> ToolCallingMethod | None:
32
+ tool_calling_method = self.settings.tool_calling_method
33
+ if tool_calling_method == 'auto':
34
+ if is_model_without_tool_support(self.model_name):
35
+ return 'raw'
36
+ elif self.chat_model_library == 'ChatGoogleGenerativeAI':
37
+ return None
38
+ elif self.chat_model_library == 'ChatOpenAI':
39
+ return 'function_calling'
40
+ elif self.chat_model_library == 'AzureChatOpenAI':
41
+ return 'function_calling'
42
+ else:
43
+ return None
44
+ else:
45
+ return tool_calling_method
46
+
47
+ @time_execution_async("--run (agent)")
48
+ async def run(
49
+ self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None,
50
+ on_step_end: AgentHookFunc | None = None
51
+ ) -> AgentHistoryList:
52
+ """Execute the task with maximum number of steps"""
53
+
54
+ loop = asyncio.get_event_loop()
55
+
56
+ # Set up the Ctrl+C signal handler with callbacks specific to this agent
57
+ from browser_use.utils import SignalHandler
58
+
59
+ signal_handler = SignalHandler(
60
+ loop=loop,
61
+ pause_callback=self.pause,
62
+ resume_callback=self.resume,
63
+ custom_exit_callback=None, # No special cleanup needed on forced exit
64
+ exit_on_second_int=True,
65
+ )
66
+ signal_handler.register()
67
+
68
+ try:
69
+ self._log_agent_run()
70
+
71
+ # Execute initial actions if provided
72
+ if self.initial_actions:
73
+ result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
74
+ self.state.last_result = result
75
+
76
+ for step in range(max_steps):
77
+ # Check if waiting for user input after Ctrl+C
78
+ if self.state.paused:
79
+ signal_handler.wait_for_resume()
80
+ signal_handler.reset()
81
+
82
+ # Check if we should stop due to too many failures
83
+ if self.state.consecutive_failures >= self.settings.max_failures:
84
+ logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
85
+ break
86
+
87
+ # Check control flags before each step
88
+ if self.state.stopped:
89
+ logger.info('Agent stopped')
90
+ break
91
+
92
+ while self.state.paused:
93
+ await asyncio.sleep(0.2) # Small delay to prevent CPU spinning
94
+ if self.state.stopped: # Allow stopping while paused
95
+ break
96
+
97
+ if on_step_start is not None:
98
+ await on_step_start(self)
99
+
100
+ step_info = AgentStepInfo(step_number=step, max_steps=max_steps)
101
+ await self.step(step_info)
102
+
103
+ if on_step_end is not None:
104
+ await on_step_end(self)
105
+
106
+ if self.state.history.is_done():
107
+ if self.settings.validate_output and step < max_steps - 1:
108
+ if not await self._validate_output():
109
+ continue
110
+
111
+ await self.log_completion()
112
+ break
113
+ else:
114
+ error_message = 'Failed to complete task in maximum steps'
115
+
116
+ self.state.history.history.append(
117
+ AgentHistory(
118
+ model_output=None,
119
+ result=[ActionResult(error=error_message, include_in_memory=True)],
120
+ state=BrowserStateHistory(
121
+ url='',
122
+ title='',
123
+ tabs=[],
124
+ interacted_element=[],
125
+ screenshot=None,
126
+ ),
127
+ metadata=None,
128
+ )
129
+ )
130
+
131
+ logger.info(f'❌ {error_message}')
132
+
133
+ return self.state.history
134
+
135
+ except KeyboardInterrupt:
136
+ # Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
137
+ logger.info('Got KeyboardInterrupt during execution, returning current history')
138
+ return self.state.history
139
+
140
+ finally:
141
+ # Unregister signal handlers before cleanup
142
+ signal_handler.unregister()
143
+
144
+ if self.settings.save_playwright_script_path:
145
+ logger.info(
146
+ f'Agent run finished. Attempting to save Playwright script to: {self.settings.save_playwright_script_path}'
147
+ )
148
+ try:
149
+ # Extract sensitive data keys if sensitive_data is provided
150
+ keys = list(self.sensitive_data.keys()) if self.sensitive_data else None
151
+ # Pass browser and context config to the saving method
152
+ self.state.history.save_as_playwright_script(
153
+ self.settings.save_playwright_script_path,
154
+ sensitive_data_keys=keys,
155
+ browser_config=self.browser.config,
156
+ context_config=self.browser_context.config,
157
+ )
158
+ except Exception as script_gen_err:
159
+ # Log any error during script generation/saving
160
+ logger.error(f'Failed to save Playwright script: {script_gen_err}', exc_info=True)
161
+
162
+ await self.close()
163
+
164
+ if self.settings.generate_gif:
165
+ output_path: str = 'agent_history.gif'
166
+ if isinstance(self.settings.generate_gif, str):
167
+ output_path = self.settings.generate_gif
168
+
169
+ create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
src/agent/deep_research/deep_research_agent.py ADDED
@@ -0,0 +1,1261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ import threading
6
+ import uuid
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, TypedDict
9
+
10
+ from browser_use.browser.browser import BrowserConfig
11
+ from langchain_community.tools.file_management import (
12
+ ListDirectoryTool,
13
+ ReadFileTool,
14
+ WriteFileTool,
15
+ )
16
+
17
+ # Langchain imports
18
+ from langchain_core.messages import (
19
+ AIMessage,
20
+ BaseMessage,
21
+ HumanMessage,
22
+ SystemMessage,
23
+ ToolMessage,
24
+ )
25
+ from langchain_core.prompts import ChatPromptTemplate
26
+ from langchain_core.tools import StructuredTool, Tool
27
+
28
+ # Langgraph imports
29
+ from langgraph.graph import StateGraph
30
+ from pydantic import BaseModel, Field
31
+
32
+ from browser_use.browser.context import BrowserContextConfig
33
+
34
+ from src.agent.browser_use.browser_use_agent import BrowserUseAgent
35
+ from src.browser.custom_browser import CustomBrowser
36
+ from src.controller.custom_controller import CustomController
37
+ from src.utils.mcp_client import setup_mcp_client_and_tools
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ # Constants
42
+ REPORT_FILENAME = "report.md"
43
+ PLAN_FILENAME = "research_plan.md"
44
+ SEARCH_INFO_FILENAME = "search_info.json"
45
+
46
+ _AGENT_STOP_FLAGS = {}
47
+ _BROWSER_AGENT_INSTANCES = {}
48
+
49
+
50
+ async def run_single_browser_task(
51
+ task_query: str,
52
+ task_id: str,
53
+ llm: Any, # Pass the main LLM
54
+ browser_config: Dict[str, Any],
55
+ stop_event: threading.Event,
56
+ use_vision: bool = False,
57
+ ) -> Dict[str, Any]:
58
+ """
59
+ Runs a single BrowserUseAgent task.
60
+ Manages browser creation and closing for this specific task.
61
+ """
62
+ if not BrowserUseAgent:
63
+ return {
64
+ "query": task_query,
65
+ "error": "BrowserUseAgent components not available.",
66
+ }
67
+
68
+ # --- Browser Setup ---
69
+ # These should ideally come from the main agent's config
70
+ headless = browser_config.get("headless", False)
71
+ window_w = browser_config.get("window_width", 1280)
72
+ window_h = browser_config.get("window_height", 1100)
73
+ browser_user_data_dir = browser_config.get("user_data_dir", None)
74
+ use_own_browser = browser_config.get("use_own_browser", False)
75
+ browser_binary_path = browser_config.get("browser_binary_path", None)
76
+ wss_url = browser_config.get("wss_url", None)
77
+ cdp_url = browser_config.get("cdp_url", None)
78
+ disable_security = browser_config.get("disable_security", False)
79
+
80
+ bu_browser = None
81
+ bu_browser_context = None
82
+ try:
83
+ logger.info(f"Starting browser task for query: {task_query}")
84
+ extra_args = []
85
+ if use_own_browser:
86
+ browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path
87
+ if browser_binary_path == "":
88
+ browser_binary_path = None
89
+ browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None)
90
+ if browser_user_data:
91
+ extra_args += [f"--user-data-dir={browser_user_data}"]
92
+ else:
93
+ browser_binary_path = None
94
+
95
+ bu_browser = CustomBrowser(
96
+ config=BrowserConfig(
97
+ headless=headless,
98
+ browser_binary_path=browser_binary_path,
99
+ extra_browser_args=extra_args,
100
+ wss_url=wss_url,
101
+ cdp_url=cdp_url,
102
+ new_context_config=BrowserContextConfig(
103
+ window_width=window_w,
104
+ window_height=window_h,
105
+ )
106
+ )
107
+ )
108
+
109
+ context_config = BrowserContextConfig(
110
+ save_downloads_path="./tmp/downloads",
111
+ window_height=window_h,
112
+ window_width=window_w,
113
+ force_new_context=True,
114
+ )
115
+ bu_browser_context = await bu_browser.new_context(config=context_config)
116
+
117
+ # Simple controller example, replace with your actual implementation if needed
118
+ bu_controller = CustomController()
119
+
120
+ # Construct the task prompt for BrowserUseAgent
121
+ # Instruct it to find specific info and return title/URL
122
+ bu_task_prompt = f"""
123
+ Research Task: {task_query}
124
+ Objective: Find relevant information answering the query.
125
+ Output Requirements: For each relevant piece of information found, please provide:
126
+ 1. A concise summary of the information.
127
+ 2. The title of the source page or document.
128
+ 3. The URL of the source.
129
+ Focus on accuracy and relevance. Avoid irrelevant details.
130
+ PDF cannot directly extract _content, please try to download first, then using read_file, if you can't save or read, please try other methods.
131
+ """
132
+
133
+ bu_agent_instance = BrowserUseAgent(
134
+ task=bu_task_prompt,
135
+ llm=llm, # Use the passed LLM
136
+ browser=bu_browser,
137
+ browser_context=bu_browser_context,
138
+ controller=bu_controller,
139
+ use_vision=use_vision,
140
+ source="webui",
141
+ )
142
+
143
+ # Store instance for potential stop() call
144
+ task_key = f"{task_id}_{uuid.uuid4()}"
145
+ _BROWSER_AGENT_INSTANCES[task_key] = bu_agent_instance
146
+
147
+ # --- Run with Stop Check ---
148
+ # BrowserUseAgent needs to internally check a stop signal or have a stop method.
149
+ # We simulate checking before starting and assume `run` might be interruptible
150
+ # or have its own stop mechanism we can trigger via bu_agent_instance.stop().
151
+ if stop_event.is_set():
152
+ logger.info(f"Browser task for '{task_query}' cancelled before start.")
153
+ return {"query": task_query, "result": None, "status": "cancelled"}
154
+
155
+ # The run needs to be awaitable and ideally accept a stop signal or have a .stop() method
156
+ # result = await bu_agent_instance.run(max_steps=max_steps) # Add max_steps if applicable
157
+ # Let's assume a simplified run for now
158
+ logger.info(f"Running BrowserUseAgent for: {task_query}")
159
+ result = await bu_agent_instance.run() # Assuming run is the main method
160
+ logger.info(f"BrowserUseAgent finished for: {task_query}")
161
+
162
+ final_data = result.final_result()
163
+
164
+ if stop_event.is_set():
165
+ logger.info(f"Browser task for '{task_query}' stopped during execution.")
166
+ return {"query": task_query, "result": final_data, "status": "stopped"}
167
+ else:
168
+ logger.info(f"Browser result for '{task_query}': {final_data}")
169
+ return {"query": task_query, "result": final_data, "status": "completed"}
170
+
171
+ except Exception as e:
172
+ logger.error(
173
+ f"Error during browser task for query '{task_query}': {e}", exc_info=True
174
+ )
175
+ return {"query": task_query, "error": str(e), "status": "failed"}
176
+ finally:
177
+ if bu_browser_context:
178
+ try:
179
+ await bu_browser_context.close()
180
+ bu_browser_context = None
181
+ logger.info("Closed browser context.")
182
+ except Exception as e:
183
+ logger.error(f"Error closing browser context: {e}")
184
+ if bu_browser:
185
+ try:
186
+ await bu_browser.close()
187
+ bu_browser = None
188
+ logger.info("Closed browser.")
189
+ except Exception as e:
190
+ logger.error(f"Error closing browser: {e}")
191
+
192
+ if task_key in _BROWSER_AGENT_INSTANCES:
193
+ del _BROWSER_AGENT_INSTANCES[task_key]
194
+
195
+
196
+ class BrowserSearchInput(BaseModel):
197
+ queries: List[str] = Field(
198
+ description="List of distinct search queries to find information relevant to the research task."
199
+ )
200
+
201
+
202
+ async def _run_browser_search_tool(
203
+ queries: List[str],
204
+ task_id: str, # Injected dependency
205
+ llm: Any, # Injected dependency
206
+ browser_config: Dict[str, Any],
207
+ stop_event: threading.Event,
208
+ max_parallel_browsers: int = 1,
209
+ ) -> List[Dict[str, Any]]:
210
+ """
211
+ Internal function to execute parallel browser searches based on LLM-provided queries.
212
+ Handles concurrency and stop signals.
213
+ """
214
+
215
+ # Limit queries just in case LLM ignores the description
216
+ queries = queries[:max_parallel_browsers]
217
+ logger.info(
218
+ f"[Browser Tool {task_id}] Running search for {len(queries)} queries: {queries}"
219
+ )
220
+
221
+ results = []
222
+ semaphore = asyncio.Semaphore(max_parallel_browsers)
223
+
224
+ async def task_wrapper(query):
225
+ async with semaphore:
226
+ if stop_event.is_set():
227
+ logger.info(
228
+ f"[Browser Tool {task_id}] Skipping task due to stop signal: {query}"
229
+ )
230
+ return {"query": query, "result": None, "status": "cancelled"}
231
+ # Pass necessary injected configs and the stop event
232
+ return await run_single_browser_task(
233
+ query,
234
+ task_id,
235
+ llm, # Pass the main LLM (or a dedicated one if needed)
236
+ browser_config,
237
+ stop_event,
238
+ # use_vision could be added here if needed
239
+ )
240
+
241
+ tasks = [task_wrapper(query) for query in queries]
242
+ search_results = await asyncio.gather(*tasks, return_exceptions=True)
243
+
244
+ processed_results = []
245
+ for i, res in enumerate(search_results):
246
+ query = queries[i] # Get corresponding query
247
+ if isinstance(res, Exception):
248
+ logger.error(
249
+ f"[Browser Tool {task_id}] Gather caught exception for query '{query}': {res}",
250
+ exc_info=True,
251
+ )
252
+ processed_results.append(
253
+ {"query": query, "error": str(res), "status": "failed"}
254
+ )
255
+ elif isinstance(res, dict):
256
+ processed_results.append(res)
257
+ else:
258
+ logger.error(
259
+ f"[Browser Tool {task_id}] Unexpected result type for query '{query}': {type(res)}"
260
+ )
261
+ processed_results.append(
262
+ {"query": query, "error": "Unexpected result type", "status": "failed"}
263
+ )
264
+
265
+ logger.info(
266
+ f"[Browser Tool {task_id}] Finished search. Results count: {len(processed_results)}"
267
+ )
268
+ return processed_results
269
+
270
+
271
+ def create_browser_search_tool(
272
+ llm: Any,
273
+ browser_config: Dict[str, Any],
274
+ task_id: str,
275
+ stop_event: threading.Event,
276
+ max_parallel_browsers: int = 1,
277
+ ) -> StructuredTool:
278
+ """Factory function to create the browser search tool with necessary dependencies."""
279
+ # Use partial to bind the dependencies that aren't part of the LLM call arguments
280
+ from functools import partial
281
+
282
+ bound_tool_func = partial(
283
+ _run_browser_search_tool,
284
+ task_id=task_id,
285
+ llm=llm,
286
+ browser_config=browser_config,
287
+ stop_event=stop_event,
288
+ max_parallel_browsers=max_parallel_browsers,
289
+ )
290
+
291
+ return StructuredTool.from_function(
292
+ coroutine=bound_tool_func,
293
+ name="parallel_browser_search",
294
+ description=f"""Use this tool to actively search the web for information related to a specific research task or question.
295
+ It runs up to {max_parallel_browsers} searches in parallel using a browser agent for better results than simple scraping.
296
+ Provide a list of distinct search queries(up to {max_parallel_browsers}) that are likely to yield relevant information.""",
297
+ args_schema=BrowserSearchInput,
298
+ )
299
+
300
+
301
+ # --- Langgraph State Definition ---
302
+
303
+
304
+ class ResearchTaskItem(TypedDict):
305
+ # step: int # Maybe step within category, or just implicit by order
306
+ task_description: str
307
+ status: str # "pending", "completed", "failed"
308
+ queries: Optional[List[str]]
309
+ result_summary: Optional[str]
310
+
311
+
312
+ class ResearchCategoryItem(TypedDict):
313
+ category_name: str
314
+ tasks: List[ResearchTaskItem]
315
+ # Optional: category_status: str # Could be "pending", "in_progress", "completed"
316
+
317
+
318
+ class DeepResearchState(TypedDict):
319
+ task_id: str
320
+ topic: str
321
+ research_plan: List[ResearchCategoryItem] # CHANGED
322
+ search_results: List[Dict[str, Any]]
323
+ llm: Any
324
+ tools: List[Tool]
325
+ output_dir: Path
326
+ browser_config: Dict[str, Any]
327
+ final_report: Optional[str]
328
+ current_category_index: int
329
+ current_task_index_in_category: int
330
+ stop_requested: bool
331
+ error_message: Optional[str]
332
+ messages: List[BaseMessage]
333
+
334
+
335
+ # --- Langgraph Nodes ---
336
+
337
+
338
+ def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]:
339
+ state_updates = {}
340
+ plan_file = os.path.join(output_dir, PLAN_FILENAME)
341
+ search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME)
342
+
343
+ loaded_plan: List[ResearchCategoryItem] = []
344
+ next_cat_idx, next_task_idx = 0, 0
345
+ found_pending = False
346
+
347
+ if os.path.exists(plan_file):
348
+ try:
349
+ with open(plan_file, "r", encoding="utf-8") as f:
350
+ current_category: Optional[ResearchCategoryItem] = None
351
+ lines = f.readlines()
352
+ cat_counter = 0
353
+ task_counter_in_cat = 0
354
+
355
+ for line_num, line_content in enumerate(lines):
356
+ line = line_content.strip()
357
+ if line.startswith("## "): # Category
358
+ if current_category: # Save previous category
359
+ loaded_plan.append(current_category)
360
+ if not found_pending: # If previous category was all done, advance cat counter
361
+ cat_counter += 1
362
+ task_counter_in_cat = 0
363
+ category_name = line[line.find(" "):].strip() # Get text after "## X. "
364
+ current_category = ResearchCategoryItem(category_name=category_name, tasks=[])
365
+ elif (line.startswith("- [ ]") or line.startswith("- [x]") or line.startswith(
366
+ "- [-]")) and current_category: # Task
367
+ status = "pending"
368
+ if line.startswith("- [x]"):
369
+ status = "completed"
370
+ elif line.startswith("- [-]"):
371
+ status = "failed"
372
+
373
+ task_desc = line[5:].strip()
374
+ current_category["tasks"].append(
375
+ ResearchTaskItem(task_description=task_desc, status=status, queries=None,
376
+ result_summary=None)
377
+ )
378
+ if status == "pending" and not found_pending:
379
+ next_cat_idx = cat_counter
380
+ next_task_idx = task_counter_in_cat
381
+ found_pending = True
382
+ if not found_pending: # only increment if previous tasks were completed/failed
383
+ task_counter_in_cat += 1
384
+
385
+ if current_category: # Append last category
386
+ loaded_plan.append(current_category)
387
+
388
+ if loaded_plan:
389
+ state_updates["research_plan"] = loaded_plan
390
+ if not found_pending and loaded_plan: # All tasks were completed or failed
391
+ next_cat_idx = len(loaded_plan) # Points beyond the last category
392
+ next_task_idx = 0
393
+ state_updates["current_category_index"] = next_cat_idx
394
+ state_updates["current_task_index_in_category"] = next_task_idx
395
+ logger.info(
396
+ f"Loaded hierarchical research plan from {plan_file}. "
397
+ f"Next task: Category {next_cat_idx}, Task {next_task_idx} in category."
398
+ )
399
+ else:
400
+ logger.warning(f"Plan file {plan_file} was empty or malformed.")
401
+
402
+ except Exception as e:
403
+ logger.error(f"Failed to load or parse research plan {plan_file}: {e}", exc_info=True)
404
+ state_updates["error_message"] = f"Failed to load research plan: {e}"
405
+ else:
406
+ logger.info(f"Plan file {plan_file} not found. Will start fresh.")
407
+
408
+ if os.path.exists(search_file):
409
+ try:
410
+ with open(search_file, "r", encoding="utf-8") as f:
411
+ state_updates["search_results"] = json.load(f)
412
+ logger.info(f"Loaded search results from {search_file}")
413
+ except Exception as e:
414
+ logger.error(f"Failed to load search results {search_file}: {e}")
415
+ state_updates["error_message"] = (
416
+ state_updates.get("error_message", "") + f" Failed to load search results: {e}").strip()
417
+
418
+ return state_updates
419
+
420
+
421
+ def _save_plan_to_md(plan: List[ResearchCategoryItem], output_dir: str):
422
+ plan_file = os.path.join(output_dir, PLAN_FILENAME)
423
+ try:
424
+ with open(plan_file, "w", encoding="utf-8") as f:
425
+ f.write(f"# Research Plan\n\n")
426
+ for cat_idx, category in enumerate(plan):
427
+ f.write(f"## {cat_idx + 1}. {category['category_name']}\n\n")
428
+ for task_idx, task in enumerate(category['tasks']):
429
+ marker = "- [x]" if task["status"] == "completed" else "- [ ]" if task[
430
+ "status"] == "pending" else "- [-]" # [-] for failed
431
+ f.write(f" {marker} {task['task_description']}\n")
432
+ f.write("\n")
433
+ logger.info(f"Hierarchical research plan saved to {plan_file}")
434
+ except Exception as e:
435
+ logger.error(f"Failed to save research plan to {plan_file}: {e}")
436
+
437
+
438
+ def _save_search_results_to_json(results: List[Dict[str, Any]], output_dir: str):
439
+ """Appends or overwrites search results to a JSON file."""
440
+ search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME)
441
+ try:
442
+ # Simple overwrite for now, could be append
443
+ with open(search_file, "w", encoding="utf-8") as f:
444
+ json.dump(results, f, indent=2, ensure_ascii=False)
445
+ logger.info(f"Search results saved to {search_file}")
446
+ except Exception as e:
447
+ logger.error(f"Failed to save search results to {search_file}: {e}")
448
+
449
+
450
+ def _save_report_to_md(report: str, output_dir: Path):
451
+ """Saves the final report to a markdown file."""
452
+ report_file = os.path.join(output_dir, REPORT_FILENAME)
453
+ try:
454
+ with open(report_file, "w", encoding="utf-8") as f:
455
+ f.write(report)
456
+ logger.info(f"Final report saved to {report_file}")
457
+ except Exception as e:
458
+ logger.error(f"Failed to save final report to {report_file}: {e}")
459
+
460
+
461
+ async def planning_node(state: DeepResearchState) -> Dict[str, Any]:
462
+ logger.info("--- Entering Planning Node ---")
463
+ if state.get("stop_requested"):
464
+ logger.info("Stop requested, skipping planning.")
465
+ return {"stop_requested": True}
466
+
467
+ llm = state["llm"]
468
+ topic = state["topic"]
469
+ existing_plan = state.get("research_plan")
470
+ output_dir = state["output_dir"]
471
+
472
+ if existing_plan and (
473
+ state.get("current_category_index", 0) > 0 or state.get("current_task_index_in_category", 0) > 0):
474
+ logger.info("Resuming with existing plan.")
475
+ _save_plan_to_md(existing_plan, output_dir) # Ensure it's saved initially
476
+ # current_category_index and current_task_index_in_category should be set by _load_previous_state
477
+ return {"research_plan": existing_plan}
478
+
479
+ logger.info(f"Generating new research plan for topic: {topic}")
480
+
481
+ prompt_text = f"""You are a meticulous research assistant. Your goal is to create a hierarchical research plan to thoroughly investigate the topic: "{topic}".
482
+ The plan should be structured into several main research categories. Each category should contain a list of specific, actionable research tasks or questions.
483
+ Format the output as a JSON list of objects. Each object represents a research category and should have:
484
+ 1. "category_name": A string for the name of the research category.
485
+ 2. "tasks": A list of strings, where each string is a specific research task for that category.
486
+
487
+ Example JSON Output:
488
+ [
489
+ {{
490
+ "category_name": "Understanding Core Concepts and Definitions",
491
+ "tasks": [
492
+ "Define the primary terminology associated with '{topic}'.",
493
+ "Identify the fundamental principles and theories underpinning '{topic}'."
494
+ ]
495
+ }},
496
+ {{
497
+ "category_name": "Historical Development and Key Milestones",
498
+ "tasks": [
499
+ "Trace the historical evolution of '{topic}'.",
500
+ "Identify key figures, events, or breakthroughs in the development of '{topic}'."
501
+ ]
502
+ }},
503
+ {{
504
+ "category_name": "Current State-of-the-Art and Applications",
505
+ "tasks": [
506
+ "Analyze the current advancements and prominent applications of '{topic}'.",
507
+ "Investigate ongoing research and active areas of development related to '{topic}'."
508
+ ]
509
+ }},
510
+ {{
511
+ "category_name": "Challenges, Limitations, and Future Outlook",
512
+ "tasks": [
513
+ "Identify the major challenges and limitations currently facing '{topic}'.",
514
+ "Explore potential future trends, ethical considerations, and societal impacts of '{topic}'."
515
+ ]
516
+ }}
517
+ ]
518
+
519
+ Generate a plan with 3-10 categories, and 2-6 tasks per category for the topic: "{topic}" according to the complexity of the topic.
520
+ Ensure the output is a valid JSON array.
521
+ """
522
+ messages = [
523
+ SystemMessage(content="You are a research planning assistant outputting JSON."),
524
+ HumanMessage(content=prompt_text)
525
+ ]
526
+
527
+ try:
528
+ response = await llm.ainvoke(messages)
529
+ raw_content = response.content
530
+ # The LLM might wrap the JSON in backticks
531
+ if raw_content.strip().startswith("```json"):
532
+ raw_content = raw_content.strip()[7:-3].strip()
533
+ elif raw_content.strip().startswith("```"):
534
+ raw_content = raw_content.strip()[3:-3].strip()
535
+
536
+ logger.debug(f"LLM response for plan: {raw_content}")
537
+ parsed_plan_from_llm = json.loads(raw_content)
538
+
539
+ new_plan: List[ResearchCategoryItem] = []
540
+ for cat_idx, category_data in enumerate(parsed_plan_from_llm):
541
+ if not isinstance(category_data,
542
+ dict) or "category_name" not in category_data or "tasks" not in category_data:
543
+ logger.warning(f"Skipping invalid category data: {category_data}")
544
+ continue
545
+
546
+ tasks: List[ResearchTaskItem] = []
547
+ for task_idx, task_desc in enumerate(category_data["tasks"]):
548
+ if isinstance(task_desc, str):
549
+ tasks.append(
550
+ ResearchTaskItem(
551
+ task_description=task_desc,
552
+ status="pending",
553
+ queries=None,
554
+ result_summary=None,
555
+ )
556
+ )
557
+ else: # Sometimes LLM puts tasks as {"task": "description"}
558
+ if isinstance(task_desc, dict) and "task_description" in task_desc:
559
+ tasks.append(
560
+ ResearchTaskItem(
561
+ task_description=task_desc["task_description"],
562
+ status="pending",
563
+ queries=None,
564
+ result_summary=None,
565
+ )
566
+ )
567
+ elif isinstance(task_desc, dict) and "task" in task_desc: # common LLM mistake
568
+ tasks.append(
569
+ ResearchTaskItem(
570
+ task_description=task_desc["task"],
571
+ status="pending",
572
+ queries=None,
573
+ result_summary=None,
574
+ )
575
+ )
576
+ else:
577
+ logger.warning(
578
+ f"Skipping invalid task data: {task_desc} in category {category_data['category_name']}")
579
+
580
+ new_plan.append(
581
+ ResearchCategoryItem(
582
+ category_name=category_data["category_name"],
583
+ tasks=tasks,
584
+ )
585
+ )
586
+
587
+ if not new_plan:
588
+ logger.error("LLM failed to generate a valid plan structure from JSON.")
589
+ return {"error_message": "Failed to generate research plan structure."}
590
+
591
+ logger.info(f"Generated research plan with {len(new_plan)} categories.")
592
+ _save_plan_to_md(new_plan, output_dir) # Save the hierarchical plan
593
+
594
+ return {
595
+ "research_plan": new_plan,
596
+ "current_category_index": 0,
597
+ "current_task_index_in_category": 0,
598
+ "search_results": [],
599
+ }
600
+
601
+ except json.JSONDecodeError as e:
602
+ logger.error(f"Failed to parse JSON from LLM for plan: {e}. Response was: {raw_content}", exc_info=True)
603
+ return {"error_message": f"LLM generated invalid JSON for research plan: {e}"}
604
+ except Exception as e:
605
+ logger.error(f"Error during planning: {e}", exc_info=True)
606
+ return {"error_message": f"LLM Error during planning: {e}"}
607
+
608
+
609
+ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]:
610
+ logger.info("--- Entering Research Execution Node ---")
611
+ if state.get("stop_requested"):
612
+ logger.info("Stop requested, skipping research execution.")
613
+ return {
614
+ "stop_requested": True,
615
+ "current_category_index": state["current_category_index"],
616
+ "current_task_index_in_category": state["current_task_index_in_category"],
617
+ }
618
+
619
+ plan = state["research_plan"]
620
+ cat_idx = state["current_category_index"]
621
+ task_idx = state["current_task_index_in_category"]
622
+ llm = state["llm"]
623
+ tools = state["tools"]
624
+ output_dir = str(state["output_dir"])
625
+ task_id = state["task_id"] # For _AGENT_STOP_FLAGS
626
+
627
+ # This check should ideally be handled by `should_continue`
628
+ if not plan or cat_idx >= len(plan):
629
+ logger.info("Research plan complete or categories exhausted.")
630
+ return {} # should route to synthesis
631
+
632
+ current_category = plan[cat_idx]
633
+ if task_idx >= len(current_category["tasks"]):
634
+ logger.info(f"All tasks in category '{current_category['category_name']}' completed. Moving to next category.")
635
+ # This logic is now effectively handled by should_continue and the index updates below
636
+ # The next iteration will be caught by should_continue or this node with updated indices
637
+ return {
638
+ "current_category_index": cat_idx + 1,
639
+ "current_task_index_in_category": 0,
640
+ "messages": state["messages"] # Pass messages along
641
+ }
642
+
643
+ current_task = current_category["tasks"][task_idx]
644
+
645
+ if current_task["status"] == "completed":
646
+ logger.info(
647
+ f"Task '{current_task['task_description']}' in category '{current_category['category_name']}' already completed. Skipping.")
648
+ # Logic to find next task
649
+ next_task_idx = task_idx + 1
650
+ next_cat_idx = cat_idx
651
+ if next_task_idx >= len(current_category["tasks"]):
652
+ next_cat_idx += 1
653
+ next_task_idx = 0
654
+ return {
655
+ "current_category_index": next_cat_idx,
656
+ "current_task_index_in_category": next_task_idx,
657
+ "messages": state["messages"] # Pass messages along
658
+ }
659
+
660
+ logger.info(
661
+ f"Executing research task: '{current_task['task_description']}' (Category: '{current_category['category_name']}')"
662
+ )
663
+
664
+ llm_with_tools = llm.bind_tools(tools)
665
+
666
+ # Construct messages for LLM invocation
667
+ task_prompt_content = (
668
+ f"Current Research Category: {current_category['category_name']}\n"
669
+ f"Specific Task: {current_task['task_description']}\n\n"
670
+ "Please use the available tools, especially 'parallel_browser_search', to gather information for this specific task. "
671
+ "Provide focused search queries relevant ONLY to this task. "
672
+ "If you believe you have sufficient information from previous steps for this specific task, you can indicate that you are ready to summarize or that no further search is needed."
673
+ )
674
+ current_task_message_history = [
675
+ HumanMessage(content=task_prompt_content)
676
+ ]
677
+ if not state["messages"]: # First actual execution message
678
+ invocation_messages = [
679
+ SystemMessage(
680
+ content="You are a research assistant executing one task of a research plan. Focus on the current task only."),
681
+ ] + current_task_message_history
682
+ else:
683
+ invocation_messages = state["messages"] + current_task_message_history
684
+
685
+ try:
686
+ logger.info(f"Invoking LLM with tools for task: {current_task['task_description']}")
687
+ ai_response: BaseMessage = await llm_with_tools.ainvoke(invocation_messages)
688
+ logger.info("LLM invocation complete.")
689
+
690
+ tool_results = []
691
+ executed_tool_names = []
692
+ current_search_results = state.get("search_results", []) # Get existing search results
693
+
694
+ if not isinstance(ai_response, AIMessage) or not ai_response.tool_calls:
695
+ logger.warning(
696
+ f"LLM did not call any tool for task '{current_task['task_description']}'. Response: {ai_response.content[:100]}..."
697
+ )
698
+ current_task["status"] = "pending" # Or "completed_no_tool" if LLM explains it's done
699
+ current_task["result_summary"] = f"LLM did not use a tool. Response: {ai_response.content}"
700
+ current_task["current_category_index"] = cat_idx
701
+ current_task["current_task_index_in_category"] = task_idx
702
+ return current_task
703
+ # We still save the plan and advance.
704
+ else:
705
+ # Process tool calls
706
+ for tool_call in ai_response.tool_calls:
707
+ tool_name = tool_call.get("name")
708
+ tool_args = tool_call.get("args", {})
709
+ tool_call_id = tool_call.get("id")
710
+
711
+ logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}")
712
+ executed_tool_names.append(tool_name)
713
+ selected_tool = next((t for t in tools if t.name == tool_name), None)
714
+
715
+ if not selected_tool:
716
+ logger.error(f"LLM called tool '{tool_name}' which is not available.")
717
+ tool_results.append(
718
+ ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id=tool_call_id))
719
+ continue
720
+
721
+ try:
722
+ stop_event = _AGENT_STOP_FLAGS.get(task_id)
723
+ if stop_event and stop_event.is_set():
724
+ logger.info(f"Stop requested before executing tool: {tool_name}")
725
+ current_task["status"] = "pending" # Or a new "stopped" status
726
+ _save_plan_to_md(plan, output_dir)
727
+ return {"stop_requested": True, "research_plan": plan, "current_category_index": cat_idx,
728
+ "current_task_index_in_category": task_idx}
729
+
730
+ logger.info(f"Executing tool: {tool_name}")
731
+ tool_output = await selected_tool.ainvoke(tool_args)
732
+ logger.info(f"Tool '{tool_name}' executed successfully.")
733
+
734
+ if tool_name == "parallel_browser_search":
735
+ current_search_results.extend(tool_output) # tool_output is List[Dict]
736
+ else: # For other tools, we might need specific handling or just log
737
+ logger.info(f"Result from tool '{tool_name}': {str(tool_output)[:200]}...")
738
+ # Storing non-browser results might need a different structure or key in search_results
739
+ current_search_results.append(
740
+ {"tool_name": tool_name, "args": tool_args, "output": str(tool_output),
741
+ "status": "completed"})
742
+
743
+ tool_results.append(ToolMessage(content=json.dumps(tool_output), tool_call_id=tool_call_id))
744
+
745
+ except Exception as e:
746
+ logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True)
747
+ tool_results.append(
748
+ ToolMessage(content=f"Error executing tool {tool_name}: {e}", tool_call_id=tool_call_id))
749
+ current_search_results.append(
750
+ {"tool_name": tool_name, "args": tool_args, "status": "failed", "error": str(e)})
751
+
752
+ # After processing all tool calls for this task
753
+ step_failed_tool_execution = any("Error:" in str(tr.content) for tr in tool_results)
754
+ # Consider a task successful if a browser search was attempted and didn't immediately error out during call
755
+ # The browser search itself returns status for each query.
756
+ browser_tool_attempted_successfully = "parallel_browser_search" in executed_tool_names and not step_failed_tool_execution
757
+
758
+ if step_failed_tool_execution:
759
+ current_task["status"] = "failed"
760
+ current_task[
761
+ "result_summary"] = f"Tool execution failed. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}"
762
+ elif executed_tool_names: # If any tool was called
763
+ current_task["status"] = "completed"
764
+ current_task["result_summary"] = f"Executed tool(s): {', '.join(executed_tool_names)}."
765
+ # TODO: Could ask LLM to summarize the tool_results for this task if needed, rather than just listing tools.
766
+ else: # No tool calls but AI response had .tool_calls structure (empty)
767
+ current_task["status"] = "failed" # Or a more specific status
768
+ current_task["result_summary"] = "LLM prepared for tool call but provided no tools."
769
+
770
+ # Save progress
771
+ _save_plan_to_md(plan, output_dir)
772
+ _save_search_results_to_json(current_search_results, output_dir)
773
+
774
+ # Determine next indices
775
+ next_task_idx = task_idx + 1
776
+ next_cat_idx = cat_idx
777
+ if next_task_idx >= len(current_category["tasks"]):
778
+ next_cat_idx += 1
779
+ next_task_idx = 0
780
+
781
+ updated_messages = state["messages"] + current_task_message_history + [ai_response] + tool_results
782
+
783
+ return {
784
+ "research_plan": plan,
785
+ "search_results": current_search_results,
786
+ "current_category_index": next_cat_idx,
787
+ "current_task_index_in_category": next_task_idx,
788
+ "messages": updated_messages,
789
+ }
790
+
791
+ except Exception as e:
792
+ logger.error(f"Unhandled error during research execution for task '{current_task['task_description']}': {e}",
793
+ exc_info=True)
794
+ current_task["status"] = "failed"
795
+ _save_plan_to_md(plan, output_dir)
796
+ # Determine next indices even on error to attempt to move on
797
+ next_task_idx = task_idx + 1
798
+ next_cat_idx = cat_idx
799
+ if next_task_idx >= len(current_category["tasks"]):
800
+ next_cat_idx += 1
801
+ next_task_idx = 0
802
+ return {
803
+ "research_plan": plan,
804
+ "current_category_index": next_cat_idx,
805
+ "current_task_index_in_category": next_task_idx,
806
+ "error_message": f"Core Execution Error on task '{current_task['task_description']}': {e}",
807
+ "messages": state["messages"] + current_task_message_history # Preserve messages up to error
808
+ }
809
+
810
+
811
+ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]:
812
+ """Synthesizes the final report from the collected search results."""
813
+ logger.info("--- Entering Synthesis Node ---")
814
+ if state.get("stop_requested"):
815
+ logger.info("Stop requested, skipping synthesis.")
816
+ return {"stop_requested": True}
817
+
818
+ llm = state["llm"]
819
+ topic = state["topic"]
820
+ search_results = state.get("search_results", [])
821
+ output_dir = state["output_dir"]
822
+ plan = state["research_plan"] # Include plan for context
823
+
824
+ if not search_results:
825
+ logger.warning("No search results found to synthesize report.")
826
+ report = f"# Research Report: {topic}\n\nNo information was gathered during the research process."
827
+ _save_report_to_md(report, output_dir)
828
+ return {"final_report": report}
829
+
830
+ logger.info(
831
+ f"Synthesizing report from {len(search_results)} collected search result entries."
832
+ )
833
+
834
+ # Prepare context for the LLM
835
+ # Format search results nicely, maybe group by query or original plan step
836
+ formatted_results = ""
837
+ references = {}
838
+ ref_count = 1
839
+ for i, result_entry in enumerate(search_results):
840
+ query = result_entry.get("query", "Unknown Query") # From parallel_browser_search
841
+ tool_name = result_entry.get("tool_name") # From other tools
842
+ status = result_entry.get("status", "unknown")
843
+ result_data = result_entry.get("result") # From BrowserUseAgent's final_result
844
+ tool_output_str = result_entry.get("output") # From other tools
845
+
846
+ if tool_name == "parallel_browser_search" and status == "completed" and result_data:
847
+ # result_data is the summary from BrowserUseAgent
848
+ formatted_results += f'### Finding from Web Search Query: "{query}"\n'
849
+ formatted_results += f"- **Summary:**\n{result_data}\n" # result_data is already a summary string here
850
+ # If result_data contained title/URL, you'd format them here.
851
+ # The current BrowserUseAgent returns a string summary directly as 'final_data' in run_single_browser_task
852
+ formatted_results += "---\n"
853
+ elif tool_name != "parallel_browser_search" and status == "completed" and tool_output_str:
854
+ formatted_results += f'### Finding from Tool: "{tool_name}" (Args: {result_entry.get("args")})\n'
855
+ formatted_results += f"- **Output:**\n{tool_output_str}\n"
856
+ formatted_results += "---\n"
857
+ elif status == "failed":
858
+ error = result_entry.get("error")
859
+ q_or_t = f"Query: \"{query}\"" if query != "Unknown Query" else f"Tool: \"{tool_name}\""
860
+ formatted_results += f'### Failed {q_or_t}\n'
861
+ formatted_results += f"- **Error:** {error}\n"
862
+ formatted_results += "---\n"
863
+
864
+ # Prepare the research plan context
865
+ plan_summary = "\nResearch Plan Followed:\n"
866
+ for cat_idx, category in enumerate(plan):
867
+ plan_summary += f"\n#### Category {cat_idx + 1}: {category['category_name']}\n"
868
+ for task_idx, task in enumerate(category['tasks']):
869
+ marker = "[x]" if task["status"] == "completed" else "[ ]" if task["status"] == "pending" else "[-]"
870
+ plan_summary += f" - {marker} {task['task_description']}\n"
871
+
872
+ synthesis_prompt = ChatPromptTemplate.from_messages(
873
+ [
874
+ (
875
+ "system",
876
+ """You are a professional researcher tasked with writing a comprehensive and well-structured report based on collected findings.
877
+ The report should address the research topic thoroughly, synthesizing the information gathered from various sources.
878
+ Structure the report logically:
879
+ 1. Briefly introduce the topic and the report's scope (mentioning the research plan followed, including categories and tasks, is good).
880
+ 2. Discuss the key findings, organizing them thematically, possibly aligning with the research categories. Analyze, compare, and contrast information.
881
+ 3. Summarize the main points and offer concluding thoughts.
882
+
883
+ Ensure the tone is objective and professional.
884
+ If findings are contradictory or incomplete, acknowledge this.
885
+ """, # Removed citation part for simplicity for now, as browser agent returns summaries.
886
+ ),
887
+ (
888
+ "human",
889
+ f"""
890
+ **Research Topic:** {topic}
891
+
892
+ {plan_summary}
893
+
894
+ **Collected Findings:**
895
+ ```
896
+ {formatted_results}
897
+ ```
898
+
899
+ Please generate the final research report in Markdown format based **only** on the information above.
900
+ """,
901
+ ),
902
+ ]
903
+ )
904
+
905
+ try:
906
+ response = await llm.ainvoke(
907
+ synthesis_prompt.format_prompt(
908
+ topic=topic,
909
+ plan_summary=plan_summary,
910
+ formatted_results=formatted_results,
911
+ ).to_messages()
912
+ )
913
+ final_report_md = response.content
914
+
915
+ # Append the reference list automatically to the end of the generated markdown
916
+ if references:
917
+ report_references_section = "\n\n## References\n\n"
918
+ # Sort refs by ID for consistent output
919
+ sorted_refs = sorted(references.values(), key=lambda x: x["id"])
920
+ for ref in sorted_refs:
921
+ report_references_section += (
922
+ f"[{ref['id']}] {ref['title']} - {ref['url']}\n"
923
+ )
924
+ final_report_md += report_references_section
925
+
926
+ logger.info("Successfully synthesized the final report.")
927
+ _save_report_to_md(final_report_md, output_dir)
928
+ return {"final_report": final_report_md}
929
+
930
+ except Exception as e:
931
+ logger.error(f"Error during report synthesis: {e}", exc_info=True)
932
+ return {"error_message": f"LLM Error during synthesis: {e}"}
933
+
934
+
935
+ # --- Langgraph Edges and Conditional Logic ---
936
+
937
+
938
+ def should_continue(state: DeepResearchState) -> str:
939
+ logger.info("--- Evaluating Condition: Should Continue? ---")
940
+ if state.get("stop_requested"):
941
+ logger.info("Stop requested, routing to END.")
942
+ return "end_run"
943
+ if state.get("error_message") and "Core Execution Error" in state["error_message"]: # Critical error in node
944
+ logger.warning(f"Critical error detected: {state['error_message']}. Routing to END.")
945
+ return "end_run"
946
+
947
+ plan = state.get("research_plan")
948
+ cat_idx = state.get("current_category_index", 0)
949
+ task_idx = state.get("current_task_index_in_category", 0) # This is the *next* task to check
950
+
951
+ if not plan:
952
+ logger.warning("No research plan found. Routing to END.")
953
+ return "end_run"
954
+
955
+ # Check if the current indices point to a valid pending task
956
+ if cat_idx < len(plan):
957
+ current_category = plan[cat_idx]
958
+ if task_idx < len(current_category["tasks"]):
959
+ # We are trying to execute the task at plan[cat_idx]["tasks"][task_idx]
960
+ # The research_execution_node will handle if it's already completed.
961
+ logger.info(
962
+ f"Plan has potential pending tasks (next up: Category {cat_idx}, Task {task_idx}). Routing to Research Execution."
963
+ )
964
+ return "execute_research"
965
+ else: # task_idx is out of bounds for current category, means we need to check next category
966
+ if cat_idx + 1 < len(plan): # If there is a next category
967
+ logger.info(
968
+ f"Finished tasks in category {cat_idx}. Moving to category {cat_idx + 1}. Routing to Research Execution."
969
+ )
970
+ # research_execution_node will update state to {current_category_index: cat_idx + 1, current_task_index_in_category: 0}
971
+ # Or rather, the previous execution node already set these indices to the start of the next category.
972
+ return "execute_research"
973
+
974
+ # If we've gone through all categories and tasks (cat_idx >= len(plan))
975
+ logger.info("All plan categories and tasks processed or current indices are out of bounds. Routing to Synthesis.")
976
+ return "synthesize_report"
977
+
978
+
979
+ # --- DeepSearchAgent Class ---
980
+
981
+
982
+ class DeepResearchAgent:
983
+ def __init__(
984
+ self,
985
+ llm: Any,
986
+ browser_config: Dict[str, Any],
987
+ mcp_server_config: Optional[Dict[str, Any]] = None,
988
+ ):
989
+ """
990
+ Initializes the DeepSearchAgent.
991
+
992
+ Args:
993
+ llm: The Langchain compatible language model instance.
994
+ browser_config: Configuration dictionary for the BrowserUseAgent tool.
995
+ Example: {"headless": True, "window_width": 1280, ...}
996
+ mcp_server_config: Optional configuration for the MCP client.
997
+ """
998
+ self.llm = llm
999
+ self.browser_config = browser_config
1000
+ self.mcp_server_config = mcp_server_config
1001
+ self.mcp_client = None
1002
+ self.stopped = False
1003
+ self.graph = self._compile_graph()
1004
+ self.current_task_id: Optional[str] = None
1005
+ self.stop_event: Optional[threading.Event] = None
1006
+ self.runner: Optional[asyncio.Task] = None # To hold the asyncio task for run
1007
+
1008
+ async def _setup_tools(
1009
+ self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1
1010
+ ) -> List[Tool]:
1011
+ """Sets up the basic tools (File I/O) and optional MCP tools."""
1012
+ tools = [
1013
+ WriteFileTool(),
1014
+ ReadFileTool(),
1015
+ ListDirectoryTool(),
1016
+ ] # Basic file operations
1017
+ browser_use_tool = create_browser_search_tool(
1018
+ llm=self.llm,
1019
+ browser_config=self.browser_config,
1020
+ task_id=task_id,
1021
+ stop_event=stop_event,
1022
+ max_parallel_browsers=max_parallel_browsers,
1023
+ )
1024
+ tools += [browser_use_tool]
1025
+ # Add MCP tools if config is provided
1026
+ if self.mcp_server_config:
1027
+ try:
1028
+ logger.info("Setting up MCP client and tools...")
1029
+ if not self.mcp_client:
1030
+ self.mcp_client = await setup_mcp_client_and_tools(
1031
+ self.mcp_server_config
1032
+ )
1033
+ mcp_tools = self.mcp_client.get_tools()
1034
+ logger.info(f"Loaded {len(mcp_tools)} MCP tools.")
1035
+ tools.extend(mcp_tools)
1036
+ except Exception as e:
1037
+ logger.error(f"Failed to set up MCP tools: {e}", exc_info=True)
1038
+ elif self.mcp_server_config:
1039
+ logger.warning(
1040
+ "MCP server config provided, but setup function unavailable."
1041
+ )
1042
+ tools_map = {tool.name: tool for tool in tools}
1043
+ return tools_map.values()
1044
+
1045
+ async def close_mcp_client(self):
1046
+ if self.mcp_client:
1047
+ await self.mcp_client.__aexit__(None, None, None)
1048
+ self.mcp_client = None
1049
+
1050
+ def _compile_graph(self) -> StateGraph:
1051
+ """Compiles the Langgraph state machine."""
1052
+ workflow = StateGraph(DeepResearchState)
1053
+
1054
+ # Add nodes
1055
+ workflow.add_node("plan_research", planning_node)
1056
+ workflow.add_node("execute_research", research_execution_node)
1057
+ workflow.add_node("synthesize_report", synthesis_node)
1058
+ workflow.add_node(
1059
+ "end_run", lambda state: logger.info("--- Reached End Run Node ---") or {}
1060
+ ) # Simple end node
1061
+
1062
+ # Define edges
1063
+ workflow.set_entry_point("plan_research")
1064
+
1065
+ workflow.add_edge(
1066
+ "plan_research", "execute_research"
1067
+ ) # Always execute after planning
1068
+
1069
+ # Conditional edge after execution
1070
+ workflow.add_conditional_edges(
1071
+ "execute_research",
1072
+ should_continue,
1073
+ {
1074
+ "execute_research": "execute_research", # Loop back if more steps
1075
+ "synthesize_report": "synthesize_report", # Move to synthesis if done
1076
+ "end_run": "end_run", # End if stop requested or error
1077
+ },
1078
+ )
1079
+
1080
+ workflow.add_edge("synthesize_report", "end_run") # End after synthesis
1081
+
1082
+ app = workflow.compile()
1083
+ return app
1084
+
1085
+ async def run(
1086
+ self,
1087
+ topic: str,
1088
+ task_id: Optional[str] = None,
1089
+ save_dir: str = "./tmp/deep_research",
1090
+ max_parallel_browsers: int = 1,
1091
+ ) -> Dict[str, Any]:
1092
+ """
1093
+ Starts the deep research process (Async Generator Version).
1094
+
1095
+ Args:
1096
+ topic: The research topic.
1097
+ task_id: Optional existing task ID to resume. If None, a new ID is generated.
1098
+
1099
+ Yields:
1100
+ Intermediate state updates or messages during execution.
1101
+ """
1102
+ if self.runner and not self.runner.done():
1103
+ logger.warning(
1104
+ "Agent is already running. Please stop the current task first."
1105
+ )
1106
+ # Return an error status instead of yielding
1107
+ return {
1108
+ "status": "error",
1109
+ "message": "Agent already running.",
1110
+ "task_id": self.current_task_id,
1111
+ }
1112
+
1113
+ self.current_task_id = task_id if task_id else str(uuid.uuid4())
1114
+ safe_root_dir = "./tmp/deep_research"
1115
+ normalized_save_dir = os.path.normpath(save_dir)
1116
+ if not normalized_save_dir.startswith(os.path.abspath(safe_root_dir)):
1117
+ logger.warning(f"Unsafe save_dir detected: {save_dir}. Using default directory.")
1118
+ normalized_save_dir = os.path.abspath(safe_root_dir)
1119
+ output_dir = os.path.join(normalized_save_dir, self.current_task_id)
1120
+ os.makedirs(output_dir, exist_ok=True)
1121
+
1122
+ logger.info(
1123
+ f"[AsyncGen] Starting research task ID: {self.current_task_id} for topic: '{topic}'"
1124
+ )
1125
+ logger.info(f"[AsyncGen] Output directory: {output_dir}")
1126
+
1127
+ self.stop_event = threading.Event()
1128
+ _AGENT_STOP_FLAGS[self.current_task_id] = self.stop_event
1129
+ agent_tools = await self._setup_tools(
1130
+ self.current_task_id, self.stop_event, max_parallel_browsers
1131
+ )
1132
+ initial_state: DeepResearchState = {
1133
+ "task_id": self.current_task_id,
1134
+ "topic": topic,
1135
+ "research_plan": [],
1136
+ "search_results": [],
1137
+ "messages": [],
1138
+ "llm": self.llm,
1139
+ "tools": agent_tools,
1140
+ "output_dir": Path(output_dir),
1141
+ "browser_config": self.browser_config,
1142
+ "final_report": None,
1143
+ "current_category_index": 0,
1144
+ "current_task_index_in_category": 0,
1145
+ "stop_requested": False,
1146
+ "error_message": None,
1147
+ }
1148
+
1149
+ if task_id:
1150
+ logger.info(f"Attempting to resume task {task_id}...")
1151
+ loaded_state = _load_previous_state(task_id, output_dir)
1152
+ initial_state.update(loaded_state)
1153
+ if loaded_state.get("research_plan"):
1154
+ logger.info(
1155
+ f"Resuming with {len(loaded_state['research_plan'])} plan categories "
1156
+ f"and {len(loaded_state.get('search_results', []))} existing results. "
1157
+ f"Next task: Cat {initial_state['current_category_index']}, Task {initial_state['current_task_index_in_category']}"
1158
+ )
1159
+ initial_state["topic"] = (
1160
+ topic # Allow overriding topic even when resuming? Or use stored topic? Let's use new one.
1161
+ )
1162
+ else:
1163
+ logger.warning(
1164
+ f"Resume requested for {task_id}, but no previous plan found. Starting fresh."
1165
+ )
1166
+
1167
+ # --- Execute Graph using ainvoke ---
1168
+ final_state = None
1169
+ status = "unknown"
1170
+ message = None
1171
+ try:
1172
+ logger.info(f"Invoking graph execution for task {self.current_task_id}...")
1173
+ self.runner = asyncio.create_task(self.graph.ainvoke(initial_state))
1174
+ final_state = await self.runner
1175
+ logger.info(f"Graph execution finished for task {self.current_task_id}.")
1176
+
1177
+ # Determine status based on final state
1178
+ if self.stop_event and self.stop_event.is_set():
1179
+ status = "stopped"
1180
+ message = "Research process was stopped by request."
1181
+ logger.info(message)
1182
+ elif final_state and final_state.get("error_message"):
1183
+ status = "error"
1184
+ message = final_state["error_message"]
1185
+ logger.error(f"Graph execution completed with error: {message}")
1186
+ elif final_state and final_state.get("final_report"):
1187
+ status = "completed"
1188
+ message = "Research process completed successfully."
1189
+ logger.info(message)
1190
+ else:
1191
+ # If it ends without error/report (e.g., empty plan, stopped before synthesis)
1192
+ status = "finished_incomplete"
1193
+ message = "Research process finished, but may be incomplete (no final report generated)."
1194
+ logger.warning(message)
1195
+
1196
+ except asyncio.CancelledError:
1197
+ status = "cancelled"
1198
+ message = f"Agent run task cancelled for {self.current_task_id}."
1199
+ logger.info(message)
1200
+ # final_state will remain None or the state before cancellation if checkpointing was used
1201
+ except Exception as e:
1202
+ status = "error"
1203
+ message = f"Unhandled error during graph execution for {self.current_task_id}: {e}"
1204
+ logger.error(message, exc_info=True)
1205
+ # final_state will remain None or the state before the error
1206
+ finally:
1207
+ logger.info(f"Cleaning up resources for task {self.current_task_id}")
1208
+ task_id_to_clean = self.current_task_id
1209
+
1210
+ self.stop_event = None
1211
+ self.current_task_id = None
1212
+ self.runner = None # Mark runner as finished
1213
+ if self.mcp_client:
1214
+ await self.mcp_client.__aexit__(None, None, None)
1215
+
1216
+ # Return a result dictionary including the status and the final state if available
1217
+ return {
1218
+ "status": status,
1219
+ "message": message,
1220
+ "task_id": task_id_to_clean, # Use the stored task_id
1221
+ "final_state": final_state
1222
+ if final_state
1223
+ else {}, # Return the final state dict
1224
+ }
1225
+
1226
+ async def _stop_lingering_browsers(self, task_id):
1227
+ """Attempts to stop any BrowserUseAgent instances associated with the task_id."""
1228
+ keys_to_stop = [
1229
+ key for key in _BROWSER_AGENT_INSTANCES if key.startswith(f"{task_id}_")
1230
+ ]
1231
+ if not keys_to_stop:
1232
+ return
1233
+
1234
+ logger.warning(
1235
+ f"Found {len(keys_to_stop)} potentially lingering browser agents for task {task_id}. Attempting stop..."
1236
+ )
1237
+ for key in keys_to_stop:
1238
+ agent_instance = _BROWSER_AGENT_INSTANCES.get(key)
1239
+ try:
1240
+ if agent_instance:
1241
+ # Assuming BU agent has an async stop method
1242
+ await agent_instance.stop()
1243
+ logger.info(f"Called stop() on browser agent instance {key}")
1244
+ except Exception as e:
1245
+ logger.error(
1246
+ f"Error calling stop() on browser agent instance {key}: {e}"
1247
+ )
1248
+
1249
+ async def stop(self):
1250
+ """Signals the currently running agent task to stop."""
1251
+ if not self.current_task_id or not self.stop_event:
1252
+ logger.info("No agent task is currently running.")
1253
+ return
1254
+
1255
+ logger.info(f"Stop requested for task ID: {self.current_task_id}")
1256
+ self.stop_event.set() # Signal the stop event
1257
+ self.stopped = True
1258
+ await self._stop_lingering_browsers(self.current_task_id)
1259
+
1260
+ def close(self):
1261
+ self.stopped = False
src/browser/custom_browser.py CHANGED
@@ -9,11 +9,23 @@ from playwright.async_api import (
9
  Playwright,
10
  async_playwright,
11
  )
12
- from browser_use.browser.browser import Browser
13
  from browser_use.browser.context import BrowserContext, BrowserContextConfig
14
  from playwright.async_api import BrowserContext as PlaywrightBrowserContext
15
  import logging
16
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  from .custom_context import CustomBrowserContext
18
 
19
  logger = logging.getLogger(__name__)
@@ -21,8 +33,77 @@ logger = logging.getLogger(__name__)
21
 
22
  class CustomBrowser(Browser):
23
 
24
- async def new_context(
25
- self,
26
- config: BrowserContextConfig = BrowserContextConfig()
27
- ) -> CustomBrowserContext:
28
- return CustomBrowserContext(config=config, browser=self)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  Playwright,
10
  async_playwright,
11
  )
12
+ from browser_use.browser.browser import Browser, IN_DOCKER
13
  from browser_use.browser.context import BrowserContext, BrowserContextConfig
14
  from playwright.async_api import BrowserContext as PlaywrightBrowserContext
15
  import logging
16
 
17
+ from browser_use.browser.chrome import (
18
+ CHROME_ARGS,
19
+ CHROME_DETERMINISTIC_RENDERING_ARGS,
20
+ CHROME_DISABLE_SECURITY_ARGS,
21
+ CHROME_DOCKER_ARGS,
22
+ CHROME_HEADLESS_ARGS,
23
+ )
24
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
25
+ from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments
26
+ from browser_use.utils import time_execution_async
27
+ import socket
28
+
29
  from .custom_context import CustomBrowserContext
30
 
31
  logger = logging.getLogger(__name__)
 
33
 
34
  class CustomBrowser(Browser):
35
 
36
+ async def new_context(self, config: BrowserContextConfig | None = None) -> CustomBrowserContext:
37
+ """Create a browser context"""
38
+ browser_config = self.config.model_dump() if self.config else {}
39
+ context_config = config.model_dump() if config else {}
40
+ merged_config = {**browser_config, **context_config}
41
+ return CustomBrowserContext(config=BrowserContextConfig(**merged_config), browser=self)
42
+
43
+ async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser:
44
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
45
+ assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers'
46
+
47
+ # Use the configured window size from new_context_config if available
48
+ if (
49
+ not self.config.headless
50
+ and hasattr(self.config, 'new_context_config')
51
+ and hasattr(self.config.new_context_config, 'window_width')
52
+ and hasattr(self.config.new_context_config, 'window_height')
53
+ ):
54
+ screen_size = {
55
+ 'width': self.config.new_context_config.window_width,
56
+ 'height': self.config.new_context_config.window_height,
57
+ }
58
+ offset_x, offset_y = get_window_adjustments()
59
+ elif self.config.headless:
60
+ screen_size = {'width': 1920, 'height': 1080}
61
+ offset_x, offset_y = 0, 0
62
+ else:
63
+ screen_size = get_screen_resolution()
64
+ offset_x, offset_y = get_window_adjustments()
65
+
66
+ chrome_args = {
67
+ f'--remote-debugging-port={self.config.chrome_remote_debugging_port}',
68
+ *CHROME_ARGS,
69
+ *(CHROME_DOCKER_ARGS if IN_DOCKER else []),
70
+ *(CHROME_HEADLESS_ARGS if self.config.headless else []),
71
+ *(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []),
72
+ *(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []),
73
+ f'--window-position={offset_x},{offset_y}',
74
+ f'--window-size={screen_size["width"]},{screen_size["height"]}',
75
+ *self.config.extra_browser_args,
76
+ }
77
+
78
+ # check if chrome remote debugging port is already taken,
79
+ # if so remove the remote-debugging-port arg to prevent conflicts
80
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
81
+ if s.connect_ex(('localhost', self.config.chrome_remote_debugging_port)) == 0:
82
+ chrome_args.remove(f'--remote-debugging-port={self.config.chrome_remote_debugging_port}')
83
+
84
+ browser_class = getattr(playwright, self.config.browser_class)
85
+ args = {
86
+ 'chromium': list(chrome_args),
87
+ 'firefox': [
88
+ *{
89
+ '-no-remote',
90
+ *self.config.extra_browser_args,
91
+ }
92
+ ],
93
+ 'webkit': [
94
+ *{
95
+ '--no-startup-window',
96
+ *self.config.extra_browser_args,
97
+ }
98
+ ],
99
+ }
100
+
101
+ browser = await browser_class.launch(
102
+ channel='chromium', # https://github.com/microsoft/playwright/issues/33566
103
+ headless=self.config.headless,
104
+ args=args[self.config.browser_class],
105
+ proxy=self.config.proxy.model_dump() if self.config.proxy else None,
106
+ handle_sigterm=False,
107
+ handle_sigint=False,
108
+ )
109
+ return browser
src/browser/custom_context.py CHANGED
@@ -2,10 +2,12 @@ import json
2
  import logging
3
  import os
4
 
5
- from browser_use.browser.browser import Browser
6
  from browser_use.browser.context import BrowserContext, BrowserContextConfig
7
  from playwright.async_api import Browser as PlaywrightBrowser
8
  from playwright.async_api import BrowserContext as PlaywrightBrowserContext
 
 
9
 
10
  logger = logging.getLogger(__name__)
11
 
@@ -13,7 +15,8 @@ logger = logging.getLogger(__name__)
13
  class CustomBrowserContext(BrowserContext):
14
  def __init__(
15
  self,
16
- browser: "Browser",
17
- config: BrowserContextConfig = BrowserContextConfig()
 
18
  ):
19
- super(CustomBrowserContext, self).__init__(browser=browser, config=config)
 
2
  import logging
3
  import os
4
 
5
+ from browser_use.browser.browser import Browser, IN_DOCKER
6
  from browser_use.browser.context import BrowserContext, BrowserContextConfig
7
  from playwright.async_api import Browser as PlaywrightBrowser
8
  from playwright.async_api import BrowserContext as PlaywrightBrowserContext
9
+ from typing import Optional
10
+ from browser_use.browser.context import BrowserContextState
11
 
12
  logger = logging.getLogger(__name__)
13
 
 
15
  class CustomBrowserContext(BrowserContext):
16
  def __init__(
17
  self,
18
+ browser: 'Browser',
19
+ config: BrowserContextConfig | None = None,
20
+ state: Optional[BrowserContextState] = None,
21
  ):
22
+ super(CustomBrowserContext, self).__init__(browser=browser, config=config, state=state)
src/controller/custom_controller.py CHANGED
@@ -1,11 +1,12 @@
1
  import pdb
2
 
3
  import pyperclip
4
- from typing import Optional, Type
5
  from pydantic import BaseModel
6
  from browser_use.agent.views import ActionResult
7
  from browser_use.browser.context import BrowserContext
8
  from browser_use.controller.service import Controller, DoneAction
 
9
  from main_content_extractor import MainContentExtractor
10
  from browser_use.controller.views import (
11
  ClickElementAction,
@@ -20,30 +21,162 @@ from browser_use.controller.views import (
20
  SwitchTabAction,
21
  )
22
  import logging
 
 
 
 
 
 
 
 
 
23
 
24
  logger = logging.getLogger(__name__)
25
 
 
 
26
 
27
  class CustomController(Controller):
28
  def __init__(self, exclude_actions: list[str] = [],
29
- output_model: Optional[Type[BaseModel]] = None
 
 
30
  ):
31
  super().__init__(exclude_actions=exclude_actions, output_model=output_model)
32
  self._register_custom_actions()
 
 
 
33
 
34
  def _register_custom_actions(self):
35
  """Register all custom browser actions"""
36
 
37
- @self.registry.action("Copy text to clipboard")
38
- def copy_to_clipboard(text: str):
39
- pyperclip.copy(text)
40
- return ActionResult(extracted_content=text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- @self.registry.action("Paste text from clipboard")
43
- async def paste_from_clipboard(browser: BrowserContext):
44
- text = pyperclip.paste()
45
- # send text to browser
46
- page = await browser.get_current_page()
47
- await page.keyboard.type(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- return ActionResult(extracted_content=text)
 
 
 
1
  import pdb
2
 
3
  import pyperclip
4
+ from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable, TypeVar
5
  from pydantic import BaseModel
6
  from browser_use.agent.views import ActionResult
7
  from browser_use.browser.context import BrowserContext
8
  from browser_use.controller.service import Controller, DoneAction
9
+ from browser_use.controller.registry.service import Registry, RegisteredAction
10
  from main_content_extractor import MainContentExtractor
11
  from browser_use.controller.views import (
12
  ClickElementAction,
 
21
  SwitchTabAction,
22
  )
23
  import logging
24
+ import inspect
25
+ import asyncio
26
+ import os
27
+ from langchain_core.language_models.chat_models import BaseChatModel
28
+ from browser_use.agent.views import ActionModel, ActionResult
29
+
30
+ from src.utils.mcp_client import create_tool_param_model, setup_mcp_client_and_tools
31
+
32
+ from browser_use.utils import time_execution_sync
33
 
34
  logger = logging.getLogger(__name__)
35
 
36
+ Context = TypeVar('Context')
37
+
38
 
39
  class CustomController(Controller):
40
  def __init__(self, exclude_actions: list[str] = [],
41
+ output_model: Optional[Type[BaseModel]] = None,
42
+ ask_assistant_callback: Optional[Union[Callable[[str, BrowserContext], Dict[str, Any]], Callable[
43
+ [str, BrowserContext], Awaitable[Dict[str, Any]]]]] = None,
44
  ):
45
  super().__init__(exclude_actions=exclude_actions, output_model=output_model)
46
  self._register_custom_actions()
47
+ self.ask_assistant_callback = ask_assistant_callback
48
+ self.mcp_client = None
49
+ self.mcp_server_config = None
50
 
51
  def _register_custom_actions(self):
52
  """Register all custom browser actions"""
53
 
54
+ @self.registry.action(
55
+ "When executing tasks, prioritize autonomous completion. However, if you encounter a definitive blocker "
56
+ "that prevents you from proceeding independently – such as needing credentials you don't possess, "
57
+ "requiring subjective human judgment, needing a physical action performed, encountering complex CAPTCHAs, "
58
+ "or facing limitations in your capabilities – you must request human assistance."
59
+ )
60
+ async def ask_for_assistant(query: str, browser: BrowserContext):
61
+ if self.ask_assistant_callback:
62
+ if inspect.iscoroutinefunction(self.ask_assistant_callback):
63
+ user_response = await self.ask_assistant_callback(query, browser)
64
+ else:
65
+ user_response = self.ask_assistant_callback(query, browser)
66
+ msg = f"AI ask: {query}. User response: {user_response['response']}"
67
+ logger.info(msg)
68
+ return ActionResult(extracted_content=msg, include_in_memory=True)
69
+ else:
70
+ return ActionResult(extracted_content="Human cannot help you. Please try another way.",
71
+ include_in_memory=True)
72
+
73
+ @self.registry.action(
74
+ 'Upload file to interactive element with file path ',
75
+ )
76
+ async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]):
77
+ if path not in available_file_paths:
78
+ return ActionResult(error=f'File path {path} is not available')
79
+
80
+ if not os.path.exists(path):
81
+ return ActionResult(error=f'File {path} does not exist')
82
+
83
+ dom_el = await browser.get_dom_element_by_index(index)
84
+
85
+ file_upload_dom_el = dom_el.get_file_upload_element()
86
+
87
+ if file_upload_dom_el is None:
88
+ msg = f'No file upload element found at index {index}'
89
+ logger.info(msg)
90
+ return ActionResult(error=msg)
91
+
92
+ file_upload_el = await browser.get_locate_element(file_upload_dom_el)
93
+
94
+ if file_upload_el is None:
95
+ msg = f'No file upload element found at index {index}'
96
+ logger.info(msg)
97
+ return ActionResult(error=msg)
98
+
99
+ try:
100
+ await file_upload_el.set_input_files(path)
101
+ msg = f'Successfully uploaded file to index {index}'
102
+ logger.info(msg)
103
+ return ActionResult(extracted_content=msg, include_in_memory=True)
104
+ except Exception as e:
105
+ msg = f'Failed to upload file to index {index}: {str(e)}'
106
+ logger.info(msg)
107
+ return ActionResult(error=msg)
108
+
109
+ @time_execution_sync('--act')
110
+ async def act(
111
+ self,
112
+ action: ActionModel,
113
+ browser_context: Optional[BrowserContext] = None,
114
+ #
115
+ page_extraction_llm: Optional[BaseChatModel] = None,
116
+ sensitive_data: Optional[Dict[str, str]] = None,
117
+ available_file_paths: Optional[list[str]] = None,
118
+ #
119
+ context: Context | None = None,
120
+ ) -> ActionResult:
121
+ """Execute an action"""
122
+
123
+ try:
124
+ for action_name, params in action.model_dump(exclude_unset=True).items():
125
+ if params is not None:
126
+ if action_name.startswith("mcp"):
127
+ # this is a mcp tool
128
+ logger.debug(f"Invoke MCP tool: {action_name}")
129
+ mcp_tool = self.registry.registry.actions.get(action_name).function
130
+ result = await mcp_tool.ainvoke(params)
131
+ else:
132
+ result = await self.registry.execute_action(
133
+ action_name,
134
+ params,
135
+ browser=browser_context,
136
+ page_extraction_llm=page_extraction_llm,
137
+ sensitive_data=sensitive_data,
138
+ available_file_paths=available_file_paths,
139
+ context=context,
140
+ )
141
+
142
+ if isinstance(result, str):
143
+ return ActionResult(extracted_content=result)
144
+ elif isinstance(result, ActionResult):
145
+ return result
146
+ elif result is None:
147
+ return ActionResult()
148
+ else:
149
+ raise ValueError(f'Invalid action result type: {type(result)} of {result}')
150
+ return ActionResult()
151
+ except Exception as e:
152
+ raise e
153
+
154
+ async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None):
155
+ self.mcp_server_config = mcp_server_config
156
+ if self.mcp_server_config:
157
+ self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config)
158
+ self.register_mcp_tools()
159
 
160
+ def register_mcp_tools(self):
161
+ """
162
+ Register the MCP tools used by this controller.
163
+ """
164
+ if self.mcp_client:
165
+ for server_name in self.mcp_client.server_name_to_tools:
166
+ for tool in self.mcp_client.server_name_to_tools[server_name]:
167
+ tool_name = f"mcp.{server_name}.{tool.name}"
168
+ self.registry.registry.actions[tool_name] = RegisteredAction(
169
+ name=tool_name,
170
+ description=tool.description,
171
+ function=tool,
172
+ param_model=create_tool_param_model(tool),
173
+ )
174
+ logger.info(f"Add mcp tool: {tool_name}")
175
+ logger.debug(
176
+ f"Registered {len(self.mcp_client.server_name_to_tools[server_name])} mcp tools for {server_name}")
177
+ else:
178
+ logger.warning(f"MCP client not started.")
179
 
180
+ async def close_mcp_client(self):
181
+ if self.mcp_client:
182
+ await self.mcp_client.__aexit__(None, None, None)
src/utils/config.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROVIDER_DISPLAY_NAMES = {
2
+ "openai": "OpenAI",
3
+ "azure_openai": "Azure OpenAI",
4
+ "anthropic": "Anthropic",
5
+ "deepseek": "DeepSeek",
6
+ "google": "Google",
7
+ "alibaba": "Alibaba",
8
+ "moonshot": "MoonShot",
9
+ "unbound": "Unbound AI",
10
+ "ibm": "IBM",
11
+ "grok": "Grok",
12
+ }
13
+
14
+ # Predefined model names for common providers
15
+ model_names = {
16
+ "anthropic": ["claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
17
+ "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"],
18
+ "deepseek": ["deepseek-chat", "deepseek-reasoner"],
19
+ "google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest",
20
+ "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05",
21
+ "gemini-2.5-pro-preview-03-25", "gemini-2.5-flash-preview-04-17"],
22
+ "ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b",
23
+ "deepseek-r1:14b", "deepseek-r1:32b"],
24
+ "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
25
+ "mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"],
26
+ "alibaba": ["qwen-plus", "qwen-max", "qwen-vl-max", "qwen-vl-plus", "qwen-turbo", "qwen-long"],
27
+ "moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"],
28
+ "unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"],
29
+ "grok": [
30
+ "grok-3",
31
+ "grok-3-fast",
32
+ "grok-3-mini",
33
+ "grok-3-mini-fast",
34
+ "grok-2-vision",
35
+ "grok-2-image",
36
+ "grok-2",
37
+ ],
38
+ "siliconflow": [
39
+ "deepseek-ai/DeepSeek-R1",
40
+ "deepseek-ai/DeepSeek-V3",
41
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
42
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
43
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
44
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
45
+ "deepseek-ai/DeepSeek-V2.5",
46
+ "deepseek-ai/deepseek-vl2",
47
+ "Qwen/Qwen2.5-72B-Instruct-128K",
48
+ "Qwen/Qwen2.5-72B-Instruct",
49
+ "Qwen/Qwen2.5-32B-Instruct",
50
+ "Qwen/Qwen2.5-14B-Instruct",
51
+ "Qwen/Qwen2.5-7B-Instruct",
52
+ "Qwen/Qwen2.5-Coder-32B-Instruct",
53
+ "Qwen/Qwen2.5-Coder-7B-Instruct",
54
+ "Qwen/Qwen2-7B-Instruct",
55
+ "Qwen/Qwen2-1.5B-Instruct",
56
+ "Qwen/QwQ-32B-Preview",
57
+ "Qwen/Qwen2-VL-72B-Instruct",
58
+ "Qwen/Qwen2.5-VL-32B-Instruct",
59
+ "Qwen/Qwen2.5-VL-72B-Instruct",
60
+ "TeleAI/TeleChat2",
61
+ "THUDM/glm-4-9b-chat",
62
+ "Vendor-A/Qwen/Qwen2.5-72B-Instruct",
63
+ "internlm/internlm2_5-7b-chat",
64
+ "internlm/internlm2_5-20b-chat",
65
+ "Pro/Qwen/Qwen2.5-7B-Instruct",
66
+ "Pro/Qwen/Qwen2-7B-Instruct",
67
+ "Pro/Qwen/Qwen2-1.5B-Instruct",
68
+ "Pro/THUDM/chatglm3-6b",
69
+ "Pro/THUDM/glm-4-9b-chat",
70
+ ],
71
+ "ibm": ["ibm/granite-vision-3.1-2b-preview", "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
72
+ "meta-llama/llama-3-2-90b-vision-instruct"],
73
+ "modelscope":[
74
+ "Qwen/Qwen2.5-Coder-32B-Instruct",
75
+ "Qwen/Qwen2.5-Coder-14B-Instruct",
76
+ "Qwen/Qwen2.5-Coder-7B-Instruct",
77
+ "Qwen/Qwen2.5-72B-Instruct",
78
+ "Qwen/Qwen2.5-32B-Instruct",
79
+ "Qwen/Qwen2.5-14B-Instruct",
80
+ "Qwen/Qwen2.5-7B-Instruct",
81
+ "Qwen/QwQ-32B-Preview",
82
+ "Qwen/Qwen2.5-VL-3B-Instruct",
83
+ "Qwen/Qwen2.5-VL-7B-Instruct",
84
+ "Qwen/Qwen2.5-VL-32B-Instruct",
85
+ "Qwen/Qwen2.5-VL-72B-Instruct",
86
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
87
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
88
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
89
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
90
+ "deepseek-ai/DeepSeek-R1",
91
+ "deepseek-ai/DeepSeek-V3",
92
+ "Qwen/Qwen3-1.7B",
93
+ "Qwen/Qwen3-4B",
94
+ "Qwen/Qwen3-8B",
95
+ "Qwen/Qwen3-14B",
96
+ "Qwen/Qwen3-30B-A3B",
97
+ "Qwen/Qwen3-32B",
98
+ "Qwen/Qwen3-235B-A22B",
99
+ ],
100
+ }
src/utils/llm_provider.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import pdb
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain_core.globals import get_llm_cache
5
+ from langchain_core.language_models.base import (
6
+ BaseLanguageModel,
7
+ LangSmithParams,
8
+ LanguageModelInput,
9
+ )
10
+ import os
11
+ from langchain_core.load import dumpd, dumps
12
+ from langchain_core.messages import (
13
+ AIMessage,
14
+ SystemMessage,
15
+ AnyMessage,
16
+ BaseMessage,
17
+ BaseMessageChunk,
18
+ HumanMessage,
19
+ convert_to_messages,
20
+ message_chunk_to_message,
21
+ )
22
+ from langchain_core.outputs import (
23
+ ChatGeneration,
24
+ ChatGenerationChunk,
25
+ ChatResult,
26
+ LLMResult,
27
+ RunInfo,
28
+ )
29
+ from langchain_ollama import ChatOllama
30
+ from langchain_core.output_parsers.base import OutputParserLike
31
+ from langchain_core.runnables import Runnable, RunnableConfig
32
+ from langchain_core.tools import BaseTool
33
+
34
+ from typing import (
35
+ TYPE_CHECKING,
36
+ Any,
37
+ Callable,
38
+ Literal,
39
+ Optional,
40
+ Union,
41
+ cast, List,
42
+ )
43
+ from langchain_anthropic import ChatAnthropic
44
+ from langchain_mistralai import ChatMistralAI
45
+ from langchain_google_genai import ChatGoogleGenerativeAI
46
+ from langchain_ollama import ChatOllama
47
+ from langchain_openai import AzureChatOpenAI, ChatOpenAI
48
+ from langchain_ibm import ChatWatsonx
49
+ from langchain_aws import ChatBedrock
50
+ from pydantic import SecretStr
51
+
52
+ from src.utils import config
53
+
54
+
55
+ class DeepSeekR1ChatOpenAI(ChatOpenAI):
56
+
57
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
58
+ super().__init__(*args, **kwargs)
59
+ self.client = OpenAI(
60
+ base_url=kwargs.get("base_url"),
61
+ api_key=kwargs.get("api_key")
62
+ )
63
+
64
+ async def ainvoke(
65
+ self,
66
+ input: LanguageModelInput,
67
+ config: Optional[RunnableConfig] = None,
68
+ *,
69
+ stop: Optional[list[str]] = None,
70
+ **kwargs: Any,
71
+ ) -> AIMessage:
72
+ message_history = []
73
+ for input_ in input:
74
+ if isinstance(input_, SystemMessage):
75
+ message_history.append({"role": "system", "content": input_.content})
76
+ elif isinstance(input_, AIMessage):
77
+ message_history.append({"role": "assistant", "content": input_.content})
78
+ else:
79
+ message_history.append({"role": "user", "content": input_.content})
80
+
81
+ response = self.client.chat.completions.create(
82
+ model=self.model_name,
83
+ messages=message_history
84
+ )
85
+
86
+ reasoning_content = response.choices[0].message.reasoning_content
87
+ content = response.choices[0].message.content
88
+ return AIMessage(content=content, reasoning_content=reasoning_content)
89
+
90
+ def invoke(
91
+ self,
92
+ input: LanguageModelInput,
93
+ config: Optional[RunnableConfig] = None,
94
+ *,
95
+ stop: Optional[list[str]] = None,
96
+ **kwargs: Any,
97
+ ) -> AIMessage:
98
+ message_history = []
99
+ for input_ in input:
100
+ if isinstance(input_, SystemMessage):
101
+ message_history.append({"role": "system", "content": input_.content})
102
+ elif isinstance(input_, AIMessage):
103
+ message_history.append({"role": "assistant", "content": input_.content})
104
+ else:
105
+ message_history.append({"role": "user", "content": input_.content})
106
+
107
+ response = self.client.chat.completions.create(
108
+ model=self.model_name,
109
+ messages=message_history
110
+ )
111
+
112
+ reasoning_content = response.choices[0].message.reasoning_content
113
+ content = response.choices[0].message.content
114
+ return AIMessage(content=content, reasoning_content=reasoning_content)
115
+
116
+
117
+ class DeepSeekR1ChatOllama(ChatOllama):
118
+
119
+ async def ainvoke(
120
+ self,
121
+ input: LanguageModelInput,
122
+ config: Optional[RunnableConfig] = None,
123
+ *,
124
+ stop: Optional[list[str]] = None,
125
+ **kwargs: Any,
126
+ ) -> AIMessage:
127
+ org_ai_message = await super().ainvoke(input=input)
128
+ org_content = org_ai_message.content
129
+ reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
130
+ content = org_content.split("</think>")[1]
131
+ if "**JSON Response:**" in content:
132
+ content = content.split("**JSON Response:**")[-1]
133
+ return AIMessage(content=content, reasoning_content=reasoning_content)
134
+
135
+ def invoke(
136
+ self,
137
+ input: LanguageModelInput,
138
+ config: Optional[RunnableConfig] = None,
139
+ *,
140
+ stop: Optional[list[str]] = None,
141
+ **kwargs: Any,
142
+ ) -> AIMessage:
143
+ org_ai_message = super().invoke(input=input)
144
+ org_content = org_ai_message.content
145
+ reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
146
+ content = org_content.split("</think>")[1]
147
+ if "**JSON Response:**" in content:
148
+ content = content.split("**JSON Response:**")[-1]
149
+ return AIMessage(content=content, reasoning_content=reasoning_content)
150
+
151
+
152
+ def get_llm_model(provider: str, **kwargs):
153
+ """
154
+ Get LLM model
155
+ :param provider: LLM provider
156
+ :param kwargs:
157
+ :return:
158
+ """
159
+ if provider not in ["ollama", "bedrock"]:
160
+ env_var = f"{provider.upper()}_API_KEY"
161
+ api_key = kwargs.get("api_key", "") or os.getenv(env_var, "")
162
+ if not api_key:
163
+ provider_display = config.PROVIDER_DISPLAY_NAMES.get(provider, provider.upper())
164
+ error_msg = f"💥 {provider_display} API key not found! 🔑 Please set the `{env_var}` environment variable or provide it in the UI."
165
+ raise ValueError(error_msg)
166
+ kwargs["api_key"] = api_key
167
+
168
+ if provider == "anthropic":
169
+ if not kwargs.get("base_url", ""):
170
+ base_url = "https://api.anthropic.com"
171
+ else:
172
+ base_url = kwargs.get("base_url")
173
+
174
+ return ChatAnthropic(
175
+ model=kwargs.get("model_name", "claude-3-5-sonnet-20241022"),
176
+ temperature=kwargs.get("temperature", 0.0),
177
+ base_url=base_url,
178
+ api_key=api_key,
179
+ )
180
+ elif provider == 'mistral':
181
+ if not kwargs.get("base_url", ""):
182
+ base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1")
183
+ else:
184
+ base_url = kwargs.get("base_url")
185
+ if not kwargs.get("api_key", ""):
186
+ api_key = os.getenv("MISTRAL_API_KEY", "")
187
+ else:
188
+ api_key = kwargs.get("api_key")
189
+
190
+ return ChatMistralAI(
191
+ model=kwargs.get("model_name", "mistral-large-latest"),
192
+ temperature=kwargs.get("temperature", 0.0),
193
+ base_url=base_url,
194
+ api_key=api_key,
195
+ )
196
+ elif provider == "openai":
197
+ if not kwargs.get("base_url", ""):
198
+ base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
199
+ else:
200
+ base_url = kwargs.get("base_url")
201
+
202
+ return ChatOpenAI(
203
+ model=kwargs.get("model_name", "gpt-4o"),
204
+ temperature=kwargs.get("temperature", 0.0),
205
+ base_url=base_url,
206
+ api_key=api_key,
207
+ )
208
+ elif provider == "grok":
209
+ if not kwargs.get("base_url", ""):
210
+ base_url = os.getenv("GROK_ENDPOINT", "https://api.x.ai/v1")
211
+ else:
212
+ base_url = kwargs.get("base_url")
213
+
214
+ return ChatOpenAI(
215
+ model=kwargs.get("model_name", "grok-3"),
216
+ temperature=kwargs.get("temperature", 0.0),
217
+ base_url=base_url,
218
+ api_key=api_key,
219
+ )
220
+ elif provider == "deepseek":
221
+ if not kwargs.get("base_url", ""):
222
+ base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
223
+ else:
224
+ base_url = kwargs.get("base_url")
225
+
226
+ if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
227
+ return DeepSeekR1ChatOpenAI(
228
+ model=kwargs.get("model_name", "deepseek-reasoner"),
229
+ temperature=kwargs.get("temperature", 0.0),
230
+ base_url=base_url,
231
+ api_key=api_key,
232
+ )
233
+ else:
234
+ return ChatOpenAI(
235
+ model=kwargs.get("model_name", "deepseek-chat"),
236
+ temperature=kwargs.get("temperature", 0.0),
237
+ base_url=base_url,
238
+ api_key=api_key,
239
+ )
240
+ elif provider == "google":
241
+ return ChatGoogleGenerativeAI(
242
+ model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
243
+ temperature=kwargs.get("temperature", 0.0),
244
+ api_key=api_key,
245
+ )
246
+ elif provider == "ollama":
247
+ if not kwargs.get("base_url", ""):
248
+ base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
249
+ else:
250
+ base_url = kwargs.get("base_url")
251
+
252
+ if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
253
+ return DeepSeekR1ChatOllama(
254
+ model=kwargs.get("model_name", "deepseek-r1:14b"),
255
+ temperature=kwargs.get("temperature", 0.0),
256
+ num_ctx=kwargs.get("num_ctx", 32000),
257
+ base_url=base_url,
258
+ )
259
+ else:
260
+ return ChatOllama(
261
+ model=kwargs.get("model_name", "qwen2.5:7b"),
262
+ temperature=kwargs.get("temperature", 0.0),
263
+ num_ctx=kwargs.get("num_ctx", 32000),
264
+ num_predict=kwargs.get("num_predict", 1024),
265
+ base_url=base_url,
266
+ )
267
+ elif provider == "azure_openai":
268
+ if not kwargs.get("base_url", ""):
269
+ base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
270
+ else:
271
+ base_url = kwargs.get("base_url")
272
+ api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview")
273
+ return AzureChatOpenAI(
274
+ model=kwargs.get("model_name", "gpt-4o"),
275
+ temperature=kwargs.get("temperature", 0.0),
276
+ api_version=api_version,
277
+ azure_endpoint=base_url,
278
+ api_key=api_key,
279
+ )
280
+ elif provider == "alibaba":
281
+ if not kwargs.get("base_url", ""):
282
+ base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1")
283
+ else:
284
+ base_url = kwargs.get("base_url")
285
+
286
+ return ChatOpenAI(
287
+ model=kwargs.get("model_name", "qwen-plus"),
288
+ temperature=kwargs.get("temperature", 0.0),
289
+ base_url=base_url,
290
+ api_key=api_key,
291
+ )
292
+ elif provider == "ibm":
293
+ parameters = {
294
+ "temperature": kwargs.get("temperature", 0.0),
295
+ "max_tokens": kwargs.get("num_ctx", 32000)
296
+ }
297
+ if not kwargs.get("base_url", ""):
298
+ base_url = os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com")
299
+ else:
300
+ base_url = kwargs.get("base_url")
301
+
302
+ return ChatWatsonx(
303
+ model_id=kwargs.get("model_name", "ibm/granite-vision-3.1-2b-preview"),
304
+ url=base_url,
305
+ project_id=os.getenv("IBM_PROJECT_ID"),
306
+ apikey=os.getenv("IBM_API_KEY"),
307
+ params=parameters
308
+ )
309
+ elif provider == "moonshot":
310
+ return ChatOpenAI(
311
+ model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"),
312
+ temperature=kwargs.get("temperature", 0.0),
313
+ base_url=os.getenv("MOONSHOT_ENDPOINT"),
314
+ api_key=os.getenv("MOONSHOT_API_KEY"),
315
+ )
316
+ elif provider == "unbound":
317
+ return ChatOpenAI(
318
+ model=kwargs.get("model_name", "gpt-4o-mini"),
319
+ temperature=kwargs.get("temperature", 0.0),
320
+ base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"),
321
+ api_key=api_key,
322
+ )
323
+ elif provider == "siliconflow":
324
+ if not kwargs.get("api_key", ""):
325
+ api_key = os.getenv("SiliconFLOW_API_KEY", "")
326
+ else:
327
+ api_key = kwargs.get("api_key")
328
+ if not kwargs.get("base_url", ""):
329
+ base_url = os.getenv("SiliconFLOW_ENDPOINT", "")
330
+ else:
331
+ base_url = kwargs.get("base_url")
332
+ return ChatOpenAI(
333
+ api_key=api_key,
334
+ base_url=base_url,
335
+ model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
336
+ temperature=kwargs.get("temperature", 0.0),
337
+ )
338
+ elif provider == "modelscope":
339
+ if not kwargs.get("api_key", ""):
340
+ api_key = os.getenv("MODELSCOPE_API_KEY", "")
341
+ else:
342
+ api_key = kwargs.get("api_key")
343
+ if not kwargs.get("base_url", ""):
344
+ base_url = os.getenv("MODELSCOPE_ENDPOINT", "")
345
+ else:
346
+ base_url = kwargs.get("base_url")
347
+ return ChatOpenAI(
348
+ api_key=api_key,
349
+ base_url=base_url,
350
+ model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
351
+ temperature=kwargs.get("temperature", 0.0),
352
+ )
353
+ else:
354
+ raise ValueError(f"Unsupported provider: {provider}")
src/utils/mcp_client.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import logging
3
+ import uuid
4
+ from datetime import date, datetime, time
5
+ from enum import Enum
6
+ from typing import Any, Dict, List, Optional, Set, Type, Union, get_type_hints
7
+
8
+ from browser_use.controller.registry.views import ActionModel
9
+ from langchain.tools import BaseTool
10
+ from langchain_mcp_adapters.client import MultiServerMCPClient
11
+ from pydantic import BaseModel, Field, create_model
12
+ from pydantic.v1 import BaseModel, Field
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Optional[MultiServerMCPClient]:
18
+ """
19
+ Initializes the MultiServerMCPClient, connects to servers, fetches tools,
20
+ filters them, and returns a flat list of usable tools and the client instance.
21
+
22
+ Returns:
23
+ A tuple containing:
24
+ - list[BaseTool]: The filtered list of usable LangChain tools.
25
+ - MultiServerMCPClient | None: The initialized and started client instance, or None on failure.
26
+ """
27
+
28
+ logger.info("Initializing MultiServerMCPClient...")
29
+
30
+ if not mcp_server_config:
31
+ logger.error("No MCP server configuration provided.")
32
+ return None
33
+
34
+ try:
35
+ if "mcpServers" in mcp_server_config:
36
+ mcp_server_config = mcp_server_config["mcpServers"]
37
+ client = MultiServerMCPClient(mcp_server_config)
38
+ await client.__aenter__()
39
+ return client
40
+
41
+ except Exception as e:
42
+ logger.error(f"Failed to setup MCP client or fetch tools: {e}", exc_info=True)
43
+ return None
44
+
45
+
46
+ def create_tool_param_model(tool: BaseTool) -> Type[BaseModel]:
47
+ """Creates a Pydantic model from a LangChain tool's schema"""
48
+
49
+ # Get tool schema information
50
+ json_schema = tool.args_schema
51
+ tool_name = tool.name
52
+
53
+ # If the tool already has a schema defined, convert it to a new param_model
54
+ if json_schema is not None:
55
+
56
+ # Create new parameter model
57
+ params = {}
58
+
59
+ # Process properties if they exist
60
+ if 'properties' in json_schema:
61
+ # Find required fields
62
+ required_fields: Set[str] = set(json_schema.get('required', []))
63
+
64
+ for prop_name, prop_details in json_schema['properties'].items():
65
+ field_type = resolve_type(prop_details, f"{tool_name}_{prop_name}")
66
+
67
+ # Check if parameter is required
68
+ is_required = prop_name in required_fields
69
+
70
+ # Get default value and description
71
+ default_value = prop_details.get('default', ... if is_required else None)
72
+ description = prop_details.get('description', '')
73
+
74
+ # Add field constraints
75
+ field_kwargs = {'default': default_value}
76
+ if description:
77
+ field_kwargs['description'] = description
78
+
79
+ # Add additional constraints if present
80
+ if 'minimum' in prop_details:
81
+ field_kwargs['ge'] = prop_details['minimum']
82
+ if 'maximum' in prop_details:
83
+ field_kwargs['le'] = prop_details['maximum']
84
+ if 'minLength' in prop_details:
85
+ field_kwargs['min_length'] = prop_details['minLength']
86
+ if 'maxLength' in prop_details:
87
+ field_kwargs['max_length'] = prop_details['maxLength']
88
+ if 'pattern' in prop_details:
89
+ field_kwargs['pattern'] = prop_details['pattern']
90
+
91
+ # Add to parameters dictionary
92
+ params[prop_name] = (field_type, Field(**field_kwargs))
93
+
94
+ return create_model(
95
+ f'{tool_name}_parameters',
96
+ __base__=ActionModel,
97
+ **params, # type: ignore
98
+ )
99
+
100
+ # If no schema is defined, extract parameters from the _run method
101
+ run_method = tool._run
102
+ sig = inspect.signature(run_method)
103
+
104
+ # Get type hints for better type information
105
+ try:
106
+ type_hints = get_type_hints(run_method)
107
+ except Exception:
108
+ type_hints = {}
109
+
110
+ params = {}
111
+ for name, param in sig.parameters.items():
112
+ # Skip 'self' parameter and any other parameters you want to exclude
113
+ if name == 'self':
114
+ continue
115
+
116
+ # Get annotation from type hints if available, otherwise from signature
117
+ annotation = type_hints.get(name, param.annotation)
118
+ if annotation == inspect.Parameter.empty:
119
+ annotation = Any
120
+
121
+ # Use default value if available, otherwise make it required
122
+ if param.default != param.empty:
123
+ params[name] = (annotation, param.default)
124
+ else:
125
+ params[name] = (annotation, ...)
126
+
127
+ return create_model(
128
+ f'{tool_name}_parameters',
129
+ __base__=ActionModel,
130
+ **params, # type: ignore
131
+ )
132
+
133
+
134
+ def resolve_type(prop_details: Dict[str, Any], prefix: str = "") -> Any:
135
+ """Recursively resolves JSON schema type to Python/Pydantic type"""
136
+
137
+ # Handle reference types
138
+ if '$ref' in prop_details:
139
+ # In a real application, reference resolution would be needed
140
+ return Any
141
+
142
+ # Basic type mapping
143
+ type_mapping = {
144
+ 'string': str,
145
+ 'integer': int,
146
+ 'number': float,
147
+ 'boolean': bool,
148
+ 'array': List,
149
+ 'object': Dict,
150
+ 'null': type(None),
151
+ }
152
+
153
+ # Handle formatted strings
154
+ if prop_details.get('type') == 'string' and 'format' in prop_details:
155
+ format_mapping = {
156
+ 'date-time': datetime,
157
+ 'date': date,
158
+ 'time': time,
159
+ 'email': str,
160
+ 'uri': str,
161
+ 'url': str,
162
+ 'uuid': uuid.UUID,
163
+ 'binary': bytes,
164
+ }
165
+ return format_mapping.get(prop_details['format'], str)
166
+
167
+ # Handle enum types
168
+ if 'enum' in prop_details:
169
+ enum_values = prop_details['enum']
170
+ # Create dynamic enum class with safe names
171
+ enum_dict = {}
172
+ for i, v in enumerate(enum_values):
173
+ # Ensure enum names are valid Python identifiers
174
+ if isinstance(v, str):
175
+ key = v.upper().replace(' ', '_').replace('-', '_')
176
+ if not key.isidentifier():
177
+ key = f"VALUE_{i}"
178
+ else:
179
+ key = f"VALUE_{i}"
180
+ enum_dict[key] = v
181
+
182
+ # Only create enum if we have values
183
+ if enum_dict:
184
+ return Enum(f"{prefix}_Enum", enum_dict)
185
+ return str # Fallback
186
+
187
+ # Handle array types
188
+ if prop_details.get('type') == 'array' and 'items' in prop_details:
189
+ item_type = resolve_type(prop_details['items'], f"{prefix}_item")
190
+ return List[item_type] # type: ignore
191
+
192
+ # Handle object types with properties
193
+ if prop_details.get('type') == 'object' and 'properties' in prop_details:
194
+ nested_params = {}
195
+ for nested_name, nested_details in prop_details['properties'].items():
196
+ nested_type = resolve_type(nested_details, f"{prefix}_{nested_name}")
197
+ # Get required field info
198
+ required_fields = prop_details.get('required', [])
199
+ is_required = nested_name in required_fields
200
+ default_value = nested_details.get('default', ... if is_required else None)
201
+ description = nested_details.get('description', '')
202
+
203
+ field_kwargs = {'default': default_value}
204
+ if description:
205
+ field_kwargs['description'] = description
206
+
207
+ nested_params[nested_name] = (nested_type, Field(**field_kwargs))
208
+
209
+ # Create nested model
210
+ nested_model = create_model(f"{prefix}_Model", **nested_params)
211
+ return nested_model
212
+
213
+ # Handle union types (oneOf, anyOf)
214
+ if 'oneOf' in prop_details or 'anyOf' in prop_details:
215
+ union_schema = prop_details.get('oneOf') or prop_details.get('anyOf')
216
+ union_types = []
217
+ for i, t in enumerate(union_schema):
218
+ union_types.append(resolve_type(t, f"{prefix}_{i}"))
219
+
220
+ if union_types:
221
+ return Union.__getitem__(tuple(union_types)) # type: ignore
222
+ return Any
223
+
224
+ # Handle allOf (intersection types)
225
+ if 'allOf' in prop_details:
226
+ nested_params = {}
227
+ for i, schema_part in enumerate(prop_details['allOf']):
228
+ if 'properties' in schema_part:
229
+ for nested_name, nested_details in schema_part['properties'].items():
230
+ nested_type = resolve_type(nested_details, f"{prefix}_allOf_{i}_{nested_name}")
231
+ # Check if required
232
+ required_fields = schema_part.get('required', [])
233
+ is_required = nested_name in required_fields
234
+ nested_params[nested_name] = (nested_type, ... if is_required else None)
235
+
236
+ # Create composite model
237
+ if nested_params:
238
+ composite_model = create_model(f"{prefix}_CompositeModel", **nested_params)
239
+ return composite_model
240
+ return Dict
241
+
242
+ # Default to basic types
243
+ schema_type = prop_details.get('type', 'string')
244
+ if isinstance(schema_type, list):
245
+ # Handle multiple types (e.g., ["string", "null"])
246
+ non_null_types = [t for t in schema_type if t != 'null']
247
+ if non_null_types:
248
+ primary_type = type_mapping.get(non_null_types[0], Any)
249
+ if 'null' in schema_type:
250
+ return Optional[primary_type] # type: ignore
251
+ return primary_type
252
+ return Any
253
+
254
+ return type_mapping.get(schema_type, Any)
src/utils/utils.py CHANGED
@@ -8,262 +8,6 @@ import json
8
  import gradio as gr
9
  import uuid
10
 
11
- from langchain_anthropic import ChatAnthropic
12
- from langchain_mistralai import ChatMistralAI
13
- from langchain_google_genai import ChatGoogleGenerativeAI
14
- from langchain_ollama import ChatOllama
15
- from langchain_openai import AzureChatOpenAI, ChatOpenAI
16
-
17
- from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama
18
-
19
- PROVIDER_DISPLAY_NAMES = {
20
- "openai": "OpenAI",
21
- "azure_openai": "Azure OpenAI",
22
- "anthropic": "Anthropic",
23
- "deepseek": "DeepSeek",
24
- "google": "Google",
25
- "alibaba": "Alibaba",
26
- "moonshot": "MoonShot",
27
- "unbound": "Unbound AI"
28
- }
29
-
30
-
31
- def get_llm_model(provider: str, **kwargs):
32
- """
33
- 获取LLM 模型
34
- :param provider: 模型类型
35
- :param kwargs:
36
- :return:
37
- """
38
- if provider not in ["ollama"]:
39
- env_var = f"{provider.upper()}_API_KEY"
40
- api_key = kwargs.get("api_key", "") or os.getenv(env_var, "")
41
- if not api_key:
42
- raise MissingAPIKeyError(provider, env_var)
43
- kwargs["api_key"] = api_key
44
-
45
- if provider == "anthropic":
46
- if not kwargs.get("base_url", ""):
47
- base_url = "https://api.anthropic.com"
48
- else:
49
- base_url = kwargs.get("base_url")
50
-
51
- return ChatAnthropic(
52
- model=kwargs.get("model_name", "claude-3-5-sonnet-20241022"),
53
- temperature=kwargs.get("temperature", 0.0),
54
- base_url=base_url,
55
- api_key=api_key,
56
- )
57
- elif provider == 'mistral':
58
- if not kwargs.get("base_url", ""):
59
- base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1")
60
- else:
61
- base_url = kwargs.get("base_url")
62
- if not kwargs.get("api_key", ""):
63
- api_key = os.getenv("MISTRAL_API_KEY", "")
64
- else:
65
- api_key = kwargs.get("api_key")
66
-
67
- return ChatMistralAI(
68
- model=kwargs.get("model_name", "mistral-large-latest"),
69
- temperature=kwargs.get("temperature", 0.0),
70
- base_url=base_url,
71
- api_key=api_key,
72
- )
73
- elif provider == "openai":
74
- if not kwargs.get("base_url", ""):
75
- base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
76
- else:
77
- base_url = kwargs.get("base_url")
78
-
79
- return ChatOpenAI(
80
- model=kwargs.get("model_name", "gpt-4o"),
81
- temperature=kwargs.get("temperature", 0.0),
82
- base_url=base_url,
83
- api_key=api_key,
84
- )
85
- elif provider == "deepseek":
86
- if not kwargs.get("base_url", ""):
87
- base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
88
- else:
89
- base_url = kwargs.get("base_url")
90
-
91
- if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
92
- return DeepSeekR1ChatOpenAI(
93
- model=kwargs.get("model_name", "deepseek-reasoner"),
94
- temperature=kwargs.get("temperature", 0.0),
95
- base_url=base_url,
96
- api_key=api_key,
97
- )
98
- else:
99
- return ChatOpenAI(
100
- model=kwargs.get("model_name", "deepseek-chat"),
101
- temperature=kwargs.get("temperature", 0.0),
102
- base_url=base_url,
103
- api_key=api_key,
104
- )
105
- elif provider == "google":
106
- return ChatGoogleGenerativeAI(
107
- model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
108
- temperature=kwargs.get("temperature", 0.0),
109
- api_key=api_key,
110
- )
111
- elif provider == "ollama":
112
- if not kwargs.get("base_url", ""):
113
- base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
114
- else:
115
- base_url = kwargs.get("base_url")
116
-
117
- if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
118
- return DeepSeekR1ChatOllama(
119
- model=kwargs.get("model_name", "deepseek-r1:14b"),
120
- temperature=kwargs.get("temperature", 0.0),
121
- num_ctx=kwargs.get("num_ctx", 32000),
122
- base_url=base_url,
123
- )
124
- else:
125
- return ChatOllama(
126
- model=kwargs.get("model_name", "qwen2.5:7b"),
127
- temperature=kwargs.get("temperature", 0.0),
128
- num_ctx=kwargs.get("num_ctx", 32000),
129
- num_predict=kwargs.get("num_predict", 1024),
130
- base_url=base_url,
131
- )
132
- elif provider == "azure_openai":
133
- if not kwargs.get("base_url", ""):
134
- base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
135
- else:
136
- base_url = kwargs.get("base_url")
137
- api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview")
138
- return AzureChatOpenAI(
139
- model=kwargs.get("model_name", "gpt-4o"),
140
- temperature=kwargs.get("temperature", 0.0),
141
- api_version=api_version,
142
- azure_endpoint=base_url,
143
- api_key=api_key,
144
- )
145
- elif provider == "alibaba":
146
- if not kwargs.get("base_url", ""):
147
- base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1")
148
- else:
149
- base_url = kwargs.get("base_url")
150
-
151
- return ChatOpenAI(
152
- model=kwargs.get("model_name", "qwen-plus"),
153
- temperature=kwargs.get("temperature", 0.0),
154
- base_url=base_url,
155
- api_key=api_key,
156
- )
157
- elif provider == "moonshot":
158
- return ChatOpenAI(
159
- model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"),
160
- temperature=kwargs.get("temperature", 0.0),
161
- base_url=os.getenv("MOONSHOT_ENDPOINT"),
162
- api_key=os.getenv("MOONSHOT_API_KEY"),
163
- )
164
- elif provider == "unbound":
165
- return ChatOpenAI(
166
- model=kwargs.get("model_name", "gpt-4o-mini"),
167
- temperature=kwargs.get("temperature", 0.0),
168
- base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"),
169
- api_key=api_key,
170
- )
171
- elif provider == "siliconflow":
172
- if not kwargs.get("api_key", ""):
173
- api_key = os.getenv("SiliconFLOW_API_KEY", "")
174
- else:
175
- api_key = kwargs.get("api_key")
176
- if not kwargs.get("base_url", ""):
177
- base_url = os.getenv("SiliconFLOW_ENDPOINT", "")
178
- else:
179
- base_url = kwargs.get("base_url")
180
- return ChatOpenAI(
181
- api_key=api_key,
182
- base_url=base_url,
183
- model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
184
- temperature=kwargs.get("temperature", 0.0),
185
- )
186
- else:
187
- raise ValueError(f"Unsupported provider: {provider}")
188
-
189
-
190
- # Predefined model names for common providers
191
- model_names = {
192
- "anthropic": ["claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
193
- "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"],
194
- "deepseek": ["deepseek-chat", "deepseek-reasoner"],
195
- "google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest",
196
- "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05"],
197
- "ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b",
198
- "deepseek-r1:14b", "deepseek-r1:32b"],
199
- "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
200
- "mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"],
201
- "alibaba": ["qwen-plus", "qwen-max", "qwen-turbo", "qwen-long"],
202
- "moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"],
203
- "unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"],
204
- "siliconflow": [
205
- "deepseek-ai/DeepSeek-R1",
206
- "deepseek-ai/DeepSeek-V3",
207
- "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
208
- "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
209
- "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
210
- "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
211
- "deepseek-ai/DeepSeek-V2.5",
212
- "deepseek-ai/deepseek-vl2",
213
- "Qwen/Qwen2.5-72B-Instruct-128K",
214
- "Qwen/Qwen2.5-72B-Instruct",
215
- "Qwen/Qwen2.5-32B-Instruct",
216
- "Qwen/Qwen2.5-14B-Instruct",
217
- "Qwen/Qwen2.5-7B-Instruct",
218
- "Qwen/Qwen2.5-Coder-32B-Instruct",
219
- "Qwen/Qwen2.5-Coder-7B-Instruct",
220
- "Qwen/Qwen2-7B-Instruct",
221
- "Qwen/Qwen2-1.5B-Instruct",
222
- "Qwen/QwQ-32B-Preview",
223
- "Qwen/Qwen2-VL-72B-Instruct",
224
- "Qwen/Qwen2.5-VL-32B-Instruct",
225
- "Qwen/Qwen2.5-VL-72B-Instruct",
226
- "TeleAI/TeleChat2",
227
- "THUDM/glm-4-9b-chat",
228
- "Vendor-A/Qwen/Qwen2.5-72B-Instruct",
229
- "internlm/internlm2_5-7b-chat",
230
- "internlm/internlm2_5-20b-chat",
231
- "Pro/Qwen/Qwen2.5-7B-Instruct",
232
- "Pro/Qwen/Qwen2-7B-Instruct",
233
- "Pro/Qwen/Qwen2-1.5B-Instruct",
234
- "Pro/THUDM/chatglm3-6b",
235
- "Pro/THUDM/glm-4-9b-chat",
236
- ],
237
- }
238
-
239
-
240
- # Callback to update the model name dropdown based on the selected provider
241
- def update_model_dropdown(llm_provider, api_key=None, base_url=None):
242
- """
243
- Update the model name dropdown with predefined models for the selected provider.
244
- """
245
- import gradio as gr
246
- # Use API keys from .env if not provided
247
- if not api_key:
248
- api_key = os.getenv(f"{llm_provider.upper()}_API_KEY", "")
249
- if not base_url:
250
- base_url = os.getenv(f"{llm_provider.upper()}_BASE_URL", "")
251
-
252
- # Use predefined models for the selected provider
253
- if llm_provider in model_names:
254
- return gr.Dropdown(choices=model_names[llm_provider], value=model_names[llm_provider][0], interactive=True)
255
- else:
256
- return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True)
257
-
258
-
259
- class MissingAPIKeyError(Exception):
260
- """Custom exception for missing API key."""
261
-
262
- def __init__(self, provider: str, env_var: str):
263
- provider_display = PROVIDER_DISPLAY_NAMES.get(provider, provider.upper())
264
- super().__init__(f"💥 {provider_display} API key not found! 🔑 Please set the "
265
- f"`{env_var}` environment variable or provide it in the UI.")
266
-
267
 
268
  def encode_image(img_path):
269
  if not img_path:
@@ -293,108 +37,3 @@ def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Di
293
  print(f"Error getting latest {file_type} file: {e}")
294
 
295
  return latest_files
296
-
297
-
298
- async def capture_screenshot(browser_context):
299
- """Capture and encode a screenshot"""
300
- # Extract the Playwright browser instance
301
- playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct.
302
-
303
- # Check if the browser instance is valid and if an existing context can be reused
304
- if playwright_browser and playwright_browser.contexts:
305
- playwright_context = playwright_browser.contexts[0]
306
- else:
307
- return None
308
-
309
- # Access pages in the context
310
- pages = None
311
- if playwright_context:
312
- pages = playwright_context.pages
313
-
314
- # Use an existing page or create a new one if none exist
315
- if pages:
316
- active_page = pages[0]
317
- for page in pages:
318
- if page.url != "about:blank":
319
- active_page = page
320
- else:
321
- return None
322
-
323
- # Take screenshot
324
- try:
325
- screenshot = await active_page.screenshot(
326
- type='jpeg',
327
- quality=75,
328
- scale="css"
329
- )
330
- encoded = base64.b64encode(screenshot).decode('utf-8')
331
- return encoded
332
- except Exception as e:
333
- return None
334
-
335
-
336
- class ConfigManager:
337
- def __init__(self):
338
- self.components = {}
339
- self.component_order = []
340
-
341
- def register_component(self, name: str, component):
342
- """Register a gradio component for config management."""
343
- self.components[name] = component
344
- if name not in self.component_order:
345
- self.component_order.append(name)
346
- return component
347
-
348
- def save_current_config(self):
349
- """Save the current configuration of all registered components."""
350
- current_config = {}
351
- for name in self.component_order:
352
- component = self.components[name]
353
- # Get the current value from the component
354
- current_config[name] = getattr(component, "value", None)
355
-
356
- return save_config_to_file(current_config)
357
-
358
- def update_ui_from_config(self, config_file):
359
- """Update UI components from a loaded configuration file."""
360
- if config_file is None:
361
- return [gr.update() for _ in self.component_order] + ["No file selected."]
362
-
363
- loaded_config = load_config_from_file(config_file.name)
364
-
365
- if not isinstance(loaded_config, dict):
366
- return [gr.update() for _ in self.component_order] + ["Error: Invalid configuration file."]
367
-
368
- # Prepare updates for all components
369
- updates = []
370
- for name in self.component_order:
371
- if name in loaded_config:
372
- updates.append(gr.update(value=loaded_config[name]))
373
- else:
374
- updates.append(gr.update())
375
-
376
- updates.append("Configuration loaded successfully.")
377
- return updates
378
-
379
- def get_all_components(self):
380
- """Return all registered components in the order they were registered."""
381
- return [self.components[name] for name in self.component_order]
382
-
383
-
384
- def load_config_from_file(config_file):
385
- """Load settings from a config file (JSON format)."""
386
- try:
387
- with open(config_file, 'r') as f:
388
- settings = json.load(f)
389
- return settings
390
- except Exception as e:
391
- return f"Error loading configuration: {str(e)}"
392
-
393
-
394
- def save_config_to_file(settings, save_dir="./tmp/webui_settings"):
395
- """Save the current settings to a UUID.json file with a UUID name."""
396
- os.makedirs(save_dir, exist_ok=True)
397
- config_file = os.path.join(save_dir, f"{uuid.uuid4()}.json")
398
- with open(config_file, 'w') as f:
399
- json.dump(settings, f, indent=2)
400
- return f"Configuration saved to {config_file}"
 
8
  import gradio as gr
9
  import uuid
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def encode_image(img_path):
13
  if not img_path:
 
37
  print(f"Error getting latest {file_type} file: {e}")
38
 
39
  return latest_files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/webui/__init__.py ADDED
File without changes
src/webui/components/__init__.py ADDED
File without changes
src/webui/components/agent_settings_tab.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import gradio as gr
5
+ from gradio.components import Component
6
+ from typing import Any, Dict, Optional
7
+ from src.webui.webui_manager import WebuiManager
8
+ from src.utils import config
9
+ import logging
10
+ from functools import partial
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def update_model_dropdown(llm_provider):
16
+ """
17
+ Update the model name dropdown with predefined models for the selected provider.
18
+ """
19
+ # Use predefined models for the selected provider
20
+ if llm_provider in config.model_names:
21
+ return gr.Dropdown(choices=config.model_names[llm_provider], value=config.model_names[llm_provider][0],
22
+ interactive=True)
23
+ else:
24
+ return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True)
25
+
26
+
27
+ async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
28
+ """
29
+ Update the MCP server.
30
+ """
31
+ if hasattr(webui_manager, "bu_controller") and webui_manager.bu_controller:
32
+ logger.warning("⚠️ Close controller because mcp file has changed!")
33
+ await webui_manager.bu_controller.close_mcp_client()
34
+ webui_manager.bu_controller = None
35
+
36
+ if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'):
37
+ logger.warning(f"{mcp_file} is not a valid MCP file.")
38
+ return None, gr.update(visible=False)
39
+
40
+ with open(mcp_file, 'r') as f:
41
+ mcp_server = json.load(f)
42
+
43
+ return json.dumps(mcp_server, indent=2), gr.update(visible=True)
44
+
45
+
46
+ def create_agent_settings_tab(webui_manager: WebuiManager):
47
+ """
48
+ Creates an agent settings tab.
49
+ """
50
+ input_components = set(webui_manager.get_components())
51
+ tab_components = {}
52
+
53
+ with gr.Group():
54
+ with gr.Column():
55
+ override_system_prompt = gr.Textbox(label="Override system prompt", lines=4, interactive=True)
56
+ extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True)
57
+
58
+ with gr.Group():
59
+ mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"])
60
+ mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False)
61
+
62
+ with gr.Group():
63
+ with gr.Row():
64
+ llm_provider = gr.Dropdown(
65
+ choices=[provider for provider, model in config.model_names.items()],
66
+ label="LLM Provider",
67
+ value=os.getenv("DEFAULT_LLM", "openai"),
68
+ info="Select LLM provider for LLM",
69
+ interactive=True
70
+ )
71
+ llm_model_name = gr.Dropdown(
72
+ label="LLM Model Name",
73
+ choices=config.model_names[os.getenv("DEFAULT_LLM", "openai")],
74
+ value=config.model_names[os.getenv("DEFAULT_LLM", "openai")][0],
75
+ interactive=True,
76
+ allow_custom_value=True,
77
+ info="Select a model in the dropdown options or directly type a custom model name"
78
+ )
79
+ with gr.Row():
80
+ llm_temperature = gr.Slider(
81
+ minimum=0.0,
82
+ maximum=2.0,
83
+ value=0.6,
84
+ step=0.1,
85
+ label="LLM Temperature",
86
+ info="Controls randomness in model outputs",
87
+ interactive=True
88
+ )
89
+
90
+ use_vision = gr.Checkbox(
91
+ label="Use Vision",
92
+ value=True,
93
+ info="Enable Vision(Input highlighted screenshot into LLM)",
94
+ interactive=True
95
+ )
96
+
97
+ ollama_num_ctx = gr.Slider(
98
+ minimum=2 ** 8,
99
+ maximum=2 ** 16,
100
+ value=16000,
101
+ step=1,
102
+ label="Ollama Context Length",
103
+ info="Controls max context length model needs to handle (less = faster)",
104
+ visible=False,
105
+ interactive=True
106
+ )
107
+
108
+ with gr.Row():
109
+ llm_base_url = gr.Textbox(
110
+ label="Base URL",
111
+ value="",
112
+ info="API endpoint URL (if required)"
113
+ )
114
+ llm_api_key = gr.Textbox(
115
+ label="API Key",
116
+ type="password",
117
+ value="",
118
+ info="Your API key (leave blank to use .env)"
119
+ )
120
+
121
+ with gr.Group():
122
+ with gr.Row():
123
+ planner_llm_provider = gr.Dropdown(
124
+ choices=[provider for provider, model in config.model_names.items()],
125
+ label="Planner LLM Provider",
126
+ info="Select LLM provider for LLM",
127
+ value=None,
128
+ interactive=True
129
+ )
130
+ planner_llm_model_name = gr.Dropdown(
131
+ label="Planner LLM Model Name",
132
+ interactive=True,
133
+ allow_custom_value=True,
134
+ info="Select a model in the dropdown options or directly type a custom model name"
135
+ )
136
+ with gr.Row():
137
+ planner_llm_temperature = gr.Slider(
138
+ minimum=0.0,
139
+ maximum=2.0,
140
+ value=0.6,
141
+ step=0.1,
142
+ label="Planner LLM Temperature",
143
+ info="Controls randomness in model outputs",
144
+ interactive=True
145
+ )
146
+
147
+ planner_use_vision = gr.Checkbox(
148
+ label="Use Vision(Planner LLM)",
149
+ value=False,
150
+ info="Enable Vision(Input highlighted screenshot into LLM)",
151
+ interactive=True
152
+ )
153
+
154
+ planner_ollama_num_ctx = gr.Slider(
155
+ minimum=2 ** 8,
156
+ maximum=2 ** 16,
157
+ value=16000,
158
+ step=1,
159
+ label="Ollama Context Length",
160
+ info="Controls max context length model needs to handle (less = faster)",
161
+ visible=False,
162
+ interactive=True
163
+ )
164
+
165
+ with gr.Row():
166
+ planner_llm_base_url = gr.Textbox(
167
+ label="Base URL",
168
+ value="",
169
+ info="API endpoint URL (if required)"
170
+ )
171
+ planner_llm_api_key = gr.Textbox(
172
+ label="API Key",
173
+ type="password",
174
+ value="",
175
+ info="Your API key (leave blank to use .env)"
176
+ )
177
+
178
+ with gr.Row():
179
+ max_steps = gr.Slider(
180
+ minimum=1,
181
+ maximum=1000,
182
+ value=100,
183
+ step=1,
184
+ label="Max Run Steps",
185
+ info="Maximum number of steps the agent will take",
186
+ interactive=True
187
+ )
188
+ max_actions = gr.Slider(
189
+ minimum=1,
190
+ maximum=100,
191
+ value=10,
192
+ step=1,
193
+ label="Max Number of Actions",
194
+ info="Maximum number of actions the agent will take per step",
195
+ interactive=True
196
+ )
197
+
198
+ with gr.Row():
199
+ max_input_tokens = gr.Number(
200
+ label="Max Input Tokens",
201
+ value=128000,
202
+ precision=0,
203
+ interactive=True
204
+ )
205
+ tool_calling_method = gr.Dropdown(
206
+ label="Tool Calling Method",
207
+ value="auto",
208
+ interactive=True,
209
+ allow_custom_value=True,
210
+ choices=['function_calling', 'json_mode', 'raw', 'auto', 'tools', "None"],
211
+ visible=True
212
+ )
213
+ tab_components.update(dict(
214
+ override_system_prompt=override_system_prompt,
215
+ extend_system_prompt=extend_system_prompt,
216
+ llm_provider=llm_provider,
217
+ llm_model_name=llm_model_name,
218
+ llm_temperature=llm_temperature,
219
+ use_vision=use_vision,
220
+ ollama_num_ctx=ollama_num_ctx,
221
+ llm_base_url=llm_base_url,
222
+ llm_api_key=llm_api_key,
223
+ planner_llm_provider=planner_llm_provider,
224
+ planner_llm_model_name=planner_llm_model_name,
225
+ planner_llm_temperature=planner_llm_temperature,
226
+ planner_use_vision=planner_use_vision,
227
+ planner_ollama_num_ctx=planner_ollama_num_ctx,
228
+ planner_llm_base_url=planner_llm_base_url,
229
+ planner_llm_api_key=planner_llm_api_key,
230
+ max_steps=max_steps,
231
+ max_actions=max_actions,
232
+ max_input_tokens=max_input_tokens,
233
+ tool_calling_method=tool_calling_method,
234
+ mcp_json_file=mcp_json_file,
235
+ mcp_server_config=mcp_server_config,
236
+ ))
237
+ webui_manager.add_components("agent_settings", tab_components)
238
+
239
+ llm_provider.change(
240
+ fn=lambda x: gr.update(visible=x == "ollama"),
241
+ inputs=llm_provider,
242
+ outputs=ollama_num_ctx
243
+ )
244
+ llm_provider.change(
245
+ lambda provider: update_model_dropdown(provider),
246
+ inputs=[llm_provider],
247
+ outputs=[llm_model_name]
248
+ )
249
+ planner_llm_provider.change(
250
+ fn=lambda x: gr.update(visible=x == "ollama"),
251
+ inputs=[planner_llm_provider],
252
+ outputs=[planner_ollama_num_ctx]
253
+ )
254
+ planner_llm_provider.change(
255
+ lambda provider: update_model_dropdown(provider),
256
+ inputs=[planner_llm_provider],
257
+ outputs=[planner_llm_model_name]
258
+ )
259
+
260
+ async def update_wrapper(mcp_file):
261
+ """Wrapper for handle_pause_resume."""
262
+ update_dict = await update_mcp_server(mcp_file, webui_manager)
263
+ yield update_dict
264
+
265
+ mcp_json_file.change(
266
+ update_wrapper,
267
+ inputs=[mcp_json_file],
268
+ outputs=[mcp_server_config, mcp_server_config]
269
+ )
src/webui/components/browser_settings_tab.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from distutils.util import strtobool
3
+ import gradio as gr
4
+ import logging
5
+ from gradio.components import Component
6
+
7
+ from src.webui.webui_manager import WebuiManager
8
+ from src.utils import config
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ async def close_browser(webui_manager: WebuiManager):
13
+ """
14
+ Close browser
15
+ """
16
+ if webui_manager.bu_current_task and not webui_manager.bu_current_task.done():
17
+ webui_manager.bu_current_task.cancel()
18
+ webui_manager.bu_current_task = None
19
+
20
+ if webui_manager.bu_browser_context:
21
+ logger.info("⚠️ Closing browser context when changing browser config.")
22
+ await webui_manager.bu_browser_context.close()
23
+ webui_manager.bu_browser_context = None
24
+
25
+ if webui_manager.bu_browser:
26
+ logger.info("⚠️ Closing browser when changing browser config.")
27
+ await webui_manager.bu_browser.close()
28
+ webui_manager.bu_browser = None
29
+
30
+ def create_browser_settings_tab(webui_manager: WebuiManager):
31
+ """
32
+ Creates a browser settings tab.
33
+ """
34
+ input_components = set(webui_manager.get_components())
35
+ tab_components = {}
36
+
37
+ with gr.Group():
38
+ with gr.Row():
39
+ browser_binary_path = gr.Textbox(
40
+ label="Browser Binary Path",
41
+ lines=1,
42
+ interactive=True,
43
+ placeholder="e.g. '/Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome'"
44
+ )
45
+ browser_user_data_dir = gr.Textbox(
46
+ label="Browser User Data Dir",
47
+ lines=1,
48
+ interactive=True,
49
+ placeholder="Leave it empty if you use your default user data",
50
+ )
51
+ with gr.Group():
52
+ with gr.Row():
53
+ use_own_browser = gr.Checkbox(
54
+ label="Use Own Browser",
55
+ value=bool(strtobool(os.getenv("USE_OWN_BROWSER", "false"))),
56
+ info="Use your existing browser instance",
57
+ interactive=True
58
+ )
59
+ keep_browser_open = gr.Checkbox(
60
+ label="Keep Browser Open",
61
+ value=bool(strtobool(os.getenv("KEEP_BROWSER_OPEN", "true"))),
62
+ info="Keep Browser Open between Tasks",
63
+ interactive=True
64
+ )
65
+ headless = gr.Checkbox(
66
+ label="Headless Mode",
67
+ value=False,
68
+ info="Run browser without GUI",
69
+ interactive=True
70
+ )
71
+ disable_security = gr.Checkbox(
72
+ label="Disable Security",
73
+ value=False,
74
+ info="Disable browser security",
75
+ interactive=True
76
+ )
77
+
78
+ with gr.Group():
79
+ with gr.Row():
80
+ window_w = gr.Number(
81
+ label="Window Width",
82
+ value=1280,
83
+ info="Browser window width",
84
+ interactive=True
85
+ )
86
+ window_h = gr.Number(
87
+ label="Window Height",
88
+ value=1100,
89
+ info="Browser window height",
90
+ interactive=True
91
+ )
92
+ with gr.Group():
93
+ with gr.Row():
94
+ cdp_url = gr.Textbox(
95
+ label="CDP URL",
96
+ value=os.getenv("BROWSER_CDP", None),
97
+ info="CDP URL for browser remote debugging",
98
+ interactive=True,
99
+ )
100
+ wss_url = gr.Textbox(
101
+ label="WSS URL",
102
+ info="WSS URL for browser remote debugging",
103
+ interactive=True,
104
+ )
105
+ with gr.Group():
106
+ with gr.Row():
107
+ save_recording_path = gr.Textbox(
108
+ label="Recording Path",
109
+ placeholder="e.g. ./tmp/record_videos",
110
+ info="Path to save browser recordings",
111
+ interactive=True,
112
+ )
113
+
114
+ save_trace_path = gr.Textbox(
115
+ label="Trace Path",
116
+ placeholder="e.g. ./tmp/traces",
117
+ info="Path to save Agent traces",
118
+ interactive=True,
119
+ )
120
+
121
+ with gr.Row():
122
+ save_agent_history_path = gr.Textbox(
123
+ label="Agent History Save Path",
124
+ value="./tmp/agent_history",
125
+ info="Specify the directory where agent history should be saved.",
126
+ interactive=True,
127
+ )
128
+ save_download_path = gr.Textbox(
129
+ label="Save Directory for browser downloads",
130
+ value="./tmp/downloads",
131
+ info="Specify the directory where downloaded files should be saved.",
132
+ interactive=True,
133
+ )
134
+ tab_components.update(
135
+ dict(
136
+ browser_binary_path=browser_binary_path,
137
+ browser_user_data_dir=browser_user_data_dir,
138
+ use_own_browser=use_own_browser,
139
+ keep_browser_open=keep_browser_open,
140
+ headless=headless,
141
+ disable_security=disable_security,
142
+ save_recording_path=save_recording_path,
143
+ save_trace_path=save_trace_path,
144
+ save_agent_history_path=save_agent_history_path,
145
+ save_download_path=save_download_path,
146
+ cdp_url=cdp_url,
147
+ wss_url=wss_url,
148
+ window_h=window_h,
149
+ window_w=window_w,
150
+ )
151
+ )
152
+ webui_manager.add_components("browser_settings", tab_components)
153
+
154
+ async def close_wrapper():
155
+ """Wrapper for handle_clear."""
156
+ await close_browser(webui_manager)
157
+
158
+ headless.change(close_wrapper)
159
+ keep_browser_open.change(close_wrapper)
160
+ disable_security.change(close_wrapper)
161
+ use_own_browser.change(close_wrapper)
src/webui/components/browser_use_agent_tab.py ADDED
@@ -0,0 +1,1083 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ import uuid
6
+ from typing import Any, AsyncGenerator, Dict, Optional
7
+
8
+ import gradio as gr
9
+
10
+ # from browser_use.agent.service import Agent
11
+ from browser_use.agent.views import (
12
+ AgentHistoryList,
13
+ AgentOutput,
14
+ )
15
+ from browser_use.browser.browser import BrowserConfig
16
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
17
+ from browser_use.browser.views import BrowserState
18
+ from gradio.components import Component
19
+ from langchain_core.language_models.chat_models import BaseChatModel
20
+
21
+ from src.agent.browser_use.browser_use_agent import BrowserUseAgent
22
+ from src.browser.custom_browser import CustomBrowser
23
+ from src.controller.custom_controller import CustomController
24
+ from src.utils import llm_provider
25
+ from src.webui.webui_manager import WebuiManager
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ # --- Helper Functions --- (Defined at module level)
31
+
32
+
33
+ async def _initialize_llm(
34
+ provider: Optional[str],
35
+ model_name: Optional[str],
36
+ temperature: float,
37
+ base_url: Optional[str],
38
+ api_key: Optional[str],
39
+ num_ctx: Optional[int] = None,
40
+ ) -> Optional[BaseChatModel]:
41
+ """Initializes the LLM based on settings. Returns None if provider/model is missing."""
42
+ if not provider or not model_name:
43
+ logger.info("LLM Provider or Model Name not specified, LLM will be None.")
44
+ return None
45
+ try:
46
+ # Use your actual LLM provider logic here
47
+ logger.info(
48
+ f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}"
49
+ )
50
+ # Example using a placeholder function
51
+ llm = llm_provider.get_llm_model(
52
+ provider=provider,
53
+ model_name=model_name,
54
+ temperature=temperature,
55
+ base_url=base_url or None,
56
+ api_key=api_key or None,
57
+ # Add other relevant params like num_ctx for ollama
58
+ num_ctx=num_ctx if provider == "ollama" else None,
59
+ )
60
+ return llm
61
+ except Exception as e:
62
+ logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
63
+ gr.Warning(
64
+ f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}"
65
+ )
66
+ return None
67
+
68
+
69
+ def _get_config_value(
70
+ webui_manager: WebuiManager,
71
+ comp_dict: Dict[gr.components.Component, Any],
72
+ comp_id_suffix: str,
73
+ default: Any = None,
74
+ ) -> Any:
75
+ """Safely get value from component dictionary using its ID suffix relative to the tab."""
76
+ # Assumes component ID format is "tab_name.comp_name"
77
+ tab_name = "browser_use_agent" # Hardcode or derive if needed
78
+ comp_id = f"{tab_name}.{comp_id_suffix}"
79
+ # Need to find the component object first using the ID from the manager
80
+ try:
81
+ comp = webui_manager.get_component_by_id(comp_id)
82
+ return comp_dict.get(comp, default)
83
+ except KeyError:
84
+ # Try accessing settings tabs as well
85
+ for prefix in ["agent_settings", "browser_settings"]:
86
+ try:
87
+ comp_id = f"{prefix}.{comp_id_suffix}"
88
+ comp = webui_manager.get_component_by_id(comp_id)
89
+ return comp_dict.get(comp, default)
90
+ except KeyError:
91
+ continue
92
+ logger.warning(
93
+ f"Component with suffix '{comp_id_suffix}' not found in manager for value lookup."
94
+ )
95
+ return default
96
+
97
+
98
+ def _format_agent_output(model_output: AgentOutput) -> str:
99
+ """Formats AgentOutput for display in the chatbot using JSON."""
100
+ content = ""
101
+ if model_output:
102
+ try:
103
+ # Directly use model_dump if actions and current_state are Pydantic models
104
+ action_dump = [
105
+ action.model_dump(exclude_none=True) for action in model_output.action
106
+ ]
107
+
108
+ state_dump = model_output.current_state.model_dump(exclude_none=True)
109
+ model_output_dump = {
110
+ "current_state": state_dump,
111
+ "action": action_dump,
112
+ }
113
+ # Dump to JSON string with indentation
114
+ json_string = json.dumps(model_output_dump, indent=4, ensure_ascii=False)
115
+ # Wrap in <pre><code> for proper display in HTML
116
+ content = f"<pre><code class='language-json'>{json_string}</code></pre>"
117
+
118
+ except AttributeError as ae:
119
+ logger.error(
120
+ f"AttributeError during model dump: {ae}. Check if 'action' or 'current_state' or their items support 'model_dump'."
121
+ )
122
+ content = f"<pre><code>Error: Could not format agent output (AttributeError: {ae}).\nRaw output: {str(model_output)}</code></pre>"
123
+ except Exception as e:
124
+ logger.error(f"Error formatting agent output: {e}", exc_info=True)
125
+ # Fallback to simple string representation on error
126
+ content = f"<pre><code>Error formatting agent output.\nRaw output:\n{str(model_output)}</code></pre>"
127
+
128
+ return content.strip()
129
+
130
+
131
+ # --- Updated Callback Implementation ---
132
+
133
+
134
+ async def _handle_new_step(
135
+ webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int
136
+ ):
137
+ """Callback for each step taken by the agent, including screenshot display."""
138
+
139
+ # Use the correct chat history attribute name from the user's code
140
+ if not hasattr(webui_manager, "bu_chat_history"):
141
+ logger.error(
142
+ "Attribute 'bu_chat_history' not found in webui_manager! Cannot add chat message."
143
+ )
144
+ # Initialize it maybe? Or raise an error? For now, log and potentially skip chat update.
145
+ webui_manager.bu_chat_history = [] # Initialize if missing (consider if this is the right place)
146
+ # return # Or stop if this is critical
147
+ step_num -= 1
148
+ logger.info(f"Step {step_num} completed.")
149
+
150
+ # --- Screenshot Handling ---
151
+ screenshot_html = ""
152
+ # Ensure state.screenshot exists and is not empty before proceeding
153
+ # Use getattr for safer access
154
+ screenshot_data = getattr(state, "screenshot", None)
155
+ if screenshot_data:
156
+ try:
157
+ # Basic validation: check if it looks like base64
158
+ if (
159
+ isinstance(screenshot_data, str) and len(screenshot_data) > 100
160
+ ): # Arbitrary length check
161
+ # *** UPDATED STYLE: Removed centering, adjusted width ***
162
+ img_tag = f'<img src="data:image/jpeg;base64,{screenshot_data}" alt="Step {step_num} Screenshot" style="max-width: 800px; max-height: 600px; object-fit:contain;" />'
163
+ screenshot_html = (
164
+ img_tag + "<br/>"
165
+ ) # Use <br/> for line break after inline-block image
166
+ else:
167
+ logger.warning(
168
+ f"Screenshot for step {step_num} seems invalid (type: {type(screenshot_data)}, len: {len(screenshot_data) if isinstance(screenshot_data, str) else 'N/A'})."
169
+ )
170
+ screenshot_html = "**[Invalid screenshot data]**<br/>"
171
+
172
+ except Exception as e:
173
+ logger.error(
174
+ f"Error processing or formatting screenshot for step {step_num}: {e}",
175
+ exc_info=True,
176
+ )
177
+ screenshot_html = "**[Error displaying screenshot]**<br/>"
178
+ else:
179
+ logger.debug(f"No screenshot available for step {step_num}.")
180
+
181
+ # --- Format Agent Output ---
182
+ formatted_output = _format_agent_output(output) # Use the updated function
183
+
184
+ # --- Combine and Append to Chat ---
185
+ step_header = f"--- **Step {step_num}** ---"
186
+ # Combine header, image (with line break), and JSON block
187
+ final_content = step_header + "<br/>" + screenshot_html + formatted_output
188
+
189
+ chat_message = {
190
+ "role": "assistant",
191
+ "content": final_content.strip(), # Remove leading/trailing whitespace
192
+ }
193
+
194
+ # Append to the correct chat history list
195
+ webui_manager.bu_chat_history.append(chat_message)
196
+
197
+ await asyncio.sleep(0.05)
198
+
199
+
200
+ def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList):
201
+ """Callback when the agent finishes the task (success or failure)."""
202
+ logger.info(
203
+ f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}"
204
+ )
205
+ final_summary = "**Task Completed**\n"
206
+ final_summary += f"- Duration: {history.total_duration_seconds():.2f} seconds\n"
207
+ final_summary += f"- Total Input Tokens: {history.total_input_tokens()}\n" # Or total tokens if available
208
+
209
+ final_result = history.final_result()
210
+ if final_result:
211
+ final_summary += f"- Final Result: {final_result}\n"
212
+
213
+ errors = history.errors()
214
+ if errors and any(errors):
215
+ final_summary += f"- **Errors:**\n```\n{errors}\n```\n"
216
+ else:
217
+ final_summary += "- Status: Success\n"
218
+
219
+ webui_manager.bu_chat_history.append(
220
+ {"role": "assistant", "content": final_summary}
221
+ )
222
+
223
+
224
+ async def _ask_assistant_callback(
225
+ webui_manager: WebuiManager, query: str, browser_context: BrowserContext
226
+ ) -> Dict[str, Any]:
227
+ """Callback triggered by the agent's ask_for_assistant action."""
228
+ logger.info("Agent requires assistance. Waiting for user input.")
229
+
230
+ if not hasattr(webui_manager, "_chat_history"):
231
+ logger.error("Chat history not found in webui_manager during ask_assistant!")
232
+ return {"response": "Internal Error: Cannot display help request."}
233
+
234
+ webui_manager.bu_chat_history.append(
235
+ {
236
+ "role": "assistant",
237
+ "content": f"**Need Help:** {query}\nPlease provide information or perform the required action in the browser, then type your response/confirmation below and click 'Submit Response'.",
238
+ }
239
+ )
240
+
241
+ # Use state stored in webui_manager
242
+ webui_manager.bu_response_event = asyncio.Event()
243
+ webui_manager.bu_user_help_response = None # Reset previous response
244
+
245
+ try:
246
+ logger.info("Waiting for user response event...")
247
+ await asyncio.wait_for(
248
+ webui_manager.bu_response_event.wait(), timeout=3600.0
249
+ ) # Long timeout
250
+ logger.info("User response event received.")
251
+ except asyncio.TimeoutError:
252
+ logger.warning("Timeout waiting for user assistance.")
253
+ webui_manager.bu_chat_history.append(
254
+ {
255
+ "role": "assistant",
256
+ "content": "**Timeout:** No response received. Trying to proceed.",
257
+ }
258
+ )
259
+ webui_manager.bu_response_event = None # Clear the event
260
+ return {"response": "Timeout: User did not respond."} # Inform the agent
261
+
262
+ response = webui_manager.bu_user_help_response
263
+ webui_manager.bu_chat_history.append(
264
+ {"role": "user", "content": response}
265
+ ) # Show user response in chat
266
+ webui_manager.bu_response_event = (
267
+ None # Clear the event for the next potential request
268
+ )
269
+ return {"response": response}
270
+
271
+
272
+ # --- Core Agent Execution Logic --- (Needs access to webui_manager)
273
+
274
+
275
+ async def run_agent_task(
276
+ webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
277
+ ) -> AsyncGenerator[Dict[gr.components.Component, Any], None]:
278
+ """Handles the entire lifecycle of initializing and running the agent."""
279
+
280
+ # --- Get Components ---
281
+ # Need handles to specific UI components to update them
282
+ user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
283
+ run_button_comp = webui_manager.get_component_by_id("browser_use_agent.run_button")
284
+ stop_button_comp = webui_manager.get_component_by_id(
285
+ "browser_use_agent.stop_button"
286
+ )
287
+ pause_resume_button_comp = webui_manager.get_component_by_id(
288
+ "browser_use_agent.pause_resume_button"
289
+ )
290
+ clear_button_comp = webui_manager.get_component_by_id(
291
+ "browser_use_agent.clear_button"
292
+ )
293
+ chatbot_comp = webui_manager.get_component_by_id("browser_use_agent.chatbot")
294
+ history_file_comp = webui_manager.get_component_by_id(
295
+ "browser_use_agent.agent_history_file"
296
+ )
297
+ gif_comp = webui_manager.get_component_by_id("browser_use_agent.recording_gif")
298
+ browser_view_comp = webui_manager.get_component_by_id(
299
+ "browser_use_agent.browser_view"
300
+ )
301
+
302
+ # --- 1. Get Task and Initial UI Update ---
303
+ task = components.get(user_input_comp, "").strip()
304
+ if not task:
305
+ gr.Warning("Please enter a task.")
306
+ yield {run_button_comp: gr.update(interactive=True)}
307
+ return
308
+
309
+ # Set running state indirectly via _current_task
310
+ webui_manager.bu_chat_history.append({"role": "user", "content": task})
311
+
312
+ yield {
313
+ user_input_comp: gr.Textbox(
314
+ value="", interactive=False, placeholder="Agent is running..."
315
+ ),
316
+ run_button_comp: gr.Button(value="⏳ Running...", interactive=False),
317
+ stop_button_comp: gr.Button(interactive=True),
318
+ pause_resume_button_comp: gr.Button(value="⏸️ Pause", interactive=True),
319
+ clear_button_comp: gr.Button(interactive=False),
320
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
321
+ history_file_comp: gr.update(value=None),
322
+ gif_comp: gr.update(value=None),
323
+ }
324
+
325
+ # --- Agent Settings ---
326
+ # Access settings values via components dict, getting IDs from webui_manager
327
+ def get_setting(key, default=None):
328
+ comp = webui_manager.id_to_component.get(f"agent_settings.{key}")
329
+ return components.get(comp, default) if comp else default
330
+
331
+ override_system_prompt = get_setting("override_system_prompt") or None
332
+ extend_system_prompt = get_setting("extend_system_prompt") or None
333
+ llm_provider_name = get_setting(
334
+ "llm_provider", None
335
+ ) # Default to None if not found
336
+ llm_model_name = get_setting("llm_model_name", None)
337
+ llm_temperature = get_setting("llm_temperature", 0.6)
338
+ use_vision = get_setting("use_vision", True)
339
+ ollama_num_ctx = get_setting("ollama_num_ctx", 16000)
340
+ llm_base_url = get_setting("llm_base_url") or None
341
+ llm_api_key = get_setting("llm_api_key") or None
342
+ max_steps = get_setting("max_steps", 100)
343
+ max_actions = get_setting("max_actions", 10)
344
+ max_input_tokens = get_setting("max_input_tokens", 128000)
345
+ tool_calling_str = get_setting("tool_calling_method", "auto")
346
+ tool_calling_method = tool_calling_str if tool_calling_str != "None" else None
347
+ mcp_server_config_comp = webui_manager.id_to_component.get(
348
+ "agent_settings.mcp_server_config"
349
+ )
350
+ mcp_server_config_str = (
351
+ components.get(mcp_server_config_comp) if mcp_server_config_comp else None
352
+ )
353
+ mcp_server_config = (
354
+ json.loads(mcp_server_config_str) if mcp_server_config_str else None
355
+ )
356
+
357
+ # Planner LLM Settings (Optional)
358
+ planner_llm_provider_name = get_setting("planner_llm_provider") or None
359
+ planner_llm = None
360
+ planner_use_vision = False
361
+ if planner_llm_provider_name:
362
+ planner_llm_model_name = get_setting("planner_llm_model_name")
363
+ planner_llm_temperature = get_setting("planner_llm_temperature", 0.6)
364
+ planner_ollama_num_ctx = get_setting("planner_ollama_num_ctx", 16000)
365
+ planner_llm_base_url = get_setting("planner_llm_base_url") or None
366
+ planner_llm_api_key = get_setting("planner_llm_api_key") or None
367
+ planner_use_vision = get_setting("planner_use_vision", False)
368
+
369
+ planner_llm = await _initialize_llm(
370
+ planner_llm_provider_name,
371
+ planner_llm_model_name,
372
+ planner_llm_temperature,
373
+ planner_llm_base_url,
374
+ planner_llm_api_key,
375
+ planner_ollama_num_ctx if planner_llm_provider_name == "ollama" else None,
376
+ )
377
+
378
+ # --- Browser Settings ---
379
+ def get_browser_setting(key, default=None):
380
+ comp = webui_manager.id_to_component.get(f"browser_settings.{key}")
381
+ return components.get(comp, default) if comp else default
382
+
383
+ browser_binary_path = get_browser_setting("browser_binary_path") or None
384
+ browser_user_data_dir = get_browser_setting("browser_user_data_dir") or None
385
+ use_own_browser = get_browser_setting(
386
+ "use_own_browser", False
387
+ ) # Logic handled by CDP/WSS presence
388
+ keep_browser_open = get_browser_setting("keep_browser_open", False)
389
+ headless = get_browser_setting("headless", False)
390
+ disable_security = get_browser_setting("disable_security", False)
391
+ window_w = int(get_browser_setting("window_w", 1280))
392
+ window_h = int(get_browser_setting("window_h", 1100))
393
+ cdp_url = get_browser_setting("cdp_url") or None
394
+ wss_url = get_browser_setting("wss_url") or None
395
+ save_recording_path = get_browser_setting("save_recording_path") or None
396
+ save_trace_path = get_browser_setting("save_trace_path") or None
397
+ save_agent_history_path = get_browser_setting(
398
+ "save_agent_history_path", "./tmp/agent_history"
399
+ )
400
+ save_download_path = get_browser_setting("save_download_path", "./tmp/downloads")
401
+
402
+ stream_vw = 70
403
+ stream_vh = int(70 * window_h // window_w)
404
+
405
+ os.makedirs(save_agent_history_path, exist_ok=True)
406
+ if save_recording_path:
407
+ os.makedirs(save_recording_path, exist_ok=True)
408
+ if save_trace_path:
409
+ os.makedirs(save_trace_path, exist_ok=True)
410
+ if save_download_path:
411
+ os.makedirs(save_download_path, exist_ok=True)
412
+
413
+ # --- 2. Initialize LLM ---
414
+ main_llm = await _initialize_llm(
415
+ llm_provider_name,
416
+ llm_model_name,
417
+ llm_temperature,
418
+ llm_base_url,
419
+ llm_api_key,
420
+ ollama_num_ctx if llm_provider_name == "ollama" else None,
421
+ )
422
+
423
+ # Pass the webui_manager instance to the callback when wrapping it
424
+ async def ask_callback_wrapper(
425
+ query: str, browser_context: BrowserContext
426
+ ) -> Dict[str, Any]:
427
+ return await _ask_assistant_callback(webui_manager, query, browser_context)
428
+
429
+ if not webui_manager.bu_controller:
430
+ webui_manager.bu_controller = CustomController(
431
+ ask_assistant_callback=ask_callback_wrapper
432
+ )
433
+ await webui_manager.bu_controller.setup_mcp_client(mcp_server_config)
434
+
435
+ # --- 4. Initialize Browser and Context ---
436
+ should_close_browser_on_finish = not keep_browser_open
437
+
438
+ try:
439
+ # Close existing resources if not keeping open
440
+ if not keep_browser_open:
441
+ if webui_manager.bu_browser_context:
442
+ logger.info("Closing previous browser context.")
443
+ await webui_manager.bu_browser_context.close()
444
+ webui_manager.bu_browser_context = None
445
+ if webui_manager.bu_browser:
446
+ logger.info("Closing previous browser.")
447
+ await webui_manager.bu_browser.close()
448
+ webui_manager.bu_browser = None
449
+
450
+ # Create Browser if needed
451
+ if not webui_manager.bu_browser:
452
+ logger.info("Launching new browser instance.")
453
+ extra_args = []
454
+ if use_own_browser:
455
+ browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path
456
+ if browser_binary_path == "":
457
+ browser_binary_path = None
458
+ browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None)
459
+ if browser_user_data:
460
+ extra_args += [f"--user-data-dir={browser_user_data}"]
461
+ else:
462
+ browser_binary_path = None
463
+
464
+ webui_manager.bu_browser = CustomBrowser(
465
+ config=BrowserConfig(
466
+ headless=headless,
467
+ disable_security=disable_security,
468
+ browser_binary_path=browser_binary_path,
469
+ extra_browser_args=extra_args,
470
+ wss_url=wss_url,
471
+ cdp_url=cdp_url,
472
+ new_context_config=BrowserContextConfig(
473
+ window_width=window_w,
474
+ window_height=window_h,
475
+ )
476
+ )
477
+ )
478
+
479
+ # Create Context if needed
480
+ if not webui_manager.bu_browser_context:
481
+ logger.info("Creating new browser context.")
482
+ context_config = BrowserContextConfig(
483
+ trace_path=save_trace_path if save_trace_path else None,
484
+ save_recording_path=save_recording_path
485
+ if save_recording_path
486
+ else None,
487
+ save_downloads_path=save_download_path if save_download_path else None,
488
+ window_height=window_h,
489
+ window_width=window_w,
490
+ )
491
+ if not webui_manager.bu_browser:
492
+ raise ValueError("Browser not initialized, cannot create context.")
493
+ webui_manager.bu_browser_context = (
494
+ await webui_manager.bu_browser.new_context(config=context_config)
495
+ )
496
+
497
+ # --- 5. Initialize or Update Agent ---
498
+ webui_manager.bu_agent_task_id = str(uuid.uuid4()) # New ID for this task run
499
+ os.makedirs(
500
+ os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id),
501
+ exist_ok=True,
502
+ )
503
+ history_file = os.path.join(
504
+ save_agent_history_path,
505
+ webui_manager.bu_agent_task_id,
506
+ f"{webui_manager.bu_agent_task_id}.json",
507
+ )
508
+ gif_path = os.path.join(
509
+ save_agent_history_path,
510
+ webui_manager.bu_agent_task_id,
511
+ f"{webui_manager.bu_agent_task_id}.gif",
512
+ )
513
+
514
+ # Pass the webui_manager to callbacks when wrapping them
515
+ async def step_callback_wrapper(
516
+ state: BrowserState, output: AgentOutput, step_num: int
517
+ ):
518
+ await _handle_new_step(webui_manager, state, output, step_num)
519
+
520
+ def done_callback_wrapper(history: AgentHistoryList):
521
+ _handle_done(webui_manager, history)
522
+
523
+ if not webui_manager.bu_agent:
524
+ logger.info(f"Initializing new agent for task: {task}")
525
+ if not webui_manager.bu_browser or not webui_manager.bu_browser_context:
526
+ raise ValueError(
527
+ "Browser or Context not initialized, cannot create agent."
528
+ )
529
+ webui_manager.bu_agent = BrowserUseAgent(
530
+ task=task,
531
+ llm=main_llm,
532
+ browser=webui_manager.bu_browser,
533
+ browser_context=webui_manager.bu_browser_context,
534
+ controller=webui_manager.bu_controller,
535
+ register_new_step_callback=step_callback_wrapper,
536
+ register_done_callback=done_callback_wrapper,
537
+ use_vision=use_vision,
538
+ override_system_message=override_system_prompt,
539
+ extend_system_message=extend_system_prompt,
540
+ max_input_tokens=max_input_tokens,
541
+ max_actions_per_step=max_actions,
542
+ tool_calling_method=tool_calling_method,
543
+ planner_llm=planner_llm,
544
+ use_vision_for_planner=planner_use_vision if planner_llm else False,
545
+ source="webui",
546
+ )
547
+ webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
548
+ webui_manager.bu_agent.settings.generate_gif = gif_path
549
+ else:
550
+ webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
551
+ webui_manager.bu_agent.add_new_task(task)
552
+ webui_manager.bu_agent.settings.generate_gif = gif_path
553
+ webui_manager.bu_agent.browser = webui_manager.bu_browser
554
+ webui_manager.bu_agent.browser_context = webui_manager.bu_browser_context
555
+ webui_manager.bu_agent.controller = webui_manager.bu_controller
556
+
557
+ # --- 6. Run Agent Task and Stream Updates ---
558
+ agent_run_coro = webui_manager.bu_agent.run(max_steps=max_steps)
559
+ agent_task = asyncio.create_task(agent_run_coro)
560
+ webui_manager.bu_current_task = agent_task # Store the task
561
+
562
+ last_chat_len = len(webui_manager.bu_chat_history)
563
+ while not agent_task.done():
564
+ is_paused = webui_manager.bu_agent.state.paused
565
+ is_stopped = webui_manager.bu_agent.state.stopped
566
+
567
+ # Check for pause state
568
+ if is_paused:
569
+ yield {
570
+ pause_resume_button_comp: gr.update(
571
+ value="▶️ Resume", interactive=True
572
+ ),
573
+ stop_button_comp: gr.update(interactive=True),
574
+ }
575
+ # Wait until pause is released or task is stopped/done
576
+ while is_paused and not agent_task.done():
577
+ # Re-check agent state in loop
578
+ is_paused = webui_manager.bu_agent.state.paused
579
+ is_stopped = webui_manager.bu_agent.state.stopped
580
+ if is_stopped: # Stop signal received while paused
581
+ break
582
+ await asyncio.sleep(0.2)
583
+
584
+ if (
585
+ agent_task.done() or is_stopped
586
+ ): # If stopped or task finished while paused
587
+ break
588
+
589
+ # If resumed, yield UI update
590
+ yield {
591
+ pause_resume_button_comp: gr.update(
592
+ value="⏸️ Pause", interactive=True
593
+ ),
594
+ run_button_comp: gr.update(
595
+ value="⏳ Running...", interactive=False
596
+ ),
597
+ }
598
+
599
+ # Check if agent stopped itself or stop button was pressed (which sets agent.state.stopped)
600
+ if is_stopped:
601
+ logger.info("Agent has stopped (internally or via stop button).")
602
+ if not agent_task.done():
603
+ # Ensure the task coroutine finishes if agent just set flag
604
+ try:
605
+ await asyncio.wait_for(
606
+ agent_task, timeout=1.0
607
+ ) # Give it a moment to exit run()
608
+ except asyncio.TimeoutError:
609
+ logger.warning(
610
+ "Agent task did not finish quickly after stop signal, cancelling."
611
+ )
612
+ agent_task.cancel()
613
+ except Exception: # Catch task exceptions if it errors on stop
614
+ pass
615
+ break # Exit the streaming loop
616
+
617
+ # Check if agent is asking for help (via response_event)
618
+ update_dict = {}
619
+ if webui_manager.bu_response_event is not None:
620
+ update_dict = {
621
+ user_input_comp: gr.update(
622
+ placeholder="Agent needs help. Enter response and submit.",
623
+ interactive=True,
624
+ ),
625
+ run_button_comp: gr.update(
626
+ value="✔️ Submit Response", interactive=True
627
+ ),
628
+ pause_resume_button_comp: gr.update(interactive=False),
629
+ stop_button_comp: gr.update(interactive=False),
630
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
631
+ }
632
+ last_chat_len = len(webui_manager.bu_chat_history)
633
+ yield update_dict
634
+ # Wait until response is submitted or task finishes
635
+ while (
636
+ webui_manager.bu_response_event is not None
637
+ and not agent_task.done()
638
+ ):
639
+ await asyncio.sleep(0.2)
640
+ # Restore UI after response submitted or if task ended unexpectedly
641
+ if not agent_task.done():
642
+ yield {
643
+ user_input_comp: gr.update(
644
+ placeholder="Agent is running...", interactive=False
645
+ ),
646
+ run_button_comp: gr.update(
647
+ value="⏳ Running...", interactive=False
648
+ ),
649
+ pause_resume_button_comp: gr.update(interactive=True),
650
+ stop_button_comp: gr.update(interactive=True),
651
+ }
652
+ else:
653
+ break # Task finished while waiting for response
654
+
655
+ # Update Chatbot if new messages arrived via callbacks
656
+ if len(webui_manager.bu_chat_history) > last_chat_len:
657
+ update_dict[chatbot_comp] = gr.update(
658
+ value=webui_manager.bu_chat_history
659
+ )
660
+ last_chat_len = len(webui_manager.bu_chat_history)
661
+
662
+ # Update Browser View
663
+ if headless and webui_manager.bu_browser_context:
664
+ try:
665
+ screenshot_b64 = (
666
+ await webui_manager.bu_browser_context.take_screenshot()
667
+ )
668
+ if screenshot_b64:
669
+ html_content = f'<img src="data:image/jpeg;base64,{screenshot_b64}" style="width:{stream_vw}vw; height:{stream_vh}vh ; border:1px solid #ccc;">'
670
+ update_dict[browser_view_comp] = gr.update(
671
+ value=html_content, visible=True
672
+ )
673
+ else:
674
+ html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>"
675
+ update_dict[browser_view_comp] = gr.update(
676
+ value=html_content, visible=True
677
+ )
678
+ except Exception as e:
679
+ logger.debug(f"Failed to capture screenshot: {e}")
680
+ update_dict[browser_view_comp] = gr.update(
681
+ value="<div style='...'>Error loading view...</div>",
682
+ visible=True,
683
+ )
684
+ else:
685
+ update_dict[browser_view_comp] = gr.update(visible=False)
686
+
687
+ # Yield accumulated updates
688
+ if update_dict:
689
+ yield update_dict
690
+
691
+ await asyncio.sleep(0.1) # Polling interval
692
+
693
+ # --- 7. Task Finalization ---
694
+ webui_manager.bu_agent.state.paused = False
695
+ webui_manager.bu_agent.state.stopped = False
696
+ final_update = {}
697
+ try:
698
+ logger.info("Agent task completing...")
699
+ # Await the task ensure completion and catch exceptions if not already caught
700
+ if not agent_task.done():
701
+ await agent_task # Retrieve result/exception
702
+ elif agent_task.exception(): # Check if task finished with exception
703
+ agent_task.result() # Raise the exception to be caught below
704
+ logger.info("Agent task completed processing.")
705
+
706
+ logger.info(f"Explicitly saving agent history to: {history_file}")
707
+ webui_manager.bu_agent.save_history(history_file)
708
+
709
+ if os.path.exists(history_file):
710
+ final_update[history_file_comp] = gr.File(value=history_file)
711
+
712
+ if gif_path and os.path.exists(gif_path):
713
+ logger.info(f"GIF found at: {gif_path}")
714
+ final_update[gif_comp] = gr.Image(value=gif_path)
715
+
716
+ except asyncio.CancelledError:
717
+ logger.info("Agent task was cancelled.")
718
+ if not any(
719
+ "Cancelled" in msg.get("content", "")
720
+ for msg in webui_manager.bu_chat_history
721
+ if msg.get("role") == "assistant"
722
+ ):
723
+ webui_manager.bu_chat_history.append(
724
+ {"role": "assistant", "content": "**Task Cancelled**."}
725
+ )
726
+ final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
727
+ except Exception as e:
728
+ logger.error(f"Error during agent execution: {e}", exc_info=True)
729
+ error_message = (
730
+ f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```"
731
+ )
732
+ if not any(
733
+ error_message in msg.get("content", "")
734
+ for msg in webui_manager.bu_chat_history
735
+ if msg.get("role") == "assistant"
736
+ ):
737
+ webui_manager.bu_chat_history.append(
738
+ {"role": "assistant", "content": error_message}
739
+ )
740
+ final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
741
+ gr.Error(f"Agent execution failed: {e}")
742
+
743
+ finally:
744
+ webui_manager.bu_current_task = None # Clear the task reference
745
+
746
+ # Close browser/context if requested
747
+ if should_close_browser_on_finish:
748
+ if webui_manager.bu_browser_context:
749
+ logger.info("Closing browser context after task.")
750
+ await webui_manager.bu_browser_context.close()
751
+ webui_manager.bu_browser_context = None
752
+ if webui_manager.bu_browser:
753
+ logger.info("Closing browser after task.")
754
+ await webui_manager.bu_browser.close()
755
+ webui_manager.bu_browser = None
756
+
757
+ # --- 8. Final UI Update ---
758
+ final_update.update(
759
+ {
760
+ user_input_comp: gr.update(
761
+ value="",
762
+ interactive=True,
763
+ placeholder="Enter your next task...",
764
+ ),
765
+ run_button_comp: gr.update(value="▶️ Submit Task", interactive=True),
766
+ stop_button_comp: gr.update(value="⏹️ Stop", interactive=False),
767
+ pause_resume_button_comp: gr.update(
768
+ value="⏸️ Pause", interactive=False
769
+ ),
770
+ clear_button_comp: gr.update(interactive=True),
771
+ # Ensure final chat history is shown
772
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
773
+ }
774
+ )
775
+ yield final_update
776
+
777
+ except Exception as e:
778
+ # Catch errors during setup (before agent run starts)
779
+ logger.error(f"Error setting up agent task: {e}", exc_info=True)
780
+ webui_manager.bu_current_task = None # Ensure state is reset
781
+ yield {
782
+ user_input_comp: gr.update(
783
+ interactive=True, placeholder="Error during setup. Enter task..."
784
+ ),
785
+ run_button_comp: gr.update(value="▶️ Submit Task", interactive=True),
786
+ stop_button_comp: gr.update(value="⏹️ Stop", interactive=False),
787
+ pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=False),
788
+ clear_button_comp: gr.update(interactive=True),
789
+ chatbot_comp: gr.update(
790
+ value=webui_manager.bu_chat_history
791
+ + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]
792
+ ),
793
+ }
794
+
795
+
796
+ # --- Button Click Handlers --- (Need access to webui_manager)
797
+
798
+
799
+ async def handle_submit(
800
+ webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
801
+ ):
802
+ """Handles clicks on the main 'Submit' button."""
803
+ user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
804
+ user_input_value = components.get(user_input_comp, "").strip()
805
+
806
+ # Check if waiting for user assistance
807
+ if webui_manager.bu_response_event and not webui_manager.bu_response_event.is_set():
808
+ logger.info(f"User submitted assistance: {user_input_value}")
809
+ webui_manager.bu_user_help_response = (
810
+ user_input_value if user_input_value else "User provided no text response."
811
+ )
812
+ webui_manager.bu_response_event.set()
813
+ # UI updates handled by the main loop reacting to the event being set
814
+ yield {
815
+ user_input_comp: gr.update(
816
+ value="",
817
+ interactive=False,
818
+ placeholder="Waiting for agent to continue...",
819
+ ),
820
+ webui_manager.get_component_by_id(
821
+ "browser_use_agent.run_button"
822
+ ): gr.update(value="⏳ Running...", interactive=False),
823
+ }
824
+ # Check if a task is currently running (using _current_task)
825
+ elif webui_manager.bu_current_task and not webui_manager.bu_current_task.done():
826
+ logger.warning(
827
+ "Submit button clicked while agent is already running and not asking for help."
828
+ )
829
+ gr.Info("Agent is currently running. Please wait or use Stop/Pause.")
830
+ yield {} # No change
831
+ else:
832
+ # Handle submission for a new task
833
+ logger.info("Submit button clicked for new task.")
834
+ # Use async generator to stream updates from run_agent_task
835
+ async for update in run_agent_task(webui_manager, components):
836
+ yield update
837
+
838
+
839
+ async def handle_stop(webui_manager: WebuiManager):
840
+ """Handles clicks on the 'Stop' button."""
841
+ logger.info("Stop button clicked.")
842
+ agent = webui_manager.bu_agent
843
+ task = webui_manager.bu_current_task
844
+
845
+ if agent and task and not task.done():
846
+ # Signal the agent to stop by setting its internal flag
847
+ agent.state.stopped = True
848
+ agent.state.paused = False # Ensure not paused if stopped
849
+ return {
850
+ webui_manager.get_component_by_id(
851
+ "browser_use_agent.stop_button"
852
+ ): gr.update(interactive=False, value="⏹️ Stopping..."),
853
+ webui_manager.get_component_by_id(
854
+ "browser_use_agent.pause_resume_button"
855
+ ): gr.update(interactive=False),
856
+ webui_manager.get_component_by_id(
857
+ "browser_use_agent.run_button"
858
+ ): gr.update(interactive=False),
859
+ }
860
+ else:
861
+ logger.warning("Stop clicked but agent is not running or task is already done.")
862
+ # Reset UI just in case it's stuck
863
+ return {
864
+ webui_manager.get_component_by_id(
865
+ "browser_use_agent.run_button"
866
+ ): gr.update(interactive=True),
867
+ webui_manager.get_component_by_id(
868
+ "browser_use_agent.stop_button"
869
+ ): gr.update(interactive=False),
870
+ webui_manager.get_component_by_id(
871
+ "browser_use_agent.pause_resume_button"
872
+ ): gr.update(interactive=False),
873
+ webui_manager.get_component_by_id(
874
+ "browser_use_agent.clear_button"
875
+ ): gr.update(interactive=True),
876
+ }
877
+
878
+
879
+ async def handle_pause_resume(webui_manager: WebuiManager):
880
+ """Handles clicks on the 'Pause/Resume' button."""
881
+ agent = webui_manager.bu_agent
882
+ task = webui_manager.bu_current_task
883
+
884
+ if agent and task and not task.done():
885
+ if agent.state.paused:
886
+ logger.info("Resume button clicked.")
887
+ agent.resume()
888
+ # UI update happens in main loop
889
+ return {
890
+ webui_manager.get_component_by_id(
891
+ "browser_use_agent.pause_resume_button"
892
+ ): gr.update(value="⏸️ Pause", interactive=True)
893
+ } # Optimistic update
894
+ else:
895
+ logger.info("Pause button clicked.")
896
+ agent.pause()
897
+ return {
898
+ webui_manager.get_component_by_id(
899
+ "browser_use_agent.pause_resume_button"
900
+ ): gr.update(value="▶️ Resume", interactive=True)
901
+ } # Optimistic update
902
+ else:
903
+ logger.warning(
904
+ "Pause/Resume clicked but agent is not running or doesn't support state."
905
+ )
906
+ return {} # No change
907
+
908
+
909
+ async def handle_clear(webui_manager: WebuiManager):
910
+ """Handles clicks on the 'Clear' button."""
911
+ logger.info("Clear button clicked.")
912
+
913
+ # Stop any running task first
914
+ task = webui_manager.bu_current_task
915
+ if task and not task.done():
916
+ logger.info("Clearing requires stopping the current task.")
917
+ webui_manager.bu_agent.stop()
918
+ task.cancel()
919
+ try:
920
+ await asyncio.wait_for(task, timeout=2.0) # Wait briefly
921
+ except (asyncio.CancelledError, asyncio.TimeoutError):
922
+ pass
923
+ except Exception as e:
924
+ logger.warning(f"Error stopping task on clear: {e}")
925
+ webui_manager.bu_current_task = None
926
+
927
+ if webui_manager.bu_controller:
928
+ await webui_manager.bu_controller.close_mcp_client()
929
+ webui_manager.bu_controller = None
930
+ webui_manager.bu_agent = None
931
+
932
+ # Reset state stored in manager
933
+ webui_manager.bu_chat_history = []
934
+ webui_manager.bu_response_event = None
935
+ webui_manager.bu_user_help_response = None
936
+ webui_manager.bu_agent_task_id = None
937
+
938
+ logger.info("Agent state and browser resources cleared.")
939
+
940
+ # Reset UI components
941
+ return {
942
+ webui_manager.get_component_by_id("browser_use_agent.chatbot"): gr.update(
943
+ value=[]
944
+ ),
945
+ webui_manager.get_component_by_id("browser_use_agent.user_input"): gr.update(
946
+ value="", placeholder="Enter your task here..."
947
+ ),
948
+ webui_manager.get_component_by_id(
949
+ "browser_use_agent.agent_history_file"
950
+ ): gr.update(value=None),
951
+ webui_manager.get_component_by_id("browser_use_agent.recording_gif"): gr.update(
952
+ value=None
953
+ ),
954
+ webui_manager.get_component_by_id("browser_use_agent.browser_view"): gr.update(
955
+ value="<div style='...'>Browser Cleared</div>"
956
+ ),
957
+ webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(
958
+ value="▶️ Submit Task", interactive=True
959
+ ),
960
+ webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(
961
+ interactive=False
962
+ ),
963
+ webui_manager.get_component_by_id(
964
+ "browser_use_agent.pause_resume_button"
965
+ ): gr.update(value="⏸️ Pause", interactive=False),
966
+ webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(
967
+ interactive=True
968
+ ),
969
+ }
970
+
971
+
972
+ # --- Tab Creation Function ---
973
+
974
+
975
+ def create_browser_use_agent_tab(webui_manager: WebuiManager):
976
+ """
977
+ Create the run agent tab, defining UI, state, and handlers.
978
+ """
979
+ webui_manager.init_browser_use_agent()
980
+
981
+ # --- Define UI Components ---
982
+ tab_components = {}
983
+ with gr.Column():
984
+ chatbot = gr.Chatbot(
985
+ lambda: webui_manager.bu_chat_history, # Load history dynamically
986
+ elem_id="browser_use_chatbot",
987
+ label="Agent Interaction",
988
+ type="messages",
989
+ height=600,
990
+ show_copy_button=True,
991
+ )
992
+ user_input = gr.Textbox(
993
+ label="Your Task or Response",
994
+ placeholder="Enter your task here or provide assistance when asked.",
995
+ lines=3,
996
+ interactive=True,
997
+ elem_id="user_input",
998
+ )
999
+ with gr.Row():
1000
+ stop_button = gr.Button(
1001
+ "⏹️ Stop", interactive=False, variant="stop", scale=2
1002
+ )
1003
+ pause_resume_button = gr.Button(
1004
+ "⏸️ Pause", interactive=False, variant="secondary", scale=2, visible=True
1005
+ )
1006
+ clear_button = gr.Button(
1007
+ "🗑️ Clear", interactive=True, variant="secondary", scale=2
1008
+ )
1009
+ run_button = gr.Button("▶️ Submit Task", variant="primary", scale=3)
1010
+
1011
+ browser_view = gr.HTML(
1012
+ value="<div style='width:100%; height:50vh; display:flex; justify-content:center; align-items:center; border:1px solid #ccc; background-color:#f0f0f0;'><p>Browser View (Requires Headless=True)</p></div>",
1013
+ label="Browser Live View",
1014
+ elem_id="browser_view",
1015
+ visible=False,
1016
+ )
1017
+ with gr.Column():
1018
+ gr.Markdown("### Task Outputs")
1019
+ agent_history_file = gr.File(label="Agent History JSON", interactive=False)
1020
+ recording_gif = gr.Image(
1021
+ label="Task Recording GIF",
1022
+ format="gif",
1023
+ interactive=False,
1024
+ type="filepath",
1025
+ )
1026
+
1027
+ # --- Store Components in Manager ---
1028
+ tab_components.update(
1029
+ dict(
1030
+ chatbot=chatbot,
1031
+ user_input=user_input,
1032
+ clear_button=clear_button,
1033
+ run_button=run_button,
1034
+ stop_button=stop_button,
1035
+ pause_resume_button=pause_resume_button,
1036
+ agent_history_file=agent_history_file,
1037
+ recording_gif=recording_gif,
1038
+ browser_view=browser_view,
1039
+ )
1040
+ )
1041
+ webui_manager.add_components(
1042
+ "browser_use_agent", tab_components
1043
+ ) # Use "browser_use_agent" as tab_name prefix
1044
+
1045
+ all_managed_components = set(
1046
+ webui_manager.get_components()
1047
+ ) # Get all components known to manager
1048
+ run_tab_outputs = list(tab_components.values())
1049
+
1050
+ async def submit_wrapper(
1051
+ components_dict: Dict[Component, Any],
1052
+ ) -> AsyncGenerator[Dict[Component, Any], None]:
1053
+ """Wrapper for handle_submit that yields its results."""
1054
+ async for update in handle_submit(webui_manager, components_dict):
1055
+ yield update
1056
+
1057
+ async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
1058
+ """Wrapper for handle_stop."""
1059
+ update_dict = await handle_stop(webui_manager)
1060
+ yield update_dict
1061
+
1062
+ async def pause_resume_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
1063
+ """Wrapper for handle_pause_resume."""
1064
+ update_dict = await handle_pause_resume(webui_manager)
1065
+ yield update_dict
1066
+
1067
+ async def clear_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
1068
+ """Wrapper for handle_clear."""
1069
+ update_dict = await handle_clear(webui_manager)
1070
+ yield update_dict
1071
+
1072
+ # --- Connect Event Handlers using the Wrappers --
1073
+ run_button.click(
1074
+ fn=submit_wrapper, inputs=all_managed_components, outputs=run_tab_outputs
1075
+ )
1076
+ user_input.submit(
1077
+ fn=submit_wrapper, inputs=all_managed_components, outputs=run_tab_outputs
1078
+ )
1079
+ stop_button.click(fn=stop_wrapper, inputs=None, outputs=run_tab_outputs)
1080
+ pause_resume_button.click(
1081
+ fn=pause_resume_wrapper, inputs=None, outputs=run_tab_outputs
1082
+ )
1083
+ clear_button.click(fn=clear_wrapper, inputs=None, outputs=run_tab_outputs)
src/webui/components/deep_research_agent_tab.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio.components import Component
3
+ from functools import partial
4
+
5
+ from src.webui.webui_manager import WebuiManager
6
+ from src.utils import config
7
+ import logging
8
+ import os
9
+ from typing import Any, Dict, AsyncGenerator, Optional, Tuple, Union
10
+ import asyncio
11
+ import json
12
+ from src.agent.deep_research.deep_research_agent import DeepResearchAgent
13
+ from src.utils import llm_provider
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float,
19
+ base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None):
20
+ """Initializes the LLM based on settings. Returns None if provider/model is missing."""
21
+ if not provider or not model_name:
22
+ logger.info("LLM Provider or Model Name not specified, LLM will be None.")
23
+ return None
24
+ try:
25
+ logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}")
26
+ # Use your actual LLM provider logic here
27
+ llm = llm_provider.get_llm_model(
28
+ provider=provider,
29
+ model_name=model_name,
30
+ temperature=temperature,
31
+ base_url=base_url or None,
32
+ api_key=api_key or None,
33
+ num_ctx=num_ctx if provider == "ollama" else None
34
+ )
35
+ return llm
36
+ except Exception as e:
37
+ logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
38
+ gr.Warning(
39
+ f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}")
40
+ return None
41
+
42
+
43
+ def _read_file_safe(file_path: str) -> Optional[str]:
44
+ """Safely read a file, returning None if it doesn't exist or on error."""
45
+ if not os.path.exists(file_path):
46
+ return None
47
+ try:
48
+ with open(file_path, 'r', encoding='utf-8') as f:
49
+ return f.read()
50
+ except Exception as e:
51
+ logger.error(f"Error reading file {file_path}: {e}")
52
+ return None
53
+
54
+
55
+ # --- Deep Research Agent Specific Logic ---
56
+
57
+ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Component, Any]) -> AsyncGenerator[
58
+ Dict[Component, Any], None]:
59
+ """Handles initializing and running the DeepResearchAgent."""
60
+
61
+ # --- Get Components ---
62
+ research_task_comp = webui_manager.get_component_by_id("deep_research_agent.research_task")
63
+ resume_task_id_comp = webui_manager.get_component_by_id("deep_research_agent.resume_task_id")
64
+ parallel_num_comp = webui_manager.get_component_by_id("deep_research_agent.parallel_num")
65
+ save_dir_comp = webui_manager.get_component_by_id(
66
+ "deep_research_agent.max_query") # Note: component ID seems misnamed in original code
67
+ start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button")
68
+ stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button")
69
+ markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display")
70
+ markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download")
71
+ mcp_server_config_comp = webui_manager.get_component_by_id("deep_research_agent.mcp_server_config")
72
+
73
+ # --- 1. Get Task and Settings ---
74
+ task_topic = components.get(research_task_comp, "").strip()
75
+ task_id_to_resume = components.get(resume_task_id_comp, "").strip() or None
76
+ max_parallel_agents = int(components.get(parallel_num_comp, 1))
77
+ base_save_dir = components.get(save_dir_comp, "./tmp/deep_research").strip()
78
+ safe_root_dir = "./tmp/deep_research"
79
+ normalized_base_save_dir = os.path.abspath(os.path.normpath(base_save_dir))
80
+ if os.path.commonpath([normalized_base_save_dir, os.path.abspath(safe_root_dir)]) != os.path.abspath(safe_root_dir):
81
+ logger.warning(f"Unsafe base_save_dir detected: {base_save_dir}. Using default directory.")
82
+ normalized_base_save_dir = os.path.abspath(safe_root_dir)
83
+ base_save_dir = normalized_base_save_dir
84
+ mcp_server_config_str = components.get(mcp_server_config_comp)
85
+ mcp_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None
86
+
87
+ if not task_topic:
88
+ gr.Warning("Please enter a research task.")
89
+ yield {start_button_comp: gr.update(interactive=True)} # Re-enable start button
90
+ return
91
+
92
+ # Store base save dir for stop handler
93
+ webui_manager.dr_save_dir = base_save_dir
94
+ os.makedirs(base_save_dir, exist_ok=True)
95
+
96
+ # --- 2. Initial UI Update ---
97
+ yield {
98
+ start_button_comp: gr.update(value="⏳ Running...", interactive=False),
99
+ stop_button_comp: gr.update(interactive=True),
100
+ research_task_comp: gr.update(interactive=False),
101
+ resume_task_id_comp: gr.update(interactive=False),
102
+ parallel_num_comp: gr.update(interactive=False),
103
+ save_dir_comp: gr.update(interactive=False),
104
+ markdown_display_comp: gr.update(value="Starting research..."),
105
+ markdown_download_comp: gr.update(value=None, interactive=False)
106
+ }
107
+
108
+ agent_task = None
109
+ running_task_id = None
110
+ plan_file_path = None
111
+ report_file_path = None
112
+ last_plan_content = None
113
+ last_plan_mtime = 0
114
+
115
+ try:
116
+ # --- 3. Get LLM and Browser Config from other tabs ---
117
+ # Access settings values via components dict, getting IDs from webui_manager
118
+ def get_setting(tab: str, key: str, default: Any = None):
119
+ comp = webui_manager.id_to_component.get(f"{tab}.{key}")
120
+ return components.get(comp, default) if comp else default
121
+
122
+ # LLM Config (from agent_settings tab)
123
+ llm_provider_name = get_setting("agent_settings", "llm_provider")
124
+ llm_model_name = get_setting("agent_settings", "llm_model_name")
125
+ llm_temperature = max(get_setting("agent_settings", "llm_temperature", 0.5), 0.5)
126
+ llm_base_url = get_setting("agent_settings", "llm_base_url")
127
+ llm_api_key = get_setting("agent_settings", "llm_api_key")
128
+ ollama_num_ctx = get_setting("agent_settings", "ollama_num_ctx")
129
+
130
+ llm = await _initialize_llm(
131
+ llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key,
132
+ ollama_num_ctx if llm_provider_name == "ollama" else None
133
+ )
134
+ if not llm:
135
+ raise ValueError("LLM Initialization failed. Please check Agent Settings.")
136
+
137
+ # Browser Config (from browser_settings tab)
138
+ # Note: DeepResearchAgent constructor takes a dict, not full Browser/Context objects
139
+ browser_config_dict = {
140
+ "headless": get_setting("browser_settings", "headless", False),
141
+ "disable_security": get_setting("browser_settings", "disable_security", False),
142
+ "browser_binary_path": get_setting("browser_settings", "browser_binary_path"),
143
+ "user_data_dir": get_setting("browser_settings", "browser_user_data_dir"),
144
+ "window_width": int(get_setting("browser_settings", "window_w", 1280)),
145
+ "window_height": int(get_setting("browser_settings", "window_h", 1100)),
146
+ # Add other relevant fields if DeepResearchAgent accepts them
147
+ }
148
+
149
+ # --- 4. Initialize or Get Agent ---
150
+ if not webui_manager.dr_agent:
151
+ webui_manager.dr_agent = DeepResearchAgent(
152
+ llm=llm,
153
+ browser_config=browser_config_dict,
154
+ mcp_server_config=mcp_config
155
+ )
156
+ logger.info("DeepResearchAgent initialized.")
157
+
158
+ # --- 5. Start Agent Run ---
159
+ agent_run_coro = webui_manager.dr_agent.run(
160
+ topic=task_topic,
161
+ task_id=task_id_to_resume,
162
+ save_dir=base_save_dir,
163
+ max_parallel_browsers=max_parallel_agents
164
+ )
165
+ agent_task = asyncio.create_task(agent_run_coro)
166
+ webui_manager.dr_current_task = agent_task
167
+
168
+ # Wait briefly for the agent to start and potentially create the task ID/folder
169
+ await asyncio.sleep(1.0)
170
+
171
+ # Determine the actual task ID being used (agent sets this)
172
+ running_task_id = webui_manager.dr_agent.current_task_id
173
+ if not running_task_id:
174
+ # Agent might not have set it yet, try to get from result later? Risky.
175
+ # Or derive from resume_task_id if provided?
176
+ running_task_id = task_id_to_resume
177
+ if not running_task_id:
178
+ logger.warning("Could not determine running task ID immediately.")
179
+ # We can still monitor, but might miss initial plan if ID needed for path
180
+ else:
181
+ logger.info(f"Assuming task ID based on resume ID: {running_task_id}")
182
+ else:
183
+ logger.info(f"Agent started with Task ID: {running_task_id}")
184
+
185
+ webui_manager.dr_task_id = running_task_id # Store for stop handler
186
+
187
+ # --- 6. Monitor Progress via research_plan.md ---
188
+ if running_task_id:
189
+ task_specific_dir = os.path.join(base_save_dir, str(running_task_id))
190
+ plan_file_path = os.path.join(task_specific_dir, "research_plan.md")
191
+ report_file_path = os.path.join(task_specific_dir, "report.md")
192
+ logger.info(f"Monitoring plan file: {plan_file_path}")
193
+ else:
194
+ logger.warning("Cannot monitor plan file: Task ID unknown.")
195
+ plan_file_path = None
196
+ last_plan_content = None
197
+ while not agent_task.done():
198
+ update_dict = {}
199
+ update_dict[resume_task_id_comp] = gr.update(value=running_task_id)
200
+ agent_stopped = getattr(webui_manager.dr_agent, 'stopped', False)
201
+ if agent_stopped:
202
+ logger.info("Stop signal detected from agent state.")
203
+ break # Exit monitoring loop
204
+
205
+ # Check and update research plan display
206
+ if plan_file_path:
207
+ try:
208
+ current_mtime = os.path.getmtime(plan_file_path) if os.path.exists(plan_file_path) else 0
209
+ if current_mtime > last_plan_mtime:
210
+ logger.info(f"Detected change in {plan_file_path}")
211
+ plan_content = _read_file_safe(plan_file_path)
212
+ if last_plan_content is None or (
213
+ plan_content is not None and plan_content != last_plan_content):
214
+ update_dict[markdown_display_comp] = gr.update(value=plan_content)
215
+ last_plan_content = plan_content
216
+ last_plan_mtime = current_mtime
217
+ elif plan_content is None:
218
+ # File might have been deleted or became unreadable
219
+ last_plan_mtime = 0 # Reset to force re-read attempt later
220
+ except Exception as e:
221
+ logger.warning(f"Error checking/reading plan file {plan_file_path}: {e}")
222
+ # Avoid continuous logging for the same error
223
+ await asyncio.sleep(2.0)
224
+
225
+ # Yield updates if any
226
+ if update_dict:
227
+ yield update_dict
228
+
229
+ await asyncio.sleep(1.0) # Check file changes every second
230
+
231
+ # --- 7. Task Finalization ---
232
+ logger.info("Agent task processing finished. Awaiting final result...")
233
+ final_result_dict = await agent_task # Get result or raise exception
234
+ logger.info(f"Agent run completed. Result keys: {final_result_dict.keys() if final_result_dict else 'None'}")
235
+
236
+ # Try to get task ID from result if not known before
237
+ if not running_task_id and final_result_dict and 'task_id' in final_result_dict:
238
+ running_task_id = final_result_dict['task_id']
239
+ webui_manager.dr_task_id = running_task_id
240
+ task_specific_dir = os.path.join(base_save_dir, str(running_task_id))
241
+ report_file_path = os.path.join(task_specific_dir, "report.md")
242
+ logger.info(f"Task ID confirmed from result: {running_task_id}")
243
+
244
+ final_ui_update = {}
245
+ if report_file_path and os.path.exists(report_file_path):
246
+ logger.info(f"Loading final report from: {report_file_path}")
247
+ report_content = _read_file_safe(report_file_path)
248
+ if report_content:
249
+ final_ui_update[markdown_display_comp] = gr.update(value=report_content)
250
+ final_ui_update[markdown_download_comp] = gr.File(value=report_file_path,
251
+ label=f"Report ({running_task_id}.md)",
252
+ interactive=True)
253
+ else:
254
+ final_ui_update[markdown_display_comp] = gr.update(
255
+ value="# Research Complete\n\n*Error reading final report file.*")
256
+ elif final_result_dict and 'report' in final_result_dict:
257
+ logger.info("Using report content directly from agent result.")
258
+ # If agent directly returns report content
259
+ final_ui_update[markdown_display_comp] = gr.update(value=final_result_dict['report'])
260
+ # Cannot offer download if only content is available
261
+ final_ui_update[markdown_download_comp] = gr.update(value=None, label="Download Research Report",
262
+ interactive=False)
263
+ else:
264
+ logger.warning("Final report file not found and not in result dict.")
265
+ final_ui_update[markdown_display_comp] = gr.update(value="# Research Complete\n\n*Final report not found.*")
266
+
267
+ yield final_ui_update
268
+
269
+
270
+ except Exception as e:
271
+ logger.error(f"Error during Deep Research Agent execution: {e}", exc_info=True)
272
+ gr.Error(f"Research failed: {e}")
273
+ yield {markdown_display_comp: gr.update(value=f"# Research Failed\n\n**Error:**\n```\n{e}\n```")}
274
+
275
+ finally:
276
+ # --- 8. Final UI Reset ---
277
+ webui_manager.dr_current_task = None # Clear task reference
278
+ webui_manager.dr_task_id = None # Clear running task ID
279
+
280
+ yield {
281
+ start_button_comp: gr.update(value="▶️ Run", interactive=True),
282
+ stop_button_comp: gr.update(interactive=False),
283
+ research_task_comp: gr.update(interactive=True),
284
+ resume_task_id_comp: gr.update(value="", interactive=True),
285
+ parallel_num_comp: gr.update(interactive=True),
286
+ save_dir_comp: gr.update(interactive=True),
287
+ # Keep download button enabled if file exists
288
+ markdown_download_comp: gr.update() if report_file_path and os.path.exists(report_file_path) else gr.update(
289
+ interactive=False)
290
+ }
291
+
292
+
293
+ async def stop_deep_research(webui_manager: WebuiManager) -> Dict[Component, Any]:
294
+ """Handles the Stop button click."""
295
+ logger.info("Stop button clicked for Deep Research.")
296
+ agent = webui_manager.dr_agent
297
+ task = webui_manager.dr_current_task
298
+ task_id = webui_manager.dr_task_id
299
+ base_save_dir = webui_manager.dr_save_dir
300
+
301
+ stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button")
302
+ start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button")
303
+ markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display")
304
+ markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download")
305
+
306
+ final_update = {
307
+ stop_button_comp: gr.update(interactive=False, value="⏹️ Stopping...")
308
+ }
309
+
310
+ if agent and task and not task.done():
311
+ logger.info("Signalling DeepResearchAgent to stop.")
312
+ try:
313
+ # Assuming stop is synchronous or sets a flag quickly
314
+ await agent.stop()
315
+ except Exception as e:
316
+ logger.error(f"Error calling agent.stop(): {e}")
317
+
318
+ # The run_deep_research loop should detect the stop and exit.
319
+ # We yield an intermediate "Stopping..." state. The final reset is done by run_deep_research.
320
+
321
+ # Try to show the final report if available after stopping
322
+ await asyncio.sleep(1.5) # Give agent a moment to write final files potentially
323
+ report_file_path = None
324
+ if task_id and base_save_dir:
325
+ report_file_path = os.path.join(base_save_dir, str(task_id), "report.md")
326
+
327
+ if report_file_path and os.path.exists(report_file_path):
328
+ report_content = _read_file_safe(report_file_path)
329
+ if report_content:
330
+ final_update[markdown_display_comp] = gr.update(
331
+ value=report_content + "\n\n---\n*Research stopped by user.*")
332
+ final_update[markdown_download_comp] = gr.File(value=report_file_path, label=f"Report ({task_id}.md)",
333
+ interactive=True)
334
+ else:
335
+ final_update[markdown_display_comp] = gr.update(
336
+ value="# Research Stopped\n\n*Error reading final report file after stop.*")
337
+ else:
338
+ final_update[markdown_display_comp] = gr.update(value="# Research Stopped by User")
339
+
340
+ # Keep start button disabled, run_deep_research finally block will re-enable it.
341
+ final_update[start_button_comp] = gr.update(interactive=False)
342
+
343
+ else:
344
+ logger.warning("Stop clicked but no active research task found.")
345
+ # Reset UI state just in case
346
+ final_update = {
347
+ start_button_comp: gr.update(interactive=True),
348
+ stop_button_comp: gr.update(interactive=False),
349
+ webui_manager.get_component_by_id("deep_research_agent.research_task"): gr.update(interactive=True),
350
+ webui_manager.get_component_by_id("deep_research_agent.resume_task_id"): gr.update(interactive=True),
351
+ webui_manager.get_component_by_id("deep_research_agent.max_iteration"): gr.update(interactive=True),
352
+ webui_manager.get_component_by_id("deep_research_agent.max_query"): gr.update(interactive=True),
353
+ }
354
+
355
+ return final_update
356
+
357
+
358
+ async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
359
+ """
360
+ Update the MCP server.
361
+ """
362
+ if hasattr(webui_manager, "dr_agent") and webui_manager.dr_agent:
363
+ logger.warning("⚠️ Close controller because mcp file has changed!")
364
+ await webui_manager.dr_agent.close_mcp_client()
365
+
366
+ if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'):
367
+ logger.warning(f"{mcp_file} is not a valid MCP file.")
368
+ return None, gr.update(visible=False)
369
+
370
+ with open(mcp_file, 'r') as f:
371
+ mcp_server = json.load(f)
372
+
373
+ return json.dumps(mcp_server, indent=2), gr.update(visible=True)
374
+
375
+
376
+ def create_deep_research_agent_tab(webui_manager: WebuiManager):
377
+ """
378
+ Creates a deep research agent tab
379
+ """
380
+ input_components = set(webui_manager.get_components())
381
+ tab_components = {}
382
+
383
+ with gr.Group():
384
+ with gr.Row():
385
+ mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"])
386
+ mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False)
387
+
388
+ with gr.Group():
389
+ research_task = gr.Textbox(label="Research Task", lines=5,
390
+ value="Give me a detailed travel plan to Switzerland from June 1st to 10th.",
391
+ interactive=True)
392
+ with gr.Row():
393
+ resume_task_id = gr.Textbox(label="Resume Task ID", value="",
394
+ interactive=True)
395
+ parallel_num = gr.Number(label="Parallel Agent Num", value=1,
396
+ precision=0,
397
+ interactive=True)
398
+ max_query = gr.Textbox(label="Research Save Dir", value="./tmp/deep_research",
399
+ interactive=True)
400
+ with gr.Row():
401
+ stop_button = gr.Button("⏹️ Stop", variant="stop", scale=2)
402
+ start_button = gr.Button("▶️ Run", variant="primary", scale=3)
403
+ with gr.Group():
404
+ markdown_display = gr.Markdown(label="Research Report")
405
+ markdown_download = gr.File(label="Download Research Report", interactive=False)
406
+ tab_components.update(
407
+ dict(
408
+ research_task=research_task,
409
+ parallel_num=parallel_num,
410
+ max_query=max_query,
411
+ start_button=start_button,
412
+ stop_button=stop_button,
413
+ markdown_display=markdown_display,
414
+ markdown_download=markdown_download,
415
+ resume_task_id=resume_task_id,
416
+ mcp_json_file=mcp_json_file,
417
+ mcp_server_config=mcp_server_config,
418
+ )
419
+ )
420
+ webui_manager.add_components("deep_research_agent", tab_components)
421
+ webui_manager.init_deep_research_agent()
422
+
423
+ async def update_wrapper(mcp_file):
424
+ """Wrapper for handle_pause_resume."""
425
+ update_dict = await update_mcp_server(mcp_file, webui_manager)
426
+ yield update_dict
427
+
428
+ mcp_json_file.change(
429
+ update_wrapper,
430
+ inputs=[mcp_json_file],
431
+ outputs=[mcp_server_config, mcp_server_config]
432
+ )
433
+
434
+ dr_tab_outputs = list(tab_components.values())
435
+ all_managed_inputs = set(webui_manager.get_components())
436
+
437
+ # --- Define Event Handler Wrappers ---
438
+ async def start_wrapper(comps: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]:
439
+ async for update in run_deep_research(webui_manager, comps):
440
+ yield update
441
+
442
+ async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
443
+ update_dict = await stop_deep_research(webui_manager)
444
+ yield update_dict
445
+
446
+ # --- Connect Handlers ---
447
+ start_button.click(
448
+ fn=start_wrapper,
449
+ inputs=all_managed_inputs,
450
+ outputs=dr_tab_outputs
451
+ )
452
+
453
+ stop_button.click(
454
+ fn=stop_wrapper,
455
+ inputs=None,
456
+ outputs=dr_tab_outputs
457
+ )
src/webui/components/load_save_config_tab.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio.components import Component
3
+
4
+ from src.webui.webui_manager import WebuiManager
5
+ from src.utils import config
6
+
7
+
8
+ def create_load_save_config_tab(webui_manager: WebuiManager):
9
+ """
10
+ Creates a load and save config tab.
11
+ """
12
+ input_components = set(webui_manager.get_components())
13
+ tab_components = {}
14
+
15
+ config_file = gr.File(
16
+ label="Load UI Settings from json",
17
+ file_types=[".json"],
18
+ interactive=True
19
+ )
20
+ with gr.Row():
21
+ load_config_button = gr.Button("Load Config", variant="primary")
22
+ save_config_button = gr.Button("Save UI Settings", variant="primary")
23
+
24
+ config_status = gr.Textbox(
25
+ label="Status",
26
+ lines=2,
27
+ interactive=False
28
+ )
29
+
30
+ tab_components.update(dict(
31
+ load_config_button=load_config_button,
32
+ save_config_button=save_config_button,
33
+ config_status=config_status,
34
+ config_file=config_file,
35
+ ))
36
+
37
+ webui_manager.add_components("load_save_config", tab_components)
38
+
39
+ save_config_button.click(
40
+ fn=webui_manager.save_config,
41
+ inputs=set(webui_manager.get_components()),
42
+ outputs=[config_status]
43
+ )
44
+
45
+ load_config_button.click(
46
+ fn=webui_manager.load_config,
47
+ inputs=[config_file],
48
+ outputs=webui_manager.get_components(),
49
+ )
50
+
src/webui/interface.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.webui.webui_manager import WebuiManager
4
+ from src.webui.components.agent_settings_tab import create_agent_settings_tab
5
+ from src.webui.components.browser_settings_tab import create_browser_settings_tab
6
+ from src.webui.components.browser_use_agent_tab import create_browser_use_agent_tab
7
+ from src.webui.components.deep_research_agent_tab import create_deep_research_agent_tab
8
+ from src.webui.components.load_save_config_tab import create_load_save_config_tab
9
+
10
+ theme_map = {
11
+ "Default": gr.themes.Default(),
12
+ "Soft": gr.themes.Soft(),
13
+ "Monochrome": gr.themes.Monochrome(),
14
+ "Glass": gr.themes.Glass(),
15
+ "Origin": gr.themes.Origin(),
16
+ "Citrus": gr.themes.Citrus(),
17
+ "Ocean": gr.themes.Ocean(),
18
+ "Base": gr.themes.Base()
19
+ }
20
+
21
+
22
+ def create_ui(theme_name="Ocean"):
23
+ css = """
24
+ .gradio-container {
25
+ width: 70vw !important;
26
+ max-width: 70% !important;
27
+ margin-left: auto !important;
28
+ margin-right: auto !important;
29
+ padding-top: 10px !important;
30
+ }
31
+ .header-text {
32
+ text-align: center;
33
+ margin-bottom: 20px;
34
+ }
35
+ .tab-header-text {
36
+ text-align: center;
37
+ }
38
+ .theme-section {
39
+ margin-bottom: 10px;
40
+ padding: 15px;
41
+ border-radius: 10px;
42
+ }
43
+ """
44
+
45
+ # dark mode in default
46
+ js_func = """
47
+ function refresh() {
48
+ const url = new URL(window.location);
49
+
50
+ if (url.searchParams.get('__theme') !== 'dark') {
51
+ url.searchParams.set('__theme', 'dark');
52
+ window.location.href = url.href;
53
+ }
54
+ }
55
+ """
56
+
57
+ ui_manager = WebuiManager()
58
+
59
+ with gr.Blocks(
60
+ title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js_func,
61
+ ) as demo:
62
+ with gr.Row():
63
+ gr.Markdown(
64
+ """
65
+ # 🌐 Browser Use WebUI
66
+ ### Control your browser with AI assistance
67
+ """,
68
+ elem_classes=["header-text"],
69
+ )
70
+
71
+ with gr.Tabs() as tabs:
72
+ with gr.TabItem("⚙️ Agent Settings"):
73
+ create_agent_settings_tab(ui_manager)
74
+
75
+ with gr.TabItem("🌐 Browser Settings"):
76
+ create_browser_settings_tab(ui_manager)
77
+
78
+ with gr.TabItem("🤖 Run Agent"):
79
+ create_browser_use_agent_tab(ui_manager)
80
+
81
+ with gr.TabItem("🎁 Agent Marketplace"):
82
+ gr.Markdown(
83
+ """
84
+ ### Agents built on Browser-Use
85
+ """,
86
+ elem_classes=["tab-header-text"],
87
+ )
88
+ with gr.Tabs():
89
+ with gr.TabItem("Deep Research"):
90
+ create_deep_research_agent_tab(ui_manager)
91
+
92
+ with gr.TabItem("📁 Load & Save Config"):
93
+ create_load_save_config_tab(ui_manager)
94
+
95
+ return demo
src/webui/webui_manager.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from collections.abc import Generator
3
+ from typing import TYPE_CHECKING
4
+ import os
5
+ import gradio as gr
6
+ from datetime import datetime
7
+ from typing import Optional, Dict, List
8
+ import uuid
9
+ import asyncio
10
+ import time
11
+
12
+ from gradio.components import Component
13
+ from browser_use.browser.browser import Browser
14
+ from browser_use.browser.context import BrowserContext
15
+ from browser_use.agent.service import Agent
16
+ from src.browser.custom_browser import CustomBrowser
17
+ from src.browser.custom_context import CustomBrowserContext
18
+ from src.controller.custom_controller import CustomController
19
+ from src.agent.deep_research.deep_research_agent import DeepResearchAgent
20
+
21
+
22
+ class WebuiManager:
23
+ def __init__(self, settings_save_dir: str = "./tmp/webui_settings"):
24
+ self.id_to_component: dict[str, Component] = {}
25
+ self.component_to_id: dict[Component, str] = {}
26
+
27
+ self.settings_save_dir = settings_save_dir
28
+ os.makedirs(self.settings_save_dir, exist_ok=True)
29
+
30
+ def init_browser_use_agent(self) -> None:
31
+ """
32
+ init browser use agent
33
+ """
34
+ self.bu_agent: Optional[Agent] = None
35
+ self.bu_browser: Optional[CustomBrowser] = None
36
+ self.bu_browser_context: Optional[CustomBrowserContext] = None
37
+ self.bu_controller: Optional[CustomController] = None
38
+ self.bu_chat_history: List[Dict[str, Optional[str]]] = []
39
+ self.bu_response_event: Optional[asyncio.Event] = None
40
+ self.bu_user_help_response: Optional[str] = None
41
+ self.bu_current_task: Optional[asyncio.Task] = None
42
+ self.bu_agent_task_id: Optional[str] = None
43
+
44
+ def init_deep_research_agent(self) -> None:
45
+ """
46
+ init deep research agent
47
+ """
48
+ self.dr_agent: Optional[DeepResearchAgent] = None
49
+ self.dr_current_task = None
50
+ self.dr_agent_task_id: Optional[str] = None
51
+ self.dr_save_dir: Optional[str] = None
52
+
53
+ def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None:
54
+ """
55
+ Add tab components
56
+ """
57
+ for comp_name, component in components_dict.items():
58
+ comp_id = f"{tab_name}.{comp_name}"
59
+ self.id_to_component[comp_id] = component
60
+ self.component_to_id[component] = comp_id
61
+
62
+ def get_components(self) -> list["Component"]:
63
+ """
64
+ Get all components
65
+ """
66
+ return list(self.id_to_component.values())
67
+
68
+ def get_component_by_id(self, comp_id: str) -> "Component":
69
+ """
70
+ Get component by id
71
+ """
72
+ return self.id_to_component[comp_id]
73
+
74
+ def get_id_by_component(self, comp: "Component") -> str:
75
+ """
76
+ Get id by component
77
+ """
78
+ return self.component_to_id[comp]
79
+
80
+ def save_config(self, components: Dict["Component", str]) -> None:
81
+ """
82
+ Save config
83
+ """
84
+ cur_settings = {}
85
+ for comp in components:
86
+ if not isinstance(comp, gr.Button) and not isinstance(comp, gr.File) and str(
87
+ getattr(comp, "interactive", True)).lower() != "false":
88
+ comp_id = self.get_id_by_component(comp)
89
+ cur_settings[comp_id] = components[comp]
90
+
91
+ config_name = datetime.now().strftime("%Y%m%d-%H%M%S")
92
+ with open(os.path.join(self.settings_save_dir, f"{config_name}.json"), "w") as fw:
93
+ json.dump(cur_settings, fw, indent=4)
94
+
95
+ return os.path.join(self.settings_save_dir, f"{config_name}.json")
96
+
97
+ def load_config(self, config_path: str):
98
+ """
99
+ Load config
100
+ """
101
+ with open(config_path, "r") as fr:
102
+ ui_settings = json.load(fr)
103
+
104
+ update_components = {}
105
+ for comp_id, comp_val in ui_settings.items():
106
+ if comp_id in self.id_to_component:
107
+ comp = self.id_to_component[comp_id]
108
+ if comp.__class__.__name__ == "Chatbot":
109
+ update_components[comp] = comp.__class__(value=comp_val, type="messages")
110
+ else:
111
+ update_components[comp] = comp.__class__(value=comp_val)
112
+ if comp_id == "agent_settings.planner_llm_provider":
113
+ yield update_components # yield provider, let callback run
114
+ time.sleep(0.1) # wait for Gradio UI callback
115
+
116
+ config_status = self.id_to_component["load_save_config.config_status"]
117
+ update_components.update(
118
+ {
119
+ config_status: config_status.__class__(value=f"Successfully loaded config: {config_path}")
120
+ }
121
+ )
122
+ yield update_components
supervisord.conf CHANGED
@@ -3,7 +3,7 @@ user=root
3
  nodaemon=true
4
  logfile=/dev/stdout
5
  logfile_maxbytes=0
6
- loglevel=debug
7
 
8
  [program:xvfb]
9
  command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset
@@ -65,21 +65,6 @@ startretries=5
65
  startsecs=3
66
  depends_on=x11vnc
67
 
68
- [program:persistent_browser]
69
- environment=START_URL="data:text/html,<html><body><h1>Browser Ready</h1></body></html>"
70
- command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\""
71
- autorestart=true
72
- stdout_logfile=/dev/stdout
73
- stdout_logfile_maxbytes=0
74
- stderr_logfile=/dev/stderr
75
- stderr_logfile_maxbytes=0
76
- priority=350
77
- startretries=5
78
- startsecs=10
79
- stopsignal=TERM
80
- stopwaitsecs=15
81
- depends_on=novnc
82
-
83
  [program:webui]
84
  command=python webui.py --ip 0.0.0.0 --port 7788
85
  directory=/app
@@ -92,5 +77,4 @@ priority=400
92
  startretries=3
93
  startsecs=3
94
  stopsignal=TERM
95
- stopwaitsecs=10
96
- depends_on=persistent_browser
 
3
  nodaemon=true
4
  logfile=/dev/stdout
5
  logfile_maxbytes=0
6
+ loglevel=error
7
 
8
  [program:xvfb]
9
  command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset
 
65
  startsecs=3
66
  depends_on=x11vnc
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  [program:webui]
69
  command=python webui.py --ip 0.0.0.0 --port 7788
70
  directory=/app
 
77
  startretries=3
78
  startsecs=3
79
  stopsignal=TERM
80
+ stopwaitsecs=10
 
tests/test_agents.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ import sys
7
+
8
+ sys.path.append(".")
9
+ import asyncio
10
+ import os
11
+ import sys
12
+ from pprint import pprint
13
+
14
+ from browser_use import Agent
15
+ from browser_use.agent.views import AgentHistoryList
16
+
17
+ from src.utils import utils
18
+
19
+
20
+ async def test_browser_use_agent():
21
+ from browser_use.browser.browser import Browser, BrowserConfig
22
+ from browser_use.browser.context import (
23
+ BrowserContextConfig
24
+ )
25
+ from browser_use.agent.service import Agent
26
+
27
+ from src.browser.custom_browser import CustomBrowser
28
+ from src.controller.custom_controller import CustomController
29
+ from src.utils import llm_provider
30
+ from src.agent.browser_use.browser_use_agent import BrowserUseAgent
31
+
32
+ llm = llm_provider.get_llm_model(
33
+ provider="openai",
34
+ model_name="gpt-4o",
35
+ temperature=0.8,
36
+ )
37
+
38
+ # llm = llm_provider.get_llm_model(
39
+ # provider="google",
40
+ # model_name="gemini-2.0-flash",
41
+ # temperature=0.6,
42
+ # api_key=os.getenv("GOOGLE_API_KEY", "")
43
+ # )
44
+
45
+ # llm = utils.get_llm_model(
46
+ # provider="deepseek",
47
+ # model_name="deepseek-reasoner",
48
+ # temperature=0.8
49
+ # )
50
+
51
+ # llm = utils.get_llm_model(
52
+ # provider="deepseek",
53
+ # model_name="deepseek-chat",
54
+ # temperature=0.8
55
+ # )
56
+
57
+ # llm = utils.get_llm_model(
58
+ # provider="ollama", model_name="qwen2.5:7b", temperature=0.5
59
+ # )
60
+
61
+ # llm = utils.get_llm_model(
62
+ # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
63
+ # )
64
+
65
+ window_w, window_h = 1280, 1100
66
+
67
+ # llm = llm_provider.get_llm_model(
68
+ # provider="azure_openai",
69
+ # model_name="gpt-4o",
70
+ # temperature=0.5,
71
+ # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
72
+ # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
73
+ # )
74
+
75
+ mcp_server_config = {
76
+ "mcpServers": {
77
+ # "markitdown": {
78
+ # "command": "docker",
79
+ # "args": [
80
+ # "run",
81
+ # "--rm",
82
+ # "-i",
83
+ # "markitdown-mcp:latest"
84
+ # ]
85
+ # },
86
+ "desktop-commander": {
87
+ "command": "npx",
88
+ "args": [
89
+ "-y",
90
+ "@wonderwhy-er/desktop-commander"
91
+ ]
92
+ },
93
+ }
94
+ }
95
+ controller = CustomController()
96
+ await controller.setup_mcp_client(mcp_server_config)
97
+ use_own_browser = True
98
+ use_vision = True # Set to False when using DeepSeek
99
+
100
+ max_actions_per_step = 10
101
+ browser = None
102
+ browser_context = None
103
+
104
+ try:
105
+ extra_browser_args = []
106
+ if use_own_browser:
107
+ browser_binary_path = os.getenv("BROWSER_PATH", None)
108
+ if browser_binary_path == "":
109
+ browser_binary_path = None
110
+ browser_user_data = os.getenv("BROWSER_USER_DATA", None)
111
+ if browser_user_data:
112
+ extra_browser_args += [f"--user-data-dir={browser_user_data}"]
113
+ else:
114
+ browser_binary_path = None
115
+ browser = CustomBrowser(
116
+ config=BrowserConfig(
117
+ headless=False,
118
+ browser_binary_path=browser_binary_path,
119
+ extra_browser_args=extra_browser_args,
120
+ new_context_config=BrowserContextConfig(
121
+ window_width=window_w,
122
+ window_height=window_h,
123
+ )
124
+ )
125
+ )
126
+ browser_context = await browser.new_context(
127
+ config=BrowserContextConfig(
128
+ trace_path=None,
129
+ save_recording_path=None,
130
+ save_downloads_path="./tmp/downloads",
131
+ window_height=window_h,
132
+ window_width=window_w,
133
+ )
134
+ )
135
+ agent = BrowserUseAgent(
136
+ # task="download pdf from https://arxiv.org/pdf/2311.16498 and rename this pdf to 'mcp-test.pdf'",
137
+ task="give me nvidia stock price",
138
+ llm=llm,
139
+ browser=browser,
140
+ browser_context=browser_context,
141
+ controller=controller,
142
+ use_vision=use_vision,
143
+ max_actions_per_step=max_actions_per_step,
144
+ generate_gif=True
145
+ )
146
+ history: AgentHistoryList = await agent.run(max_steps=100)
147
+
148
+ print("Final Result:")
149
+ pprint(history.final_result(), indent=4)
150
+
151
+ print("\nErrors:")
152
+ pprint(history.errors(), indent=4)
153
+
154
+ except Exception:
155
+ import traceback
156
+ traceback.print_exc()
157
+ finally:
158
+ if browser_context:
159
+ await browser_context.close()
160
+ if browser:
161
+ await browser.close()
162
+ if controller:
163
+ await controller.close_mcp_client()
164
+
165
+
166
+ async def test_browser_use_parallel():
167
+ from browser_use.browser.browser import Browser, BrowserConfig
168
+ from browser_use.browser.context import (
169
+ BrowserContextConfig,
170
+ )
171
+ from browser_use.agent.service import Agent
172
+
173
+ from src.browser.custom_browser import CustomBrowser
174
+ from src.controller.custom_controller import CustomController
175
+ from src.utils import llm_provider
176
+ from src.agent.browser_use.browser_use_agent import BrowserUseAgent
177
+
178
+ # llm = utils.get_llm_model(
179
+ # provider="openai",
180
+ # model_name="gpt-4o",
181
+ # temperature=0.8,
182
+ # base_url=os.getenv("OPENAI_ENDPOINT", ""),
183
+ # api_key=os.getenv("OPENAI_API_KEY", ""),
184
+ # )
185
+
186
+ # llm = utils.get_llm_model(
187
+ # provider="google",
188
+ # model_name="gemini-2.0-flash",
189
+ # temperature=0.6,
190
+ # api_key=os.getenv("GOOGLE_API_KEY", "")
191
+ # )
192
+
193
+ # llm = utils.get_llm_model(
194
+ # provider="deepseek",
195
+ # model_name="deepseek-reasoner",
196
+ # temperature=0.8
197
+ # )
198
+
199
+ # llm = utils.get_llm_model(
200
+ # provider="deepseek",
201
+ # model_name="deepseek-chat",
202
+ # temperature=0.8
203
+ # )
204
+
205
+ # llm = utils.get_llm_model(
206
+ # provider="ollama", model_name="qwen2.5:7b", temperature=0.5
207
+ # )
208
+
209
+ # llm = utils.get_llm_model(
210
+ # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
211
+ # )
212
+
213
+ window_w, window_h = 1280, 1100
214
+
215
+ llm = llm_provider.get_llm_model(
216
+ provider="azure_openai",
217
+ model_name="gpt-4o",
218
+ temperature=0.5,
219
+ base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
220
+ api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
221
+ )
222
+
223
+ mcp_server_config = {
224
+ "mcpServers": {
225
+ # "markitdown": {
226
+ # "command": "docker",
227
+ # "args": [
228
+ # "run",
229
+ # "--rm",
230
+ # "-i",
231
+ # "markitdown-mcp:latest"
232
+ # ]
233
+ # },
234
+ "desktop-commander": {
235
+ "command": "npx",
236
+ "args": [
237
+ "-y",
238
+ "@wonderwhy-er/desktop-commander"
239
+ ]
240
+ },
241
+ # "filesystem": {
242
+ # "command": "npx",
243
+ # "args": [
244
+ # "-y",
245
+ # "@modelcontextprotocol/server-filesystem",
246
+ # "/Users/xxx/ai_workspace",
247
+ # ]
248
+ # },
249
+ }
250
+ }
251
+ controller = CustomController()
252
+ await controller.setup_mcp_client(mcp_server_config)
253
+ use_own_browser = True
254
+ use_vision = True # Set to False when using DeepSeek
255
+
256
+ max_actions_per_step = 10
257
+ browser = None
258
+ browser_context = None
259
+
260
+ try:
261
+ extra_browser_args = []
262
+ if use_own_browser:
263
+ browser_binary_path = os.getenv("BROWSER_PATH", None)
264
+ if browser_binary_path == "":
265
+ browser_binary_path = None
266
+ browser_user_data = os.getenv("BROWSER_USER_DATA", None)
267
+ if browser_user_data:
268
+ extra_browser_args += [f"--user-data-dir={browser_user_data}"]
269
+ else:
270
+ browser_binary_path = None
271
+ browser = CustomBrowser(
272
+ config=BrowserConfig(
273
+ headless=False,
274
+ browser_binary_path=browser_binary_path,
275
+ extra_browser_args=extra_browser_args,
276
+ new_context_config=BrowserContextConfig(
277
+ window_width=window_w,
278
+ window_height=window_h,
279
+ )
280
+ )
281
+ )
282
+ browser_context = await browser.new_context(
283
+ config=BrowserContextConfig(
284
+ trace_path=None,
285
+ save_recording_path=None,
286
+ save_downloads_path="./tmp/downloads",
287
+ window_height=window_h,
288
+ window_width=window_w,
289
+ force_new_context=True
290
+ )
291
+ )
292
+ agents = [
293
+ BrowserUseAgent(task=task, llm=llm, browser=browser, controller=controller)
294
+ for task in [
295
+ 'Search Google for weather in Tokyo',
296
+ # 'Check Reddit front page title',
297
+ # 'Find NASA image of the day',
298
+ # 'Check top story on CNN',
299
+ # 'Search latest SpaceX launch date',
300
+ # 'Look up population of Paris',
301
+ 'Find current time in Sydney',
302
+ 'Check who won last Super Bowl',
303
+ # 'Search trending topics on Twitter',
304
+ ]
305
+ ]
306
+
307
+ history = await asyncio.gather(*[agent.run() for agent in agents])
308
+ print("Final Result:")
309
+ pprint(history.final_result(), indent=4)
310
+
311
+ print("\nErrors:")
312
+ pprint(history.errors(), indent=4)
313
+
314
+ pdb.set_trace()
315
+
316
+ except Exception:
317
+ import traceback
318
+
319
+ traceback.print_exc()
320
+ finally:
321
+ if browser_context:
322
+ await browser_context.close()
323
+ if browser:
324
+ await browser.close()
325
+ if controller:
326
+ await controller.close_mcp_client()
327
+
328
+
329
+ async def test_deep_research_agent():
330
+ from src.agent.deep_research.deep_research_agent import DeepResearchAgent, PLAN_FILENAME, REPORT_FILENAME
331
+ from src.utils import llm_provider
332
+
333
+ llm = llm_provider.get_llm_model(
334
+ provider="openai",
335
+ model_name="gpt-4o",
336
+ temperature=0.5
337
+ )
338
+
339
+ # llm = llm_provider.get_llm_model(
340
+ # provider="bedrock",
341
+ # )
342
+
343
+ mcp_server_config = {
344
+ "mcpServers": {
345
+ "desktop-commander": {
346
+ "command": "npx",
347
+ "args": [
348
+ "-y",
349
+ "@wonderwhy-er/desktop-commander"
350
+ ]
351
+ },
352
+ }
353
+ }
354
+
355
+ browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False}
356
+ agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config)
357
+ research_topic = "Give me investment advices of nvidia and tesla."
358
+ task_id_to_resume = "" # Set this to resume a previous task ID
359
+
360
+ print(f"Starting research on: {research_topic}")
361
+
362
+ try:
363
+ # Call run and wait for the final result dictionary
364
+ result = await agent.run(research_topic,
365
+ task_id=task_id_to_resume,
366
+ save_dir="./tmp/deep_research",
367
+ max_parallel_browsers=1,
368
+ )
369
+
370
+ print("\n--- Research Process Ended ---")
371
+ print(f"Status: {result.get('status')}")
372
+ print(f"Message: {result.get('message')}")
373
+ print(f"Task ID: {result.get('task_id')}")
374
+
375
+ # Check the final state for the report
376
+ final_state = result.get('final_state', {})
377
+ if final_state:
378
+ print("\n--- Final State Summary ---")
379
+ print(
380
+ f" Plan Steps Completed: {sum(1 for item in final_state.get('research_plan', []) if item.get('status') == 'completed')}")
381
+ print(f" Total Search Results Logged: {len(final_state.get('search_results', []))}")
382
+ if final_state.get("final_report"):
383
+ print(" Final Report: Generated (content omitted). You can find it in the output directory.")
384
+ # print("\n--- Final Report ---") # Optionally print report
385
+ # print(final_state["final_report"])
386
+ else:
387
+ print(" Final Report: Not generated.")
388
+ else:
389
+ print("Final state information not available.")
390
+
391
+
392
+ except Exception as e:
393
+ print(f"\n--- An unhandled error occurred outside the agent run ---")
394
+ print(e)
395
+
396
+
397
+ if __name__ == "__main__":
398
+ asyncio.run(test_browser_use_agent())
399
+ # asyncio.run(test_browser_use_parallel())
400
+ # asyncio.run(test_deep_research_agent())
tests/test_controller.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import pdb
3
+ import sys
4
+ import time
5
+
6
+ sys.path.append(".")
7
+
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+
12
+
13
+ async def test_mcp_client():
14
+ from src.utils.mcp_client import setup_mcp_client_and_tools, create_tool_param_model
15
+
16
+ test_server_config = {
17
+ "mcpServers": {
18
+ # "markitdown": {
19
+ # "command": "docker",
20
+ # "args": [
21
+ # "run",
22
+ # "--rm",
23
+ # "-i",
24
+ # "markitdown-mcp:latest"
25
+ # ]
26
+ # },
27
+ "desktop-commander": {
28
+ "command": "npx",
29
+ "args": [
30
+ "-y",
31
+ "@wonderwhy-er/desktop-commander"
32
+ ]
33
+ },
34
+ # "filesystem": {
35
+ # "command": "npx",
36
+ # "args": [
37
+ # "-y",
38
+ # "@modelcontextprotocol/server-filesystem",
39
+ # "/Users/xxx/ai_workspace",
40
+ # ]
41
+ # },
42
+ }
43
+ }
44
+
45
+ mcp_tools, mcp_client = await setup_mcp_client_and_tools(test_server_config)
46
+
47
+ for tool in mcp_tools:
48
+ tool_param_model = create_tool_param_model(tool)
49
+ print(tool.name)
50
+ print(tool.description)
51
+ print(tool_param_model.model_json_schema())
52
+ pdb.set_trace()
53
+
54
+
55
+ async def test_controller_with_mcp():
56
+ import os
57
+ from src.controller.custom_controller import CustomController
58
+ from browser_use.controller.registry.views import ActionModel
59
+
60
+ mcp_server_config = {
61
+ "mcpServers": {
62
+ # "markitdown": {
63
+ # "command": "docker",
64
+ # "args": [
65
+ # "run",
66
+ # "--rm",
67
+ # "-i",
68
+ # "markitdown-mcp:latest"
69
+ # ]
70
+ # },
71
+ "desktop-commander": {
72
+ "command": "npx",
73
+ "args": [
74
+ "-y",
75
+ "@wonderwhy-er/desktop-commander"
76
+ ]
77
+ },
78
+ # "filesystem": {
79
+ # "command": "npx",
80
+ # "args": [
81
+ # "-y",
82
+ # "@modelcontextprotocol/server-filesystem",
83
+ # "/Users/xxx/ai_workspace",
84
+ # ]
85
+ # },
86
+ }
87
+ }
88
+
89
+ controller = CustomController()
90
+ await controller.setup_mcp_client(mcp_server_config)
91
+ action_name = "mcp.desktop-commander.execute_command"
92
+ action_info = controller.registry.registry.actions[action_name]
93
+ param_model = action_info.param_model
94
+ print(param_model.model_json_schema())
95
+ params = {"command": f"python ./tmp/test.py"
96
+ }
97
+ validated_params = param_model(**params)
98
+ ActionModel_ = controller.registry.create_action_model()
99
+ # Create ActionModel instance with the validated parameters
100
+ action_model = ActionModel_(**{action_name: validated_params})
101
+ result = await controller.act(action_model)
102
+ result = result.extracted_content
103
+ print(result)
104
+ if result and "Command is still running. Use read_output to get more output." in result and "PID" in \
105
+ result.split("\n")[0]:
106
+ pid = int(result.split("\n")[0].split("PID")[-1].strip())
107
+ action_name = "mcp.desktop-commander.read_output"
108
+ action_info = controller.registry.registry.actions[action_name]
109
+ param_model = action_info.param_model
110
+ print(param_model.model_json_schema())
111
+ params = {"pid": pid}
112
+ validated_params = param_model(**params)
113
+ action_model = ActionModel_(**{action_name: validated_params})
114
+ output_result = ""
115
+ while True:
116
+ time.sleep(1)
117
+ result = await controller.act(action_model)
118
+ result = result.extracted_content
119
+ if result:
120
+ pdb.set_trace()
121
+ output_result = result
122
+ break
123
+ print(output_result)
124
+ pdb.set_trace()
125
+ await controller.close_mcp_client()
126
+ pdb.set_trace()
127
+
128
+
129
+ if __name__ == '__main__':
130
+ # asyncio.run(test_mcp_client())
131
+ asyncio.run(test_controller_with_mcp())
tests/test_llm_api.py CHANGED
@@ -12,6 +12,7 @@ import sys
12
 
13
  sys.path.append(".")
14
 
 
15
  @dataclass
16
  class LLMConfig:
17
  provider: str
@@ -20,6 +21,7 @@ class LLMConfig:
20
  base_url: str = None
21
  api_key: str = None
22
 
 
23
  def create_message_content(text, image_path=None):
24
  content = [{"type": "text", "text": text}]
25
  image_format = "png" if image_path and image_path.endswith(".png") else "jpeg"
@@ -32,6 +34,7 @@ def create_message_content(text, image_path=None):
32
  })
33
  return content
34
 
 
35
  def get_env_value(key, provider):
36
  env_mappings = {
37
  "openai": {"api_key": "OPENAI_API_KEY", "base_url": "OPENAI_ENDPOINT"},
@@ -40,20 +43,22 @@ def get_env_value(key, provider):
40
  "deepseek": {"api_key": "DEEPSEEK_API_KEY", "base_url": "DEEPSEEK_ENDPOINT"},
41
  "mistral": {"api_key": "MISTRAL_API_KEY", "base_url": "MISTRAL_ENDPOINT"},
42
  "alibaba": {"api_key": "ALIBABA_API_KEY", "base_url": "ALIBABA_ENDPOINT"},
43
- "moonshot":{"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"},
 
44
  }
45
 
46
  if provider in env_mappings and key in env_mappings[provider]:
47
  return os.getenv(env_mappings[provider][key], "")
48
  return ""
49
 
 
50
  def test_llm(config, query, image_path=None, system_message=None):
51
- from src.utils import utils
52
 
53
  # Special handling for Ollama-based models
54
  if config.provider == "ollama":
55
  if "deepseek-r1" in config.model_name:
56
- from src.utils.llm import DeepSeekR1ChatOllama
57
  llm = DeepSeekR1ChatOllama(model=config.model_name)
58
  else:
59
  llm = ChatOllama(model=config.model_name)
@@ -65,7 +70,7 @@ def test_llm(config, query, image_path=None, system_message=None):
65
  return
66
 
67
  # For other providers, use the standard configuration
68
- llm = utils.get_llm_model(
69
  provider=config.provider,
70
  model_name=config.model_name,
71
  temperature=config.temperature,
@@ -85,53 +90,70 @@ def test_llm(config, query, image_path=None, system_message=None):
85
  print(ai_msg.reasoning_content)
86
  print(ai_msg.content)
87
 
88
- if config.provider == "deepseek" and "deepseek-reasoner" in config.model_name:
89
- print(llm.model_name)
90
- pdb.set_trace()
91
-
92
  def test_openai_model():
93
  config = LLMConfig(provider="openai", model_name="gpt-4o")
94
  test_llm(config, "Describe this image", "assets/examples/test.png")
95
 
 
96
  def test_google_model():
97
  # Enable your API key first if you haven't: https://ai.google.dev/palm_docs/oauth_quickstart
98
  config = LLMConfig(provider="google", model_name="gemini-2.0-flash-exp")
99
  test_llm(config, "Describe this image", "assets/examples/test.png")
100
 
 
101
  def test_azure_openai_model():
102
  config = LLMConfig(provider="azure_openai", model_name="gpt-4o")
103
  test_llm(config, "Describe this image", "assets/examples/test.png")
104
 
 
105
  def test_deepseek_model():
106
  config = LLMConfig(provider="deepseek", model_name="deepseek-chat")
107
  test_llm(config, "Who are you?")
108
 
 
109
  def test_deepseek_r1_model():
110
  config = LLMConfig(provider="deepseek", model_name="deepseek-reasoner")
111
  test_llm(config, "Which is greater, 9.11 or 9.8?", system_message="You are a helpful AI assistant.")
112
 
 
113
  def test_ollama_model():
114
  config = LLMConfig(provider="ollama", model_name="qwen2.5:7b")
115
  test_llm(config, "Sing a ballad of LangChain.")
116
 
 
117
  def test_deepseek_r1_ollama_model():
118
  config = LLMConfig(provider="ollama", model_name="deepseek-r1:14b")
119
  test_llm(config, "How many 'r's are in the word 'strawberry'?")
120
 
 
121
  def test_mistral_model():
122
  config = LLMConfig(provider="mistral", model_name="pixtral-large-latest")
123
  test_llm(config, "Describe this image", "assets/examples/test.png")
124
 
 
125
  def test_moonshot_model():
126
  config = LLMConfig(provider="moonshot", model_name="moonshot-v1-32k-vision-preview")
127
  test_llm(config, "Describe this image", "assets/examples/test.png")
128
 
 
 
 
 
 
 
 
 
 
 
 
129
  if __name__ == "__main__":
130
  # test_openai_model()
131
  # test_google_model()
132
- # test_azure_openai_model()
133
- #test_deepseek_model()
134
  # test_ollama_model()
135
- test_deepseek_r1_model()
136
  # test_deepseek_r1_ollama_model()
137
  # test_mistral_model()
 
 
 
12
 
13
  sys.path.append(".")
14
 
15
+
16
  @dataclass
17
  class LLMConfig:
18
  provider: str
 
21
  base_url: str = None
22
  api_key: str = None
23
 
24
+
25
  def create_message_content(text, image_path=None):
26
  content = [{"type": "text", "text": text}]
27
  image_format = "png" if image_path and image_path.endswith(".png") else "jpeg"
 
34
  })
35
  return content
36
 
37
+
38
  def get_env_value(key, provider):
39
  env_mappings = {
40
  "openai": {"api_key": "OPENAI_API_KEY", "base_url": "OPENAI_ENDPOINT"},
 
43
  "deepseek": {"api_key": "DEEPSEEK_API_KEY", "base_url": "DEEPSEEK_ENDPOINT"},
44
  "mistral": {"api_key": "MISTRAL_API_KEY", "base_url": "MISTRAL_ENDPOINT"},
45
  "alibaba": {"api_key": "ALIBABA_API_KEY", "base_url": "ALIBABA_ENDPOINT"},
46
+ "moonshot": {"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"},
47
+ "ibm": {"api_key": "IBM_API_KEY", "base_url": "IBM_ENDPOINT"}
48
  }
49
 
50
  if provider in env_mappings and key in env_mappings[provider]:
51
  return os.getenv(env_mappings[provider][key], "")
52
  return ""
53
 
54
+
55
  def test_llm(config, query, image_path=None, system_message=None):
56
+ from src.utils import utils, llm_provider
57
 
58
  # Special handling for Ollama-based models
59
  if config.provider == "ollama":
60
  if "deepseek-r1" in config.model_name:
61
+ from src.utils.llm_provider import DeepSeekR1ChatOllama
62
  llm = DeepSeekR1ChatOllama(model=config.model_name)
63
  else:
64
  llm = ChatOllama(model=config.model_name)
 
70
  return
71
 
72
  # For other providers, use the standard configuration
73
+ llm = llm_provider.get_llm_model(
74
  provider=config.provider,
75
  model_name=config.model_name,
76
  temperature=config.temperature,
 
90
  print(ai_msg.reasoning_content)
91
  print(ai_msg.content)
92
 
 
 
 
 
93
  def test_openai_model():
94
  config = LLMConfig(provider="openai", model_name="gpt-4o")
95
  test_llm(config, "Describe this image", "assets/examples/test.png")
96
 
97
+
98
  def test_google_model():
99
  # Enable your API key first if you haven't: https://ai.google.dev/palm_docs/oauth_quickstart
100
  config = LLMConfig(provider="google", model_name="gemini-2.0-flash-exp")
101
  test_llm(config, "Describe this image", "assets/examples/test.png")
102
 
103
+
104
  def test_azure_openai_model():
105
  config = LLMConfig(provider="azure_openai", model_name="gpt-4o")
106
  test_llm(config, "Describe this image", "assets/examples/test.png")
107
 
108
+
109
  def test_deepseek_model():
110
  config = LLMConfig(provider="deepseek", model_name="deepseek-chat")
111
  test_llm(config, "Who are you?")
112
 
113
+
114
  def test_deepseek_r1_model():
115
  config = LLMConfig(provider="deepseek", model_name="deepseek-reasoner")
116
  test_llm(config, "Which is greater, 9.11 or 9.8?", system_message="You are a helpful AI assistant.")
117
 
118
+
119
  def test_ollama_model():
120
  config = LLMConfig(provider="ollama", model_name="qwen2.5:7b")
121
  test_llm(config, "Sing a ballad of LangChain.")
122
 
123
+
124
  def test_deepseek_r1_ollama_model():
125
  config = LLMConfig(provider="ollama", model_name="deepseek-r1:14b")
126
  test_llm(config, "How many 'r's are in the word 'strawberry'?")
127
 
128
+
129
  def test_mistral_model():
130
  config = LLMConfig(provider="mistral", model_name="pixtral-large-latest")
131
  test_llm(config, "Describe this image", "assets/examples/test.png")
132
 
133
+
134
  def test_moonshot_model():
135
  config = LLMConfig(provider="moonshot", model_name="moonshot-v1-32k-vision-preview")
136
  test_llm(config, "Describe this image", "assets/examples/test.png")
137
 
138
+
139
+ def test_ibm_model():
140
+ config = LLMConfig(provider="ibm", model_name="meta-llama/llama-4-maverick-17b-128e-instruct-fp8")
141
+ test_llm(config, "Describe this image", "assets/examples/test.png")
142
+
143
+
144
+ def test_qwen_model():
145
+ config = LLMConfig(provider="alibaba", model_name="qwen-vl-max")
146
+ test_llm(config, "How many 'r's are in the word 'strawberry'?")
147
+
148
+
149
  if __name__ == "__main__":
150
  # test_openai_model()
151
  # test_google_model()
152
+ test_azure_openai_model()
153
+ # test_deepseek_model()
154
  # test_ollama_model()
155
+ # test_deepseek_r1_model()
156
  # test_deepseek_r1_ollama_model()
157
  # test_mistral_model()
158
+ # test_ibm_model()
159
+ # test_qwen_model()
webui.py CHANGED
@@ -1,1201 +1,18 @@
1
- import pdb
2
- import logging
3
-
4
  from dotenv import load_dotenv
5
-
6
  load_dotenv()
7
- import os
8
- import glob
9
- import asyncio
10
  import argparse
11
- import os
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
- import gradio as gr
16
- import inspect
17
- from functools import wraps
18
-
19
- from browser_use.agent.service import Agent
20
- from playwright.async_api import async_playwright
21
- from browser_use.browser.browser import Browser, BrowserConfig
22
- from browser_use.browser.context import (
23
- BrowserContextConfig,
24
- BrowserContextWindowSize,
25
- )
26
- from langchain_ollama import ChatOllama
27
- from playwright.async_api import async_playwright
28
- from src.utils.agent_state import AgentState
29
-
30
- from src.utils import utils
31
- from src.agent.custom_agent import CustomAgent
32
- from src.browser.custom_browser import CustomBrowser
33
- from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
34
- from src.browser.custom_context import BrowserContextConfig, CustomBrowserContext
35
- from src.controller.custom_controller import CustomController
36
- from gradio.themes import Citrus, Default, Glass, Monochrome, Ocean, Origin, Soft, Base
37
- from src.utils.utils import update_model_dropdown, get_latest_files, capture_screenshot, MissingAPIKeyError
38
- from src.utils import utils
39
-
40
- # Global variables for persistence
41
- _global_browser = None
42
- _global_browser_context = None
43
- _global_agent = None
44
-
45
- # Create the global agent state instance
46
- _global_agent_state = AgentState()
47
-
48
- # webui config
49
- webui_config_manager = utils.ConfigManager()
50
-
51
-
52
- def scan_and_register_components(blocks):
53
- """扫描一个 Blocks 对象并注册其中的所有交互式组件,但不包括按钮"""
54
- global webui_config_manager
55
-
56
- def traverse_blocks(block, prefix=""):
57
- registered = 0
58
-
59
- # 处理 Blocks 自身的组件
60
- if hasattr(block, "children"):
61
- for i, child in enumerate(block.children):
62
- if isinstance(child, gr.components.Component):
63
- # 排除按钮 (Button) 组件
64
- if getattr(child, "interactive", False) and not isinstance(child, gr.Button):
65
- name = f"{prefix}component_{i}"
66
- if hasattr(child, "label") and child.label:
67
- # 使用标签作为名称的一部分
68
- label = child.label
69
- name = f"{prefix}{label}"
70
- logger.debug(f"Registering component: {name}")
71
- webui_config_manager.register_component(name, child)
72
- registered += 1
73
- elif hasattr(child, "children"):
74
- # 递归处理嵌套的 Blocks
75
- new_prefix = f"{prefix}block_{i}_"
76
- registered += traverse_blocks(child, new_prefix)
77
-
78
- return registered
79
-
80
- total = traverse_blocks(blocks)
81
- logger.info(f"Total registered components: {total}")
82
-
83
-
84
- def save_current_config():
85
- return webui_config_manager.save_current_config()
86
-
87
-
88
- def update_ui_from_config(config_file):
89
- return webui_config_manager.update_ui_from_config(config_file)
90
-
91
-
92
- def resolve_sensitive_env_variables(text):
93
- """
94
- Replace environment variable placeholders ($SENSITIVE_*) with their values.
95
- Only replaces variables that start with SENSITIVE_.
96
- """
97
- if not text:
98
- return text
99
-
100
- import re
101
-
102
- # Find all $SENSITIVE_* patterns
103
- env_vars = re.findall(r'\$SENSITIVE_[A-Za-z0-9_]*', text)
104
-
105
- result = text
106
- for var in env_vars:
107
- # Remove the $ prefix to get the actual environment variable name
108
- env_name = var[1:] # removes the $
109
- env_value = os.getenv(env_name)
110
- if env_value is not None:
111
- # Replace $SENSITIVE_VAR_NAME with its value
112
- result = result.replace(var, env_value)
113
-
114
- return result
115
-
116
-
117
- async def stop_agent():
118
- """Request the agent to stop and update UI with enhanced feedback"""
119
- global _global_agent
120
-
121
- try:
122
- if _global_agent is not None:
123
- # Request stop
124
- _global_agent.stop()
125
- # Update UI immediately
126
- message = "Stop requested - the agent will halt at the next safe point"
127
- logger.info(f"🛑 {message}")
128
-
129
- # Return UI updates
130
- return (
131
- gr.update(value="Stopping...", interactive=False), # stop_button
132
- gr.update(interactive=False), # run_button
133
- )
134
- except Exception as e:
135
- error_msg = f"Error during stop: {str(e)}"
136
- logger.error(error_msg)
137
- return (
138
- gr.update(value="Stop", interactive=True),
139
- gr.update(interactive=True)
140
- )
141
-
142
-
143
- async def stop_research_agent():
144
- """Request the agent to stop and update UI with enhanced feedback"""
145
- global _global_agent_state
146
-
147
- try:
148
- # Request stop
149
- _global_agent_state.request_stop()
150
-
151
- # Update UI immediately
152
- message = "Stop requested - the agent will halt at the next safe point"
153
- logger.info(f"🛑 {message}")
154
-
155
- # Return UI updates
156
- return ( # errors_output
157
- gr.update(value="Stopping...", interactive=False), # stop_button
158
- gr.update(interactive=False), # run_button
159
- )
160
- except Exception as e:
161
- error_msg = f"Error during stop: {str(e)}"
162
- logger.error(error_msg)
163
- return (
164
- gr.update(value="Stop", interactive=True),
165
- gr.update(interactive=True)
166
- )
167
-
168
-
169
- async def run_browser_agent(
170
- agent_type,
171
- llm_provider,
172
- llm_model_name,
173
- llm_num_ctx,
174
- llm_temperature,
175
- llm_base_url,
176
- llm_api_key,
177
- use_own_browser,
178
- keep_browser_open,
179
- headless,
180
- disable_security,
181
- window_w,
182
- window_h,
183
- save_recording_path,
184
- save_agent_history_path,
185
- save_trace_path,
186
- enable_recording,
187
- task,
188
- add_infos,
189
- max_steps,
190
- use_vision,
191
- max_actions_per_step,
192
- tool_calling_method,
193
- chrome_cdp,
194
- max_input_tokens
195
- ):
196
- try:
197
- # Disable recording if the checkbox is unchecked
198
- if not enable_recording:
199
- save_recording_path = None
200
-
201
- # Ensure the recording directory exists if recording is enabled
202
- if save_recording_path:
203
- os.makedirs(save_recording_path, exist_ok=True)
204
-
205
- # Get the list of existing videos before the agent runs
206
- existing_videos = set()
207
- if save_recording_path:
208
- existing_videos = set(
209
- glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4"))
210
- + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
211
- )
212
-
213
- task = resolve_sensitive_env_variables(task)
214
-
215
- # Run the agent
216
- llm = utils.get_llm_model(
217
- provider=llm_provider,
218
- model_name=llm_model_name,
219
- num_ctx=llm_num_ctx,
220
- temperature=llm_temperature,
221
- base_url=llm_base_url,
222
- api_key=llm_api_key,
223
- )
224
- if agent_type == "org":
225
- final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_org_agent(
226
- llm=llm,
227
- use_own_browser=use_own_browser,
228
- keep_browser_open=keep_browser_open,
229
- headless=headless,
230
- disable_security=disable_security,
231
- window_w=window_w,
232
- window_h=window_h,
233
- save_recording_path=save_recording_path,
234
- save_agent_history_path=save_agent_history_path,
235
- save_trace_path=save_trace_path,
236
- task=task,
237
- max_steps=max_steps,
238
- use_vision=use_vision,
239
- max_actions_per_step=max_actions_per_step,
240
- tool_calling_method=tool_calling_method,
241
- chrome_cdp=chrome_cdp,
242
- max_input_tokens=max_input_tokens
243
- )
244
- elif agent_type == "custom":
245
- final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_custom_agent(
246
- llm=llm,
247
- use_own_browser=use_own_browser,
248
- keep_browser_open=keep_browser_open,
249
- headless=headless,
250
- disable_security=disable_security,
251
- window_w=window_w,
252
- window_h=window_h,
253
- save_recording_path=save_recording_path,
254
- save_agent_history_path=save_agent_history_path,
255
- save_trace_path=save_trace_path,
256
- task=task,
257
- add_infos=add_infos,
258
- max_steps=max_steps,
259
- use_vision=use_vision,
260
- max_actions_per_step=max_actions_per_step,
261
- tool_calling_method=tool_calling_method,
262
- chrome_cdp=chrome_cdp,
263
- max_input_tokens=max_input_tokens
264
- )
265
- else:
266
- raise ValueError(f"Invalid agent type: {agent_type}")
267
-
268
- # Get the list of videos after the agent runs (if recording is enabled)
269
- # latest_video = None
270
- # if save_recording_path:
271
- # new_videos = set(
272
- # glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4"))
273
- # + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
274
- # )
275
- # if new_videos - existing_videos:
276
- # latest_video = list(new_videos - existing_videos)[0] # Get the first new video
277
-
278
- gif_path = os.path.join(os.path.dirname(__file__), "agent_history.gif")
279
-
280
- return (
281
- final_result,
282
- errors,
283
- model_actions,
284
- model_thoughts,
285
- gif_path,
286
- trace_file,
287
- history_file,
288
- gr.update(value="Stop", interactive=True), # Re-enable stop button
289
- gr.update(interactive=True) # Re-enable run button
290
- )
291
-
292
- except MissingAPIKeyError as e:
293
- logger.error(str(e))
294
- raise gr.Error(str(e), print_exception=False)
295
-
296
- except Exception as e:
297
- import traceback
298
- traceback.print_exc()
299
- errors = str(e) + "\n" + traceback.format_exc()
300
- return (
301
- '', # final_result
302
- errors, # errors
303
- '', # model_actions
304
- '', # model_thoughts
305
- None, # latest_video
306
- None, # history_file
307
- None, # trace_file
308
- gr.update(value="Stop", interactive=True), # Re-enable stop button
309
- gr.update(interactive=True) # Re-enable run button
310
- )
311
-
312
-
313
- async def run_org_agent(
314
- llm,
315
- use_own_browser,
316
- keep_browser_open,
317
- headless,
318
- disable_security,
319
- window_w,
320
- window_h,
321
- save_recording_path,
322
- save_agent_history_path,
323
- save_trace_path,
324
- task,
325
- max_steps,
326
- use_vision,
327
- max_actions_per_step,
328
- tool_calling_method,
329
- chrome_cdp,
330
- max_input_tokens
331
- ):
332
- try:
333
- global _global_browser, _global_browser_context, _global_agent
334
-
335
- extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"]
336
- cdp_url = chrome_cdp
337
-
338
- if use_own_browser:
339
- cdp_url = os.getenv("CHROME_CDP", chrome_cdp)
340
- chrome_path = os.getenv("CHROME_PATH", None)
341
- if chrome_path == "":
342
- chrome_path = None
343
- chrome_user_data = os.getenv("CHROME_USER_DATA", None)
344
- if chrome_user_data:
345
- extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
346
- else:
347
- chrome_path = None
348
-
349
- if _global_browser is None:
350
- _global_browser = Browser(
351
- config=BrowserConfig(
352
- headless=headless,
353
- cdp_url=cdp_url,
354
- disable_security=disable_security,
355
- chrome_instance_path=chrome_path,
356
- extra_chromium_args=extra_chromium_args,
357
- )
358
- )
359
-
360
- if _global_browser_context is None:
361
- _global_browser_context = await _global_browser.new_context(
362
- config=BrowserContextConfig(
363
- trace_path=save_trace_path if save_trace_path else None,
364
- save_recording_path=save_recording_path if save_recording_path else None,
365
- save_downloads_path="./tmp/downloads",
366
- no_viewport=False,
367
- browser_window_size=BrowserContextWindowSize(
368
- width=window_w, height=window_h
369
- ),
370
- )
371
- )
372
-
373
- if _global_agent is None:
374
- _global_agent = Agent(
375
- task=task,
376
- llm=llm,
377
- use_vision=use_vision,
378
- browser=_global_browser,
379
- browser_context=_global_browser_context,
380
- max_actions_per_step=max_actions_per_step,
381
- tool_calling_method=tool_calling_method,
382
- max_input_tokens=max_input_tokens,
383
- generate_gif=True
384
- )
385
- history = await _global_agent.run(max_steps=max_steps)
386
-
387
- history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json")
388
- _global_agent.save_history(history_file)
389
-
390
- final_result = history.final_result()
391
- errors = history.errors()
392
- model_actions = history.model_actions()
393
- model_thoughts = history.model_thoughts()
394
-
395
- trace_file = get_latest_files(save_trace_path)
396
-
397
- return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file
398
- except Exception as e:
399
- import traceback
400
- traceback.print_exc()
401
- errors = str(e) + "\n" + traceback.format_exc()
402
- return '', errors, '', '', None, None
403
- finally:
404
- _global_agent = None
405
- # Handle cleanup based on persistence configuration
406
- if not keep_browser_open:
407
- if _global_browser_context:
408
- await _global_browser_context.close()
409
- _global_browser_context = None
410
-
411
- if _global_browser:
412
- await _global_browser.close()
413
- _global_browser = None
414
-
415
-
416
- async def run_custom_agent(
417
- llm,
418
- use_own_browser,
419
- keep_browser_open,
420
- headless,
421
- disable_security,
422
- window_w,
423
- window_h,
424
- save_recording_path,
425
- save_agent_history_path,
426
- save_trace_path,
427
- task,
428
- add_infos,
429
- max_steps,
430
- use_vision,
431
- max_actions_per_step,
432
- tool_calling_method,
433
- chrome_cdp,
434
- max_input_tokens
435
- ):
436
- try:
437
- global _global_browser, _global_browser_context, _global_agent
438
-
439
- extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"]
440
- cdp_url = chrome_cdp
441
- if use_own_browser:
442
- cdp_url = os.getenv("CHROME_CDP", chrome_cdp)
443
-
444
- chrome_path = os.getenv("CHROME_PATH", None)
445
- if chrome_path == "":
446
- chrome_path = None
447
- chrome_user_data = os.getenv("CHROME_USER_DATA", None)
448
- if chrome_user_data:
449
- extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
450
- else:
451
- chrome_path = None
452
-
453
- controller = CustomController()
454
-
455
- # Initialize global browser if needed
456
- # if chrome_cdp not empty string nor None
457
- if (_global_browser is None) or (cdp_url and cdp_url != "" and cdp_url != None):
458
- _global_browser = CustomBrowser(
459
- config=BrowserConfig(
460
- headless=headless,
461
- disable_security=disable_security,
462
- cdp_url=cdp_url,
463
- chrome_instance_path=chrome_path,
464
- extra_chromium_args=extra_chromium_args,
465
- )
466
- )
467
-
468
- if _global_browser_context is None or (chrome_cdp and cdp_url != "" and cdp_url != None):
469
- _global_browser_context = await _global_browser.new_context(
470
- config=BrowserContextConfig(
471
- trace_path=save_trace_path if save_trace_path else None,
472
- save_recording_path=save_recording_path if save_recording_path else None,
473
- no_viewport=False,
474
- save_downloads_path="./tmp/downloads",
475
- browser_window_size=BrowserContextWindowSize(
476
- width=window_w, height=window_h
477
- ),
478
- )
479
- )
480
-
481
- # Create and run agent
482
- if _global_agent is None:
483
- _global_agent = CustomAgent(
484
- task=task,
485
- add_infos=add_infos,
486
- use_vision=use_vision,
487
- llm=llm,
488
- browser=_global_browser,
489
- browser_context=_global_browser_context,
490
- controller=controller,
491
- system_prompt_class=CustomSystemPrompt,
492
- agent_prompt_class=CustomAgentMessagePrompt,
493
- max_actions_per_step=max_actions_per_step,
494
- tool_calling_method=tool_calling_method,
495
- max_input_tokens=max_input_tokens,
496
- generate_gif=True
497
- )
498
- history = await _global_agent.run(max_steps=max_steps)
499
-
500
- history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json")
501
- _global_agent.save_history(history_file)
502
-
503
- final_result = history.final_result()
504
- errors = history.errors()
505
- model_actions = history.model_actions()
506
- model_thoughts = history.model_thoughts()
507
-
508
- trace_file = get_latest_files(save_trace_path)
509
-
510
- return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file
511
- except Exception as e:
512
- import traceback
513
- traceback.print_exc()
514
- errors = str(e) + "\n" + traceback.format_exc()
515
- return '', errors, '', '', None, None
516
- finally:
517
- _global_agent = None
518
- # Handle cleanup based on persistence configuration
519
- if not keep_browser_open:
520
- if _global_browser_context:
521
- await _global_browser_context.close()
522
- _global_browser_context = None
523
-
524
- if _global_browser:
525
- await _global_browser.close()
526
- _global_browser = None
527
-
528
-
529
- async def run_with_stream(
530
- agent_type,
531
- llm_provider,
532
- llm_model_name,
533
- llm_num_ctx,
534
- llm_temperature,
535
- llm_base_url,
536
- llm_api_key,
537
- use_own_browser,
538
- keep_browser_open,
539
- headless,
540
- disable_security,
541
- window_w,
542
- window_h,
543
- save_recording_path,
544
- save_agent_history_path,
545
- save_trace_path,
546
- enable_recording,
547
- task,
548
- add_infos,
549
- max_steps,
550
- use_vision,
551
- max_actions_per_step,
552
- tool_calling_method,
553
- chrome_cdp,
554
- max_input_tokens
555
- ):
556
- global _global_agent
557
-
558
- stream_vw = 80
559
- stream_vh = int(80 * window_h // window_w)
560
- if not headless:
561
- result = await run_browser_agent(
562
- agent_type=agent_type,
563
- llm_provider=llm_provider,
564
- llm_model_name=llm_model_name,
565
- llm_num_ctx=llm_num_ctx,
566
- llm_temperature=llm_temperature,
567
- llm_base_url=llm_base_url,
568
- llm_api_key=llm_api_key,
569
- use_own_browser=use_own_browser,
570
- keep_browser_open=keep_browser_open,
571
- headless=headless,
572
- disable_security=disable_security,
573
- window_w=window_w,
574
- window_h=window_h,
575
- save_recording_path=save_recording_path,
576
- save_agent_history_path=save_agent_history_path,
577
- save_trace_path=save_trace_path,
578
- enable_recording=enable_recording,
579
- task=task,
580
- add_infos=add_infos,
581
- max_steps=max_steps,
582
- use_vision=use_vision,
583
- max_actions_per_step=max_actions_per_step,
584
- tool_calling_method=tool_calling_method,
585
- chrome_cdp=chrome_cdp,
586
- max_input_tokens=max_input_tokens
587
- )
588
- # Add HTML content at the start of the result array
589
- yield [gr.update(visible=False)] + list(result)
590
- else:
591
- try:
592
- # Run the browser agent in the background
593
- agent_task = asyncio.create_task(
594
- run_browser_agent(
595
- agent_type=agent_type,
596
- llm_provider=llm_provider,
597
- llm_model_name=llm_model_name,
598
- llm_num_ctx=llm_num_ctx,
599
- llm_temperature=llm_temperature,
600
- llm_base_url=llm_base_url,
601
- llm_api_key=llm_api_key,
602
- use_own_browser=use_own_browser,
603
- keep_browser_open=keep_browser_open,
604
- headless=headless,
605
- disable_security=disable_security,
606
- window_w=window_w,
607
- window_h=window_h,
608
- save_recording_path=save_recording_path,
609
- save_agent_history_path=save_agent_history_path,
610
- save_trace_path=save_trace_path,
611
- enable_recording=enable_recording,
612
- task=task,
613
- add_infos=add_infos,
614
- max_steps=max_steps,
615
- use_vision=use_vision,
616
- max_actions_per_step=max_actions_per_step,
617
- tool_calling_method=tool_calling_method,
618
- chrome_cdp=chrome_cdp,
619
- max_input_tokens=max_input_tokens
620
- )
621
- )
622
-
623
- # Initialize values for streaming
624
- html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Using browser...</h1>"
625
- final_result = errors = model_actions = model_thoughts = ""
626
- recording_gif = trace = history_file = None
627
-
628
- # Periodically update the stream while the agent task is running
629
- while not agent_task.done():
630
- try:
631
- encoded_screenshot = await capture_screenshot(_global_browser_context)
632
- if encoded_screenshot is not None:
633
- html_content = f'<img src="data:image/jpeg;base64,{encoded_screenshot}" style="width:{stream_vw}vw; height:{stream_vh}vh ; border:1px solid #ccc;">'
634
- else:
635
- html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>"
636
- except Exception as e:
637
- html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>"
638
-
639
- if _global_agent and _global_agent.state.stopped:
640
- yield [
641
- gr.HTML(value=html_content, visible=True),
642
- final_result,
643
- errors,
644
- model_actions,
645
- model_thoughts,
646
- recording_gif,
647
- trace,
648
- history_file,
649
- gr.update(value="Stopping...", interactive=False), # stop_button
650
- gr.update(interactive=False), # run_button
651
- ]
652
- break
653
- else:
654
- yield [
655
- gr.HTML(value=html_content, visible=True),
656
- final_result,
657
- errors,
658
- model_actions,
659
- model_thoughts,
660
- recording_gif,
661
- trace,
662
- history_file,
663
- gr.update(), # Re-enable stop button
664
- gr.update() # Re-enable run button
665
- ]
666
- await asyncio.sleep(0.1)
667
-
668
- # Once the agent task completes, get the results
669
- try:
670
- result = await agent_task
671
- final_result, errors, model_actions, model_thoughts, recording_gif, trace, history_file, stop_button, run_button = result
672
- except gr.Error:
673
- final_result = ""
674
- model_actions = ""
675
- model_thoughts = ""
676
- recording_gif = trace = history_file = None
677
-
678
- except Exception as e:
679
- errors = f"Agent error: {str(e)}"
680
-
681
- yield [
682
- gr.HTML(value=html_content, visible=True),
683
- final_result,
684
- errors,
685
- model_actions,
686
- model_thoughts,
687
- recording_gif,
688
- trace,
689
- history_file,
690
- stop_button,
691
- run_button
692
- ]
693
-
694
- except Exception as e:
695
- import traceback
696
- yield [
697
- gr.HTML(
698
- value=f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>",
699
- visible=True),
700
- "",
701
- f"Error: {str(e)}\n{traceback.format_exc()}",
702
- "",
703
- "",
704
- None,
705
- None,
706
- None,
707
- gr.update(value="Stop", interactive=True), # Re-enable stop button
708
- gr.update(interactive=True) # Re-enable run button
709
- ]
710
-
711
-
712
- # Define the theme map globally
713
- theme_map = {
714
- "Default": Default(),
715
- "Soft": Soft(),
716
- "Monochrome": Monochrome(),
717
- "Glass": Glass(),
718
- "Origin": Origin(),
719
- "Citrus": Citrus(),
720
- "Ocean": Ocean(),
721
- "Base": Base()
722
- }
723
-
724
-
725
- async def close_global_browser():
726
- global _global_browser, _global_browser_context
727
-
728
- if _global_browser_context:
729
- await _global_browser_context.close()
730
- _global_browser_context = None
731
-
732
- if _global_browser:
733
- await _global_browser.close()
734
- _global_browser = None
735
-
736
-
737
- async def run_deep_search(research_task, max_search_iteration_input, max_query_per_iter_input, llm_provider,
738
- llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision,
739
- use_own_browser, headless, chrome_cdp):
740
- from src.utils.deep_research import deep_research
741
- global _global_agent_state
742
-
743
- # Clear any previous stop request
744
- _global_agent_state.clear_stop()
745
-
746
- llm = utils.get_llm_model(
747
- provider=llm_provider,
748
- model_name=llm_model_name,
749
- num_ctx=llm_num_ctx,
750
- temperature=llm_temperature,
751
- base_url=llm_base_url,
752
- api_key=llm_api_key,
753
- )
754
- markdown_content, file_path = await deep_research(research_task, llm, _global_agent_state,
755
- max_search_iterations=max_search_iteration_input,
756
- max_query_num=max_query_per_iter_input,
757
- use_vision=use_vision,
758
- headless=headless,
759
- use_own_browser=use_own_browser,
760
- chrome_cdp=chrome_cdp
761
- )
762
-
763
- return markdown_content, file_path, gr.update(value="Stop", interactive=True), gr.update(interactive=True)
764
-
765
-
766
- def create_ui(theme_name="Ocean"):
767
- css = """
768
- .gradio-container {
769
- width: 60vw !important;
770
- max-width: 60% !important;
771
- margin-left: auto !important;
772
- margin-right: auto !important;
773
- padding-top: 20px !important;
774
- }
775
- .header-text {
776
- text-align: center;
777
- margin-bottom: 30px;
778
- }
779
- .theme-section {
780
- margin-bottom: 20px;
781
- padding: 15px;
782
- border-radius: 10px;
783
- }
784
- """
785
-
786
- with gr.Blocks(
787
- title="Browser Use WebUI", theme=theme_map[theme_name], css=css
788
- ) as demo:
789
- with gr.Row():
790
- gr.Markdown(
791
- """
792
- # 🌐 Browser Use WebUI
793
- ### Control your browser with AI assistance
794
- """,
795
- elem_classes=["header-text"],
796
- )
797
-
798
- with gr.Tabs() as tabs:
799
- with gr.TabItem("⚙️ Agent Settings", id=1):
800
- with gr.Group():
801
- agent_type = gr.Radio(
802
- ["org", "custom"],
803
- label="Agent Type",
804
- value="custom",
805
- info="Select the type of agent to use",
806
- interactive=True
807
- )
808
- with gr.Column():
809
- max_steps = gr.Slider(
810
- minimum=1,
811
- maximum=200,
812
- value=100,
813
- step=1,
814
- label="Max Run Steps",
815
- info="Maximum number of steps the agent will take",
816
- interactive=True
817
- )
818
- max_actions_per_step = gr.Slider(
819
- minimum=1,
820
- maximum=100,
821
- value=10,
822
- step=1,
823
- label="Max Actions per Step",
824
- info="Maximum number of actions the agent will take per step",
825
- interactive=True
826
- )
827
- with gr.Column():
828
- use_vision = gr.Checkbox(
829
- label="Use Vision",
830
- value=True,
831
- info="Enable visual processing capabilities",
832
- interactive=True
833
- )
834
- max_input_tokens = gr.Number(
835
- label="Max Input Tokens",
836
- value=128000,
837
- precision=0,
838
- interactive=True
839
- )
840
- tool_calling_method = gr.Dropdown(
841
- label="Tool Calling Method",
842
- value="auto",
843
- interactive=True,
844
- allow_custom_value=True, # Allow users to input custom model names
845
- choices=["auto", "json_schema", "function_calling"],
846
- info="Tool Calls Funtion Name",
847
- visible=False
848
- )
849
-
850
- with gr.TabItem("🔧 LLM Settings", id=2):
851
- with gr.Group():
852
- llm_provider = gr.Dropdown(
853
- choices=[provider for provider, model in utils.model_names.items()],
854
- label="LLM Provider",
855
- value="openai",
856
- info="Select your preferred language model provider",
857
- interactive=True
858
- )
859
- llm_model_name = gr.Dropdown(
860
- label="Model Name",
861
- choices=utils.model_names['openai'],
862
- value="gpt-4o",
863
- interactive=True,
864
- allow_custom_value=True, # Allow users to input custom model names
865
- info="Select a model in the dropdown options or directly type a custom model name"
866
- )
867
- ollama_num_ctx = gr.Slider(
868
- minimum=2 ** 8,
869
- maximum=2 ** 16,
870
- value=16000,
871
- step=1,
872
- label="Ollama Context Length",
873
- info="Controls max context length model needs to handle (less = faster)",
874
- visible=False,
875
- interactive=True
876
- )
877
- llm_temperature = gr.Slider(
878
- minimum=0.0,
879
- maximum=2.0,
880
- value=0.6,
881
- step=0.1,
882
- label="Temperature",
883
- info="Controls randomness in model outputs",
884
- interactive=True
885
- )
886
- with gr.Row():
887
- llm_base_url = gr.Textbox(
888
- label="Base URL",
889
- value="",
890
- info="API endpoint URL (if required)"
891
- )
892
- llm_api_key = gr.Textbox(
893
- label="API Key",
894
- type="password",
895
- value="",
896
- info="Your API key (leave blank to use .env)"
897
- )
898
-
899
- # Change event to update context length slider
900
- def update_llm_num_ctx_visibility(llm_provider):
901
- return gr.update(visible=llm_provider == "ollama")
902
-
903
- # Bind the change event of llm_provider to update the visibility of context length slider
904
- llm_provider.change(
905
- fn=update_llm_num_ctx_visibility,
906
- inputs=llm_provider,
907
- outputs=ollama_num_ctx
908
- )
909
-
910
- with gr.TabItem("🌐 Browser Settings", id=3):
911
- with gr.Group():
912
- with gr.Row():
913
- use_own_browser = gr.Checkbox(
914
- label="Use Own Browser",
915
- value=False,
916
- info="Use your existing browser instance",
917
- interactive=True
918
- )
919
- keep_browser_open = gr.Checkbox(
920
- label="Keep Browser Open",
921
- value=False,
922
- info="Keep Browser Open between Tasks",
923
- interactive=True
924
- )
925
- headless = gr.Checkbox(
926
- label="Headless Mode",
927
- value=False,
928
- info="Run browser without GUI",
929
- interactive=True
930
- )
931
- disable_security = gr.Checkbox(
932
- label="Disable Security",
933
- value=True,
934
- info="Disable browser security features",
935
- interactive=True
936
- )
937
- enable_recording = gr.Checkbox(
938
- label="Enable Recording",
939
- value=True,
940
- info="Enable saving browser recordings",
941
- interactive=True
942
- )
943
-
944
- with gr.Row():
945
- window_w = gr.Number(
946
- label="Window Width",
947
- value=1280,
948
- info="Browser window width",
949
- interactive=True
950
- )
951
- window_h = gr.Number(
952
- label="Window Height",
953
- value=1100,
954
- info="Browser window height",
955
- interactive=True
956
- )
957
-
958
- chrome_cdp = gr.Textbox(
959
- label="CDP URL",
960
- placeholder="http://localhost:9222",
961
- value="",
962
- info="CDP for google remote debugging",
963
- interactive=True, # Allow editing only if recording is enabled
964
- )
965
-
966
- save_recording_path = gr.Textbox(
967
- label="Recording Path",
968
- placeholder="e.g. ./tmp/record_videos",
969
- value="./tmp/record_videos",
970
- info="Path to save browser recordings",
971
- interactive=True, # Allow editing only if recording is enabled
972
- )
973
-
974
- save_trace_path = gr.Textbox(
975
- label="Trace Path",
976
- placeholder="e.g. ./tmp/traces",
977
- value="./tmp/traces",
978
- info="Path to save Agent traces",
979
- interactive=True,
980
- )
981
-
982
- save_agent_history_path = gr.Textbox(
983
- label="Agent History Save Path",
984
- placeholder="e.g., ./tmp/agent_history",
985
- value="./tmp/agent_history",
986
- info="Specify the directory where agent history should be saved.",
987
- interactive=True,
988
- )
989
-
990
- with gr.TabItem("🤖 Run Agent", id=4):
991
- task = gr.Textbox(
992
- label="Task Description",
993
- lines=4,
994
- placeholder="Enter your task here...",
995
- value="go to google.com and type 'OpenAI' click search and give me the first url",
996
- info="Describe what you want the agent to do",
997
- interactive=True
998
- )
999
- add_infos = gr.Textbox(
1000
- label="Additional Information",
1001
- lines=3,
1002
- placeholder="Add any helpful context or instructions...",
1003
- info="Optional hints to help the LLM complete the task",
1004
- value="",
1005
- interactive=True
1006
- )
1007
-
1008
- with gr.Row():
1009
- run_button = gr.Button("▶️ Run Agent", variant="primary", scale=2)
1010
- stop_button = gr.Button("⏹️ Stop", variant="stop", scale=1)
1011
-
1012
- with gr.Row():
1013
- browser_view = gr.HTML(
1014
- value="<h1 style='width:80vw; height:50vh'>Waiting for browser session...</h1>",
1015
- label="Live Browser View",
1016
- visible=False
1017
- )
1018
-
1019
- gr.Markdown("### Results")
1020
- with gr.Row():
1021
- with gr.Column():
1022
- final_result_output = gr.Textbox(
1023
- label="Final Result", lines=3, show_label=True
1024
- )
1025
- with gr.Column():
1026
- errors_output = gr.Textbox(
1027
- label="Errors", lines=3, show_label=True
1028
- )
1029
- with gr.Row():
1030
- with gr.Column():
1031
- model_actions_output = gr.Textbox(
1032
- label="Model Actions", lines=3, show_label=True, visible=False
1033
- )
1034
- with gr.Column():
1035
- model_thoughts_output = gr.Textbox(
1036
- label="Model Thoughts", lines=3, show_label=True, visible=False
1037
- )
1038
- recording_gif = gr.Image(label="Result GIF", format="gif")
1039
- trace_file = gr.File(label="Trace File")
1040
- agent_history_file = gr.File(label="Agent History")
1041
-
1042
- with gr.TabItem("🧐 Deep Research", id=5):
1043
- research_task_input = gr.Textbox(label="Research Task", lines=5,
1044
- value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.",
1045
- interactive=True)
1046
- with gr.Row():
1047
- max_search_iteration_input = gr.Number(label="Max Search Iteration", value=3,
1048
- precision=0,
1049
- interactive=True) # precision=0 确保是整数
1050
- max_query_per_iter_input = gr.Number(label="Max Query per Iteration", value=1,
1051
- precision=0,
1052
- interactive=True) # precision=0 确保是整数
1053
- with gr.Row():
1054
- research_button = gr.Button("▶️ Run Deep Research", variant="primary", scale=2)
1055
- stop_research_button = gr.Button("⏹ Stop", variant="stop", scale=1)
1056
- markdown_output_display = gr.Markdown(label="Research Report")
1057
- markdown_download = gr.File(label="Download Research Report")
1058
-
1059
- # Bind the stop button click event after errors_output is defined
1060
- stop_button.click(
1061
- fn=stop_agent,
1062
- inputs=[],
1063
- outputs=[stop_button, run_button],
1064
- )
1065
-
1066
- # Run button click handler
1067
- run_button.click(
1068
- fn=run_with_stream,
1069
- inputs=[
1070
- agent_type, llm_provider, llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url,
1071
- llm_api_key,
1072
- use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h,
1073
- save_recording_path, save_agent_history_path, save_trace_path, # Include the new path
1074
- enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step,
1075
- tool_calling_method, chrome_cdp, max_input_tokens
1076
- ],
1077
- outputs=[
1078
- browser_view, # Browser view
1079
- final_result_output, # Final result
1080
- errors_output, # Errors
1081
- model_actions_output, # Model actions
1082
- model_thoughts_output, # Model thoughts
1083
- recording_gif, # Latest recording
1084
- trace_file, # Trace file
1085
- agent_history_file, # Agent history file
1086
- stop_button, # Stop button
1087
- run_button # Run button
1088
- ],
1089
- )
1090
-
1091
- # Run Deep Research
1092
- research_button.click(
1093
- fn=run_deep_search,
1094
- inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider,
1095
- llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision,
1096
- use_own_browser, headless, chrome_cdp],
1097
- outputs=[markdown_output_display, markdown_download, stop_research_button, research_button]
1098
- )
1099
- # Bind the stop button click event after errors_output is defined
1100
- stop_research_button.click(
1101
- fn=stop_research_agent,
1102
- inputs=[],
1103
- outputs=[stop_research_button, research_button],
1104
- )
1105
-
1106
- with gr.TabItem("🎥 Recordings", id=7, visible=True):
1107
- def list_recordings(save_recording_path):
1108
- if not os.path.exists(save_recording_path):
1109
- return []
1110
-
1111
- # Get all video files
1112
- recordings = glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + glob.glob(
1113
- os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
1114
-
1115
- # Sort recordings by creation time (oldest first)
1116
- recordings.sort(key=os.path.getctime)
1117
-
1118
- # Add numbering to the recordings
1119
- numbered_recordings = []
1120
- for idx, recording in enumerate(recordings, start=1):
1121
- filename = os.path.basename(recording)
1122
- numbered_recordings.append((recording, f"{idx}. {filename}"))
1123
-
1124
- return numbered_recordings
1125
-
1126
- recordings_gallery = gr.Gallery(
1127
- label="Recordings",
1128
- columns=3,
1129
- height="auto",
1130
- object_fit="contain"
1131
- )
1132
-
1133
- refresh_button = gr.Button("🔄 Refresh Recordings", variant="secondary")
1134
- refresh_button.click(
1135
- fn=list_recordings,
1136
- inputs=save_recording_path,
1137
- outputs=recordings_gallery
1138
- )
1139
-
1140
- with gr.TabItem("📁 UI Configuration", id=8):
1141
- config_file_input = gr.File(
1142
- label="Load UI Settings from Config File",
1143
- file_types=[".json"],
1144
- interactive=True
1145
- )
1146
- with gr.Row():
1147
- load_config_button = gr.Button("Load Config", variant="primary")
1148
- save_config_button = gr.Button("Save UI Settings", variant="primary")
1149
-
1150
- config_status = gr.Textbox(
1151
- label="Status",
1152
- lines=2,
1153
- interactive=False
1154
- )
1155
- save_config_button.click(
1156
- fn=save_current_config,
1157
- inputs=[], # 不需要输入参数
1158
- outputs=[config_status]
1159
- )
1160
-
1161
- # Attach the callback to the LLM provider dropdown
1162
- llm_provider.change(
1163
- lambda provider, api_key, base_url: update_model_dropdown(provider, api_key, base_url),
1164
- inputs=[llm_provider, llm_api_key, llm_base_url],
1165
- outputs=llm_model_name
1166
- )
1167
-
1168
- # Add this after defining the components
1169
- enable_recording.change(
1170
- lambda enabled: gr.update(interactive=enabled),
1171
- inputs=enable_recording,
1172
- outputs=save_recording_path
1173
- )
1174
-
1175
- use_own_browser.change(fn=close_global_browser)
1176
- keep_browser_open.change(fn=close_global_browser)
1177
-
1178
- scan_and_register_components(demo)
1179
- global webui_config_manager
1180
- all_components = webui_config_manager.get_all_components()
1181
-
1182
- load_config_button.click(
1183
- fn=update_ui_from_config,
1184
- inputs=[config_file_input],
1185
- outputs=all_components + [config_status]
1186
- )
1187
- return demo
1188
 
1189
 
1190
  def main():
1191
- parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent")
1192
  parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to")
1193
  parser.add_argument("--port", type=int, default=7788, help="Port to listen on")
1194
  parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI")
1195
  args = parser.parse_args()
1196
 
1197
  demo = create_ui(theme_name=args.theme)
1198
- demo.launch(server_name=args.ip, server_port=args.port, share=True)
1199
 
1200
 
1201
  if __name__ == '__main__':
 
 
 
 
1
  from dotenv import load_dotenv
 
2
  load_dotenv()
 
 
 
3
  import argparse
4
+ from src.webui.interface import theme_map, create_ui
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  def main():
8
+ parser = argparse.ArgumentParser(description="Gradio WebUI for Browser Agent")
9
  parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to")
10
  parser.add_argument("--port", type=int, default=7788, help="Port to listen on")
11
  parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI")
12
  args = parser.parse_args()
13
 
14
  demo = create_ui(theme_name=args.theme)
15
+ demo.queue().launch(server_name=args.ip, server_port=args.port, share=True)
16
 
17
 
18
  if __name__ == '__main__':