asifHuggingFace commited on
Commit
405ca38
·
1 Parent(s): a4ca470

Add Git LFS support and migrate binary files

Browse files
.dockerignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ data
2
+ tmp
3
+ results
4
+
5
+ .env
.env.example ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OPENAI_ENDPOINT=https://api.openai.com/v1
2
+ OPENAI_API_KEY=
3
+
4
+ ANTHROPIC_API_KEY=
5
+ ANTHROPIC_ENDPOINT=https://api.anthropic.com
6
+
7
+ GOOGLE_API_KEY=
8
+
9
+ AZURE_OPENAI_ENDPOINT=
10
+ AZURE_OPENAI_API_KEY=
11
+ AZURE_OPENAI_API_VERSION=2025-01-01-preview
12
+
13
+ DEEPSEEK_ENDPOINT=https://api.deepseek.com
14
+ DEEPSEEK_API_KEY=
15
+
16
+ MISTRAL_API_KEY=
17
+ MISTRAL_ENDPOINT=https://api.mistral.ai/v1
18
+
19
+ OLLAMA_ENDPOINT=http://localhost:11434
20
+
21
+ ALIBABA_ENDPOINT=https://dashscope.aliyuncs.com/compatible-mode/v1
22
+ ALIBABA_API_KEY=
23
+
24
+ MOONSHOT_ENDPOINT=https://api.moonshot.cn/v1
25
+ MOONSHOT_API_KEY=
26
+
27
+ UNBOUND_ENDPOINT=https://api.getunbound.ai
28
+ UNBOUND_API_KEY=
29
+
30
+ SiliconFLOW_ENDPOINT=https://api.siliconflow.cn/v1/
31
+ SiliconFLOW_API_KEY=
32
+
33
+ IBM_ENDPOINT=https://us-south.ml.cloud.ibm.com
34
+ IBM_API_KEY=
35
+ IBM_PROJECT_ID=
36
+
37
+ GROK_ENDPOINT="https://api.x.ai/v1"
38
+ GROK_API_KEY=
39
+
40
+ #set default LLM
41
+ DEFAULT_LLM=openai
42
+
43
+
44
+ # Set to false to disable anonymized telemetry
45
+ ANONYMIZED_TELEMETRY=false
46
+
47
+ # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
48
+ BROWSER_USE_LOGGING_LEVEL=info
49
+
50
+ # Browser settings
51
+ BROWSER_PATH=
52
+ BROWSER_USER_DATA=
53
+ BROWSER_DEBUGGING_PORT=9222
54
+ BROWSER_DEBUGGING_HOST=localhost
55
+ # Set to true to keep browser open between AI tasks
56
+ KEEP_BROWSER_OPEN=true
57
+ USE_OWN_BROWSER=false
58
+ BROWSER_CDP=
59
+ # Display settings
60
+ # Format: WIDTHxHEIGHTxDEPTH
61
+ RESOLUTION=1920x1080x24
62
+ # Width in pixels
63
+ RESOLUTION_WIDTH=1920
64
+ # Height in pixels
65
+ RESOLUTION_HEIGHT=1080
66
+
67
+ # VNC settings
68
+ VNC_PASSWORD=youvncpassword
.github/workflows/build.yml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build Docker Image
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ push:
7
+ branches: [main]
8
+
9
+ env:
10
+ GITHUB_CR_REPO: ghcr.io/${{ github.repository }}
11
+
12
+ jobs:
13
+ build:
14
+ runs-on: ubuntu-latest
15
+ strategy:
16
+ fail-fast: false
17
+ matrix:
18
+ platform:
19
+ - linux/amd64
20
+ - linux/arm64
21
+ steps:
22
+ - name: Prepare
23
+ run: |
24
+ platform=${{ matrix.platform }}
25
+ echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
26
+
27
+ - name: Docker meta
28
+ id: meta
29
+ uses: docker/metadata-action@v5
30
+ with:
31
+ images: |
32
+ ${{ env.GITHUB_CR_REPO }}
33
+
34
+ - name: Login to GHCR
35
+ uses: docker/login-action@v3
36
+ with:
37
+ registry: ghcr.io
38
+ username: ${{ github.repository_owner }}
39
+ password: ${{ secrets.GITHUB_TOKEN }}
40
+
41
+ - name: Set up QEMU
42
+ uses: docker/setup-qemu-action@v3
43
+
44
+ - name: Set up Docker Buildx
45
+ uses: docker/setup-buildx-action@v3
46
+
47
+ - name: Build and push by digest
48
+ id: build
49
+ uses: docker/build-push-action@v6
50
+ with:
51
+ platforms: ${{ matrix.platform }}
52
+ labels: ${{ steps.meta.outputs.labels }}
53
+ tags: |
54
+ ${{ env.GITHUB_CR_REPO }}
55
+ build-args: |
56
+ TARGETPLATFORM=${{ matrix.platform }}
57
+ outputs: type=image,push-by-digest=true,name-canonical=true,push=true
58
+
59
+ - name: Export digest
60
+ run: |
61
+ mkdir -p ${{ runner.temp }}/digests
62
+ digest="${{ steps.build.outputs.digest }}"
63
+ touch "${{ runner.temp }}/digests/${digest#sha256:}"
64
+
65
+ - name: Upload digest
66
+ uses: actions/upload-artifact@v4
67
+ with:
68
+ name: digests-${{ env.PLATFORM_PAIR }}
69
+ path: ${{ runner.temp }}/digests/*
70
+ if-no-files-found: error
71
+ retention-days: 1
72
+
73
+ merge:
74
+ runs-on: ubuntu-latest
75
+ needs:
76
+ - build
77
+ steps:
78
+ - name: Download digests
79
+ uses: actions/download-artifact@v4
80
+ with:
81
+ path: ${{ runner.temp }}/digests
82
+ pattern: digests-*
83
+ merge-multiple: true
84
+
85
+ - name: Login to GHCR
86
+ uses: docker/login-action@v3
87
+ with:
88
+ registry: ghcr.io
89
+ username: ${{ github.repository_owner }}
90
+ password: ${{ secrets.GITHUB_TOKEN }}
91
+
92
+ - name: Set up Docker Buildx
93
+ uses: docker/setup-buildx-action@v3
94
+
95
+ - name: Docker meta
96
+ id: meta
97
+ uses: docker/metadata-action@v5
98
+ with:
99
+ images: |
100
+ ${{ env.GITHUB_CR_REPO }}
101
+ tags: |
102
+ type=ref,event=branch
103
+ type=ref,event=pr
104
+ type=semver,pattern={{version}}
105
+ type=semver,pattern={{major}}
106
+
107
+ - name: Docker tags
108
+ run: |
109
+ tags=$(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON")
110
+ if [ -z "$tags" ]; then
111
+ echo "DOCKER_METADATA_OUTPUT_VERSION=${{ github.ref_name }}" >> $GITHUB_ENV
112
+ tags="-t ${{ env.GITHUB_CR_REPO }}:${{ github.ref_name }}"
113
+ fi
114
+ echo "DOCKER_METADATA_TAGS=$tags" >> $GITHUB_ENV
115
+
116
+ - name: Create manifest list and push
117
+ working-directory: ${{ runner.temp }}/digests
118
+ run: |
119
+ docker buildx imagetools create ${{ env.DOCKER_METADATA_TAGS }} \
120
+ $(printf '${{ env.GITHUB_CR_REPO }}@sha256:%s ' *)
121
+
122
+ - name: Inspect image
123
+ run: |
124
+ docker buildx imagetools inspect ${{ env.GITHUB_CR_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION }}
.gitignore ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+ test_env/
133
+ myenv
134
+
135
+
136
+ # Spyder project settings
137
+ .spyderproject
138
+ .spyproject
139
+
140
+ # Rope project settings
141
+ .ropeproject
142
+
143
+ # mkdocs documentation
144
+ /site
145
+
146
+ # mypy
147
+ .mypy_cache/
148
+ .dmypy.json
149
+ dmypy.json
150
+
151
+ # Pyre type checker
152
+ .pyre/
153
+
154
+ # pytype static type analyzer
155
+ .pytype/
156
+
157
+ # Cython debug symbols
158
+ cython_debug/
159
+
160
+ # PyCharm
161
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
164
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165
+ .idea/
166
+ temp
167
+ tmp
168
+
169
+
170
+ .DS_Store
171
+
172
+ private_example.py
173
+ private_example
174
+
175
+ browser_cookies.json
176
+ cookies.json
177
+ AgentHistory.json
178
+ cv_04_24.pdf
179
+ AgentHistoryList.json
180
+ *.gif
181
+
182
+ # For Sharing (.pem files)
183
+ .gradio/
184
+
185
+ # For Docker
186
+ data/
187
+
188
+ # For Config Files (Current Settings)
189
+ .config.pkl
190
+ *.pdf
191
+
192
+ workflow
.vscode/settings.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "python.analysis.typeCheckingMode": "basic",
3
+ "[python]": {
4
+ "editor.defaultFormatter": "charliermarsh.ruff",
5
+ "editor.formatOnSave": true,
6
+ "editor.codeActionsOnSave": {
7
+ "source.fixAll.ruff": "explicit",
8
+ "source.organizeImports.ruff": "explicit"
9
+ }
10
+ }
11
+ }
Dockerfile ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set platform for multi-arch builds (Docker Buildx will set this)
4
+ ARG TARGETPLATFORM
5
+ ARG NODE_MAJOR=20
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ wget \
10
+ netcat-traditional \
11
+ gnupg \
12
+ curl \
13
+ unzip \
14
+ xvfb \
15
+ libgconf-2-4 \
16
+ libxss1 \
17
+ libnss3 \
18
+ libnspr4 \
19
+ libasound2 \
20
+ libatk1.0-0 \
21
+ libatk-bridge2.0-0 \
22
+ libcups2 \
23
+ libdbus-1-3 \
24
+ libdrm2 \
25
+ libgbm1 \
26
+ libgtk-3-0 \
27
+ libxcomposite1 \
28
+ libxdamage1 \
29
+ libxfixes3 \
30
+ libxrandr2 \
31
+ xdg-utils \
32
+ fonts-liberation \
33
+ dbus \
34
+ xauth \
35
+ x11vnc \
36
+ tigervnc-tools \
37
+ supervisor \
38
+ net-tools \
39
+ procps \
40
+ git \
41
+ python3-numpy \
42
+ fontconfig \
43
+ fonts-dejavu \
44
+ fonts-dejavu-core \
45
+ fonts-dejavu-extra \
46
+ vim \
47
+ && rm -rf /var/lib/apt/lists/*
48
+
49
+ # Install noVNC
50
+ RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
51
+ && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
52
+ && ln -s /opt/novnc/vnc.html /opt/novnc/index.html
53
+
54
+ # Install Node.js using NodeSource PPA
55
+ RUN mkdir -p /etc/apt/keyrings \
56
+ && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
57
+ && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list \
58
+ && apt-get update \
59
+ && apt-get install nodejs -y \
60
+ && rm -rf /var/lib/apt/lists/*
61
+
62
+ # Verify Node.js and npm installation (optional, but good for debugging)
63
+ RUN node -v && npm -v && npx -v
64
+
65
+ # Set up working directory
66
+ WORKDIR /app
67
+
68
+ # Copy requirements and install Python dependencies
69
+ COPY requirements.txt .
70
+
71
+ RUN pip install --no-cache-dir -r requirements.txt
72
+
73
+ # Install playwright browsers and dependencies
74
+ # playwright documentation suggests PLAYWRIGHT_BROWSERS_PATH is still relevant
75
+ # or that playwright installs to a similar default location that Playwright would.
76
+ # Let's assume playwright respects PLAYWRIGHT_BROWSERS_PATH or its default install location is findable.
77
+ ENV PLAYWRIGHT_BROWSERS_PATH=/ms-browsers
78
+ RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH
79
+
80
+ # Install recommended: Google Chrome (instead of just Chromium for better undetectability)
81
+ # The 'playwright install chrome' command might download and place it.
82
+ # The '--with-deps' equivalent for playwright install is to run 'playwright install-deps chrome' after.
83
+ # RUN playwright install chrome --with-deps
84
+
85
+ # Alternative: Install Chromium if Google Chrome is problematic in certain environments
86
+ RUN playwright install chromium --with-deps
87
+
88
+
89
+ # Copy the application code
90
+ COPY . .
91
+
92
+ # Set up supervisor configuration
93
+ RUN mkdir -p /var/log/supervisor
94
+ COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
95
+
96
+ EXPOSE 7788 6080 5901 9222
97
+
98
+ CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
99
+ #CMD ["/bin/bash"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Browser Use Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README_DEPLOYMENT.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Browser Automation WebUI - Deployment Guide
2
+
3
+ ## Deploying to Hugging Face Spaces
4
+
5
+ ### Prerequisites
6
+ - A Hugging Face account
7
+ - Your code pushed to a Git repository
8
+
9
+ ### Steps to Deploy
10
+
11
+ 1. **Create a new Space on Hugging Face**
12
+ - Go to https://huggingface.co/spaces
13
+ - Click "Create new Space"
14
+ - Choose "Gradio" as the SDK
15
+ - Select your repository or create a new one
16
+
17
+ 2. **File Structure for Deployment**
18
+ ```
19
+ web-ui/
20
+ ├── app.py # Main entry point (created)
21
+ ├── requirements.txt # Dependencies
22
+ ├── src/ # Source code
23
+ └── README.md # Documentation
24
+ ```
25
+
26
+ 3. **Key Files for Deployment**
27
+ - `app.py`: Main entry point that Gradio will use
28
+ - `requirements.txt`: All necessary dependencies
29
+ - `src/`: Your source code directory
30
+
31
+ ### Troubleshooting the "Failed to canonicalize script path" Error
32
+
33
+ This error typically occurs when:
34
+ - Gradio can't find the main entry point
35
+ - Import paths are not properly configured
36
+ - File structure doesn't match deployment expectations
37
+
38
+ **Solution**: The `app.py` file has been created to serve as the proper entry point for Gradio deployment.
39
+
40
+ ### Environment Variables
41
+
42
+ If your app requires environment variables, you can set them in the Hugging Face Space settings:
43
+ - Go to your Space settings
44
+ - Navigate to "Repository secrets"
45
+ - Add any required environment variables
46
+
47
+ ### Local Testing
48
+
49
+ To test the deployment locally before pushing:
50
+
51
+ ```bash
52
+ cd web-ui
53
+ python app.py
54
+ ```
55
+
56
+ This should start the Gradio interface without the canonicalization error.
57
+
58
+ ### Common Issues and Solutions
59
+
60
+ 1. **Import Errors**: Make sure all imports use relative paths from the project root
61
+ 2. **Missing Dependencies**: Ensure all packages are listed in `requirements.txt`
62
+ 3. **Path Issues**: The `app.py` file includes proper path configuration
63
+
64
+ ### Deployment Checklist
65
+
66
+ - [ ] `app.py` exists and is properly configured
67
+ - [ ] All dependencies are in `requirements.txt`
68
+ - [ ] All import paths are correct
69
+ - [ ] Environment variables are configured (if needed)
70
+ - [ ] Local testing works without errors
SECURITY.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Reporting Security Issues
2
+
3
+ If you believe you have found a security vulnerability in browser-use, please report it through coordinated disclosure.
4
+
5
+ **Please do not report security vulnerabilities through the repository issues, discussions, or pull requests.**
6
+
7
+ Instead, please open a new [Github security advisory](https://github.com/browser-use/web-ui/security/advisories/new).
8
+
9
+ Please include as much of the information listed below as you can to help me better understand and resolve the issue:
10
+
11
+ * The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
12
+ * Full paths of source file(s) related to the manifestation of the issue
13
+ * The location of the affected source code (tag/branch/commit or direct URL)
14
+ * Any special configuration required to reproduce the issue
15
+ * Step-by-step instructions to reproduce the issue
16
+ * Proof-of-concept or exploit code (if possible)
17
+ * Impact of the issue, including how an attacker might exploit the issue
18
+
19
+ This information will help me triage your report more quickly.
app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ # Add the current directory to Python path to ensure imports work
5
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
6
+
7
+ from dotenv import load_dotenv
8
+ load_dotenv()
9
+
10
+ from src.webui.interface import create_ui
11
+
12
+ # Create the Gradio app
13
+ demo = create_ui(theme_name="Ocean")
14
+
15
+ # For deployment, we need to expose the app directly
16
+ if __name__ == "__main__":
17
+ demo.launch()
assets/examples/test.png ADDED

Git LFS Details

  • SHA256: 23e4fe8c9836cd35393315a3cca074dbd55a8645289ea337e3300269dda06900
  • Pointer size: 131 Bytes
  • Size of remote file: 423 kB
assets/web-ui.png ADDED

Git LFS Details

  • SHA256: ea3c23160272116985f1d24a8140f0746e92a820bbd6e4988b6aa4ec0dfbb491
  • Pointer size: 130 Bytes
  • Size of remote file: 24.5 kB
docker-compose.yml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ # debug: docker compose run --rm -it browser-use-webui bash
3
+ browser-use-webui:
4
+ # image: ghcr.io/browser-use/web-ui # Using precompiled image
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ args:
9
+ TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
10
+ ports:
11
+ - "7788:7788"
12
+ - "6080:6080"
13
+ - "5901:5901"
14
+ - "9222:9222"
15
+ environment:
16
+ # LLM API Keys & Endpoints
17
+ - OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1}
18
+ - OPENAI_API_KEY=${OPENAI_API_KEY:-}
19
+ - ANTHROPIC_ENDPOINT=${ANTHROPIC_ENDPOINT:-https://api.anthropic.com}
20
+ - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
21
+ - GOOGLE_API_KEY=${GOOGLE_API_KEY:-}
22
+ - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-}
23
+ - AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-}
24
+ - AZURE_OPENAI_API_VERSION=${AZURE_OPENAI_API_VERSION:-2025-01-01-preview}
25
+ - DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com}
26
+ - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
27
+ - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://localhost:11434}
28
+ - MISTRAL_ENDPOINT=${MISTRAL_ENDPOINT:-https://api.mistral.ai/v1}
29
+ - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
30
+ - ALIBABA_ENDPOINT=${ALIBABA_ENDPOINT:-https://dashscope.aliyuncs.com/compatible-mode/v1}
31
+ - ALIBABA_API_KEY=${ALIBABA_API_KEY:-}
32
+ - MOONSHOT_ENDPOINT=${MOONSHOT_ENDPOINT:-https://api.moonshot.cn/v1}
33
+ - MOONSHOT_API_KEY=${MOONSHOT_API_KEY:-}
34
+ - UNBOUND_ENDPOINT=${UNBOUND_ENDPOINT:-https://api.getunbound.ai}
35
+ - UNBOUND_API_KEY=${UNBOUND_API_KEY:-}
36
+ - SiliconFLOW_ENDPOINT=${SiliconFLOW_ENDPOINT:-https://api.siliconflow.cn/v1/}
37
+ - SiliconFLOW_API_KEY=${SiliconFLOW_API_KEY:-}
38
+ - IBM_ENDPOINT=${IBM_ENDPOINT:-https://us-south.ml.cloud.ibm.com}
39
+ - IBM_API_KEY=${IBM_API_KEY:-}
40
+ - IBM_PROJECT_ID=${IBM_PROJECT_ID:-}
41
+
42
+ # Application Settings
43
+ - ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false}
44
+ - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
45
+
46
+ # Browser Settings
47
+ - BROWSER_PATH=
48
+ - BROWSER_USER_DATA=
49
+ - BROWSER_DEBUGGING_PORT=${BROWSER_DEBUGGING_PORT:-9222}
50
+ - BROWSER_DEBUGGING_HOST=localhost
51
+ - USE_OWN_BROWSER=false
52
+ - KEEP_BROWSER_OPEN=true
53
+ - BROWSER_CDP=${BROWSER_CDP:-} # e.g., http://localhost:9222
54
+
55
+ # Display Settings
56
+ - DISPLAY=:99
57
+ # This ENV is used by the Dockerfile during build time if playwright respects it.
58
+ # It's not strictly needed at runtime by docker-compose unless your app or scripts also read it.
59
+ - PLAYWRIGHT_BROWSERS_PATH=/ms-browsers # Matches Dockerfile ENV
60
+ - RESOLUTION=${RESOLUTION:-1920x1080x24}
61
+ - RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920}
62
+ - RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080}
63
+
64
+ # VNC Settings
65
+ - VNC_PASSWORD=${VNC_PASSWORD:-youvncpassword}
66
+
67
+ volumes:
68
+ - /tmp/.X11-unix:/tmp/.X11-unix
69
+ # - ./my_chrome_data:/app/data/chrome_data # Optional: persist browser data
70
+ restart: unless-stopped
71
+ shm_size: '2gb'
72
+ cap_add:
73
+ - SYS_ADMIN
74
+ tmpfs:
75
+ - /tmp
76
+ healthcheck:
77
+ test: ["CMD", "nc", "-z", "localhost", "5901"] # VNC port
78
+ interval: 10s
79
+ timeout: 5s
80
+ retries: 3
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ browser-use==0.1.48
2
+ pyperclip==1.9.0
3
+ gradio==5.27.0
4
+ json-repair
5
+ langchain-mistralai==0.2.4
6
+ MainContentExtractor==0.0.4
7
+ langchain-ibm==0.3.10
8
+ langchain_mcp_adapters==0.0.9
9
+ langgraph==0.3.34
10
+ langchain-community
src/__init__.py ADDED
File without changes
src/agent/__init__.py ADDED
File without changes
src/agent/browser_use/browser_use_agent.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+
7
+ # from lmnr.sdk.decorators import observe
8
+ from browser_use.agent.gif import create_history_gif
9
+ from browser_use.agent.service import Agent, AgentHookFunc
10
+ from browser_use.agent.views import (
11
+ ActionResult,
12
+ AgentHistory,
13
+ AgentHistoryList,
14
+ AgentStepInfo,
15
+ ToolCallingMethod,
16
+ )
17
+ from browser_use.browser.views import BrowserStateHistory
18
+ from browser_use.utils import time_execution_async
19
+ from dotenv import load_dotenv
20
+ from browser_use.agent.message_manager.utils import is_model_without_tool_support
21
+
22
+ load_dotenv()
23
+ logger = logging.getLogger(__name__)
24
+
25
+ SKIP_LLM_API_KEY_VERIFICATION = (
26
+ os.environ.get("SKIP_LLM_API_KEY_VERIFICATION", "false").lower()[0] in "ty1"
27
+ )
28
+
29
+
30
+ class BrowserUseAgent(Agent):
31
+ def _set_tool_calling_method(self) -> ToolCallingMethod | None:
32
+ tool_calling_method = self.settings.tool_calling_method
33
+ if tool_calling_method == 'auto':
34
+ if is_model_without_tool_support(self.model_name):
35
+ return 'raw'
36
+ elif self.chat_model_library == 'ChatGoogleGenerativeAI':
37
+ return None
38
+ elif self.chat_model_library == 'ChatOpenAI':
39
+ return 'function_calling'
40
+ elif self.chat_model_library == 'AzureChatOpenAI':
41
+ return 'function_calling'
42
+ else:
43
+ return None
44
+ else:
45
+ return tool_calling_method
46
+
47
+ @time_execution_async("--run (agent)")
48
+ async def run(
49
+ self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None,
50
+ on_step_end: AgentHookFunc | None = None
51
+ ) -> AgentHistoryList:
52
+ """Execute the task with maximum number of steps"""
53
+
54
+ loop = asyncio.get_event_loop()
55
+
56
+ # Set up the Ctrl+C signal handler with callbacks specific to this agent
57
+ from browser_use.utils import SignalHandler
58
+
59
+ signal_handler = SignalHandler(
60
+ loop=loop,
61
+ pause_callback=self.pause,
62
+ resume_callback=self.resume,
63
+ custom_exit_callback=None, # No special cleanup needed on forced exit
64
+ exit_on_second_int=True,
65
+ )
66
+ signal_handler.register()
67
+
68
+ try:
69
+ self._log_agent_run()
70
+
71
+ # Execute initial actions if provided
72
+ if self.initial_actions:
73
+ result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
74
+ self.state.last_result = result
75
+
76
+ for step in range(max_steps):
77
+ # Check if waiting for user input after Ctrl+C
78
+ if self.state.paused:
79
+ signal_handler.wait_for_resume()
80
+ signal_handler.reset()
81
+
82
+ # Check if we should stop due to too many failures
83
+ if self.state.consecutive_failures >= self.settings.max_failures:
84
+ logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
85
+ break
86
+
87
+ # Check control flags before each step
88
+ if self.state.stopped:
89
+ logger.info('Agent stopped')
90
+ break
91
+
92
+ while self.state.paused:
93
+ await asyncio.sleep(0.2) # Small delay to prevent CPU spinning
94
+ if self.state.stopped: # Allow stopping while paused
95
+ break
96
+
97
+ if on_step_start is not None:
98
+ await on_step_start(self)
99
+
100
+ step_info = AgentStepInfo(step_number=step, max_steps=max_steps)
101
+ await self.step(step_info)
102
+
103
+ if on_step_end is not None:
104
+ await on_step_end(self)
105
+
106
+ if self.state.history.is_done():
107
+ if self.settings.validate_output and step < max_steps - 1:
108
+ if not await self._validate_output():
109
+ continue
110
+
111
+ await self.log_completion()
112
+ break
113
+ else:
114
+ error_message = 'Failed to complete task in maximum steps'
115
+
116
+ self.state.history.history.append(
117
+ AgentHistory(
118
+ model_output=None,
119
+ result=[ActionResult(error=error_message, include_in_memory=True)],
120
+ state=BrowserStateHistory(
121
+ url='',
122
+ title='',
123
+ tabs=[],
124
+ interacted_element=[],
125
+ screenshot=None,
126
+ ),
127
+ metadata=None,
128
+ )
129
+ )
130
+
131
+ logger.info(f'❌ {error_message}')
132
+
133
+ return self.state.history
134
+
135
+ except KeyboardInterrupt:
136
+ # Already handled by our signal handler, but catch any direct KeyboardInterrupt as well
137
+ logger.info('Got KeyboardInterrupt during execution, returning current history')
138
+ return self.state.history
139
+
140
+ finally:
141
+ # Unregister signal handlers before cleanup
142
+ signal_handler.unregister()
143
+
144
+ if self.settings.save_playwright_script_path:
145
+ logger.info(
146
+ f'Agent run finished. Attempting to save Playwright script to: {self.settings.save_playwright_script_path}'
147
+ )
148
+ try:
149
+ # Extract sensitive data keys if sensitive_data is provided
150
+ keys = list(self.sensitive_data.keys()) if self.sensitive_data else None
151
+ # Pass browser and context config to the saving method
152
+ self.state.history.save_as_playwright_script(
153
+ self.settings.save_playwright_script_path,
154
+ sensitive_data_keys=keys,
155
+ browser_config=self.browser.config,
156
+ context_config=self.browser_context.config,
157
+ )
158
+ except Exception as script_gen_err:
159
+ # Log any error during script generation/saving
160
+ logger.error(f'Failed to save Playwright script: {script_gen_err}', exc_info=True)
161
+
162
+ await self.close()
163
+
164
+ if self.settings.generate_gif:
165
+ output_path: str = 'agent_history.gif'
166
+ if isinstance(self.settings.generate_gif, str):
167
+ output_path = self.settings.generate_gif
168
+
169
+ create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
src/agent/deep_research/deep_research_agent.py ADDED
@@ -0,0 +1,1261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ import threading
6
+ import uuid
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, TypedDict
9
+
10
+ from browser_use.browser.browser import BrowserConfig
11
+ from langchain_community.tools.file_management import (
12
+ ListDirectoryTool,
13
+ ReadFileTool,
14
+ WriteFileTool,
15
+ )
16
+
17
+ # Langchain imports
18
+ from langchain_core.messages import (
19
+ AIMessage,
20
+ BaseMessage,
21
+ HumanMessage,
22
+ SystemMessage,
23
+ ToolMessage,
24
+ )
25
+ from langchain_core.prompts import ChatPromptTemplate
26
+ from langchain_core.tools import StructuredTool, Tool
27
+
28
+ # Langgraph imports
29
+ from langgraph.graph import StateGraph
30
+ from pydantic import BaseModel, Field
31
+
32
+ from browser_use.browser.context import BrowserContextConfig
33
+
34
+ from src.agent.browser_use.browser_use_agent import BrowserUseAgent
35
+ from src.browser.custom_browser import CustomBrowser
36
+ from src.controller.custom_controller import CustomController
37
+ from src.utils.mcp_client import setup_mcp_client_and_tools
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ # Constants
42
+ REPORT_FILENAME = "report.md"
43
+ PLAN_FILENAME = "research_plan.md"
44
+ SEARCH_INFO_FILENAME = "search_info.json"
45
+
46
+ _AGENT_STOP_FLAGS = {}
47
+ _BROWSER_AGENT_INSTANCES = {}
48
+
49
+
50
+ async def run_single_browser_task(
51
+ task_query: str,
52
+ task_id: str,
53
+ llm: Any, # Pass the main LLM
54
+ browser_config: Dict[str, Any],
55
+ stop_event: threading.Event,
56
+ use_vision: bool = False,
57
+ ) -> Dict[str, Any]:
58
+ """
59
+ Runs a single BrowserUseAgent task.
60
+ Manages browser creation and closing for this specific task.
61
+ """
62
+ if not BrowserUseAgent:
63
+ return {
64
+ "query": task_query,
65
+ "error": "BrowserUseAgent components not available.",
66
+ }
67
+
68
+ # --- Browser Setup ---
69
+ # These should ideally come from the main agent's config
70
+ headless = browser_config.get("headless", False)
71
+ window_w = browser_config.get("window_width", 1280)
72
+ window_h = browser_config.get("window_height", 1100)
73
+ browser_user_data_dir = browser_config.get("user_data_dir", None)
74
+ use_own_browser = browser_config.get("use_own_browser", False)
75
+ browser_binary_path = browser_config.get("browser_binary_path", None)
76
+ wss_url = browser_config.get("wss_url", None)
77
+ cdp_url = browser_config.get("cdp_url", None)
78
+ disable_security = browser_config.get("disable_security", False)
79
+
80
+ bu_browser = None
81
+ bu_browser_context = None
82
+ try:
83
+ logger.info(f"Starting browser task for query: {task_query}")
84
+ extra_args = []
85
+ if use_own_browser:
86
+ browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path
87
+ if browser_binary_path == "":
88
+ browser_binary_path = None
89
+ browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None)
90
+ if browser_user_data:
91
+ extra_args += [f"--user-data-dir={browser_user_data}"]
92
+ else:
93
+ browser_binary_path = None
94
+
95
+ bu_browser = CustomBrowser(
96
+ config=BrowserConfig(
97
+ headless=headless,
98
+ browser_binary_path=browser_binary_path,
99
+ extra_browser_args=extra_args,
100
+ wss_url=wss_url,
101
+ cdp_url=cdp_url,
102
+ new_context_config=BrowserContextConfig(
103
+ window_width=window_w,
104
+ window_height=window_h,
105
+ )
106
+ )
107
+ )
108
+
109
+ context_config = BrowserContextConfig(
110
+ save_downloads_path="./tmp/downloads",
111
+ window_height=window_h,
112
+ window_width=window_w,
113
+ force_new_context=True,
114
+ )
115
+ bu_browser_context = await bu_browser.new_context(config=context_config)
116
+
117
+ # Simple controller example, replace with your actual implementation if needed
118
+ bu_controller = CustomController()
119
+
120
+ # Construct the task prompt for BrowserUseAgent
121
+ # Instruct it to find specific info and return title/URL
122
+ bu_task_prompt = f"""
123
+ Research Task: {task_query}
124
+ Objective: Find relevant information answering the query.
125
+ Output Requirements: For each relevant piece of information found, please provide:
126
+ 1. A concise summary of the information.
127
+ 2. The title of the source page or document.
128
+ 3. The URL of the source.
129
+ Focus on accuracy and relevance. Avoid irrelevant details.
130
+ PDF cannot directly extract _content, please try to download first, then using read_file, if you can't save or read, please try other methods.
131
+ """
132
+
133
+ bu_agent_instance = BrowserUseAgent(
134
+ task=bu_task_prompt,
135
+ llm=llm, # Use the passed LLM
136
+ browser=bu_browser,
137
+ browser_context=bu_browser_context,
138
+ controller=bu_controller,
139
+ use_vision=use_vision,
140
+ source="webui",
141
+ )
142
+
143
+ # Store instance for potential stop() call
144
+ task_key = f"{task_id}_{uuid.uuid4()}"
145
+ _BROWSER_AGENT_INSTANCES[task_key] = bu_agent_instance
146
+
147
+ # --- Run with Stop Check ---
148
+ # BrowserUseAgent needs to internally check a stop signal or have a stop method.
149
+ # We simulate checking before starting and assume `run` might be interruptible
150
+ # or have its own stop mechanism we can trigger via bu_agent_instance.stop().
151
+ if stop_event.is_set():
152
+ logger.info(f"Browser task for '{task_query}' cancelled before start.")
153
+ return {"query": task_query, "result": None, "status": "cancelled"}
154
+
155
+ # The run needs to be awaitable and ideally accept a stop signal or have a .stop() method
156
+ # result = await bu_agent_instance.run(max_steps=max_steps) # Add max_steps if applicable
157
+ # Let's assume a simplified run for now
158
+ logger.info(f"Running BrowserUseAgent for: {task_query}")
159
+ result = await bu_agent_instance.run() # Assuming run is the main method
160
+ logger.info(f"BrowserUseAgent finished for: {task_query}")
161
+
162
+ final_data = result.final_result()
163
+
164
+ if stop_event.is_set():
165
+ logger.info(f"Browser task for '{task_query}' stopped during execution.")
166
+ return {"query": task_query, "result": final_data, "status": "stopped"}
167
+ else:
168
+ logger.info(f"Browser result for '{task_query}': {final_data}")
169
+ return {"query": task_query, "result": final_data, "status": "completed"}
170
+
171
+ except Exception as e:
172
+ logger.error(
173
+ f"Error during browser task for query '{task_query}': {e}", exc_info=True
174
+ )
175
+ return {"query": task_query, "error": str(e), "status": "failed"}
176
+ finally:
177
+ if bu_browser_context:
178
+ try:
179
+ await bu_browser_context.close()
180
+ bu_browser_context = None
181
+ logger.info("Closed browser context.")
182
+ except Exception as e:
183
+ logger.error(f"Error closing browser context: {e}")
184
+ if bu_browser:
185
+ try:
186
+ await bu_browser.close()
187
+ bu_browser = None
188
+ logger.info("Closed browser.")
189
+ except Exception as e:
190
+ logger.error(f"Error closing browser: {e}")
191
+
192
+ if task_key in _BROWSER_AGENT_INSTANCES:
193
+ del _BROWSER_AGENT_INSTANCES[task_key]
194
+
195
+
196
+ class BrowserSearchInput(BaseModel):
197
+ queries: List[str] = Field(
198
+ description="List of distinct search queries to find information relevant to the research task."
199
+ )
200
+
201
+
202
+ async def _run_browser_search_tool(
203
+ queries: List[str],
204
+ task_id: str, # Injected dependency
205
+ llm: Any, # Injected dependency
206
+ browser_config: Dict[str, Any],
207
+ stop_event: threading.Event,
208
+ max_parallel_browsers: int = 1,
209
+ ) -> List[Dict[str, Any]]:
210
+ """
211
+ Internal function to execute parallel browser searches based on LLM-provided queries.
212
+ Handles concurrency and stop signals.
213
+ """
214
+
215
+ # Limit queries just in case LLM ignores the description
216
+ queries = queries[:max_parallel_browsers]
217
+ logger.info(
218
+ f"[Browser Tool {task_id}] Running search for {len(queries)} queries: {queries}"
219
+ )
220
+
221
+ results = []
222
+ semaphore = asyncio.Semaphore(max_parallel_browsers)
223
+
224
+ async def task_wrapper(query):
225
+ async with semaphore:
226
+ if stop_event.is_set():
227
+ logger.info(
228
+ f"[Browser Tool {task_id}] Skipping task due to stop signal: {query}"
229
+ )
230
+ return {"query": query, "result": None, "status": "cancelled"}
231
+ # Pass necessary injected configs and the stop event
232
+ return await run_single_browser_task(
233
+ query,
234
+ task_id,
235
+ llm, # Pass the main LLM (or a dedicated one if needed)
236
+ browser_config,
237
+ stop_event,
238
+ # use_vision could be added here if needed
239
+ )
240
+
241
+ tasks = [task_wrapper(query) for query in queries]
242
+ search_results = await asyncio.gather(*tasks, return_exceptions=True)
243
+
244
+ processed_results = []
245
+ for i, res in enumerate(search_results):
246
+ query = queries[i] # Get corresponding query
247
+ if isinstance(res, Exception):
248
+ logger.error(
249
+ f"[Browser Tool {task_id}] Gather caught exception for query '{query}': {res}",
250
+ exc_info=True,
251
+ )
252
+ processed_results.append(
253
+ {"query": query, "error": str(res), "status": "failed"}
254
+ )
255
+ elif isinstance(res, dict):
256
+ processed_results.append(res)
257
+ else:
258
+ logger.error(
259
+ f"[Browser Tool {task_id}] Unexpected result type for query '{query}': {type(res)}"
260
+ )
261
+ processed_results.append(
262
+ {"query": query, "error": "Unexpected result type", "status": "failed"}
263
+ )
264
+
265
+ logger.info(
266
+ f"[Browser Tool {task_id}] Finished search. Results count: {len(processed_results)}"
267
+ )
268
+ return processed_results
269
+
270
+
271
+ def create_browser_search_tool(
272
+ llm: Any,
273
+ browser_config: Dict[str, Any],
274
+ task_id: str,
275
+ stop_event: threading.Event,
276
+ max_parallel_browsers: int = 1,
277
+ ) -> StructuredTool:
278
+ """Factory function to create the browser search tool with necessary dependencies."""
279
+ # Use partial to bind the dependencies that aren't part of the LLM call arguments
280
+ from functools import partial
281
+
282
+ bound_tool_func = partial(
283
+ _run_browser_search_tool,
284
+ task_id=task_id,
285
+ llm=llm,
286
+ browser_config=browser_config,
287
+ stop_event=stop_event,
288
+ max_parallel_browsers=max_parallel_browsers,
289
+ )
290
+
291
+ return StructuredTool.from_function(
292
+ coroutine=bound_tool_func,
293
+ name="parallel_browser_search",
294
+ description=f"""Use this tool to actively search the web for information related to a specific research task or question.
295
+ It runs up to {max_parallel_browsers} searches in parallel using a browser agent for better results than simple scraping.
296
+ Provide a list of distinct search queries(up to {max_parallel_browsers}) that are likely to yield relevant information.""",
297
+ args_schema=BrowserSearchInput,
298
+ )
299
+
300
+
301
+ # --- Langgraph State Definition ---
302
+
303
+
304
+ class ResearchTaskItem(TypedDict):
305
+ # step: int # Maybe step within category, or just implicit by order
306
+ task_description: str
307
+ status: str # "pending", "completed", "failed"
308
+ queries: Optional[List[str]]
309
+ result_summary: Optional[str]
310
+
311
+
312
+ class ResearchCategoryItem(TypedDict):
313
+ category_name: str
314
+ tasks: List[ResearchTaskItem]
315
+ # Optional: category_status: str # Could be "pending", "in_progress", "completed"
316
+
317
+
318
+ class DeepResearchState(TypedDict):
319
+ task_id: str
320
+ topic: str
321
+ research_plan: List[ResearchCategoryItem] # CHANGED
322
+ search_results: List[Dict[str, Any]]
323
+ llm: Any
324
+ tools: List[Tool]
325
+ output_dir: Path
326
+ browser_config: Dict[str, Any]
327
+ final_report: Optional[str]
328
+ current_category_index: int
329
+ current_task_index_in_category: int
330
+ stop_requested: bool
331
+ error_message: Optional[str]
332
+ messages: List[BaseMessage]
333
+
334
+
335
+ # --- Langgraph Nodes ---
336
+
337
+
338
+ def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]:
339
+ state_updates = {}
340
+ plan_file = os.path.join(output_dir, PLAN_FILENAME)
341
+ search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME)
342
+
343
+ loaded_plan: List[ResearchCategoryItem] = []
344
+ next_cat_idx, next_task_idx = 0, 0
345
+ found_pending = False
346
+
347
+ if os.path.exists(plan_file):
348
+ try:
349
+ with open(plan_file, "r", encoding="utf-8") as f:
350
+ current_category: Optional[ResearchCategoryItem] = None
351
+ lines = f.readlines()
352
+ cat_counter = 0
353
+ task_counter_in_cat = 0
354
+
355
+ for line_num, line_content in enumerate(lines):
356
+ line = line_content.strip()
357
+ if line.startswith("## "): # Category
358
+ if current_category: # Save previous category
359
+ loaded_plan.append(current_category)
360
+ if not found_pending: # If previous category was all done, advance cat counter
361
+ cat_counter += 1
362
+ task_counter_in_cat = 0
363
+ category_name = line[line.find(" "):].strip() # Get text after "## X. "
364
+ current_category = ResearchCategoryItem(category_name=category_name, tasks=[])
365
+ elif (line.startswith("- [ ]") or line.startswith("- [x]") or line.startswith(
366
+ "- [-]")) and current_category: # Task
367
+ status = "pending"
368
+ if line.startswith("- [x]"):
369
+ status = "completed"
370
+ elif line.startswith("- [-]"):
371
+ status = "failed"
372
+
373
+ task_desc = line[5:].strip()
374
+ current_category["tasks"].append(
375
+ ResearchTaskItem(task_description=task_desc, status=status, queries=None,
376
+ result_summary=None)
377
+ )
378
+ if status == "pending" and not found_pending:
379
+ next_cat_idx = cat_counter
380
+ next_task_idx = task_counter_in_cat
381
+ found_pending = True
382
+ if not found_pending: # only increment if previous tasks were completed/failed
383
+ task_counter_in_cat += 1
384
+
385
+ if current_category: # Append last category
386
+ loaded_plan.append(current_category)
387
+
388
+ if loaded_plan:
389
+ state_updates["research_plan"] = loaded_plan
390
+ if not found_pending and loaded_plan: # All tasks were completed or failed
391
+ next_cat_idx = len(loaded_plan) # Points beyond the last category
392
+ next_task_idx = 0
393
+ state_updates["current_category_index"] = next_cat_idx
394
+ state_updates["current_task_index_in_category"] = next_task_idx
395
+ logger.info(
396
+ f"Loaded hierarchical research plan from {plan_file}. "
397
+ f"Next task: Category {next_cat_idx}, Task {next_task_idx} in category."
398
+ )
399
+ else:
400
+ logger.warning(f"Plan file {plan_file} was empty or malformed.")
401
+
402
+ except Exception as e:
403
+ logger.error(f"Failed to load or parse research plan {plan_file}: {e}", exc_info=True)
404
+ state_updates["error_message"] = f"Failed to load research plan: {e}"
405
+ else:
406
+ logger.info(f"Plan file {plan_file} not found. Will start fresh.")
407
+
408
+ if os.path.exists(search_file):
409
+ try:
410
+ with open(search_file, "r", encoding="utf-8") as f:
411
+ state_updates["search_results"] = json.load(f)
412
+ logger.info(f"Loaded search results from {search_file}")
413
+ except Exception as e:
414
+ logger.error(f"Failed to load search results {search_file}: {e}")
415
+ state_updates["error_message"] = (
416
+ state_updates.get("error_message", "") + f" Failed to load search results: {e}").strip()
417
+
418
+ return state_updates
419
+
420
+
421
+ def _save_plan_to_md(plan: List[ResearchCategoryItem], output_dir: str):
422
+ plan_file = os.path.join(output_dir, PLAN_FILENAME)
423
+ try:
424
+ with open(plan_file, "w", encoding="utf-8") as f:
425
+ f.write(f"# Research Plan\n\n")
426
+ for cat_idx, category in enumerate(plan):
427
+ f.write(f"## {cat_idx + 1}. {category['category_name']}\n\n")
428
+ for task_idx, task in enumerate(category['tasks']):
429
+ marker = "- [x]" if task["status"] == "completed" else "- [ ]" if task[
430
+ "status"] == "pending" else "- [-]" # [-] for failed
431
+ f.write(f" {marker} {task['task_description']}\n")
432
+ f.write("\n")
433
+ logger.info(f"Hierarchical research plan saved to {plan_file}")
434
+ except Exception as e:
435
+ logger.error(f"Failed to save research plan to {plan_file}: {e}")
436
+
437
+
438
+ def _save_search_results_to_json(results: List[Dict[str, Any]], output_dir: str):
439
+ """Appends or overwrites search results to a JSON file."""
440
+ search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME)
441
+ try:
442
+ # Simple overwrite for now, could be append
443
+ with open(search_file, "w", encoding="utf-8") as f:
444
+ json.dump(results, f, indent=2, ensure_ascii=False)
445
+ logger.info(f"Search results saved to {search_file}")
446
+ except Exception as e:
447
+ logger.error(f"Failed to save search results to {search_file}: {e}")
448
+
449
+
450
+ def _save_report_to_md(report: str, output_dir: Path):
451
+ """Saves the final report to a markdown file."""
452
+ report_file = os.path.join(output_dir, REPORT_FILENAME)
453
+ try:
454
+ with open(report_file, "w", encoding="utf-8") as f:
455
+ f.write(report)
456
+ logger.info(f"Final report saved to {report_file}")
457
+ except Exception as e:
458
+ logger.error(f"Failed to save final report to {report_file}: {e}")
459
+
460
+
461
+ async def planning_node(state: DeepResearchState) -> Dict[str, Any]:
462
+ logger.info("--- Entering Planning Node ---")
463
+ if state.get("stop_requested"):
464
+ logger.info("Stop requested, skipping planning.")
465
+ return {"stop_requested": True}
466
+
467
+ llm = state["llm"]
468
+ topic = state["topic"]
469
+ existing_plan = state.get("research_plan")
470
+ output_dir = state["output_dir"]
471
+
472
+ if existing_plan and (
473
+ state.get("current_category_index", 0) > 0 or state.get("current_task_index_in_category", 0) > 0):
474
+ logger.info("Resuming with existing plan.")
475
+ _save_plan_to_md(existing_plan, output_dir) # Ensure it's saved initially
476
+ # current_category_index and current_task_index_in_category should be set by _load_previous_state
477
+ return {"research_plan": existing_plan}
478
+
479
+ logger.info(f"Generating new research plan for topic: {topic}")
480
+
481
+ prompt_text = f"""You are a meticulous research assistant. Your goal is to create a hierarchical research plan to thoroughly investigate the topic: "{topic}".
482
+ The plan should be structured into several main research categories. Each category should contain a list of specific, actionable research tasks or questions.
483
+ Format the output as a JSON list of objects. Each object represents a research category and should have:
484
+ 1. "category_name": A string for the name of the research category.
485
+ 2. "tasks": A list of strings, where each string is a specific research task for that category.
486
+
487
+ Example JSON Output:
488
+ [
489
+ {{
490
+ "category_name": "Understanding Core Concepts and Definitions",
491
+ "tasks": [
492
+ "Define the primary terminology associated with '{topic}'.",
493
+ "Identify the fundamental principles and theories underpinning '{topic}'."
494
+ ]
495
+ }},
496
+ {{
497
+ "category_name": "Historical Development and Key Milestones",
498
+ "tasks": [
499
+ "Trace the historical evolution of '{topic}'.",
500
+ "Identify key figures, events, or breakthroughs in the development of '{topic}'."
501
+ ]
502
+ }},
503
+ {{
504
+ "category_name": "Current State-of-the-Art and Applications",
505
+ "tasks": [
506
+ "Analyze the current advancements and prominent applications of '{topic}'.",
507
+ "Investigate ongoing research and active areas of development related to '{topic}'."
508
+ ]
509
+ }},
510
+ {{
511
+ "category_name": "Challenges, Limitations, and Future Outlook",
512
+ "tasks": [
513
+ "Identify the major challenges and limitations currently facing '{topic}'.",
514
+ "Explore potential future trends, ethical considerations, and societal impacts of '{topic}'."
515
+ ]
516
+ }}
517
+ ]
518
+
519
+ Generate a plan with 3-10 categories, and 2-6 tasks per category for the topic: "{topic}" according to the complexity of the topic.
520
+ Ensure the output is a valid JSON array.
521
+ """
522
+ messages = [
523
+ SystemMessage(content="You are a research planning assistant outputting JSON."),
524
+ HumanMessage(content=prompt_text)
525
+ ]
526
+
527
+ try:
528
+ response = await llm.ainvoke(messages)
529
+ raw_content = response.content
530
+ # The LLM might wrap the JSON in backticks
531
+ if raw_content.strip().startswith("```json"):
532
+ raw_content = raw_content.strip()[7:-3].strip()
533
+ elif raw_content.strip().startswith("```"):
534
+ raw_content = raw_content.strip()[3:-3].strip()
535
+
536
+ logger.debug(f"LLM response for plan: {raw_content}")
537
+ parsed_plan_from_llm = json.loads(raw_content)
538
+
539
+ new_plan: List[ResearchCategoryItem] = []
540
+ for cat_idx, category_data in enumerate(parsed_plan_from_llm):
541
+ if not isinstance(category_data,
542
+ dict) or "category_name" not in category_data or "tasks" not in category_data:
543
+ logger.warning(f"Skipping invalid category data: {category_data}")
544
+ continue
545
+
546
+ tasks: List[ResearchTaskItem] = []
547
+ for task_idx, task_desc in enumerate(category_data["tasks"]):
548
+ if isinstance(task_desc, str):
549
+ tasks.append(
550
+ ResearchTaskItem(
551
+ task_description=task_desc,
552
+ status="pending",
553
+ queries=None,
554
+ result_summary=None,
555
+ )
556
+ )
557
+ else: # Sometimes LLM puts tasks as {"task": "description"}
558
+ if isinstance(task_desc, dict) and "task_description" in task_desc:
559
+ tasks.append(
560
+ ResearchTaskItem(
561
+ task_description=task_desc["task_description"],
562
+ status="pending",
563
+ queries=None,
564
+ result_summary=None,
565
+ )
566
+ )
567
+ elif isinstance(task_desc, dict) and "task" in task_desc: # common LLM mistake
568
+ tasks.append(
569
+ ResearchTaskItem(
570
+ task_description=task_desc["task"],
571
+ status="pending",
572
+ queries=None,
573
+ result_summary=None,
574
+ )
575
+ )
576
+ else:
577
+ logger.warning(
578
+ f"Skipping invalid task data: {task_desc} in category {category_data['category_name']}")
579
+
580
+ new_plan.append(
581
+ ResearchCategoryItem(
582
+ category_name=category_data["category_name"],
583
+ tasks=tasks,
584
+ )
585
+ )
586
+
587
+ if not new_plan:
588
+ logger.error("LLM failed to generate a valid plan structure from JSON.")
589
+ return {"error_message": "Failed to generate research plan structure."}
590
+
591
+ logger.info(f"Generated research plan with {len(new_plan)} categories.")
592
+ _save_plan_to_md(new_plan, output_dir) # Save the hierarchical plan
593
+
594
+ return {
595
+ "research_plan": new_plan,
596
+ "current_category_index": 0,
597
+ "current_task_index_in_category": 0,
598
+ "search_results": [],
599
+ }
600
+
601
+ except json.JSONDecodeError as e:
602
+ logger.error(f"Failed to parse JSON from LLM for plan: {e}. Response was: {raw_content}", exc_info=True)
603
+ return {"error_message": f"LLM generated invalid JSON for research plan: {e}"}
604
+ except Exception as e:
605
+ logger.error(f"Error during planning: {e}", exc_info=True)
606
+ return {"error_message": f"LLM Error during planning: {e}"}
607
+
608
+
609
+ async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]:
610
+ logger.info("--- Entering Research Execution Node ---")
611
+ if state.get("stop_requested"):
612
+ logger.info("Stop requested, skipping research execution.")
613
+ return {
614
+ "stop_requested": True,
615
+ "current_category_index": state["current_category_index"],
616
+ "current_task_index_in_category": state["current_task_index_in_category"],
617
+ }
618
+
619
+ plan = state["research_plan"]
620
+ cat_idx = state["current_category_index"]
621
+ task_idx = state["current_task_index_in_category"]
622
+ llm = state["llm"]
623
+ tools = state["tools"]
624
+ output_dir = str(state["output_dir"])
625
+ task_id = state["task_id"] # For _AGENT_STOP_FLAGS
626
+
627
+ # This check should ideally be handled by `should_continue`
628
+ if not plan or cat_idx >= len(plan):
629
+ logger.info("Research plan complete or categories exhausted.")
630
+ return {} # should route to synthesis
631
+
632
+ current_category = plan[cat_idx]
633
+ if task_idx >= len(current_category["tasks"]):
634
+ logger.info(f"All tasks in category '{current_category['category_name']}' completed. Moving to next category.")
635
+ # This logic is now effectively handled by should_continue and the index updates below
636
+ # The next iteration will be caught by should_continue or this node with updated indices
637
+ return {
638
+ "current_category_index": cat_idx + 1,
639
+ "current_task_index_in_category": 0,
640
+ "messages": state["messages"] # Pass messages along
641
+ }
642
+
643
+ current_task = current_category["tasks"][task_idx]
644
+
645
+ if current_task["status"] == "completed":
646
+ logger.info(
647
+ f"Task '{current_task['task_description']}' in category '{current_category['category_name']}' already completed. Skipping.")
648
+ # Logic to find next task
649
+ next_task_idx = task_idx + 1
650
+ next_cat_idx = cat_idx
651
+ if next_task_idx >= len(current_category["tasks"]):
652
+ next_cat_idx += 1
653
+ next_task_idx = 0
654
+ return {
655
+ "current_category_index": next_cat_idx,
656
+ "current_task_index_in_category": next_task_idx,
657
+ "messages": state["messages"] # Pass messages along
658
+ }
659
+
660
+ logger.info(
661
+ f"Executing research task: '{current_task['task_description']}' (Category: '{current_category['category_name']}')"
662
+ )
663
+
664
+ llm_with_tools = llm.bind_tools(tools)
665
+
666
+ # Construct messages for LLM invocation
667
+ task_prompt_content = (
668
+ f"Current Research Category: {current_category['category_name']}\n"
669
+ f"Specific Task: {current_task['task_description']}\n\n"
670
+ "Please use the available tools, especially 'parallel_browser_search', to gather information for this specific task. "
671
+ "Provide focused search queries relevant ONLY to this task. "
672
+ "If you believe you have sufficient information from previous steps for this specific task, you can indicate that you are ready to summarize or that no further search is needed."
673
+ )
674
+ current_task_message_history = [
675
+ HumanMessage(content=task_prompt_content)
676
+ ]
677
+ if not state["messages"]: # First actual execution message
678
+ invocation_messages = [
679
+ SystemMessage(
680
+ content="You are a research assistant executing one task of a research plan. Focus on the current task only."),
681
+ ] + current_task_message_history
682
+ else:
683
+ invocation_messages = state["messages"] + current_task_message_history
684
+
685
+ try:
686
+ logger.info(f"Invoking LLM with tools for task: {current_task['task_description']}")
687
+ ai_response: BaseMessage = await llm_with_tools.ainvoke(invocation_messages)
688
+ logger.info("LLM invocation complete.")
689
+
690
+ tool_results = []
691
+ executed_tool_names = []
692
+ current_search_results = state.get("search_results", []) # Get existing search results
693
+
694
+ if not isinstance(ai_response, AIMessage) or not ai_response.tool_calls:
695
+ logger.warning(
696
+ f"LLM did not call any tool for task '{current_task['task_description']}'. Response: {ai_response.content[:100]}..."
697
+ )
698
+ current_task["status"] = "pending" # Or "completed_no_tool" if LLM explains it's done
699
+ current_task["result_summary"] = f"LLM did not use a tool. Response: {ai_response.content}"
700
+ current_task["current_category_index"] = cat_idx
701
+ current_task["current_task_index_in_category"] = task_idx
702
+ return current_task
703
+ # We still save the plan and advance.
704
+ else:
705
+ # Process tool calls
706
+ for tool_call in ai_response.tool_calls:
707
+ tool_name = tool_call.get("name")
708
+ tool_args = tool_call.get("args", {})
709
+ tool_call_id = tool_call.get("id")
710
+
711
+ logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}")
712
+ executed_tool_names.append(tool_name)
713
+ selected_tool = next((t for t in tools if t.name == tool_name), None)
714
+
715
+ if not selected_tool:
716
+ logger.error(f"LLM called tool '{tool_name}' which is not available.")
717
+ tool_results.append(
718
+ ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id=tool_call_id))
719
+ continue
720
+
721
+ try:
722
+ stop_event = _AGENT_STOP_FLAGS.get(task_id)
723
+ if stop_event and stop_event.is_set():
724
+ logger.info(f"Stop requested before executing tool: {tool_name}")
725
+ current_task["status"] = "pending" # Or a new "stopped" status
726
+ _save_plan_to_md(plan, output_dir)
727
+ return {"stop_requested": True, "research_plan": plan, "current_category_index": cat_idx,
728
+ "current_task_index_in_category": task_idx}
729
+
730
+ logger.info(f"Executing tool: {tool_name}")
731
+ tool_output = await selected_tool.ainvoke(tool_args)
732
+ logger.info(f"Tool '{tool_name}' executed successfully.")
733
+
734
+ if tool_name == "parallel_browser_search":
735
+ current_search_results.extend(tool_output) # tool_output is List[Dict]
736
+ else: # For other tools, we might need specific handling or just log
737
+ logger.info(f"Result from tool '{tool_name}': {str(tool_output)[:200]}...")
738
+ # Storing non-browser results might need a different structure or key in search_results
739
+ current_search_results.append(
740
+ {"tool_name": tool_name, "args": tool_args, "output": str(tool_output),
741
+ "status": "completed"})
742
+
743
+ tool_results.append(ToolMessage(content=json.dumps(tool_output), tool_call_id=tool_call_id))
744
+
745
+ except Exception as e:
746
+ logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True)
747
+ tool_results.append(
748
+ ToolMessage(content=f"Error executing tool {tool_name}: {e}", tool_call_id=tool_call_id))
749
+ current_search_results.append(
750
+ {"tool_name": tool_name, "args": tool_args, "status": "failed", "error": str(e)})
751
+
752
+ # After processing all tool calls for this task
753
+ step_failed_tool_execution = any("Error:" in str(tr.content) for tr in tool_results)
754
+ # Consider a task successful if a browser search was attempted and didn't immediately error out during call
755
+ # The browser search itself returns status for each query.
756
+ browser_tool_attempted_successfully = "parallel_browser_search" in executed_tool_names and not step_failed_tool_execution
757
+
758
+ if step_failed_tool_execution:
759
+ current_task["status"] = "failed"
760
+ current_task[
761
+ "result_summary"] = f"Tool execution failed. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}"
762
+ elif executed_tool_names: # If any tool was called
763
+ current_task["status"] = "completed"
764
+ current_task["result_summary"] = f"Executed tool(s): {', '.join(executed_tool_names)}."
765
+ # TODO: Could ask LLM to summarize the tool_results for this task if needed, rather than just listing tools.
766
+ else: # No tool calls but AI response had .tool_calls structure (empty)
767
+ current_task["status"] = "failed" # Or a more specific status
768
+ current_task["result_summary"] = "LLM prepared for tool call but provided no tools."
769
+
770
+ # Save progress
771
+ _save_plan_to_md(plan, output_dir)
772
+ _save_search_results_to_json(current_search_results, output_dir)
773
+
774
+ # Determine next indices
775
+ next_task_idx = task_idx + 1
776
+ next_cat_idx = cat_idx
777
+ if next_task_idx >= len(current_category["tasks"]):
778
+ next_cat_idx += 1
779
+ next_task_idx = 0
780
+
781
+ updated_messages = state["messages"] + current_task_message_history + [ai_response] + tool_results
782
+
783
+ return {
784
+ "research_plan": plan,
785
+ "search_results": current_search_results,
786
+ "current_category_index": next_cat_idx,
787
+ "current_task_index_in_category": next_task_idx,
788
+ "messages": updated_messages,
789
+ }
790
+
791
+ except Exception as e:
792
+ logger.error(f"Unhandled error during research execution for task '{current_task['task_description']}': {e}",
793
+ exc_info=True)
794
+ current_task["status"] = "failed"
795
+ _save_plan_to_md(plan, output_dir)
796
+ # Determine next indices even on error to attempt to move on
797
+ next_task_idx = task_idx + 1
798
+ next_cat_idx = cat_idx
799
+ if next_task_idx >= len(current_category["tasks"]):
800
+ next_cat_idx += 1
801
+ next_task_idx = 0
802
+ return {
803
+ "research_plan": plan,
804
+ "current_category_index": next_cat_idx,
805
+ "current_task_index_in_category": next_task_idx,
806
+ "error_message": f"Core Execution Error on task '{current_task['task_description']}': {e}",
807
+ "messages": state["messages"] + current_task_message_history # Preserve messages up to error
808
+ }
809
+
810
+
811
+ async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]:
812
+ """Synthesizes the final report from the collected search results."""
813
+ logger.info("--- Entering Synthesis Node ---")
814
+ if state.get("stop_requested"):
815
+ logger.info("Stop requested, skipping synthesis.")
816
+ return {"stop_requested": True}
817
+
818
+ llm = state["llm"]
819
+ topic = state["topic"]
820
+ search_results = state.get("search_results", [])
821
+ output_dir = state["output_dir"]
822
+ plan = state["research_plan"] # Include plan for context
823
+
824
+ if not search_results:
825
+ logger.warning("No search results found to synthesize report.")
826
+ report = f"# Research Report: {topic}\n\nNo information was gathered during the research process."
827
+ _save_report_to_md(report, output_dir)
828
+ return {"final_report": report}
829
+
830
+ logger.info(
831
+ f"Synthesizing report from {len(search_results)} collected search result entries."
832
+ )
833
+
834
+ # Prepare context for the LLM
835
+ # Format search results nicely, maybe group by query or original plan step
836
+ formatted_results = ""
837
+ references = {}
838
+ ref_count = 1
839
+ for i, result_entry in enumerate(search_results):
840
+ query = result_entry.get("query", "Unknown Query") # From parallel_browser_search
841
+ tool_name = result_entry.get("tool_name") # From other tools
842
+ status = result_entry.get("status", "unknown")
843
+ result_data = result_entry.get("result") # From BrowserUseAgent's final_result
844
+ tool_output_str = result_entry.get("output") # From other tools
845
+
846
+ if tool_name == "parallel_browser_search" and status == "completed" and result_data:
847
+ # result_data is the summary from BrowserUseAgent
848
+ formatted_results += f'### Finding from Web Search Query: "{query}"\n'
849
+ formatted_results += f"- **Summary:**\n{result_data}\n" # result_data is already a summary string here
850
+ # If result_data contained title/URL, you'd format them here.
851
+ # The current BrowserUseAgent returns a string summary directly as 'final_data' in run_single_browser_task
852
+ formatted_results += "---\n"
853
+ elif tool_name != "parallel_browser_search" and status == "completed" and tool_output_str:
854
+ formatted_results += f'### Finding from Tool: "{tool_name}" (Args: {result_entry.get("args")})\n'
855
+ formatted_results += f"- **Output:**\n{tool_output_str}\n"
856
+ formatted_results += "---\n"
857
+ elif status == "failed":
858
+ error = result_entry.get("error")
859
+ q_or_t = f"Query: \"{query}\"" if query != "Unknown Query" else f"Tool: \"{tool_name}\""
860
+ formatted_results += f'### Failed {q_or_t}\n'
861
+ formatted_results += f"- **Error:** {error}\n"
862
+ formatted_results += "---\n"
863
+
864
+ # Prepare the research plan context
865
+ plan_summary = "\nResearch Plan Followed:\n"
866
+ for cat_idx, category in enumerate(plan):
867
+ plan_summary += f"\n#### Category {cat_idx + 1}: {category['category_name']}\n"
868
+ for task_idx, task in enumerate(category['tasks']):
869
+ marker = "[x]" if task["status"] == "completed" else "[ ]" if task["status"] == "pending" else "[-]"
870
+ plan_summary += f" - {marker} {task['task_description']}\n"
871
+
872
+ synthesis_prompt = ChatPromptTemplate.from_messages(
873
+ [
874
+ (
875
+ "system",
876
+ """You are a professional researcher tasked with writing a comprehensive and well-structured report based on collected findings.
877
+ The report should address the research topic thoroughly, synthesizing the information gathered from various sources.
878
+ Structure the report logically:
879
+ 1. Briefly introduce the topic and the report's scope (mentioning the research plan followed, including categories and tasks, is good).
880
+ 2. Discuss the key findings, organizing them thematically, possibly aligning with the research categories. Analyze, compare, and contrast information.
881
+ 3. Summarize the main points and offer concluding thoughts.
882
+
883
+ Ensure the tone is objective and professional.
884
+ If findings are contradictory or incomplete, acknowledge this.
885
+ """, # Removed citation part for simplicity for now, as browser agent returns summaries.
886
+ ),
887
+ (
888
+ "human",
889
+ f"""
890
+ **Research Topic:** {topic}
891
+
892
+ {plan_summary}
893
+
894
+ **Collected Findings:**
895
+ ```
896
+ {formatted_results}
897
+ ```
898
+
899
+ Please generate the final research report in Markdown format based **only** on the information above.
900
+ """,
901
+ ),
902
+ ]
903
+ )
904
+
905
+ try:
906
+ response = await llm.ainvoke(
907
+ synthesis_prompt.format_prompt(
908
+ topic=topic,
909
+ plan_summary=plan_summary,
910
+ formatted_results=formatted_results,
911
+ ).to_messages()
912
+ )
913
+ final_report_md = response.content
914
+
915
+ # Append the reference list automatically to the end of the generated markdown
916
+ if references:
917
+ report_references_section = "\n\n## References\n\n"
918
+ # Sort refs by ID for consistent output
919
+ sorted_refs = sorted(references.values(), key=lambda x: x["id"])
920
+ for ref in sorted_refs:
921
+ report_references_section += (
922
+ f"[{ref['id']}] {ref['title']} - {ref['url']}\n"
923
+ )
924
+ final_report_md += report_references_section
925
+
926
+ logger.info("Successfully synthesized the final report.")
927
+ _save_report_to_md(final_report_md, output_dir)
928
+ return {"final_report": final_report_md}
929
+
930
+ except Exception as e:
931
+ logger.error(f"Error during report synthesis: {e}", exc_info=True)
932
+ return {"error_message": f"LLM Error during synthesis: {e}"}
933
+
934
+
935
+ # --- Langgraph Edges and Conditional Logic ---
936
+
937
+
938
+ def should_continue(state: DeepResearchState) -> str:
939
+ logger.info("--- Evaluating Condition: Should Continue? ---")
940
+ if state.get("stop_requested"):
941
+ logger.info("Stop requested, routing to END.")
942
+ return "end_run"
943
+ if state.get("error_message") and "Core Execution Error" in state["error_message"]: # Critical error in node
944
+ logger.warning(f"Critical error detected: {state['error_message']}. Routing to END.")
945
+ return "end_run"
946
+
947
+ plan = state.get("research_plan")
948
+ cat_idx = state.get("current_category_index", 0)
949
+ task_idx = state.get("current_task_index_in_category", 0) # This is the *next* task to check
950
+
951
+ if not plan:
952
+ logger.warning("No research plan found. Routing to END.")
953
+ return "end_run"
954
+
955
+ # Check if the current indices point to a valid pending task
956
+ if cat_idx < len(plan):
957
+ current_category = plan[cat_idx]
958
+ if task_idx < len(current_category["tasks"]):
959
+ # We are trying to execute the task at plan[cat_idx]["tasks"][task_idx]
960
+ # The research_execution_node will handle if it's already completed.
961
+ logger.info(
962
+ f"Plan has potential pending tasks (next up: Category {cat_idx}, Task {task_idx}). Routing to Research Execution."
963
+ )
964
+ return "execute_research"
965
+ else: # task_idx is out of bounds for current category, means we need to check next category
966
+ if cat_idx + 1 < len(plan): # If there is a next category
967
+ logger.info(
968
+ f"Finished tasks in category {cat_idx}. Moving to category {cat_idx + 1}. Routing to Research Execution."
969
+ )
970
+ # research_execution_node will update state to {current_category_index: cat_idx + 1, current_task_index_in_category: 0}
971
+ # Or rather, the previous execution node already set these indices to the start of the next category.
972
+ return "execute_research"
973
+
974
+ # If we've gone through all categories and tasks (cat_idx >= len(plan))
975
+ logger.info("All plan categories and tasks processed or current indices are out of bounds. Routing to Synthesis.")
976
+ return "synthesize_report"
977
+
978
+
979
+ # --- DeepSearchAgent Class ---
980
+
981
+
982
+ class DeepResearchAgent:
983
+ def __init__(
984
+ self,
985
+ llm: Any,
986
+ browser_config: Dict[str, Any],
987
+ mcp_server_config: Optional[Dict[str, Any]] = None,
988
+ ):
989
+ """
990
+ Initializes the DeepSearchAgent.
991
+
992
+ Args:
993
+ llm: The Langchain compatible language model instance.
994
+ browser_config: Configuration dictionary for the BrowserUseAgent tool.
995
+ Example: {"headless": True, "window_width": 1280, ...}
996
+ mcp_server_config: Optional configuration for the MCP client.
997
+ """
998
+ self.llm = llm
999
+ self.browser_config = browser_config
1000
+ self.mcp_server_config = mcp_server_config
1001
+ self.mcp_client = None
1002
+ self.stopped = False
1003
+ self.graph = self._compile_graph()
1004
+ self.current_task_id: Optional[str] = None
1005
+ self.stop_event: Optional[threading.Event] = None
1006
+ self.runner: Optional[asyncio.Task] = None # To hold the asyncio task for run
1007
+
1008
+ async def _setup_tools(
1009
+ self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1
1010
+ ) -> List[Tool]:
1011
+ """Sets up the basic tools (File I/O) and optional MCP tools."""
1012
+ tools = [
1013
+ WriteFileTool(),
1014
+ ReadFileTool(),
1015
+ ListDirectoryTool(),
1016
+ ] # Basic file operations
1017
+ browser_use_tool = create_browser_search_tool(
1018
+ llm=self.llm,
1019
+ browser_config=self.browser_config,
1020
+ task_id=task_id,
1021
+ stop_event=stop_event,
1022
+ max_parallel_browsers=max_parallel_browsers,
1023
+ )
1024
+ tools += [browser_use_tool]
1025
+ # Add MCP tools if config is provided
1026
+ if self.mcp_server_config:
1027
+ try:
1028
+ logger.info("Setting up MCP client and tools...")
1029
+ if not self.mcp_client:
1030
+ self.mcp_client = await setup_mcp_client_and_tools(
1031
+ self.mcp_server_config
1032
+ )
1033
+ mcp_tools = self.mcp_client.get_tools()
1034
+ logger.info(f"Loaded {len(mcp_tools)} MCP tools.")
1035
+ tools.extend(mcp_tools)
1036
+ except Exception as e:
1037
+ logger.error(f"Failed to set up MCP tools: {e}", exc_info=True)
1038
+ elif self.mcp_server_config:
1039
+ logger.warning(
1040
+ "MCP server config provided, but setup function unavailable."
1041
+ )
1042
+ tools_map = {tool.name: tool for tool in tools}
1043
+ return tools_map.values()
1044
+
1045
+ async def close_mcp_client(self):
1046
+ if self.mcp_client:
1047
+ await self.mcp_client.__aexit__(None, None, None)
1048
+ self.mcp_client = None
1049
+
1050
+ def _compile_graph(self) -> StateGraph:
1051
+ """Compiles the Langgraph state machine."""
1052
+ workflow = StateGraph(DeepResearchState)
1053
+
1054
+ # Add nodes
1055
+ workflow.add_node("plan_research", planning_node)
1056
+ workflow.add_node("execute_research", research_execution_node)
1057
+ workflow.add_node("synthesize_report", synthesis_node)
1058
+ workflow.add_node(
1059
+ "end_run", lambda state: logger.info("--- Reached End Run Node ---") or {}
1060
+ ) # Simple end node
1061
+
1062
+ # Define edges
1063
+ workflow.set_entry_point("plan_research")
1064
+
1065
+ workflow.add_edge(
1066
+ "plan_research", "execute_research"
1067
+ ) # Always execute after planning
1068
+
1069
+ # Conditional edge after execution
1070
+ workflow.add_conditional_edges(
1071
+ "execute_research",
1072
+ should_continue,
1073
+ {
1074
+ "execute_research": "execute_research", # Loop back if more steps
1075
+ "synthesize_report": "synthesize_report", # Move to synthesis if done
1076
+ "end_run": "end_run", # End if stop requested or error
1077
+ },
1078
+ )
1079
+
1080
+ workflow.add_edge("synthesize_report", "end_run") # End after synthesis
1081
+
1082
+ app = workflow.compile()
1083
+ return app
1084
+
1085
+ async def run(
1086
+ self,
1087
+ topic: str,
1088
+ task_id: Optional[str] = None,
1089
+ save_dir: str = "./tmp/deep_research",
1090
+ max_parallel_browsers: int = 1,
1091
+ ) -> Dict[str, Any]:
1092
+ """
1093
+ Starts the deep research process (Async Generator Version).
1094
+
1095
+ Args:
1096
+ topic: The research topic.
1097
+ task_id: Optional existing task ID to resume. If None, a new ID is generated.
1098
+
1099
+ Yields:
1100
+ Intermediate state updates or messages during execution.
1101
+ """
1102
+ if self.runner and not self.runner.done():
1103
+ logger.warning(
1104
+ "Agent is already running. Please stop the current task first."
1105
+ )
1106
+ # Return an error status instead of yielding
1107
+ return {
1108
+ "status": "error",
1109
+ "message": "Agent already running.",
1110
+ "task_id": self.current_task_id,
1111
+ }
1112
+
1113
+ self.current_task_id = task_id if task_id else str(uuid.uuid4())
1114
+ safe_root_dir = "./tmp/deep_research"
1115
+ normalized_save_dir = os.path.normpath(save_dir)
1116
+ if not normalized_save_dir.startswith(os.path.abspath(safe_root_dir)):
1117
+ logger.warning(f"Unsafe save_dir detected: {save_dir}. Using default directory.")
1118
+ normalized_save_dir = os.path.abspath(safe_root_dir)
1119
+ output_dir = os.path.join(normalized_save_dir, self.current_task_id)
1120
+ os.makedirs(output_dir, exist_ok=True)
1121
+
1122
+ logger.info(
1123
+ f"[AsyncGen] Starting research task ID: {self.current_task_id} for topic: '{topic}'"
1124
+ )
1125
+ logger.info(f"[AsyncGen] Output directory: {output_dir}")
1126
+
1127
+ self.stop_event = threading.Event()
1128
+ _AGENT_STOP_FLAGS[self.current_task_id] = self.stop_event
1129
+ agent_tools = await self._setup_tools(
1130
+ self.current_task_id, self.stop_event, max_parallel_browsers
1131
+ )
1132
+ initial_state: DeepResearchState = {
1133
+ "task_id": self.current_task_id,
1134
+ "topic": topic,
1135
+ "research_plan": [],
1136
+ "search_results": [],
1137
+ "messages": [],
1138
+ "llm": self.llm,
1139
+ "tools": agent_tools,
1140
+ "output_dir": Path(output_dir),
1141
+ "browser_config": self.browser_config,
1142
+ "final_report": None,
1143
+ "current_category_index": 0,
1144
+ "current_task_index_in_category": 0,
1145
+ "stop_requested": False,
1146
+ "error_message": None,
1147
+ }
1148
+
1149
+ if task_id:
1150
+ logger.info(f"Attempting to resume task {task_id}...")
1151
+ loaded_state = _load_previous_state(task_id, output_dir)
1152
+ initial_state.update(loaded_state)
1153
+ if loaded_state.get("research_plan"):
1154
+ logger.info(
1155
+ f"Resuming with {len(loaded_state['research_plan'])} plan categories "
1156
+ f"and {len(loaded_state.get('search_results', []))} existing results. "
1157
+ f"Next task: Cat {initial_state['current_category_index']}, Task {initial_state['current_task_index_in_category']}"
1158
+ )
1159
+ initial_state["topic"] = (
1160
+ topic # Allow overriding topic even when resuming? Or use stored topic? Let's use new one.
1161
+ )
1162
+ else:
1163
+ logger.warning(
1164
+ f"Resume requested for {task_id}, but no previous plan found. Starting fresh."
1165
+ )
1166
+
1167
+ # --- Execute Graph using ainvoke ---
1168
+ final_state = None
1169
+ status = "unknown"
1170
+ message = None
1171
+ try:
1172
+ logger.info(f"Invoking graph execution for task {self.current_task_id}...")
1173
+ self.runner = asyncio.create_task(self.graph.ainvoke(initial_state))
1174
+ final_state = await self.runner
1175
+ logger.info(f"Graph execution finished for task {self.current_task_id}.")
1176
+
1177
+ # Determine status based on final state
1178
+ if self.stop_event and self.stop_event.is_set():
1179
+ status = "stopped"
1180
+ message = "Research process was stopped by request."
1181
+ logger.info(message)
1182
+ elif final_state and final_state.get("error_message"):
1183
+ status = "error"
1184
+ message = final_state["error_message"]
1185
+ logger.error(f"Graph execution completed with error: {message}")
1186
+ elif final_state and final_state.get("final_report"):
1187
+ status = "completed"
1188
+ message = "Research process completed successfully."
1189
+ logger.info(message)
1190
+ else:
1191
+ # If it ends without error/report (e.g., empty plan, stopped before synthesis)
1192
+ status = "finished_incomplete"
1193
+ message = "Research process finished, but may be incomplete (no final report generated)."
1194
+ logger.warning(message)
1195
+
1196
+ except asyncio.CancelledError:
1197
+ status = "cancelled"
1198
+ message = f"Agent run task cancelled for {self.current_task_id}."
1199
+ logger.info(message)
1200
+ # final_state will remain None or the state before cancellation if checkpointing was used
1201
+ except Exception as e:
1202
+ status = "error"
1203
+ message = f"Unhandled error during graph execution for {self.current_task_id}: {e}"
1204
+ logger.error(message, exc_info=True)
1205
+ # final_state will remain None or the state before the error
1206
+ finally:
1207
+ logger.info(f"Cleaning up resources for task {self.current_task_id}")
1208
+ task_id_to_clean = self.current_task_id
1209
+
1210
+ self.stop_event = None
1211
+ self.current_task_id = None
1212
+ self.runner = None # Mark runner as finished
1213
+ if self.mcp_client:
1214
+ await self.mcp_client.__aexit__(None, None, None)
1215
+
1216
+ # Return a result dictionary including the status and the final state if available
1217
+ return {
1218
+ "status": status,
1219
+ "message": message,
1220
+ "task_id": task_id_to_clean, # Use the stored task_id
1221
+ "final_state": final_state
1222
+ if final_state
1223
+ else {}, # Return the final state dict
1224
+ }
1225
+
1226
+ async def _stop_lingering_browsers(self, task_id):
1227
+ """Attempts to stop any BrowserUseAgent instances associated with the task_id."""
1228
+ keys_to_stop = [
1229
+ key for key in _BROWSER_AGENT_INSTANCES if key.startswith(f"{task_id}_")
1230
+ ]
1231
+ if not keys_to_stop:
1232
+ return
1233
+
1234
+ logger.warning(
1235
+ f"Found {len(keys_to_stop)} potentially lingering browser agents for task {task_id}. Attempting stop..."
1236
+ )
1237
+ for key in keys_to_stop:
1238
+ agent_instance = _BROWSER_AGENT_INSTANCES.get(key)
1239
+ try:
1240
+ if agent_instance:
1241
+ # Assuming BU agent has an async stop method
1242
+ await agent_instance.stop()
1243
+ logger.info(f"Called stop() on browser agent instance {key}")
1244
+ except Exception as e:
1245
+ logger.error(
1246
+ f"Error calling stop() on browser agent instance {key}: {e}"
1247
+ )
1248
+
1249
+ async def stop(self):
1250
+ """Signals the currently running agent task to stop."""
1251
+ if not self.current_task_id or not self.stop_event:
1252
+ logger.info("No agent task is currently running.")
1253
+ return
1254
+
1255
+ logger.info(f"Stop requested for task ID: {self.current_task_id}")
1256
+ self.stop_event.set() # Signal the stop event
1257
+ self.stopped = True
1258
+ await self._stop_lingering_browsers(self.current_task_id)
1259
+
1260
+ def close(self):
1261
+ self.stopped = False
src/browser/__init__.py ADDED
File without changes
src/browser/custom_browser.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import pdb
3
+
4
+ from playwright.async_api import Browser as PlaywrightBrowser
5
+ from playwright.async_api import (
6
+ BrowserContext as PlaywrightBrowserContext,
7
+ )
8
+ from playwright.async_api import (
9
+ Playwright,
10
+ async_playwright,
11
+ )
12
+ from browser_use.browser.browser import Browser, IN_DOCKER
13
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
14
+ from playwright.async_api import BrowserContext as PlaywrightBrowserContext
15
+ import logging
16
+
17
+ from browser_use.browser.chrome import (
18
+ CHROME_ARGS,
19
+ CHROME_DETERMINISTIC_RENDERING_ARGS,
20
+ CHROME_DISABLE_SECURITY_ARGS,
21
+ CHROME_DOCKER_ARGS,
22
+ CHROME_HEADLESS_ARGS,
23
+ )
24
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
25
+ from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments
26
+ from browser_use.utils import time_execution_async
27
+ import socket
28
+
29
+ from .custom_context import CustomBrowserContext
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class CustomBrowser(Browser):
35
+
36
+ async def new_context(self, config: BrowserContextConfig | None = None) -> CustomBrowserContext:
37
+ """Create a browser context"""
38
+ browser_config = self.config.model_dump() if self.config else {}
39
+ context_config = config.model_dump() if config else {}
40
+ merged_config = {**browser_config, **context_config}
41
+ return CustomBrowserContext(config=BrowserContextConfig(**merged_config), browser=self)
42
+
43
+ async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser:
44
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
45
+ assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers'
46
+
47
+ # Use the configured window size from new_context_config if available
48
+ if (
49
+ not self.config.headless
50
+ and hasattr(self.config, 'new_context_config')
51
+ and hasattr(self.config.new_context_config, 'window_width')
52
+ and hasattr(self.config.new_context_config, 'window_height')
53
+ ):
54
+ screen_size = {
55
+ 'width': self.config.new_context_config.window_width,
56
+ 'height': self.config.new_context_config.window_height,
57
+ }
58
+ offset_x, offset_y = get_window_adjustments()
59
+ elif self.config.headless:
60
+ screen_size = {'width': 1920, 'height': 1080}
61
+ offset_x, offset_y = 0, 0
62
+ else:
63
+ screen_size = get_screen_resolution()
64
+ offset_x, offset_y = get_window_adjustments()
65
+
66
+ chrome_args = {
67
+ f'--remote-debugging-port={self.config.chrome_remote_debugging_port}',
68
+ *CHROME_ARGS,
69
+ *(CHROME_DOCKER_ARGS if IN_DOCKER else []),
70
+ *(CHROME_HEADLESS_ARGS if self.config.headless else []),
71
+ *(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []),
72
+ *(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []),
73
+ f'--window-position={offset_x},{offset_y}',
74
+ f'--window-size={screen_size["width"]},{screen_size["height"]}',
75
+ *self.config.extra_browser_args,
76
+ }
77
+
78
+ # check if chrome remote debugging port is already taken,
79
+ # if so remove the remote-debugging-port arg to prevent conflicts
80
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
81
+ if s.connect_ex(('localhost', self.config.chrome_remote_debugging_port)) == 0:
82
+ chrome_args.remove(f'--remote-debugging-port={self.config.chrome_remote_debugging_port}')
83
+
84
+ browser_class = getattr(playwright, self.config.browser_class)
85
+ args = {
86
+ 'chromium': list(chrome_args),
87
+ 'firefox': [
88
+ *{
89
+ '-no-remote',
90
+ *self.config.extra_browser_args,
91
+ }
92
+ ],
93
+ 'webkit': [
94
+ *{
95
+ '--no-startup-window',
96
+ *self.config.extra_browser_args,
97
+ }
98
+ ],
99
+ }
100
+
101
+ browser = await browser_class.launch(
102
+ channel='chromium', # https://github.com/microsoft/playwright/issues/33566
103
+ headless=self.config.headless,
104
+ args=args[self.config.browser_class],
105
+ proxy=self.config.proxy.model_dump() if self.config.proxy else None,
106
+ handle_sigterm=False,
107
+ handle_sigint=False,
108
+ )
109
+ return browser
src/browser/custom_context.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+
5
+ from browser_use.browser.browser import Browser, IN_DOCKER
6
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
7
+ from playwright.async_api import Browser as PlaywrightBrowser
8
+ from playwright.async_api import BrowserContext as PlaywrightBrowserContext
9
+ from typing import Optional
10
+ from browser_use.browser.context import BrowserContextState
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class CustomBrowserContext(BrowserContext):
16
+ def __init__(
17
+ self,
18
+ browser: 'Browser',
19
+ config: BrowserContextConfig | None = None,
20
+ state: Optional[BrowserContextState] = None,
21
+ ):
22
+ super(CustomBrowserContext, self).__init__(browser=browser, config=config, state=state)
src/controller/__init__.py ADDED
File without changes
src/controller/custom_controller.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+
3
+ import pyperclip
4
+ from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable, TypeVar
5
+ from pydantic import BaseModel
6
+ from browser_use.agent.views import ActionResult
7
+ from browser_use.browser.context import BrowserContext
8
+ from browser_use.controller.service import Controller, DoneAction
9
+ from browser_use.controller.registry.service import Registry, RegisteredAction
10
+ from main_content_extractor import MainContentExtractor
11
+ from browser_use.controller.views import (
12
+ ClickElementAction,
13
+ DoneAction,
14
+ ExtractPageContentAction,
15
+ GoToUrlAction,
16
+ InputTextAction,
17
+ OpenTabAction,
18
+ ScrollAction,
19
+ SearchGoogleAction,
20
+ SendKeysAction,
21
+ SwitchTabAction,
22
+ )
23
+ import logging
24
+ import inspect
25
+ import asyncio
26
+ import os
27
+ from langchain_core.language_models.chat_models import BaseChatModel
28
+ from browser_use.agent.views import ActionModel, ActionResult
29
+
30
+ from src.utils.mcp_client import create_tool_param_model, setup_mcp_client_and_tools
31
+
32
+ from browser_use.utils import time_execution_sync
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ Context = TypeVar('Context')
37
+
38
+
39
+ class CustomController(Controller):
40
+ def __init__(self, exclude_actions: list[str] = [],
41
+ output_model: Optional[Type[BaseModel]] = None,
42
+ ask_assistant_callback: Optional[Union[Callable[[str, BrowserContext], Dict[str, Any]], Callable[
43
+ [str, BrowserContext], Awaitable[Dict[str, Any]]]]] = None,
44
+ ):
45
+ super().__init__(exclude_actions=exclude_actions, output_model=output_model)
46
+ self._register_custom_actions()
47
+ self.ask_assistant_callback = ask_assistant_callback
48
+ self.mcp_client = None
49
+ self.mcp_server_config = None
50
+
51
+ def _register_custom_actions(self):
52
+ """Register all custom browser actions"""
53
+
54
+ @self.registry.action(
55
+ "When executing tasks, prioritize autonomous completion. However, if you encounter a definitive blocker "
56
+ "that prevents you from proceeding independently – such as needing credentials you don't possess, "
57
+ "requiring subjective human judgment, needing a physical action performed, encountering complex CAPTCHAs, "
58
+ "or facing limitations in your capabilities – you must request human assistance."
59
+ )
60
+ async def ask_for_assistant(query: str, browser: BrowserContext):
61
+ if self.ask_assistant_callback:
62
+ if inspect.iscoroutinefunction(self.ask_assistant_callback):
63
+ user_response = await self.ask_assistant_callback(query, browser)
64
+ else:
65
+ user_response = self.ask_assistant_callback(query, browser)
66
+ msg = f"AI ask: {query}. User response: {user_response['response']}"
67
+ logger.info(msg)
68
+ return ActionResult(extracted_content=msg, include_in_memory=True)
69
+ else:
70
+ return ActionResult(extracted_content="Human cannot help you. Please try another way.",
71
+ include_in_memory=True)
72
+
73
+ @self.registry.action(
74
+ 'Upload file to interactive element with file path ',
75
+ )
76
+ async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]):
77
+ if path not in available_file_paths:
78
+ return ActionResult(error=f'File path {path} is not available')
79
+
80
+ if not os.path.exists(path):
81
+ return ActionResult(error=f'File {path} does not exist')
82
+
83
+ dom_el = await browser.get_dom_element_by_index(index)
84
+
85
+ file_upload_dom_el = dom_el.get_file_upload_element()
86
+
87
+ if file_upload_dom_el is None:
88
+ msg = f'No file upload element found at index {index}'
89
+ logger.info(msg)
90
+ return ActionResult(error=msg)
91
+
92
+ file_upload_el = await browser.get_locate_element(file_upload_dom_el)
93
+
94
+ if file_upload_el is None:
95
+ msg = f'No file upload element found at index {index}'
96
+ logger.info(msg)
97
+ return ActionResult(error=msg)
98
+
99
+ try:
100
+ await file_upload_el.set_input_files(path)
101
+ msg = f'Successfully uploaded file to index {index}'
102
+ logger.info(msg)
103
+ return ActionResult(extracted_content=msg, include_in_memory=True)
104
+ except Exception as e:
105
+ msg = f'Failed to upload file to index {index}: {str(e)}'
106
+ logger.info(msg)
107
+ return ActionResult(error=msg)
108
+
109
+ @time_execution_sync('--act')
110
+ async def act(
111
+ self,
112
+ action: ActionModel,
113
+ browser_context: Optional[BrowserContext] = None,
114
+ #
115
+ page_extraction_llm: Optional[BaseChatModel] = None,
116
+ sensitive_data: Optional[Dict[str, str]] = None,
117
+ available_file_paths: Optional[list[str]] = None,
118
+ #
119
+ context: Context | None = None,
120
+ ) -> ActionResult:
121
+ """Execute an action"""
122
+
123
+ try:
124
+ for action_name, params in action.model_dump(exclude_unset=True).items():
125
+ if params is not None:
126
+ if action_name.startswith("mcp"):
127
+ # this is a mcp tool
128
+ logger.debug(f"Invoke MCP tool: {action_name}")
129
+ mcp_tool = self.registry.registry.actions.get(action_name).function
130
+ result = await mcp_tool.ainvoke(params)
131
+ else:
132
+ result = await self.registry.execute_action(
133
+ action_name,
134
+ params,
135
+ browser=browser_context,
136
+ page_extraction_llm=page_extraction_llm,
137
+ sensitive_data=sensitive_data,
138
+ available_file_paths=available_file_paths,
139
+ context=context,
140
+ )
141
+
142
+ if isinstance(result, str):
143
+ return ActionResult(extracted_content=result)
144
+ elif isinstance(result, ActionResult):
145
+ return result
146
+ elif result is None:
147
+ return ActionResult()
148
+ else:
149
+ raise ValueError(f'Invalid action result type: {type(result)} of {result}')
150
+ return ActionResult()
151
+ except Exception as e:
152
+ raise e
153
+
154
+ async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None):
155
+ self.mcp_server_config = mcp_server_config
156
+ if self.mcp_server_config:
157
+ self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config)
158
+ self.register_mcp_tools()
159
+
160
+ def register_mcp_tools(self):
161
+ """
162
+ Register the MCP tools used by this controller.
163
+ """
164
+ if self.mcp_client:
165
+ for server_name in self.mcp_client.server_name_to_tools:
166
+ for tool in self.mcp_client.server_name_to_tools[server_name]:
167
+ tool_name = f"mcp.{server_name}.{tool.name}"
168
+ self.registry.registry.actions[tool_name] = RegisteredAction(
169
+ name=tool_name,
170
+ description=tool.description,
171
+ function=tool,
172
+ param_model=create_tool_param_model(tool),
173
+ )
174
+ logger.info(f"Add mcp tool: {tool_name}")
175
+ logger.debug(
176
+ f"Registered {len(self.mcp_client.server_name_to_tools[server_name])} mcp tools for {server_name}")
177
+ else:
178
+ logger.warning(f"MCP client not started.")
179
+
180
+ async def close_mcp_client(self):
181
+ if self.mcp_client:
182
+ await self.mcp_client.__aexit__(None, None, None)
src/utils/__init__.py ADDED
File without changes
src/utils/config.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROVIDER_DISPLAY_NAMES = {
2
+ "openai": "OpenAI",
3
+ "azure_openai": "Azure OpenAI",
4
+ "anthropic": "Anthropic",
5
+ "deepseek": "DeepSeek",
6
+ "google": "Google",
7
+ "alibaba": "Alibaba",
8
+ "moonshot": "MoonShot",
9
+ "unbound": "Unbound AI",
10
+ "ibm": "IBM",
11
+ "grok": "Grok",
12
+ }
13
+
14
+ # Predefined model names for common providers
15
+ model_names = {
16
+ "anthropic": ["claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
17
+ "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"],
18
+ "deepseek": ["deepseek-chat", "deepseek-reasoner"],
19
+ "google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest",
20
+ "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05",
21
+ "gemini-2.5-pro-preview-03-25", "gemini-2.5-flash-preview-04-17"],
22
+ "ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b",
23
+ "deepseek-r1:14b", "deepseek-r1:32b"],
24
+ "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
25
+ "mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"],
26
+ "alibaba": ["qwen-plus", "qwen-max", "qwen-vl-max", "qwen-vl-plus", "qwen-turbo", "qwen-long"],
27
+ "moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"],
28
+ "unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"],
29
+ "grok": [
30
+ "grok-3",
31
+ "grok-3-fast",
32
+ "grok-3-mini",
33
+ "grok-3-mini-fast",
34
+ "grok-2-vision",
35
+ "grok-2-image",
36
+ "grok-2",
37
+ ],
38
+ "siliconflow": [
39
+ "deepseek-ai/DeepSeek-R1",
40
+ "deepseek-ai/DeepSeek-V3",
41
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
42
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
43
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
44
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
45
+ "deepseek-ai/DeepSeek-V2.5",
46
+ "deepseek-ai/deepseek-vl2",
47
+ "Qwen/Qwen2.5-72B-Instruct-128K",
48
+ "Qwen/Qwen2.5-72B-Instruct",
49
+ "Qwen/Qwen2.5-32B-Instruct",
50
+ "Qwen/Qwen2.5-14B-Instruct",
51
+ "Qwen/Qwen2.5-7B-Instruct",
52
+ "Qwen/Qwen2.5-Coder-32B-Instruct",
53
+ "Qwen/Qwen2.5-Coder-7B-Instruct",
54
+ "Qwen/Qwen2-7B-Instruct",
55
+ "Qwen/Qwen2-1.5B-Instruct",
56
+ "Qwen/QwQ-32B-Preview",
57
+ "Qwen/Qwen2-VL-72B-Instruct",
58
+ "Qwen/Qwen2.5-VL-32B-Instruct",
59
+ "Qwen/Qwen2.5-VL-72B-Instruct",
60
+ "TeleAI/TeleChat2",
61
+ "THUDM/glm-4-9b-chat",
62
+ "Vendor-A/Qwen/Qwen2.5-72B-Instruct",
63
+ "internlm/internlm2_5-7b-chat",
64
+ "internlm/internlm2_5-20b-chat",
65
+ "Pro/Qwen/Qwen2.5-7B-Instruct",
66
+ "Pro/Qwen/Qwen2-7B-Instruct",
67
+ "Pro/Qwen/Qwen2-1.5B-Instruct",
68
+ "Pro/THUDM/chatglm3-6b",
69
+ "Pro/THUDM/glm-4-9b-chat",
70
+ ],
71
+ "ibm": ["ibm/granite-vision-3.1-2b-preview", "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
72
+ "meta-llama/llama-3-2-90b-vision-instruct"],
73
+ "modelscope":[
74
+ "Qwen/Qwen2.5-Coder-32B-Instruct",
75
+ "Qwen/Qwen2.5-Coder-14B-Instruct",
76
+ "Qwen/Qwen2.5-Coder-7B-Instruct",
77
+ "Qwen/Qwen2.5-72B-Instruct",
78
+ "Qwen/Qwen2.5-32B-Instruct",
79
+ "Qwen/Qwen2.5-14B-Instruct",
80
+ "Qwen/Qwen2.5-7B-Instruct",
81
+ "Qwen/QwQ-32B-Preview",
82
+ "Qwen/Qwen2.5-VL-3B-Instruct",
83
+ "Qwen/Qwen2.5-VL-7B-Instruct",
84
+ "Qwen/Qwen2.5-VL-32B-Instruct",
85
+ "Qwen/Qwen2.5-VL-72B-Instruct",
86
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
87
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
88
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
89
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
90
+ "deepseek-ai/DeepSeek-R1",
91
+ "deepseek-ai/DeepSeek-V3",
92
+ "Qwen/Qwen3-1.7B",
93
+ "Qwen/Qwen3-4B",
94
+ "Qwen/Qwen3-8B",
95
+ "Qwen/Qwen3-14B",
96
+ "Qwen/Qwen3-30B-A3B",
97
+ "Qwen/Qwen3-32B",
98
+ "Qwen/Qwen3-235B-A22B",
99
+ ],
100
+ }
src/utils/llm_provider.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import pdb
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain_core.globals import get_llm_cache
5
+ from langchain_core.language_models.base import (
6
+ BaseLanguageModel,
7
+ LangSmithParams,
8
+ LanguageModelInput,
9
+ )
10
+ import os
11
+ from langchain_core.load import dumpd, dumps
12
+ from langchain_core.messages import (
13
+ AIMessage,
14
+ SystemMessage,
15
+ AnyMessage,
16
+ BaseMessage,
17
+ BaseMessageChunk,
18
+ HumanMessage,
19
+ convert_to_messages,
20
+ message_chunk_to_message,
21
+ )
22
+ from langchain_core.outputs import (
23
+ ChatGeneration,
24
+ ChatGenerationChunk,
25
+ ChatResult,
26
+ LLMResult,
27
+ RunInfo,
28
+ )
29
+ from langchain_ollama import ChatOllama
30
+ from langchain_core.output_parsers.base import OutputParserLike
31
+ from langchain_core.runnables import Runnable, RunnableConfig
32
+ from langchain_core.tools import BaseTool
33
+
34
+ from typing import (
35
+ TYPE_CHECKING,
36
+ Any,
37
+ Callable,
38
+ Literal,
39
+ Optional,
40
+ Union,
41
+ cast, List,
42
+ )
43
+ from langchain_anthropic import ChatAnthropic
44
+ from langchain_mistralai import ChatMistralAI
45
+ from langchain_google_genai import ChatGoogleGenerativeAI
46
+ from langchain_ollama import ChatOllama
47
+ from langchain_openai import AzureChatOpenAI, ChatOpenAI
48
+ from langchain_ibm import ChatWatsonx
49
+ from langchain_aws import ChatBedrock
50
+ from pydantic import SecretStr
51
+
52
+ from src.utils import config
53
+
54
+
55
+ class DeepSeekR1ChatOpenAI(ChatOpenAI):
56
+
57
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
58
+ super().__init__(*args, **kwargs)
59
+ self.client = OpenAI(
60
+ base_url=kwargs.get("base_url"),
61
+ api_key=kwargs.get("api_key")
62
+ )
63
+
64
+ async def ainvoke(
65
+ self,
66
+ input: LanguageModelInput,
67
+ config: Optional[RunnableConfig] = None,
68
+ *,
69
+ stop: Optional[list[str]] = None,
70
+ **kwargs: Any,
71
+ ) -> AIMessage:
72
+ message_history = []
73
+ for input_ in input:
74
+ if isinstance(input_, SystemMessage):
75
+ message_history.append({"role": "system", "content": input_.content})
76
+ elif isinstance(input_, AIMessage):
77
+ message_history.append({"role": "assistant", "content": input_.content})
78
+ else:
79
+ message_history.append({"role": "user", "content": input_.content})
80
+
81
+ response = self.client.chat.completions.create(
82
+ model=self.model_name,
83
+ messages=message_history
84
+ )
85
+
86
+ reasoning_content = response.choices[0].message.reasoning_content
87
+ content = response.choices[0].message.content
88
+ return AIMessage(content=content, reasoning_content=reasoning_content)
89
+
90
+ def invoke(
91
+ self,
92
+ input: LanguageModelInput,
93
+ config: Optional[RunnableConfig] = None,
94
+ *,
95
+ stop: Optional[list[str]] = None,
96
+ **kwargs: Any,
97
+ ) -> AIMessage:
98
+ message_history = []
99
+ for input_ in input:
100
+ if isinstance(input_, SystemMessage):
101
+ message_history.append({"role": "system", "content": input_.content})
102
+ elif isinstance(input_, AIMessage):
103
+ message_history.append({"role": "assistant", "content": input_.content})
104
+ else:
105
+ message_history.append({"role": "user", "content": input_.content})
106
+
107
+ response = self.client.chat.completions.create(
108
+ model=self.model_name,
109
+ messages=message_history
110
+ )
111
+
112
+ reasoning_content = response.choices[0].message.reasoning_content
113
+ content = response.choices[0].message.content
114
+ return AIMessage(content=content, reasoning_content=reasoning_content)
115
+
116
+
117
+ class DeepSeekR1ChatOllama(ChatOllama):
118
+
119
+ async def ainvoke(
120
+ self,
121
+ input: LanguageModelInput,
122
+ config: Optional[RunnableConfig] = None,
123
+ *,
124
+ stop: Optional[list[str]] = None,
125
+ **kwargs: Any,
126
+ ) -> AIMessage:
127
+ org_ai_message = await super().ainvoke(input=input)
128
+ org_content = org_ai_message.content
129
+ reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
130
+ content = org_content.split("</think>")[1]
131
+ if "**JSON Response:**" in content:
132
+ content = content.split("**JSON Response:**")[-1]
133
+ return AIMessage(content=content, reasoning_content=reasoning_content)
134
+
135
+ def invoke(
136
+ self,
137
+ input: LanguageModelInput,
138
+ config: Optional[RunnableConfig] = None,
139
+ *,
140
+ stop: Optional[list[str]] = None,
141
+ **kwargs: Any,
142
+ ) -> AIMessage:
143
+ org_ai_message = super().invoke(input=input)
144
+ org_content = org_ai_message.content
145
+ reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
146
+ content = org_content.split("</think>")[1]
147
+ if "**JSON Response:**" in content:
148
+ content = content.split("**JSON Response:**")[-1]
149
+ return AIMessage(content=content, reasoning_content=reasoning_content)
150
+
151
+
152
+ def get_llm_model(provider: str, **kwargs):
153
+ """
154
+ Get LLM model
155
+ :param provider: LLM provider
156
+ :param kwargs:
157
+ :return:
158
+ """
159
+ if provider not in ["ollama", "bedrock"]:
160
+ env_var = f"{provider.upper()}_API_KEY"
161
+ api_key = kwargs.get("api_key", "") or os.getenv(env_var, "")
162
+ if not api_key:
163
+ provider_display = config.PROVIDER_DISPLAY_NAMES.get(provider, provider.upper())
164
+ error_msg = f"💥 {provider_display} API key not found! 🔑 Please set the `{env_var}` environment variable or provide it in the UI."
165
+ raise ValueError(error_msg)
166
+ kwargs["api_key"] = api_key
167
+
168
+ if provider == "anthropic":
169
+ if not kwargs.get("base_url", ""):
170
+ base_url = "https://api.anthropic.com"
171
+ else:
172
+ base_url = kwargs.get("base_url")
173
+
174
+ return ChatAnthropic(
175
+ model=kwargs.get("model_name", "claude-3-5-sonnet-20241022"),
176
+ temperature=kwargs.get("temperature", 0.0),
177
+ base_url=base_url,
178
+ api_key=api_key,
179
+ )
180
+ elif provider == 'mistral':
181
+ if not kwargs.get("base_url", ""):
182
+ base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1")
183
+ else:
184
+ base_url = kwargs.get("base_url")
185
+ if not kwargs.get("api_key", ""):
186
+ api_key = os.getenv("MISTRAL_API_KEY", "")
187
+ else:
188
+ api_key = kwargs.get("api_key")
189
+
190
+ return ChatMistralAI(
191
+ model=kwargs.get("model_name", "mistral-large-latest"),
192
+ temperature=kwargs.get("temperature", 0.0),
193
+ base_url=base_url,
194
+ api_key=api_key,
195
+ )
196
+ elif provider == "openai":
197
+ if not kwargs.get("base_url", ""):
198
+ base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
199
+ else:
200
+ base_url = kwargs.get("base_url")
201
+
202
+ return ChatOpenAI(
203
+ model=kwargs.get("model_name", "gpt-4o"),
204
+ temperature=kwargs.get("temperature", 0.0),
205
+ base_url=base_url,
206
+ api_key=api_key,
207
+ )
208
+ elif provider == "grok":
209
+ if not kwargs.get("base_url", ""):
210
+ base_url = os.getenv("GROK_ENDPOINT", "https://api.x.ai/v1")
211
+ else:
212
+ base_url = kwargs.get("base_url")
213
+
214
+ return ChatOpenAI(
215
+ model=kwargs.get("model_name", "grok-3"),
216
+ temperature=kwargs.get("temperature", 0.0),
217
+ base_url=base_url,
218
+ api_key=api_key,
219
+ )
220
+ elif provider == "deepseek":
221
+ if not kwargs.get("base_url", ""):
222
+ base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
223
+ else:
224
+ base_url = kwargs.get("base_url")
225
+
226
+ if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
227
+ return DeepSeekR1ChatOpenAI(
228
+ model=kwargs.get("model_name", "deepseek-reasoner"),
229
+ temperature=kwargs.get("temperature", 0.0),
230
+ base_url=base_url,
231
+ api_key=api_key,
232
+ )
233
+ else:
234
+ return ChatOpenAI(
235
+ model=kwargs.get("model_name", "deepseek-chat"),
236
+ temperature=kwargs.get("temperature", 0.0),
237
+ base_url=base_url,
238
+ api_key=api_key,
239
+ )
240
+ elif provider == "google":
241
+ return ChatGoogleGenerativeAI(
242
+ model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
243
+ temperature=kwargs.get("temperature", 0.0),
244
+ api_key=api_key,
245
+ )
246
+ elif provider == "ollama":
247
+ if not kwargs.get("base_url", ""):
248
+ base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
249
+ else:
250
+ base_url = kwargs.get("base_url")
251
+
252
+ if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
253
+ return DeepSeekR1ChatOllama(
254
+ model=kwargs.get("model_name", "deepseek-r1:14b"),
255
+ temperature=kwargs.get("temperature", 0.0),
256
+ num_ctx=kwargs.get("num_ctx", 32000),
257
+ base_url=base_url,
258
+ )
259
+ else:
260
+ return ChatOllama(
261
+ model=kwargs.get("model_name", "qwen2.5:7b"),
262
+ temperature=kwargs.get("temperature", 0.0),
263
+ num_ctx=kwargs.get("num_ctx", 32000),
264
+ num_predict=kwargs.get("num_predict", 1024),
265
+ base_url=base_url,
266
+ )
267
+ elif provider == "azure_openai":
268
+ if not kwargs.get("base_url", ""):
269
+ base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
270
+ else:
271
+ base_url = kwargs.get("base_url")
272
+ api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview")
273
+ return AzureChatOpenAI(
274
+ model=kwargs.get("model_name", "gpt-4o"),
275
+ temperature=kwargs.get("temperature", 0.0),
276
+ api_version=api_version,
277
+ azure_endpoint=base_url,
278
+ api_key=api_key,
279
+ )
280
+ elif provider == "alibaba":
281
+ if not kwargs.get("base_url", ""):
282
+ base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1")
283
+ else:
284
+ base_url = kwargs.get("base_url")
285
+
286
+ return ChatOpenAI(
287
+ model=kwargs.get("model_name", "qwen-plus"),
288
+ temperature=kwargs.get("temperature", 0.0),
289
+ base_url=base_url,
290
+ api_key=api_key,
291
+ )
292
+ elif provider == "ibm":
293
+ parameters = {
294
+ "temperature": kwargs.get("temperature", 0.0),
295
+ "max_tokens": kwargs.get("num_ctx", 32000)
296
+ }
297
+ if not kwargs.get("base_url", ""):
298
+ base_url = os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com")
299
+ else:
300
+ base_url = kwargs.get("base_url")
301
+
302
+ return ChatWatsonx(
303
+ model_id=kwargs.get("model_name", "ibm/granite-vision-3.1-2b-preview"),
304
+ url=base_url,
305
+ project_id=os.getenv("IBM_PROJECT_ID"),
306
+ apikey=os.getenv("IBM_API_KEY"),
307
+ params=parameters
308
+ )
309
+ elif provider == "moonshot":
310
+ return ChatOpenAI(
311
+ model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"),
312
+ temperature=kwargs.get("temperature", 0.0),
313
+ base_url=os.getenv("MOONSHOT_ENDPOINT"),
314
+ api_key=os.getenv("MOONSHOT_API_KEY"),
315
+ )
316
+ elif provider == "unbound":
317
+ return ChatOpenAI(
318
+ model=kwargs.get("model_name", "gpt-4o-mini"),
319
+ temperature=kwargs.get("temperature", 0.0),
320
+ base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"),
321
+ api_key=api_key,
322
+ )
323
+ elif provider == "siliconflow":
324
+ if not kwargs.get("api_key", ""):
325
+ api_key = os.getenv("SiliconFLOW_API_KEY", "")
326
+ else:
327
+ api_key = kwargs.get("api_key")
328
+ if not kwargs.get("base_url", ""):
329
+ base_url = os.getenv("SiliconFLOW_ENDPOINT", "")
330
+ else:
331
+ base_url = kwargs.get("base_url")
332
+ return ChatOpenAI(
333
+ api_key=api_key,
334
+ base_url=base_url,
335
+ model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
336
+ temperature=kwargs.get("temperature", 0.0),
337
+ )
338
+ elif provider == "modelscope":
339
+ if not kwargs.get("api_key", ""):
340
+ api_key = os.getenv("MODELSCOPE_API_KEY", "")
341
+ else:
342
+ api_key = kwargs.get("api_key")
343
+ if not kwargs.get("base_url", ""):
344
+ base_url = os.getenv("MODELSCOPE_ENDPOINT", "")
345
+ else:
346
+ base_url = kwargs.get("base_url")
347
+ return ChatOpenAI(
348
+ api_key=api_key,
349
+ base_url=base_url,
350
+ model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
351
+ temperature=kwargs.get("temperature", 0.0),
352
+ )
353
+ else:
354
+ raise ValueError(f"Unsupported provider: {provider}")
src/utils/mcp_client.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import logging
3
+ import uuid
4
+ from datetime import date, datetime, time
5
+ from enum import Enum
6
+ from typing import Any, Dict, List, Optional, Set, Type, Union, get_type_hints
7
+
8
+ from browser_use.controller.registry.views import ActionModel
9
+ from langchain.tools import BaseTool
10
+ from langchain_mcp_adapters.client import MultiServerMCPClient
11
+ from pydantic import BaseModel, Field, create_model
12
+ from pydantic.v1 import BaseModel, Field
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Optional[MultiServerMCPClient]:
18
+ """
19
+ Initializes the MultiServerMCPClient, connects to servers, fetches tools,
20
+ filters them, and returns a flat list of usable tools and the client instance.
21
+
22
+ Returns:
23
+ A tuple containing:
24
+ - list[BaseTool]: The filtered list of usable LangChain tools.
25
+ - MultiServerMCPClient | None: The initialized and started client instance, or None on failure.
26
+ """
27
+
28
+ logger.info("Initializing MultiServerMCPClient...")
29
+
30
+ if not mcp_server_config:
31
+ logger.error("No MCP server configuration provided.")
32
+ return None
33
+
34
+ try:
35
+ if "mcpServers" in mcp_server_config:
36
+ mcp_server_config = mcp_server_config["mcpServers"]
37
+ client = MultiServerMCPClient(mcp_server_config)
38
+ await client.__aenter__()
39
+ return client
40
+
41
+ except Exception as e:
42
+ logger.error(f"Failed to setup MCP client or fetch tools: {e}", exc_info=True)
43
+ return None
44
+
45
+
46
+ def create_tool_param_model(tool: BaseTool) -> Type[BaseModel]:
47
+ """Creates a Pydantic model from a LangChain tool's schema"""
48
+
49
+ # Get tool schema information
50
+ json_schema = tool.args_schema
51
+ tool_name = tool.name
52
+
53
+ # If the tool already has a schema defined, convert it to a new param_model
54
+ if json_schema is not None:
55
+
56
+ # Create new parameter model
57
+ params = {}
58
+
59
+ # Process properties if they exist
60
+ if 'properties' in json_schema:
61
+ # Find required fields
62
+ required_fields: Set[str] = set(json_schema.get('required', []))
63
+
64
+ for prop_name, prop_details in json_schema['properties'].items():
65
+ field_type = resolve_type(prop_details, f"{tool_name}_{prop_name}")
66
+
67
+ # Check if parameter is required
68
+ is_required = prop_name in required_fields
69
+
70
+ # Get default value and description
71
+ default_value = prop_details.get('default', ... if is_required else None)
72
+ description = prop_details.get('description', '')
73
+
74
+ # Add field constraints
75
+ field_kwargs = {'default': default_value}
76
+ if description:
77
+ field_kwargs['description'] = description
78
+
79
+ # Add additional constraints if present
80
+ if 'minimum' in prop_details:
81
+ field_kwargs['ge'] = prop_details['minimum']
82
+ if 'maximum' in prop_details:
83
+ field_kwargs['le'] = prop_details['maximum']
84
+ if 'minLength' in prop_details:
85
+ field_kwargs['min_length'] = prop_details['minLength']
86
+ if 'maxLength' in prop_details:
87
+ field_kwargs['max_length'] = prop_details['maxLength']
88
+ if 'pattern' in prop_details:
89
+ field_kwargs['pattern'] = prop_details['pattern']
90
+
91
+ # Add to parameters dictionary
92
+ params[prop_name] = (field_type, Field(**field_kwargs))
93
+
94
+ return create_model(
95
+ f'{tool_name}_parameters',
96
+ __base__=ActionModel,
97
+ **params, # type: ignore
98
+ )
99
+
100
+ # If no schema is defined, extract parameters from the _run method
101
+ run_method = tool._run
102
+ sig = inspect.signature(run_method)
103
+
104
+ # Get type hints for better type information
105
+ try:
106
+ type_hints = get_type_hints(run_method)
107
+ except Exception:
108
+ type_hints = {}
109
+
110
+ params = {}
111
+ for name, param in sig.parameters.items():
112
+ # Skip 'self' parameter and any other parameters you want to exclude
113
+ if name == 'self':
114
+ continue
115
+
116
+ # Get annotation from type hints if available, otherwise from signature
117
+ annotation = type_hints.get(name, param.annotation)
118
+ if annotation == inspect.Parameter.empty:
119
+ annotation = Any
120
+
121
+ # Use default value if available, otherwise make it required
122
+ if param.default != param.empty:
123
+ params[name] = (annotation, param.default)
124
+ else:
125
+ params[name] = (annotation, ...)
126
+
127
+ return create_model(
128
+ f'{tool_name}_parameters',
129
+ __base__=ActionModel,
130
+ **params, # type: ignore
131
+ )
132
+
133
+
134
+ def resolve_type(prop_details: Dict[str, Any], prefix: str = "") -> Any:
135
+ """Recursively resolves JSON schema type to Python/Pydantic type"""
136
+
137
+ # Handle reference types
138
+ if '$ref' in prop_details:
139
+ # In a real application, reference resolution would be needed
140
+ return Any
141
+
142
+ # Basic type mapping
143
+ type_mapping = {
144
+ 'string': str,
145
+ 'integer': int,
146
+ 'number': float,
147
+ 'boolean': bool,
148
+ 'array': List,
149
+ 'object': Dict,
150
+ 'null': type(None),
151
+ }
152
+
153
+ # Handle formatted strings
154
+ if prop_details.get('type') == 'string' and 'format' in prop_details:
155
+ format_mapping = {
156
+ 'date-time': datetime,
157
+ 'date': date,
158
+ 'time': time,
159
+ 'email': str,
160
+ 'uri': str,
161
+ 'url': str,
162
+ 'uuid': uuid.UUID,
163
+ 'binary': bytes,
164
+ }
165
+ return format_mapping.get(prop_details['format'], str)
166
+
167
+ # Handle enum types
168
+ if 'enum' in prop_details:
169
+ enum_values = prop_details['enum']
170
+ # Create dynamic enum class with safe names
171
+ enum_dict = {}
172
+ for i, v in enumerate(enum_values):
173
+ # Ensure enum names are valid Python identifiers
174
+ if isinstance(v, str):
175
+ key = v.upper().replace(' ', '_').replace('-', '_')
176
+ if not key.isidentifier():
177
+ key = f"VALUE_{i}"
178
+ else:
179
+ key = f"VALUE_{i}"
180
+ enum_dict[key] = v
181
+
182
+ # Only create enum if we have values
183
+ if enum_dict:
184
+ return Enum(f"{prefix}_Enum", enum_dict)
185
+ return str # Fallback
186
+
187
+ # Handle array types
188
+ if prop_details.get('type') == 'array' and 'items' in prop_details:
189
+ item_type = resolve_type(prop_details['items'], f"{prefix}_item")
190
+ return List[item_type] # type: ignore
191
+
192
+ # Handle object types with properties
193
+ if prop_details.get('type') == 'object' and 'properties' in prop_details:
194
+ nested_params = {}
195
+ for nested_name, nested_details in prop_details['properties'].items():
196
+ nested_type = resolve_type(nested_details, f"{prefix}_{nested_name}")
197
+ # Get required field info
198
+ required_fields = prop_details.get('required', [])
199
+ is_required = nested_name in required_fields
200
+ default_value = nested_details.get('default', ... if is_required else None)
201
+ description = nested_details.get('description', '')
202
+
203
+ field_kwargs = {'default': default_value}
204
+ if description:
205
+ field_kwargs['description'] = description
206
+
207
+ nested_params[nested_name] = (nested_type, Field(**field_kwargs))
208
+
209
+ # Create nested model
210
+ nested_model = create_model(f"{prefix}_Model", **nested_params)
211
+ return nested_model
212
+
213
+ # Handle union types (oneOf, anyOf)
214
+ if 'oneOf' in prop_details or 'anyOf' in prop_details:
215
+ union_schema = prop_details.get('oneOf') or prop_details.get('anyOf')
216
+ union_types = []
217
+ for i, t in enumerate(union_schema):
218
+ union_types.append(resolve_type(t, f"{prefix}_{i}"))
219
+
220
+ if union_types:
221
+ return Union.__getitem__(tuple(union_types)) # type: ignore
222
+ return Any
223
+
224
+ # Handle allOf (intersection types)
225
+ if 'allOf' in prop_details:
226
+ nested_params = {}
227
+ for i, schema_part in enumerate(prop_details['allOf']):
228
+ if 'properties' in schema_part:
229
+ for nested_name, nested_details in schema_part['properties'].items():
230
+ nested_type = resolve_type(nested_details, f"{prefix}_allOf_{i}_{nested_name}")
231
+ # Check if required
232
+ required_fields = schema_part.get('required', [])
233
+ is_required = nested_name in required_fields
234
+ nested_params[nested_name] = (nested_type, ... if is_required else None)
235
+
236
+ # Create composite model
237
+ if nested_params:
238
+ composite_model = create_model(f"{prefix}_CompositeModel", **nested_params)
239
+ return composite_model
240
+ return Dict
241
+
242
+ # Default to basic types
243
+ schema_type = prop_details.get('type', 'string')
244
+ if isinstance(schema_type, list):
245
+ # Handle multiple types (e.g., ["string", "null"])
246
+ non_null_types = [t for t in schema_type if t != 'null']
247
+ if non_null_types:
248
+ primary_type = type_mapping.get(non_null_types[0], Any)
249
+ if 'null' in schema_type:
250
+ return Optional[primary_type] # type: ignore
251
+ return primary_type
252
+ return Any
253
+
254
+ return type_mapping.get(schema_type, Any)
src/utils/utils.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Dict, Optional
6
+ import requests
7
+ import json
8
+ import gradio as gr
9
+ import uuid
10
+
11
+
12
+ def encode_image(img_path):
13
+ if not img_path:
14
+ return None
15
+ with open(img_path, "rb") as fin:
16
+ image_data = base64.b64encode(fin.read()).decode("utf-8")
17
+ return image_data
18
+
19
+
20
+ def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Dict[str, Optional[str]]:
21
+ """Get the latest recording and trace files"""
22
+ latest_files: Dict[str, Optional[str]] = {ext: None for ext in file_types}
23
+
24
+ if not os.path.exists(directory):
25
+ os.makedirs(directory, exist_ok=True)
26
+ return latest_files
27
+
28
+ for file_type in file_types:
29
+ try:
30
+ matches = list(Path(directory).rglob(f"*{file_type}"))
31
+ if matches:
32
+ latest = max(matches, key=lambda p: p.stat().st_mtime)
33
+ # Only return files that are complete (not being written)
34
+ if time.time() - latest.stat().st_mtime > 1.0:
35
+ latest_files[file_type] = str(latest)
36
+ except Exception as e:
37
+ print(f"Error getting latest {file_type} file: {e}")
38
+
39
+ return latest_files
src/webui/__init__.py ADDED
File without changes
src/webui/components/__init__.py ADDED
File without changes
src/webui/components/agent_settings_tab.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import gradio as gr
5
+ from gradio.components import Component
6
+ from typing import Any, Dict, Optional
7
+ from src.webui.webui_manager import WebuiManager
8
+ from src.utils import config
9
+ import logging
10
+ from functools import partial
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def update_model_dropdown(llm_provider):
16
+ """
17
+ Update the model name dropdown with predefined models for the selected provider.
18
+ """
19
+ # Use predefined models for the selected provider
20
+ if llm_provider in config.model_names:
21
+ return gr.Dropdown(choices=config.model_names[llm_provider], value=config.model_names[llm_provider][0],
22
+ interactive=True)
23
+ else:
24
+ return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True)
25
+
26
+
27
+ async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
28
+ """
29
+ Update the MCP server.
30
+ """
31
+ if hasattr(webui_manager, "bu_controller") and webui_manager.bu_controller:
32
+ logger.warning("⚠️ Close controller because mcp file has changed!")
33
+ await webui_manager.bu_controller.close_mcp_client()
34
+ webui_manager.bu_controller = None
35
+
36
+ if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'):
37
+ logger.warning(f"{mcp_file} is not a valid MCP file.")
38
+ return None, gr.update(visible=False)
39
+
40
+ with open(mcp_file, 'r') as f:
41
+ mcp_server = json.load(f)
42
+
43
+ return json.dumps(mcp_server, indent=2), gr.update(visible=True)
44
+
45
+
46
+ def create_agent_settings_tab(webui_manager: WebuiManager):
47
+ """
48
+ Creates an agent settings tab.
49
+ """
50
+ input_components = set(webui_manager.get_components())
51
+ tab_components = {}
52
+
53
+ with gr.Group():
54
+ with gr.Column():
55
+ override_system_prompt = gr.Textbox(label="Override system prompt", lines=4, interactive=True)
56
+ extend_system_prompt = gr.Textbox(label="Extend system prompt", lines=4, interactive=True)
57
+
58
+ with gr.Group():
59
+ mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"])
60
+ mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False)
61
+
62
+ with gr.Group():
63
+ with gr.Row():
64
+ llm_provider = gr.Dropdown(
65
+ choices=[provider for provider, model in config.model_names.items()],
66
+ label="LLM Provider",
67
+ value=os.getenv("DEFAULT_LLM", "openai"),
68
+ info="Select LLM provider for LLM",
69
+ interactive=True
70
+ )
71
+ llm_model_name = gr.Dropdown(
72
+ label="LLM Model Name",
73
+ choices=config.model_names[os.getenv("DEFAULT_LLM", "openai")],
74
+ value=config.model_names[os.getenv("DEFAULT_LLM", "openai")][0],
75
+ interactive=True,
76
+ allow_custom_value=True,
77
+ info="Select a model in the dropdown options or directly type a custom model name"
78
+ )
79
+ with gr.Row():
80
+ llm_temperature = gr.Slider(
81
+ minimum=0.0,
82
+ maximum=2.0,
83
+ value=0.6,
84
+ step=0.1,
85
+ label="LLM Temperature",
86
+ info="Controls randomness in model outputs",
87
+ interactive=True
88
+ )
89
+
90
+ use_vision = gr.Checkbox(
91
+ label="Use Vision",
92
+ value=True,
93
+ info="Enable Vision(Input highlighted screenshot into LLM)",
94
+ interactive=True
95
+ )
96
+
97
+ ollama_num_ctx = gr.Slider(
98
+ minimum=2 ** 8,
99
+ maximum=2 ** 16,
100
+ value=16000,
101
+ step=1,
102
+ label="Ollama Context Length",
103
+ info="Controls max context length model needs to handle (less = faster)",
104
+ visible=False,
105
+ interactive=True
106
+ )
107
+
108
+ with gr.Row():
109
+ llm_base_url = gr.Textbox(
110
+ label="Base URL",
111
+ value="",
112
+ info="API endpoint URL (if required)"
113
+ )
114
+ llm_api_key = gr.Textbox(
115
+ label="API Key",
116
+ type="password",
117
+ value="",
118
+ info="Your API key (leave blank to use .env)"
119
+ )
120
+
121
+ with gr.Group():
122
+ with gr.Row():
123
+ planner_llm_provider = gr.Dropdown(
124
+ choices=[provider for provider, model in config.model_names.items()],
125
+ label="Planner LLM Provider",
126
+ info="Select LLM provider for LLM",
127
+ value=None,
128
+ interactive=True
129
+ )
130
+ planner_llm_model_name = gr.Dropdown(
131
+ label="Planner LLM Model Name",
132
+ interactive=True,
133
+ allow_custom_value=True,
134
+ info="Select a model in the dropdown options or directly type a custom model name"
135
+ )
136
+ with gr.Row():
137
+ planner_llm_temperature = gr.Slider(
138
+ minimum=0.0,
139
+ maximum=2.0,
140
+ value=0.6,
141
+ step=0.1,
142
+ label="Planner LLM Temperature",
143
+ info="Controls randomness in model outputs",
144
+ interactive=True
145
+ )
146
+
147
+ planner_use_vision = gr.Checkbox(
148
+ label="Use Vision(Planner LLM)",
149
+ value=False,
150
+ info="Enable Vision(Input highlighted screenshot into LLM)",
151
+ interactive=True
152
+ )
153
+
154
+ planner_ollama_num_ctx = gr.Slider(
155
+ minimum=2 ** 8,
156
+ maximum=2 ** 16,
157
+ value=16000,
158
+ step=1,
159
+ label="Ollama Context Length",
160
+ info="Controls max context length model needs to handle (less = faster)",
161
+ visible=False,
162
+ interactive=True
163
+ )
164
+
165
+ with gr.Row():
166
+ planner_llm_base_url = gr.Textbox(
167
+ label="Base URL",
168
+ value="",
169
+ info="API endpoint URL (if required)"
170
+ )
171
+ planner_llm_api_key = gr.Textbox(
172
+ label="API Key",
173
+ type="password",
174
+ value="",
175
+ info="Your API key (leave blank to use .env)"
176
+ )
177
+
178
+ with gr.Row():
179
+ max_steps = gr.Slider(
180
+ minimum=1,
181
+ maximum=1000,
182
+ value=100,
183
+ step=1,
184
+ label="Max Run Steps",
185
+ info="Maximum number of steps the agent will take",
186
+ interactive=True
187
+ )
188
+ max_actions = gr.Slider(
189
+ minimum=1,
190
+ maximum=100,
191
+ value=10,
192
+ step=1,
193
+ label="Max Number of Actions",
194
+ info="Maximum number of actions the agent will take per step",
195
+ interactive=True
196
+ )
197
+
198
+ with gr.Row():
199
+ max_input_tokens = gr.Number(
200
+ label="Max Input Tokens",
201
+ value=128000,
202
+ precision=0,
203
+ interactive=True
204
+ )
205
+ tool_calling_method = gr.Dropdown(
206
+ label="Tool Calling Method",
207
+ value="auto",
208
+ interactive=True,
209
+ allow_custom_value=True,
210
+ choices=['function_calling', 'json_mode', 'raw', 'auto', 'tools', "None"],
211
+ visible=True
212
+ )
213
+ tab_components.update(dict(
214
+ override_system_prompt=override_system_prompt,
215
+ extend_system_prompt=extend_system_prompt,
216
+ llm_provider=llm_provider,
217
+ llm_model_name=llm_model_name,
218
+ llm_temperature=llm_temperature,
219
+ use_vision=use_vision,
220
+ ollama_num_ctx=ollama_num_ctx,
221
+ llm_base_url=llm_base_url,
222
+ llm_api_key=llm_api_key,
223
+ planner_llm_provider=planner_llm_provider,
224
+ planner_llm_model_name=planner_llm_model_name,
225
+ planner_llm_temperature=planner_llm_temperature,
226
+ planner_use_vision=planner_use_vision,
227
+ planner_ollama_num_ctx=planner_ollama_num_ctx,
228
+ planner_llm_base_url=planner_llm_base_url,
229
+ planner_llm_api_key=planner_llm_api_key,
230
+ max_steps=max_steps,
231
+ max_actions=max_actions,
232
+ max_input_tokens=max_input_tokens,
233
+ tool_calling_method=tool_calling_method,
234
+ mcp_json_file=mcp_json_file,
235
+ mcp_server_config=mcp_server_config,
236
+ ))
237
+ webui_manager.add_components("agent_settings", tab_components)
238
+
239
+ llm_provider.change(
240
+ fn=lambda x: gr.update(visible=x == "ollama"),
241
+ inputs=llm_provider,
242
+ outputs=ollama_num_ctx
243
+ )
244
+ llm_provider.change(
245
+ lambda provider: update_model_dropdown(provider),
246
+ inputs=[llm_provider],
247
+ outputs=[llm_model_name]
248
+ )
249
+ planner_llm_provider.change(
250
+ fn=lambda x: gr.update(visible=x == "ollama"),
251
+ inputs=[planner_llm_provider],
252
+ outputs=[planner_ollama_num_ctx]
253
+ )
254
+ planner_llm_provider.change(
255
+ lambda provider: update_model_dropdown(provider),
256
+ inputs=[planner_llm_provider],
257
+ outputs=[planner_llm_model_name]
258
+ )
259
+
260
+ async def update_wrapper(mcp_file):
261
+ """Wrapper for handle_pause_resume."""
262
+ update_dict = await update_mcp_server(mcp_file, webui_manager)
263
+ yield update_dict
264
+
265
+ mcp_json_file.change(
266
+ update_wrapper,
267
+ inputs=[mcp_json_file],
268
+ outputs=[mcp_server_config, mcp_server_config]
269
+ )
src/webui/components/browser_settings_tab.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from distutils.util import strtobool
3
+ import gradio as gr
4
+ import logging
5
+ from gradio.components import Component
6
+
7
+ from src.webui.webui_manager import WebuiManager
8
+ from src.utils import config
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ async def close_browser(webui_manager: WebuiManager):
13
+ """
14
+ Close browser
15
+ """
16
+ if webui_manager.bu_current_task and not webui_manager.bu_current_task.done():
17
+ webui_manager.bu_current_task.cancel()
18
+ webui_manager.bu_current_task = None
19
+
20
+ if webui_manager.bu_browser_context:
21
+ logger.info("⚠️ Closing browser context when changing browser config.")
22
+ await webui_manager.bu_browser_context.close()
23
+ webui_manager.bu_browser_context = None
24
+
25
+ if webui_manager.bu_browser:
26
+ logger.info("⚠️ Closing browser when changing browser config.")
27
+ await webui_manager.bu_browser.close()
28
+ webui_manager.bu_browser = None
29
+
30
+ def create_browser_settings_tab(webui_manager: WebuiManager):
31
+ """
32
+ Creates a browser settings tab.
33
+ """
34
+ input_components = set(webui_manager.get_components())
35
+ tab_components = {}
36
+
37
+ with gr.Group():
38
+ with gr.Row():
39
+ browser_binary_path = gr.Textbox(
40
+ label="Browser Binary Path",
41
+ lines=1,
42
+ interactive=True,
43
+ placeholder="e.g. '/Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome'"
44
+ )
45
+ browser_user_data_dir = gr.Textbox(
46
+ label="Browser User Data Dir",
47
+ lines=1,
48
+ interactive=True,
49
+ placeholder="Leave it empty if you use your default user data",
50
+ )
51
+ with gr.Group():
52
+ with gr.Row():
53
+ use_own_browser = gr.Checkbox(
54
+ label="Use Own Browser",
55
+ value=bool(strtobool(os.getenv("USE_OWN_BROWSER", "false"))),
56
+ info="Use your existing browser instance",
57
+ interactive=True
58
+ )
59
+ keep_browser_open = gr.Checkbox(
60
+ label="Keep Browser Open",
61
+ value=bool(strtobool(os.getenv("KEEP_BROWSER_OPEN", "true"))),
62
+ info="Keep Browser Open between Tasks",
63
+ interactive=True
64
+ )
65
+ headless = gr.Checkbox(
66
+ label="Headless Mode",
67
+ value=False,
68
+ info="Run browser without GUI",
69
+ interactive=True
70
+ )
71
+ disable_security = gr.Checkbox(
72
+ label="Disable Security",
73
+ value=False,
74
+ info="Disable browser security",
75
+ interactive=True
76
+ )
77
+
78
+ with gr.Group():
79
+ with gr.Row():
80
+ window_w = gr.Number(
81
+ label="Window Width",
82
+ value=1280,
83
+ info="Browser window width",
84
+ interactive=True
85
+ )
86
+ window_h = gr.Number(
87
+ label="Window Height",
88
+ value=1100,
89
+ info="Browser window height",
90
+ interactive=True
91
+ )
92
+ with gr.Group():
93
+ with gr.Row():
94
+ cdp_url = gr.Textbox(
95
+ label="CDP URL",
96
+ value=os.getenv("BROWSER_CDP", None),
97
+ info="CDP URL for browser remote debugging",
98
+ interactive=True,
99
+ )
100
+ wss_url = gr.Textbox(
101
+ label="WSS URL",
102
+ info="WSS URL for browser remote debugging",
103
+ interactive=True,
104
+ )
105
+ with gr.Group():
106
+ with gr.Row():
107
+ save_recording_path = gr.Textbox(
108
+ label="Recording Path",
109
+ placeholder="e.g. ./tmp/record_videos",
110
+ info="Path to save browser recordings",
111
+ interactive=True,
112
+ )
113
+
114
+ save_trace_path = gr.Textbox(
115
+ label="Trace Path",
116
+ placeholder="e.g. ./tmp/traces",
117
+ info="Path to save Agent traces",
118
+ interactive=True,
119
+ )
120
+
121
+ with gr.Row():
122
+ save_agent_history_path = gr.Textbox(
123
+ label="Agent History Save Path",
124
+ value="./tmp/agent_history",
125
+ info="Specify the directory where agent history should be saved.",
126
+ interactive=True,
127
+ )
128
+ save_download_path = gr.Textbox(
129
+ label="Save Directory for browser downloads",
130
+ value="./tmp/downloads",
131
+ info="Specify the directory where downloaded files should be saved.",
132
+ interactive=True,
133
+ )
134
+ tab_components.update(
135
+ dict(
136
+ browser_binary_path=browser_binary_path,
137
+ browser_user_data_dir=browser_user_data_dir,
138
+ use_own_browser=use_own_browser,
139
+ keep_browser_open=keep_browser_open,
140
+ headless=headless,
141
+ disable_security=disable_security,
142
+ save_recording_path=save_recording_path,
143
+ save_trace_path=save_trace_path,
144
+ save_agent_history_path=save_agent_history_path,
145
+ save_download_path=save_download_path,
146
+ cdp_url=cdp_url,
147
+ wss_url=wss_url,
148
+ window_h=window_h,
149
+ window_w=window_w,
150
+ )
151
+ )
152
+ webui_manager.add_components("browser_settings", tab_components)
153
+
154
+ async def close_wrapper():
155
+ """Wrapper for handle_clear."""
156
+ await close_browser(webui_manager)
157
+
158
+ headless.change(close_wrapper)
159
+ keep_browser_open.change(close_wrapper)
160
+ disable_security.change(close_wrapper)
161
+ use_own_browser.change(close_wrapper)
src/webui/components/browser_use_agent_tab.py ADDED
@@ -0,0 +1,1083 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ import uuid
6
+ from typing import Any, AsyncGenerator, Dict, Optional
7
+
8
+ import gradio as gr
9
+
10
+ # from browser_use.agent.service import Agent
11
+ from browser_use.agent.views import (
12
+ AgentHistoryList,
13
+ AgentOutput,
14
+ )
15
+ from browser_use.browser.browser import BrowserConfig
16
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
17
+ from browser_use.browser.views import BrowserState
18
+ from gradio.components import Component
19
+ from langchain_core.language_models.chat_models import BaseChatModel
20
+
21
+ from src.agent.browser_use.browser_use_agent import BrowserUseAgent
22
+ from src.browser.custom_browser import CustomBrowser
23
+ from src.controller.custom_controller import CustomController
24
+ from src.utils import llm_provider
25
+ from src.webui.webui_manager import WebuiManager
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ # --- Helper Functions --- (Defined at module level)
31
+
32
+
33
+ async def _initialize_llm(
34
+ provider: Optional[str],
35
+ model_name: Optional[str],
36
+ temperature: float,
37
+ base_url: Optional[str],
38
+ api_key: Optional[str],
39
+ num_ctx: Optional[int] = None,
40
+ ) -> Optional[BaseChatModel]:
41
+ """Initializes the LLM based on settings. Returns None if provider/model is missing."""
42
+ if not provider or not model_name:
43
+ logger.info("LLM Provider or Model Name not specified, LLM will be None.")
44
+ return None
45
+ try:
46
+ # Use your actual LLM provider logic here
47
+ logger.info(
48
+ f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}"
49
+ )
50
+ # Example using a placeholder function
51
+ llm = llm_provider.get_llm_model(
52
+ provider=provider,
53
+ model_name=model_name,
54
+ temperature=temperature,
55
+ base_url=base_url or None,
56
+ api_key=api_key or None,
57
+ # Add other relevant params like num_ctx for ollama
58
+ num_ctx=num_ctx if provider == "ollama" else None,
59
+ )
60
+ return llm
61
+ except Exception as e:
62
+ logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
63
+ gr.Warning(
64
+ f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}"
65
+ )
66
+ return None
67
+
68
+
69
+ def _get_config_value(
70
+ webui_manager: WebuiManager,
71
+ comp_dict: Dict[gr.components.Component, Any],
72
+ comp_id_suffix: str,
73
+ default: Any = None,
74
+ ) -> Any:
75
+ """Safely get value from component dictionary using its ID suffix relative to the tab."""
76
+ # Assumes component ID format is "tab_name.comp_name"
77
+ tab_name = "browser_use_agent" # Hardcode or derive if needed
78
+ comp_id = f"{tab_name}.{comp_id_suffix}"
79
+ # Need to find the component object first using the ID from the manager
80
+ try:
81
+ comp = webui_manager.get_component_by_id(comp_id)
82
+ return comp_dict.get(comp, default)
83
+ except KeyError:
84
+ # Try accessing settings tabs as well
85
+ for prefix in ["agent_settings", "browser_settings"]:
86
+ try:
87
+ comp_id = f"{prefix}.{comp_id_suffix}"
88
+ comp = webui_manager.get_component_by_id(comp_id)
89
+ return comp_dict.get(comp, default)
90
+ except KeyError:
91
+ continue
92
+ logger.warning(
93
+ f"Component with suffix '{comp_id_suffix}' not found in manager for value lookup."
94
+ )
95
+ return default
96
+
97
+
98
+ def _format_agent_output(model_output: AgentOutput) -> str:
99
+ """Formats AgentOutput for display in the chatbot using JSON."""
100
+ content = ""
101
+ if model_output:
102
+ try:
103
+ # Directly use model_dump if actions and current_state are Pydantic models
104
+ action_dump = [
105
+ action.model_dump(exclude_none=True) for action in model_output.action
106
+ ]
107
+
108
+ state_dump = model_output.current_state.model_dump(exclude_none=True)
109
+ model_output_dump = {
110
+ "current_state": state_dump,
111
+ "action": action_dump,
112
+ }
113
+ # Dump to JSON string with indentation
114
+ json_string = json.dumps(model_output_dump, indent=4, ensure_ascii=False)
115
+ # Wrap in <pre><code> for proper display in HTML
116
+ content = f"<pre><code class='language-json'>{json_string}</code></pre>"
117
+
118
+ except AttributeError as ae:
119
+ logger.error(
120
+ f"AttributeError during model dump: {ae}. Check if 'action' or 'current_state' or their items support 'model_dump'."
121
+ )
122
+ content = f"<pre><code>Error: Could not format agent output (AttributeError: {ae}).\nRaw output: {str(model_output)}</code></pre>"
123
+ except Exception as e:
124
+ logger.error(f"Error formatting agent output: {e}", exc_info=True)
125
+ # Fallback to simple string representation on error
126
+ content = f"<pre><code>Error formatting agent output.\nRaw output:\n{str(model_output)}</code></pre>"
127
+
128
+ return content.strip()
129
+
130
+
131
+ # --- Updated Callback Implementation ---
132
+
133
+
134
+ async def _handle_new_step(
135
+ webui_manager: WebuiManager, state: BrowserState, output: AgentOutput, step_num: int
136
+ ):
137
+ """Callback for each step taken by the agent, including screenshot display."""
138
+
139
+ # Use the correct chat history attribute name from the user's code
140
+ if not hasattr(webui_manager, "bu_chat_history"):
141
+ logger.error(
142
+ "Attribute 'bu_chat_history' not found in webui_manager! Cannot add chat message."
143
+ )
144
+ # Initialize it maybe? Or raise an error? For now, log and potentially skip chat update.
145
+ webui_manager.bu_chat_history = [] # Initialize if missing (consider if this is the right place)
146
+ # return # Or stop if this is critical
147
+ step_num -= 1
148
+ logger.info(f"Step {step_num} completed.")
149
+
150
+ # --- Screenshot Handling ---
151
+ screenshot_html = ""
152
+ # Ensure state.screenshot exists and is not empty before proceeding
153
+ # Use getattr for safer access
154
+ screenshot_data = getattr(state, "screenshot", None)
155
+ if screenshot_data:
156
+ try:
157
+ # Basic validation: check if it looks like base64
158
+ if (
159
+ isinstance(screenshot_data, str) and len(screenshot_data) > 100
160
+ ): # Arbitrary length check
161
+ # *** UPDATED STYLE: Removed centering, adjusted width ***
162
+ img_tag = f'<img src="data:image/jpeg;base64,{screenshot_data}" alt="Step {step_num} Screenshot" style="max-width: 800px; max-height: 600px; object-fit:contain;" />'
163
+ screenshot_html = (
164
+ img_tag + "<br/>"
165
+ ) # Use <br/> for line break after inline-block image
166
+ else:
167
+ logger.warning(
168
+ f"Screenshot for step {step_num} seems invalid (type: {type(screenshot_data)}, len: {len(screenshot_data) if isinstance(screenshot_data, str) else 'N/A'})."
169
+ )
170
+ screenshot_html = "**[Invalid screenshot data]**<br/>"
171
+
172
+ except Exception as e:
173
+ logger.error(
174
+ f"Error processing or formatting screenshot for step {step_num}: {e}",
175
+ exc_info=True,
176
+ )
177
+ screenshot_html = "**[Error displaying screenshot]**<br/>"
178
+ else:
179
+ logger.debug(f"No screenshot available for step {step_num}.")
180
+
181
+ # --- Format Agent Output ---
182
+ formatted_output = _format_agent_output(output) # Use the updated function
183
+
184
+ # --- Combine and Append to Chat ---
185
+ step_header = f"--- **Step {step_num}** ---"
186
+ # Combine header, image (with line break), and JSON block
187
+ final_content = step_header + "<br/>" + screenshot_html + formatted_output
188
+
189
+ chat_message = {
190
+ "role": "assistant",
191
+ "content": final_content.strip(), # Remove leading/trailing whitespace
192
+ }
193
+
194
+ # Append to the correct chat history list
195
+ webui_manager.bu_chat_history.append(chat_message)
196
+
197
+ await asyncio.sleep(0.05)
198
+
199
+
200
+ def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList):
201
+ """Callback when the agent finishes the task (success or failure)."""
202
+ logger.info(
203
+ f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}"
204
+ )
205
+ final_summary = "**Task Completed**\n"
206
+ final_summary += f"- Duration: {history.total_duration_seconds():.2f} seconds\n"
207
+ final_summary += f"- Total Input Tokens: {history.total_input_tokens()}\n" # Or total tokens if available
208
+
209
+ final_result = history.final_result()
210
+ if final_result:
211
+ final_summary += f"- Final Result: {final_result}\n"
212
+
213
+ errors = history.errors()
214
+ if errors and any(errors):
215
+ final_summary += f"- **Errors:**\n```\n{errors}\n```\n"
216
+ else:
217
+ final_summary += "- Status: Success\n"
218
+
219
+ webui_manager.bu_chat_history.append(
220
+ {"role": "assistant", "content": final_summary}
221
+ )
222
+
223
+
224
+ async def _ask_assistant_callback(
225
+ webui_manager: WebuiManager, query: str, browser_context: BrowserContext
226
+ ) -> Dict[str, Any]:
227
+ """Callback triggered by the agent's ask_for_assistant action."""
228
+ logger.info("Agent requires assistance. Waiting for user input.")
229
+
230
+ if not hasattr(webui_manager, "_chat_history"):
231
+ logger.error("Chat history not found in webui_manager during ask_assistant!")
232
+ return {"response": "Internal Error: Cannot display help request."}
233
+
234
+ webui_manager.bu_chat_history.append(
235
+ {
236
+ "role": "assistant",
237
+ "content": f"**Need Help:** {query}\nPlease provide information or perform the required action in the browser, then type your response/confirmation below and click 'Submit Response'.",
238
+ }
239
+ )
240
+
241
+ # Use state stored in webui_manager
242
+ webui_manager.bu_response_event = asyncio.Event()
243
+ webui_manager.bu_user_help_response = None # Reset previous response
244
+
245
+ try:
246
+ logger.info("Waiting for user response event...")
247
+ await asyncio.wait_for(
248
+ webui_manager.bu_response_event.wait(), timeout=3600.0
249
+ ) # Long timeout
250
+ logger.info("User response event received.")
251
+ except asyncio.TimeoutError:
252
+ logger.warning("Timeout waiting for user assistance.")
253
+ webui_manager.bu_chat_history.append(
254
+ {
255
+ "role": "assistant",
256
+ "content": "**Timeout:** No response received. Trying to proceed.",
257
+ }
258
+ )
259
+ webui_manager.bu_response_event = None # Clear the event
260
+ return {"response": "Timeout: User did not respond."} # Inform the agent
261
+
262
+ response = webui_manager.bu_user_help_response
263
+ webui_manager.bu_chat_history.append(
264
+ {"role": "user", "content": response}
265
+ ) # Show user response in chat
266
+ webui_manager.bu_response_event = (
267
+ None # Clear the event for the next potential request
268
+ )
269
+ return {"response": response}
270
+
271
+
272
+ # --- Core Agent Execution Logic --- (Needs access to webui_manager)
273
+
274
+
275
+ async def run_agent_task(
276
+ webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
277
+ ) -> AsyncGenerator[Dict[gr.components.Component, Any], None]:
278
+ """Handles the entire lifecycle of initializing and running the agent."""
279
+
280
+ # --- Get Components ---
281
+ # Need handles to specific UI components to update them
282
+ user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
283
+ run_button_comp = webui_manager.get_component_by_id("browser_use_agent.run_button")
284
+ stop_button_comp = webui_manager.get_component_by_id(
285
+ "browser_use_agent.stop_button"
286
+ )
287
+ pause_resume_button_comp = webui_manager.get_component_by_id(
288
+ "browser_use_agent.pause_resume_button"
289
+ )
290
+ clear_button_comp = webui_manager.get_component_by_id(
291
+ "browser_use_agent.clear_button"
292
+ )
293
+ chatbot_comp = webui_manager.get_component_by_id("browser_use_agent.chatbot")
294
+ history_file_comp = webui_manager.get_component_by_id(
295
+ "browser_use_agent.agent_history_file"
296
+ )
297
+ gif_comp = webui_manager.get_component_by_id("browser_use_agent.recording_gif")
298
+ browser_view_comp = webui_manager.get_component_by_id(
299
+ "browser_use_agent.browser_view"
300
+ )
301
+
302
+ # --- 1. Get Task and Initial UI Update ---
303
+ task = components.get(user_input_comp, "").strip()
304
+ if not task:
305
+ gr.Warning("Please enter a task.")
306
+ yield {run_button_comp: gr.update(interactive=True)}
307
+ return
308
+
309
+ # Set running state indirectly via _current_task
310
+ webui_manager.bu_chat_history.append({"role": "user", "content": task})
311
+
312
+ yield {
313
+ user_input_comp: gr.Textbox(
314
+ value="", interactive=False, placeholder="Agent is running..."
315
+ ),
316
+ run_button_comp: gr.Button(value="⏳ Running...", interactive=False),
317
+ stop_button_comp: gr.Button(interactive=True),
318
+ pause_resume_button_comp: gr.Button(value="⏸️ Pause", interactive=True),
319
+ clear_button_comp: gr.Button(interactive=False),
320
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
321
+ history_file_comp: gr.update(value=None),
322
+ gif_comp: gr.update(value=None),
323
+ }
324
+
325
+ # --- Agent Settings ---
326
+ # Access settings values via components dict, getting IDs from webui_manager
327
+ def get_setting(key, default=None):
328
+ comp = webui_manager.id_to_component.get(f"agent_settings.{key}")
329
+ return components.get(comp, default) if comp else default
330
+
331
+ override_system_prompt = get_setting("override_system_prompt") or None
332
+ extend_system_prompt = get_setting("extend_system_prompt") or None
333
+ llm_provider_name = get_setting(
334
+ "llm_provider", None
335
+ ) # Default to None if not found
336
+ llm_model_name = get_setting("llm_model_name", None)
337
+ llm_temperature = get_setting("llm_temperature", 0.6)
338
+ use_vision = get_setting("use_vision", True)
339
+ ollama_num_ctx = get_setting("ollama_num_ctx", 16000)
340
+ llm_base_url = get_setting("llm_base_url") or None
341
+ llm_api_key = get_setting("llm_api_key") or None
342
+ max_steps = get_setting("max_steps", 100)
343
+ max_actions = get_setting("max_actions", 10)
344
+ max_input_tokens = get_setting("max_input_tokens", 128000)
345
+ tool_calling_str = get_setting("tool_calling_method", "auto")
346
+ tool_calling_method = tool_calling_str if tool_calling_str != "None" else None
347
+ mcp_server_config_comp = webui_manager.id_to_component.get(
348
+ "agent_settings.mcp_server_config"
349
+ )
350
+ mcp_server_config_str = (
351
+ components.get(mcp_server_config_comp) if mcp_server_config_comp else None
352
+ )
353
+ mcp_server_config = (
354
+ json.loads(mcp_server_config_str) if mcp_server_config_str else None
355
+ )
356
+
357
+ # Planner LLM Settings (Optional)
358
+ planner_llm_provider_name = get_setting("planner_llm_provider") or None
359
+ planner_llm = None
360
+ planner_use_vision = False
361
+ if planner_llm_provider_name:
362
+ planner_llm_model_name = get_setting("planner_llm_model_name")
363
+ planner_llm_temperature = get_setting("planner_llm_temperature", 0.6)
364
+ planner_ollama_num_ctx = get_setting("planner_ollama_num_ctx", 16000)
365
+ planner_llm_base_url = get_setting("planner_llm_base_url") or None
366
+ planner_llm_api_key = get_setting("planner_llm_api_key") or None
367
+ planner_use_vision = get_setting("planner_use_vision", False)
368
+
369
+ planner_llm = await _initialize_llm(
370
+ planner_llm_provider_name,
371
+ planner_llm_model_name,
372
+ planner_llm_temperature,
373
+ planner_llm_base_url,
374
+ planner_llm_api_key,
375
+ planner_ollama_num_ctx if planner_llm_provider_name == "ollama" else None,
376
+ )
377
+
378
+ # --- Browser Settings ---
379
+ def get_browser_setting(key, default=None):
380
+ comp = webui_manager.id_to_component.get(f"browser_settings.{key}")
381
+ return components.get(comp, default) if comp else default
382
+
383
+ browser_binary_path = get_browser_setting("browser_binary_path") or None
384
+ browser_user_data_dir = get_browser_setting("browser_user_data_dir") or None
385
+ use_own_browser = get_browser_setting(
386
+ "use_own_browser", False
387
+ ) # Logic handled by CDP/WSS presence
388
+ keep_browser_open = get_browser_setting("keep_browser_open", False)
389
+ headless = get_browser_setting("headless", False)
390
+ disable_security = get_browser_setting("disable_security", False)
391
+ window_w = int(get_browser_setting("window_w", 1280))
392
+ window_h = int(get_browser_setting("window_h", 1100))
393
+ cdp_url = get_browser_setting("cdp_url") or None
394
+ wss_url = get_browser_setting("wss_url") or None
395
+ save_recording_path = get_browser_setting("save_recording_path") or None
396
+ save_trace_path = get_browser_setting("save_trace_path") or None
397
+ save_agent_history_path = get_browser_setting(
398
+ "save_agent_history_path", "./tmp/agent_history"
399
+ )
400
+ save_download_path = get_browser_setting("save_download_path", "./tmp/downloads")
401
+
402
+ stream_vw = 70
403
+ stream_vh = int(70 * window_h // window_w)
404
+
405
+ os.makedirs(save_agent_history_path, exist_ok=True)
406
+ if save_recording_path:
407
+ os.makedirs(save_recording_path, exist_ok=True)
408
+ if save_trace_path:
409
+ os.makedirs(save_trace_path, exist_ok=True)
410
+ if save_download_path:
411
+ os.makedirs(save_download_path, exist_ok=True)
412
+
413
+ # --- 2. Initialize LLM ---
414
+ main_llm = await _initialize_llm(
415
+ llm_provider_name,
416
+ llm_model_name,
417
+ llm_temperature,
418
+ llm_base_url,
419
+ llm_api_key,
420
+ ollama_num_ctx if llm_provider_name == "ollama" else None,
421
+ )
422
+
423
+ # Pass the webui_manager instance to the callback when wrapping it
424
+ async def ask_callback_wrapper(
425
+ query: str, browser_context: BrowserContext
426
+ ) -> Dict[str, Any]:
427
+ return await _ask_assistant_callback(webui_manager, query, browser_context)
428
+
429
+ if not webui_manager.bu_controller:
430
+ webui_manager.bu_controller = CustomController(
431
+ ask_assistant_callback=ask_callback_wrapper
432
+ )
433
+ await webui_manager.bu_controller.setup_mcp_client(mcp_server_config)
434
+
435
+ # --- 4. Initialize Browser and Context ---
436
+ should_close_browser_on_finish = not keep_browser_open
437
+
438
+ try:
439
+ # Close existing resources if not keeping open
440
+ if not keep_browser_open:
441
+ if webui_manager.bu_browser_context:
442
+ logger.info("Closing previous browser context.")
443
+ await webui_manager.bu_browser_context.close()
444
+ webui_manager.bu_browser_context = None
445
+ if webui_manager.bu_browser:
446
+ logger.info("Closing previous browser.")
447
+ await webui_manager.bu_browser.close()
448
+ webui_manager.bu_browser = None
449
+
450
+ # Create Browser if needed
451
+ if not webui_manager.bu_browser:
452
+ logger.info("Launching new browser instance.")
453
+ extra_args = []
454
+ if use_own_browser:
455
+ browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path
456
+ if browser_binary_path == "":
457
+ browser_binary_path = None
458
+ browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None)
459
+ if browser_user_data:
460
+ extra_args += [f"--user-data-dir={browser_user_data}"]
461
+ else:
462
+ browser_binary_path = None
463
+
464
+ webui_manager.bu_browser = CustomBrowser(
465
+ config=BrowserConfig(
466
+ headless=headless,
467
+ disable_security=disable_security,
468
+ browser_binary_path=browser_binary_path,
469
+ extra_browser_args=extra_args,
470
+ wss_url=wss_url,
471
+ cdp_url=cdp_url,
472
+ new_context_config=BrowserContextConfig(
473
+ window_width=window_w,
474
+ window_height=window_h,
475
+ )
476
+ )
477
+ )
478
+
479
+ # Create Context if needed
480
+ if not webui_manager.bu_browser_context:
481
+ logger.info("Creating new browser context.")
482
+ context_config = BrowserContextConfig(
483
+ trace_path=save_trace_path if save_trace_path else None,
484
+ save_recording_path=save_recording_path
485
+ if save_recording_path
486
+ else None,
487
+ save_downloads_path=save_download_path if save_download_path else None,
488
+ window_height=window_h,
489
+ window_width=window_w,
490
+ )
491
+ if not webui_manager.bu_browser:
492
+ raise ValueError("Browser not initialized, cannot create context.")
493
+ webui_manager.bu_browser_context = (
494
+ await webui_manager.bu_browser.new_context(config=context_config)
495
+ )
496
+
497
+ # --- 5. Initialize or Update Agent ---
498
+ webui_manager.bu_agent_task_id = str(uuid.uuid4()) # New ID for this task run
499
+ os.makedirs(
500
+ os.path.join(save_agent_history_path, webui_manager.bu_agent_task_id),
501
+ exist_ok=True,
502
+ )
503
+ history_file = os.path.join(
504
+ save_agent_history_path,
505
+ webui_manager.bu_agent_task_id,
506
+ f"{webui_manager.bu_agent_task_id}.json",
507
+ )
508
+ gif_path = os.path.join(
509
+ save_agent_history_path,
510
+ webui_manager.bu_agent_task_id,
511
+ f"{webui_manager.bu_agent_task_id}.gif",
512
+ )
513
+
514
+ # Pass the webui_manager to callbacks when wrapping them
515
+ async def step_callback_wrapper(
516
+ state: BrowserState, output: AgentOutput, step_num: int
517
+ ):
518
+ await _handle_new_step(webui_manager, state, output, step_num)
519
+
520
+ def done_callback_wrapper(history: AgentHistoryList):
521
+ _handle_done(webui_manager, history)
522
+
523
+ if not webui_manager.bu_agent:
524
+ logger.info(f"Initializing new agent for task: {task}")
525
+ if not webui_manager.bu_browser or not webui_manager.bu_browser_context:
526
+ raise ValueError(
527
+ "Browser or Context not initialized, cannot create agent."
528
+ )
529
+ webui_manager.bu_agent = BrowserUseAgent(
530
+ task=task,
531
+ llm=main_llm,
532
+ browser=webui_manager.bu_browser,
533
+ browser_context=webui_manager.bu_browser_context,
534
+ controller=webui_manager.bu_controller,
535
+ register_new_step_callback=step_callback_wrapper,
536
+ register_done_callback=done_callback_wrapper,
537
+ use_vision=use_vision,
538
+ override_system_message=override_system_prompt,
539
+ extend_system_message=extend_system_prompt,
540
+ max_input_tokens=max_input_tokens,
541
+ max_actions_per_step=max_actions,
542
+ tool_calling_method=tool_calling_method,
543
+ planner_llm=planner_llm,
544
+ use_vision_for_planner=planner_use_vision if planner_llm else False,
545
+ source="webui",
546
+ )
547
+ webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
548
+ webui_manager.bu_agent.settings.generate_gif = gif_path
549
+ else:
550
+ webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
551
+ webui_manager.bu_agent.add_new_task(task)
552
+ webui_manager.bu_agent.settings.generate_gif = gif_path
553
+ webui_manager.bu_agent.browser = webui_manager.bu_browser
554
+ webui_manager.bu_agent.browser_context = webui_manager.bu_browser_context
555
+ webui_manager.bu_agent.controller = webui_manager.bu_controller
556
+
557
+ # --- 6. Run Agent Task and Stream Updates ---
558
+ agent_run_coro = webui_manager.bu_agent.run(max_steps=max_steps)
559
+ agent_task = asyncio.create_task(agent_run_coro)
560
+ webui_manager.bu_current_task = agent_task # Store the task
561
+
562
+ last_chat_len = len(webui_manager.bu_chat_history)
563
+ while not agent_task.done():
564
+ is_paused = webui_manager.bu_agent.state.paused
565
+ is_stopped = webui_manager.bu_agent.state.stopped
566
+
567
+ # Check for pause state
568
+ if is_paused:
569
+ yield {
570
+ pause_resume_button_comp: gr.update(
571
+ value="▶️ Resume", interactive=True
572
+ ),
573
+ stop_button_comp: gr.update(interactive=True),
574
+ }
575
+ # Wait until pause is released or task is stopped/done
576
+ while is_paused and not agent_task.done():
577
+ # Re-check agent state in loop
578
+ is_paused = webui_manager.bu_agent.state.paused
579
+ is_stopped = webui_manager.bu_agent.state.stopped
580
+ if is_stopped: # Stop signal received while paused
581
+ break
582
+ await asyncio.sleep(0.2)
583
+
584
+ if (
585
+ agent_task.done() or is_stopped
586
+ ): # If stopped or task finished while paused
587
+ break
588
+
589
+ # If resumed, yield UI update
590
+ yield {
591
+ pause_resume_button_comp: gr.update(
592
+ value="⏸️ Pause", interactive=True
593
+ ),
594
+ run_button_comp: gr.update(
595
+ value="⏳ Running...", interactive=False
596
+ ),
597
+ }
598
+
599
+ # Check if agent stopped itself or stop button was pressed (which sets agent.state.stopped)
600
+ if is_stopped:
601
+ logger.info("Agent has stopped (internally or via stop button).")
602
+ if not agent_task.done():
603
+ # Ensure the task coroutine finishes if agent just set flag
604
+ try:
605
+ await asyncio.wait_for(
606
+ agent_task, timeout=1.0
607
+ ) # Give it a moment to exit run()
608
+ except asyncio.TimeoutError:
609
+ logger.warning(
610
+ "Agent task did not finish quickly after stop signal, cancelling."
611
+ )
612
+ agent_task.cancel()
613
+ except Exception: # Catch task exceptions if it errors on stop
614
+ pass
615
+ break # Exit the streaming loop
616
+
617
+ # Check if agent is asking for help (via response_event)
618
+ update_dict = {}
619
+ if webui_manager.bu_response_event is not None:
620
+ update_dict = {
621
+ user_input_comp: gr.update(
622
+ placeholder="Agent needs help. Enter response and submit.",
623
+ interactive=True,
624
+ ),
625
+ run_button_comp: gr.update(
626
+ value="✔️ Submit Response", interactive=True
627
+ ),
628
+ pause_resume_button_comp: gr.update(interactive=False),
629
+ stop_button_comp: gr.update(interactive=False),
630
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
631
+ }
632
+ last_chat_len = len(webui_manager.bu_chat_history)
633
+ yield update_dict
634
+ # Wait until response is submitted or task finishes
635
+ while (
636
+ webui_manager.bu_response_event is not None
637
+ and not agent_task.done()
638
+ ):
639
+ await asyncio.sleep(0.2)
640
+ # Restore UI after response submitted or if task ended unexpectedly
641
+ if not agent_task.done():
642
+ yield {
643
+ user_input_comp: gr.update(
644
+ placeholder="Agent is running...", interactive=False
645
+ ),
646
+ run_button_comp: gr.update(
647
+ value="⏳ Running...", interactive=False
648
+ ),
649
+ pause_resume_button_comp: gr.update(interactive=True),
650
+ stop_button_comp: gr.update(interactive=True),
651
+ }
652
+ else:
653
+ break # Task finished while waiting for response
654
+
655
+ # Update Chatbot if new messages arrived via callbacks
656
+ if len(webui_manager.bu_chat_history) > last_chat_len:
657
+ update_dict[chatbot_comp] = gr.update(
658
+ value=webui_manager.bu_chat_history
659
+ )
660
+ last_chat_len = len(webui_manager.bu_chat_history)
661
+
662
+ # Update Browser View
663
+ if headless and webui_manager.bu_browser_context:
664
+ try:
665
+ screenshot_b64 = (
666
+ await webui_manager.bu_browser_context.take_screenshot()
667
+ )
668
+ if screenshot_b64:
669
+ html_content = f'<img src="data:image/jpeg;base64,{screenshot_b64}" style="width:{stream_vw}vw; height:{stream_vh}vh ; border:1px solid #ccc;">'
670
+ update_dict[browser_view_comp] = gr.update(
671
+ value=html_content, visible=True
672
+ )
673
+ else:
674
+ html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>"
675
+ update_dict[browser_view_comp] = gr.update(
676
+ value=html_content, visible=True
677
+ )
678
+ except Exception as e:
679
+ logger.debug(f"Failed to capture screenshot: {e}")
680
+ update_dict[browser_view_comp] = gr.update(
681
+ value="<div style='...'>Error loading view...</div>",
682
+ visible=True,
683
+ )
684
+ else:
685
+ update_dict[browser_view_comp] = gr.update(visible=False)
686
+
687
+ # Yield accumulated updates
688
+ if update_dict:
689
+ yield update_dict
690
+
691
+ await asyncio.sleep(0.1) # Polling interval
692
+
693
+ # --- 7. Task Finalization ---
694
+ webui_manager.bu_agent.state.paused = False
695
+ webui_manager.bu_agent.state.stopped = False
696
+ final_update = {}
697
+ try:
698
+ logger.info("Agent task completing...")
699
+ # Await the task ensure completion and catch exceptions if not already caught
700
+ if not agent_task.done():
701
+ await agent_task # Retrieve result/exception
702
+ elif agent_task.exception(): # Check if task finished with exception
703
+ agent_task.result() # Raise the exception to be caught below
704
+ logger.info("Agent task completed processing.")
705
+
706
+ logger.info(f"Explicitly saving agent history to: {history_file}")
707
+ webui_manager.bu_agent.save_history(history_file)
708
+
709
+ if os.path.exists(history_file):
710
+ final_update[history_file_comp] = gr.File(value=history_file)
711
+
712
+ if gif_path and os.path.exists(gif_path):
713
+ logger.info(f"GIF found at: {gif_path}")
714
+ final_update[gif_comp] = gr.Image(value=gif_path)
715
+
716
+ except asyncio.CancelledError:
717
+ logger.info("Agent task was cancelled.")
718
+ if not any(
719
+ "Cancelled" in msg.get("content", "")
720
+ for msg in webui_manager.bu_chat_history
721
+ if msg.get("role") == "assistant"
722
+ ):
723
+ webui_manager.bu_chat_history.append(
724
+ {"role": "assistant", "content": "**Task Cancelled**."}
725
+ )
726
+ final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
727
+ except Exception as e:
728
+ logger.error(f"Error during agent execution: {e}", exc_info=True)
729
+ error_message = (
730
+ f"**Agent Execution Error:**\n```\n{type(e).__name__}: {e}\n```"
731
+ )
732
+ if not any(
733
+ error_message in msg.get("content", "")
734
+ for msg in webui_manager.bu_chat_history
735
+ if msg.get("role") == "assistant"
736
+ ):
737
+ webui_manager.bu_chat_history.append(
738
+ {"role": "assistant", "content": error_message}
739
+ )
740
+ final_update[chatbot_comp] = gr.update(value=webui_manager.bu_chat_history)
741
+ gr.Error(f"Agent execution failed: {e}")
742
+
743
+ finally:
744
+ webui_manager.bu_current_task = None # Clear the task reference
745
+
746
+ # Close browser/context if requested
747
+ if should_close_browser_on_finish:
748
+ if webui_manager.bu_browser_context:
749
+ logger.info("Closing browser context after task.")
750
+ await webui_manager.bu_browser_context.close()
751
+ webui_manager.bu_browser_context = None
752
+ if webui_manager.bu_browser:
753
+ logger.info("Closing browser after task.")
754
+ await webui_manager.bu_browser.close()
755
+ webui_manager.bu_browser = None
756
+
757
+ # --- 8. Final UI Update ---
758
+ final_update.update(
759
+ {
760
+ user_input_comp: gr.update(
761
+ value="",
762
+ interactive=True,
763
+ placeholder="Enter your next task...",
764
+ ),
765
+ run_button_comp: gr.update(value="▶️ Submit Task", interactive=True),
766
+ stop_button_comp: gr.update(value="⏹️ Stop", interactive=False),
767
+ pause_resume_button_comp: gr.update(
768
+ value="⏸️ Pause", interactive=False
769
+ ),
770
+ clear_button_comp: gr.update(interactive=True),
771
+ # Ensure final chat history is shown
772
+ chatbot_comp: gr.update(value=webui_manager.bu_chat_history),
773
+ }
774
+ )
775
+ yield final_update
776
+
777
+ except Exception as e:
778
+ # Catch errors during setup (before agent run starts)
779
+ logger.error(f"Error setting up agent task: {e}", exc_info=True)
780
+ webui_manager.bu_current_task = None # Ensure state is reset
781
+ yield {
782
+ user_input_comp: gr.update(
783
+ interactive=True, placeholder="Error during setup. Enter task..."
784
+ ),
785
+ run_button_comp: gr.update(value="▶️ Submit Task", interactive=True),
786
+ stop_button_comp: gr.update(value="⏹️ Stop", interactive=False),
787
+ pause_resume_button_comp: gr.update(value="⏸️ Pause", interactive=False),
788
+ clear_button_comp: gr.update(interactive=True),
789
+ chatbot_comp: gr.update(
790
+ value=webui_manager.bu_chat_history
791
+ + [{"role": "assistant", "content": f"**Setup Error:** {e}"}]
792
+ ),
793
+ }
794
+
795
+
796
+ # --- Button Click Handlers --- (Need access to webui_manager)
797
+
798
+
799
+ async def handle_submit(
800
+ webui_manager: WebuiManager, components: Dict[gr.components.Component, Any]
801
+ ):
802
+ """Handles clicks on the main 'Submit' button."""
803
+ user_input_comp = webui_manager.get_component_by_id("browser_use_agent.user_input")
804
+ user_input_value = components.get(user_input_comp, "").strip()
805
+
806
+ # Check if waiting for user assistance
807
+ if webui_manager.bu_response_event and not webui_manager.bu_response_event.is_set():
808
+ logger.info(f"User submitted assistance: {user_input_value}")
809
+ webui_manager.bu_user_help_response = (
810
+ user_input_value if user_input_value else "User provided no text response."
811
+ )
812
+ webui_manager.bu_response_event.set()
813
+ # UI updates handled by the main loop reacting to the event being set
814
+ yield {
815
+ user_input_comp: gr.update(
816
+ value="",
817
+ interactive=False,
818
+ placeholder="Waiting for agent to continue...",
819
+ ),
820
+ webui_manager.get_component_by_id(
821
+ "browser_use_agent.run_button"
822
+ ): gr.update(value="⏳ Running...", interactive=False),
823
+ }
824
+ # Check if a task is currently running (using _current_task)
825
+ elif webui_manager.bu_current_task and not webui_manager.bu_current_task.done():
826
+ logger.warning(
827
+ "Submit button clicked while agent is already running and not asking for help."
828
+ )
829
+ gr.Info("Agent is currently running. Please wait or use Stop/Pause.")
830
+ yield {} # No change
831
+ else:
832
+ # Handle submission for a new task
833
+ logger.info("Submit button clicked for new task.")
834
+ # Use async generator to stream updates from run_agent_task
835
+ async for update in run_agent_task(webui_manager, components):
836
+ yield update
837
+
838
+
839
+ async def handle_stop(webui_manager: WebuiManager):
840
+ """Handles clicks on the 'Stop' button."""
841
+ logger.info("Stop button clicked.")
842
+ agent = webui_manager.bu_agent
843
+ task = webui_manager.bu_current_task
844
+
845
+ if agent and task and not task.done():
846
+ # Signal the agent to stop by setting its internal flag
847
+ agent.state.stopped = True
848
+ agent.state.paused = False # Ensure not paused if stopped
849
+ return {
850
+ webui_manager.get_component_by_id(
851
+ "browser_use_agent.stop_button"
852
+ ): gr.update(interactive=False, value="⏹️ Stopping..."),
853
+ webui_manager.get_component_by_id(
854
+ "browser_use_agent.pause_resume_button"
855
+ ): gr.update(interactive=False),
856
+ webui_manager.get_component_by_id(
857
+ "browser_use_agent.run_button"
858
+ ): gr.update(interactive=False),
859
+ }
860
+ else:
861
+ logger.warning("Stop clicked but agent is not running or task is already done.")
862
+ # Reset UI just in case it's stuck
863
+ return {
864
+ webui_manager.get_component_by_id(
865
+ "browser_use_agent.run_button"
866
+ ): gr.update(interactive=True),
867
+ webui_manager.get_component_by_id(
868
+ "browser_use_agent.stop_button"
869
+ ): gr.update(interactive=False),
870
+ webui_manager.get_component_by_id(
871
+ "browser_use_agent.pause_resume_button"
872
+ ): gr.update(interactive=False),
873
+ webui_manager.get_component_by_id(
874
+ "browser_use_agent.clear_button"
875
+ ): gr.update(interactive=True),
876
+ }
877
+
878
+
879
+ async def handle_pause_resume(webui_manager: WebuiManager):
880
+ """Handles clicks on the 'Pause/Resume' button."""
881
+ agent = webui_manager.bu_agent
882
+ task = webui_manager.bu_current_task
883
+
884
+ if agent and task and not task.done():
885
+ if agent.state.paused:
886
+ logger.info("Resume button clicked.")
887
+ agent.resume()
888
+ # UI update happens in main loop
889
+ return {
890
+ webui_manager.get_component_by_id(
891
+ "browser_use_agent.pause_resume_button"
892
+ ): gr.update(value="⏸️ Pause", interactive=True)
893
+ } # Optimistic update
894
+ else:
895
+ logger.info("Pause button clicked.")
896
+ agent.pause()
897
+ return {
898
+ webui_manager.get_component_by_id(
899
+ "browser_use_agent.pause_resume_button"
900
+ ): gr.update(value="▶️ Resume", interactive=True)
901
+ } # Optimistic update
902
+ else:
903
+ logger.warning(
904
+ "Pause/Resume clicked but agent is not running or doesn't support state."
905
+ )
906
+ return {} # No change
907
+
908
+
909
+ async def handle_clear(webui_manager: WebuiManager):
910
+ """Handles clicks on the 'Clear' button."""
911
+ logger.info("Clear button clicked.")
912
+
913
+ # Stop any running task first
914
+ task = webui_manager.bu_current_task
915
+ if task and not task.done():
916
+ logger.info("Clearing requires stopping the current task.")
917
+ webui_manager.bu_agent.stop()
918
+ task.cancel()
919
+ try:
920
+ await asyncio.wait_for(task, timeout=2.0) # Wait briefly
921
+ except (asyncio.CancelledError, asyncio.TimeoutError):
922
+ pass
923
+ except Exception as e:
924
+ logger.warning(f"Error stopping task on clear: {e}")
925
+ webui_manager.bu_current_task = None
926
+
927
+ if webui_manager.bu_controller:
928
+ await webui_manager.bu_controller.close_mcp_client()
929
+ webui_manager.bu_controller = None
930
+ webui_manager.bu_agent = None
931
+
932
+ # Reset state stored in manager
933
+ webui_manager.bu_chat_history = []
934
+ webui_manager.bu_response_event = None
935
+ webui_manager.bu_user_help_response = None
936
+ webui_manager.bu_agent_task_id = None
937
+
938
+ logger.info("Agent state and browser resources cleared.")
939
+
940
+ # Reset UI components
941
+ return {
942
+ webui_manager.get_component_by_id("browser_use_agent.chatbot"): gr.update(
943
+ value=[]
944
+ ),
945
+ webui_manager.get_component_by_id("browser_use_agent.user_input"): gr.update(
946
+ value="", placeholder="Enter your task here..."
947
+ ),
948
+ webui_manager.get_component_by_id(
949
+ "browser_use_agent.agent_history_file"
950
+ ): gr.update(value=None),
951
+ webui_manager.get_component_by_id("browser_use_agent.recording_gif"): gr.update(
952
+ value=None
953
+ ),
954
+ webui_manager.get_component_by_id("browser_use_agent.browser_view"): gr.update(
955
+ value="<div style='...'>Browser Cleared</div>"
956
+ ),
957
+ webui_manager.get_component_by_id("browser_use_agent.run_button"): gr.update(
958
+ value="▶️ Submit Task", interactive=True
959
+ ),
960
+ webui_manager.get_component_by_id("browser_use_agent.stop_button"): gr.update(
961
+ interactive=False
962
+ ),
963
+ webui_manager.get_component_by_id(
964
+ "browser_use_agent.pause_resume_button"
965
+ ): gr.update(value="⏸️ Pause", interactive=False),
966
+ webui_manager.get_component_by_id("browser_use_agent.clear_button"): gr.update(
967
+ interactive=True
968
+ ),
969
+ }
970
+
971
+
972
+ # --- Tab Creation Function ---
973
+
974
+
975
+ def create_browser_use_agent_tab(webui_manager: WebuiManager):
976
+ """
977
+ Create the run agent tab, defining UI, state, and handlers.
978
+ """
979
+ webui_manager.init_browser_use_agent()
980
+
981
+ # --- Define UI Components ---
982
+ tab_components = {}
983
+ with gr.Column():
984
+ chatbot = gr.Chatbot(
985
+ lambda: webui_manager.bu_chat_history, # Load history dynamically
986
+ elem_id="browser_use_chatbot",
987
+ label="Agent Interaction",
988
+ type="messages",
989
+ height=600,
990
+ show_copy_button=True,
991
+ )
992
+ user_input = gr.Textbox(
993
+ label="Your Task or Response",
994
+ placeholder="Enter your task here or provide assistance when asked.",
995
+ lines=3,
996
+ interactive=True,
997
+ elem_id="user_input",
998
+ )
999
+ with gr.Row():
1000
+ stop_button = gr.Button(
1001
+ "⏹️ Stop", interactive=False, variant="stop", scale=2
1002
+ )
1003
+ pause_resume_button = gr.Button(
1004
+ "⏸️ Pause", interactive=False, variant="secondary", scale=2, visible=True
1005
+ )
1006
+ clear_button = gr.Button(
1007
+ "🗑️ Clear", interactive=True, variant="secondary", scale=2
1008
+ )
1009
+ run_button = gr.Button("▶️ Submit Task", variant="primary", scale=3)
1010
+
1011
+ browser_view = gr.HTML(
1012
+ value="<div style='width:100%; height:50vh; display:flex; justify-content:center; align-items:center; border:1px solid #ccc; background-color:#f0f0f0;'><p>Browser View (Requires Headless=True)</p></div>",
1013
+ label="Browser Live View",
1014
+ elem_id="browser_view",
1015
+ visible=False,
1016
+ )
1017
+ with gr.Column():
1018
+ gr.Markdown("### Task Outputs")
1019
+ agent_history_file = gr.File(label="Agent History JSON", interactive=False)
1020
+ recording_gif = gr.Image(
1021
+ label="Task Recording GIF",
1022
+ format="gif",
1023
+ interactive=False,
1024
+ type="filepath",
1025
+ )
1026
+
1027
+ # --- Store Components in Manager ---
1028
+ tab_components.update(
1029
+ dict(
1030
+ chatbot=chatbot,
1031
+ user_input=user_input,
1032
+ clear_button=clear_button,
1033
+ run_button=run_button,
1034
+ stop_button=stop_button,
1035
+ pause_resume_button=pause_resume_button,
1036
+ agent_history_file=agent_history_file,
1037
+ recording_gif=recording_gif,
1038
+ browser_view=browser_view,
1039
+ )
1040
+ )
1041
+ webui_manager.add_components(
1042
+ "browser_use_agent", tab_components
1043
+ ) # Use "browser_use_agent" as tab_name prefix
1044
+
1045
+ all_managed_components = set(
1046
+ webui_manager.get_components()
1047
+ ) # Get all components known to manager
1048
+ run_tab_outputs = list(tab_components.values())
1049
+
1050
+ async def submit_wrapper(
1051
+ components_dict: Dict[Component, Any],
1052
+ ) -> AsyncGenerator[Dict[Component, Any], None]:
1053
+ """Wrapper for handle_submit that yields its results."""
1054
+ async for update in handle_submit(webui_manager, components_dict):
1055
+ yield update
1056
+
1057
+ async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
1058
+ """Wrapper for handle_stop."""
1059
+ update_dict = await handle_stop(webui_manager)
1060
+ yield update_dict
1061
+
1062
+ async def pause_resume_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
1063
+ """Wrapper for handle_pause_resume."""
1064
+ update_dict = await handle_pause_resume(webui_manager)
1065
+ yield update_dict
1066
+
1067
+ async def clear_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
1068
+ """Wrapper for handle_clear."""
1069
+ update_dict = await handle_clear(webui_manager)
1070
+ yield update_dict
1071
+
1072
+ # --- Connect Event Handlers using the Wrappers --
1073
+ run_button.click(
1074
+ fn=submit_wrapper, inputs=all_managed_components, outputs=run_tab_outputs
1075
+ )
1076
+ user_input.submit(
1077
+ fn=submit_wrapper, inputs=all_managed_components, outputs=run_tab_outputs
1078
+ )
1079
+ stop_button.click(fn=stop_wrapper, inputs=None, outputs=run_tab_outputs)
1080
+ pause_resume_button.click(
1081
+ fn=pause_resume_wrapper, inputs=None, outputs=run_tab_outputs
1082
+ )
1083
+ clear_button.click(fn=clear_wrapper, inputs=None, outputs=run_tab_outputs)
src/webui/components/deep_research_agent_tab.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio.components import Component
3
+ from functools import partial
4
+
5
+ from src.webui.webui_manager import WebuiManager
6
+ from src.utils import config
7
+ import logging
8
+ import os
9
+ from typing import Any, Dict, AsyncGenerator, Optional, Tuple, Union
10
+ import asyncio
11
+ import json
12
+ from src.agent.deep_research.deep_research_agent import DeepResearchAgent
13
+ from src.utils import llm_provider
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float,
19
+ base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None):
20
+ """Initializes the LLM based on settings. Returns None if provider/model is missing."""
21
+ if not provider or not model_name:
22
+ logger.info("LLM Provider or Model Name not specified, LLM will be None.")
23
+ return None
24
+ try:
25
+ logger.info(f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}")
26
+ # Use your actual LLM provider logic here
27
+ llm = llm_provider.get_llm_model(
28
+ provider=provider,
29
+ model_name=model_name,
30
+ temperature=temperature,
31
+ base_url=base_url or None,
32
+ api_key=api_key or None,
33
+ num_ctx=num_ctx if provider == "ollama" else None
34
+ )
35
+ return llm
36
+ except Exception as e:
37
+ logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
38
+ gr.Warning(
39
+ f"Failed to initialize LLM '{model_name}' for provider '{provider}'. Please check settings. Error: {e}")
40
+ return None
41
+
42
+
43
+ def _read_file_safe(file_path: str) -> Optional[str]:
44
+ """Safely read a file, returning None if it doesn't exist or on error."""
45
+ if not os.path.exists(file_path):
46
+ return None
47
+ try:
48
+ with open(file_path, 'r', encoding='utf-8') as f:
49
+ return f.read()
50
+ except Exception as e:
51
+ logger.error(f"Error reading file {file_path}: {e}")
52
+ return None
53
+
54
+
55
+ # --- Deep Research Agent Specific Logic ---
56
+
57
+ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Component, Any]) -> AsyncGenerator[
58
+ Dict[Component, Any], None]:
59
+ """Handles initializing and running the DeepResearchAgent."""
60
+
61
+ # --- Get Components ---
62
+ research_task_comp = webui_manager.get_component_by_id("deep_research_agent.research_task")
63
+ resume_task_id_comp = webui_manager.get_component_by_id("deep_research_agent.resume_task_id")
64
+ parallel_num_comp = webui_manager.get_component_by_id("deep_research_agent.parallel_num")
65
+ save_dir_comp = webui_manager.get_component_by_id(
66
+ "deep_research_agent.max_query") # Note: component ID seems misnamed in original code
67
+ start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button")
68
+ stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button")
69
+ markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display")
70
+ markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download")
71
+ mcp_server_config_comp = webui_manager.get_component_by_id("deep_research_agent.mcp_server_config")
72
+
73
+ # --- 1. Get Task and Settings ---
74
+ task_topic = components.get(research_task_comp, "").strip()
75
+ task_id_to_resume = components.get(resume_task_id_comp, "").strip() or None
76
+ max_parallel_agents = int(components.get(parallel_num_comp, 1))
77
+ base_save_dir = components.get(save_dir_comp, "./tmp/deep_research").strip()
78
+ safe_root_dir = "./tmp/deep_research"
79
+ normalized_base_save_dir = os.path.abspath(os.path.normpath(base_save_dir))
80
+ if os.path.commonpath([normalized_base_save_dir, os.path.abspath(safe_root_dir)]) != os.path.abspath(safe_root_dir):
81
+ logger.warning(f"Unsafe base_save_dir detected: {base_save_dir}. Using default directory.")
82
+ normalized_base_save_dir = os.path.abspath(safe_root_dir)
83
+ base_save_dir = normalized_base_save_dir
84
+ mcp_server_config_str = components.get(mcp_server_config_comp)
85
+ mcp_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None
86
+
87
+ if not task_topic:
88
+ gr.Warning("Please enter a research task.")
89
+ yield {start_button_comp: gr.update(interactive=True)} # Re-enable start button
90
+ return
91
+
92
+ # Store base save dir for stop handler
93
+ webui_manager.dr_save_dir = base_save_dir
94
+ os.makedirs(base_save_dir, exist_ok=True)
95
+
96
+ # --- 2. Initial UI Update ---
97
+ yield {
98
+ start_button_comp: gr.update(value="⏳ Running...", interactive=False),
99
+ stop_button_comp: gr.update(interactive=True),
100
+ research_task_comp: gr.update(interactive=False),
101
+ resume_task_id_comp: gr.update(interactive=False),
102
+ parallel_num_comp: gr.update(interactive=False),
103
+ save_dir_comp: gr.update(interactive=False),
104
+ markdown_display_comp: gr.update(value="Starting research..."),
105
+ markdown_download_comp: gr.update(value=None, interactive=False)
106
+ }
107
+
108
+ agent_task = None
109
+ running_task_id = None
110
+ plan_file_path = None
111
+ report_file_path = None
112
+ last_plan_content = None
113
+ last_plan_mtime = 0
114
+
115
+ try:
116
+ # --- 3. Get LLM and Browser Config from other tabs ---
117
+ # Access settings values via components dict, getting IDs from webui_manager
118
+ def get_setting(tab: str, key: str, default: Any = None):
119
+ comp = webui_manager.id_to_component.get(f"{tab}.{key}")
120
+ return components.get(comp, default) if comp else default
121
+
122
+ # LLM Config (from agent_settings tab)
123
+ llm_provider_name = get_setting("agent_settings", "llm_provider")
124
+ llm_model_name = get_setting("agent_settings", "llm_model_name")
125
+ llm_temperature = max(get_setting("agent_settings", "llm_temperature", 0.5), 0.5)
126
+ llm_base_url = get_setting("agent_settings", "llm_base_url")
127
+ llm_api_key = get_setting("agent_settings", "llm_api_key")
128
+ ollama_num_ctx = get_setting("agent_settings", "ollama_num_ctx")
129
+
130
+ llm = await _initialize_llm(
131
+ llm_provider_name, llm_model_name, llm_temperature, llm_base_url, llm_api_key,
132
+ ollama_num_ctx if llm_provider_name == "ollama" else None
133
+ )
134
+ if not llm:
135
+ raise ValueError("LLM Initialization failed. Please check Agent Settings.")
136
+
137
+ # Browser Config (from browser_settings tab)
138
+ # Note: DeepResearchAgent constructor takes a dict, not full Browser/Context objects
139
+ browser_config_dict = {
140
+ "headless": get_setting("browser_settings", "headless", False),
141
+ "disable_security": get_setting("browser_settings", "disable_security", False),
142
+ "browser_binary_path": get_setting("browser_settings", "browser_binary_path"),
143
+ "user_data_dir": get_setting("browser_settings", "browser_user_data_dir"),
144
+ "window_width": int(get_setting("browser_settings", "window_w", 1280)),
145
+ "window_height": int(get_setting("browser_settings", "window_h", 1100)),
146
+ # Add other relevant fields if DeepResearchAgent accepts them
147
+ }
148
+
149
+ # --- 4. Initialize or Get Agent ---
150
+ if not webui_manager.dr_agent:
151
+ webui_manager.dr_agent = DeepResearchAgent(
152
+ llm=llm,
153
+ browser_config=browser_config_dict,
154
+ mcp_server_config=mcp_config
155
+ )
156
+ logger.info("DeepResearchAgent initialized.")
157
+
158
+ # --- 5. Start Agent Run ---
159
+ agent_run_coro = webui_manager.dr_agent.run(
160
+ topic=task_topic,
161
+ task_id=task_id_to_resume,
162
+ save_dir=base_save_dir,
163
+ max_parallel_browsers=max_parallel_agents
164
+ )
165
+ agent_task = asyncio.create_task(agent_run_coro)
166
+ webui_manager.dr_current_task = agent_task
167
+
168
+ # Wait briefly for the agent to start and potentially create the task ID/folder
169
+ await asyncio.sleep(1.0)
170
+
171
+ # Determine the actual task ID being used (agent sets this)
172
+ running_task_id = webui_manager.dr_agent.current_task_id
173
+ if not running_task_id:
174
+ # Agent might not have set it yet, try to get from result later? Risky.
175
+ # Or derive from resume_task_id if provided?
176
+ running_task_id = task_id_to_resume
177
+ if not running_task_id:
178
+ logger.warning("Could not determine running task ID immediately.")
179
+ # We can still monitor, but might miss initial plan if ID needed for path
180
+ else:
181
+ logger.info(f"Assuming task ID based on resume ID: {running_task_id}")
182
+ else:
183
+ logger.info(f"Agent started with Task ID: {running_task_id}")
184
+
185
+ webui_manager.dr_task_id = running_task_id # Store for stop handler
186
+
187
+ # --- 6. Monitor Progress via research_plan.md ---
188
+ if running_task_id:
189
+ task_specific_dir = os.path.join(base_save_dir, str(running_task_id))
190
+ plan_file_path = os.path.join(task_specific_dir, "research_plan.md")
191
+ report_file_path = os.path.join(task_specific_dir, "report.md")
192
+ logger.info(f"Monitoring plan file: {plan_file_path}")
193
+ else:
194
+ logger.warning("Cannot monitor plan file: Task ID unknown.")
195
+ plan_file_path = None
196
+ last_plan_content = None
197
+ while not agent_task.done():
198
+ update_dict = {}
199
+ update_dict[resume_task_id_comp] = gr.update(value=running_task_id)
200
+ agent_stopped = getattr(webui_manager.dr_agent, 'stopped', False)
201
+ if agent_stopped:
202
+ logger.info("Stop signal detected from agent state.")
203
+ break # Exit monitoring loop
204
+
205
+ # Check and update research plan display
206
+ if plan_file_path:
207
+ try:
208
+ current_mtime = os.path.getmtime(plan_file_path) if os.path.exists(plan_file_path) else 0
209
+ if current_mtime > last_plan_mtime:
210
+ logger.info(f"Detected change in {plan_file_path}")
211
+ plan_content = _read_file_safe(plan_file_path)
212
+ if last_plan_content is None or (
213
+ plan_content is not None and plan_content != last_plan_content):
214
+ update_dict[markdown_display_comp] = gr.update(value=plan_content)
215
+ last_plan_content = plan_content
216
+ last_plan_mtime = current_mtime
217
+ elif plan_content is None:
218
+ # File might have been deleted or became unreadable
219
+ last_plan_mtime = 0 # Reset to force re-read attempt later
220
+ except Exception as e:
221
+ logger.warning(f"Error checking/reading plan file {plan_file_path}: {e}")
222
+ # Avoid continuous logging for the same error
223
+ await asyncio.sleep(2.0)
224
+
225
+ # Yield updates if any
226
+ if update_dict:
227
+ yield update_dict
228
+
229
+ await asyncio.sleep(1.0) # Check file changes every second
230
+
231
+ # --- 7. Task Finalization ---
232
+ logger.info("Agent task processing finished. Awaiting final result...")
233
+ final_result_dict = await agent_task # Get result or raise exception
234
+ logger.info(f"Agent run completed. Result keys: {final_result_dict.keys() if final_result_dict else 'None'}")
235
+
236
+ # Try to get task ID from result if not known before
237
+ if not running_task_id and final_result_dict and 'task_id' in final_result_dict:
238
+ running_task_id = final_result_dict['task_id']
239
+ webui_manager.dr_task_id = running_task_id
240
+ task_specific_dir = os.path.join(base_save_dir, str(running_task_id))
241
+ report_file_path = os.path.join(task_specific_dir, "report.md")
242
+ logger.info(f"Task ID confirmed from result: {running_task_id}")
243
+
244
+ final_ui_update = {}
245
+ if report_file_path and os.path.exists(report_file_path):
246
+ logger.info(f"Loading final report from: {report_file_path}")
247
+ report_content = _read_file_safe(report_file_path)
248
+ if report_content:
249
+ final_ui_update[markdown_display_comp] = gr.update(value=report_content)
250
+ final_ui_update[markdown_download_comp] = gr.File(value=report_file_path,
251
+ label=f"Report ({running_task_id}.md)",
252
+ interactive=True)
253
+ else:
254
+ final_ui_update[markdown_display_comp] = gr.update(
255
+ value="# Research Complete\n\n*Error reading final report file.*")
256
+ elif final_result_dict and 'report' in final_result_dict:
257
+ logger.info("Using report content directly from agent result.")
258
+ # If agent directly returns report content
259
+ final_ui_update[markdown_display_comp] = gr.update(value=final_result_dict['report'])
260
+ # Cannot offer download if only content is available
261
+ final_ui_update[markdown_download_comp] = gr.update(value=None, label="Download Research Report",
262
+ interactive=False)
263
+ else:
264
+ logger.warning("Final report file not found and not in result dict.")
265
+ final_ui_update[markdown_display_comp] = gr.update(value="# Research Complete\n\n*Final report not found.*")
266
+
267
+ yield final_ui_update
268
+
269
+
270
+ except Exception as e:
271
+ logger.error(f"Error during Deep Research Agent execution: {e}", exc_info=True)
272
+ gr.Error(f"Research failed: {e}")
273
+ yield {markdown_display_comp: gr.update(value=f"# Research Failed\n\n**Error:**\n```\n{e}\n```")}
274
+
275
+ finally:
276
+ # --- 8. Final UI Reset ---
277
+ webui_manager.dr_current_task = None # Clear task reference
278
+ webui_manager.dr_task_id = None # Clear running task ID
279
+
280
+ yield {
281
+ start_button_comp: gr.update(value="▶️ Run", interactive=True),
282
+ stop_button_comp: gr.update(interactive=False),
283
+ research_task_comp: gr.update(interactive=True),
284
+ resume_task_id_comp: gr.update(value="", interactive=True),
285
+ parallel_num_comp: gr.update(interactive=True),
286
+ save_dir_comp: gr.update(interactive=True),
287
+ # Keep download button enabled if file exists
288
+ markdown_download_comp: gr.update() if report_file_path and os.path.exists(report_file_path) else gr.update(
289
+ interactive=False)
290
+ }
291
+
292
+
293
+ async def stop_deep_research(webui_manager: WebuiManager) -> Dict[Component, Any]:
294
+ """Handles the Stop button click."""
295
+ logger.info("Stop button clicked for Deep Research.")
296
+ agent = webui_manager.dr_agent
297
+ task = webui_manager.dr_current_task
298
+ task_id = webui_manager.dr_task_id
299
+ base_save_dir = webui_manager.dr_save_dir
300
+
301
+ stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button")
302
+ start_button_comp = webui_manager.get_component_by_id("deep_research_agent.start_button")
303
+ markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display")
304
+ markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download")
305
+
306
+ final_update = {
307
+ stop_button_comp: gr.update(interactive=False, value="⏹️ Stopping...")
308
+ }
309
+
310
+ if agent and task and not task.done():
311
+ logger.info("Signalling DeepResearchAgent to stop.")
312
+ try:
313
+ # Assuming stop is synchronous or sets a flag quickly
314
+ await agent.stop()
315
+ except Exception as e:
316
+ logger.error(f"Error calling agent.stop(): {e}")
317
+
318
+ # The run_deep_research loop should detect the stop and exit.
319
+ # We yield an intermediate "Stopping..." state. The final reset is done by run_deep_research.
320
+
321
+ # Try to show the final report if available after stopping
322
+ await asyncio.sleep(1.5) # Give agent a moment to write final files potentially
323
+ report_file_path = None
324
+ if task_id and base_save_dir:
325
+ report_file_path = os.path.join(base_save_dir, str(task_id), "report.md")
326
+
327
+ if report_file_path and os.path.exists(report_file_path):
328
+ report_content = _read_file_safe(report_file_path)
329
+ if report_content:
330
+ final_update[markdown_display_comp] = gr.update(
331
+ value=report_content + "\n\n---\n*Research stopped by user.*")
332
+ final_update[markdown_download_comp] = gr.File(value=report_file_path, label=f"Report ({task_id}.md)",
333
+ interactive=True)
334
+ else:
335
+ final_update[markdown_display_comp] = gr.update(
336
+ value="# Research Stopped\n\n*Error reading final report file after stop.*")
337
+ else:
338
+ final_update[markdown_display_comp] = gr.update(value="# Research Stopped by User")
339
+
340
+ # Keep start button disabled, run_deep_research finally block will re-enable it.
341
+ final_update[start_button_comp] = gr.update(interactive=False)
342
+
343
+ else:
344
+ logger.warning("Stop clicked but no active research task found.")
345
+ # Reset UI state just in case
346
+ final_update = {
347
+ start_button_comp: gr.update(interactive=True),
348
+ stop_button_comp: gr.update(interactive=False),
349
+ webui_manager.get_component_by_id("deep_research_agent.research_task"): gr.update(interactive=True),
350
+ webui_manager.get_component_by_id("deep_research_agent.resume_task_id"): gr.update(interactive=True),
351
+ webui_manager.get_component_by_id("deep_research_agent.max_iteration"): gr.update(interactive=True),
352
+ webui_manager.get_component_by_id("deep_research_agent.max_query"): gr.update(interactive=True),
353
+ }
354
+
355
+ return final_update
356
+
357
+
358
+ async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
359
+ """
360
+ Update the MCP server.
361
+ """
362
+ if hasattr(webui_manager, "dr_agent") and webui_manager.dr_agent:
363
+ logger.warning("⚠️ Close controller because mcp file has changed!")
364
+ await webui_manager.dr_agent.close_mcp_client()
365
+
366
+ if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'):
367
+ logger.warning(f"{mcp_file} is not a valid MCP file.")
368
+ return None, gr.update(visible=False)
369
+
370
+ with open(mcp_file, 'r') as f:
371
+ mcp_server = json.load(f)
372
+
373
+ return json.dumps(mcp_server, indent=2), gr.update(visible=True)
374
+
375
+
376
+ def create_deep_research_agent_tab(webui_manager: WebuiManager):
377
+ """
378
+ Creates a deep research agent tab
379
+ """
380
+ input_components = set(webui_manager.get_components())
381
+ tab_components = {}
382
+
383
+ with gr.Group():
384
+ with gr.Row():
385
+ mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"])
386
+ mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False)
387
+
388
+ with gr.Group():
389
+ research_task = gr.Textbox(label="Research Task", lines=5,
390
+ value="Give me a detailed travel plan to Switzerland from June 1st to 10th.",
391
+ interactive=True)
392
+ with gr.Row():
393
+ resume_task_id = gr.Textbox(label="Resume Task ID", value="",
394
+ interactive=True)
395
+ parallel_num = gr.Number(label="Parallel Agent Num", value=1,
396
+ precision=0,
397
+ interactive=True)
398
+ max_query = gr.Textbox(label="Research Save Dir", value="./tmp/deep_research",
399
+ interactive=True)
400
+ with gr.Row():
401
+ stop_button = gr.Button("⏹️ Stop", variant="stop", scale=2)
402
+ start_button = gr.Button("▶️ Run", variant="primary", scale=3)
403
+ with gr.Group():
404
+ markdown_display = gr.Markdown(label="Research Report")
405
+ markdown_download = gr.File(label="Download Research Report", interactive=False)
406
+ tab_components.update(
407
+ dict(
408
+ research_task=research_task,
409
+ parallel_num=parallel_num,
410
+ max_query=max_query,
411
+ start_button=start_button,
412
+ stop_button=stop_button,
413
+ markdown_display=markdown_display,
414
+ markdown_download=markdown_download,
415
+ resume_task_id=resume_task_id,
416
+ mcp_json_file=mcp_json_file,
417
+ mcp_server_config=mcp_server_config,
418
+ )
419
+ )
420
+ webui_manager.add_components("deep_research_agent", tab_components)
421
+ webui_manager.init_deep_research_agent()
422
+
423
+ async def update_wrapper(mcp_file):
424
+ """Wrapper for handle_pause_resume."""
425
+ update_dict = await update_mcp_server(mcp_file, webui_manager)
426
+ yield update_dict
427
+
428
+ mcp_json_file.change(
429
+ update_wrapper,
430
+ inputs=[mcp_json_file],
431
+ outputs=[mcp_server_config, mcp_server_config]
432
+ )
433
+
434
+ dr_tab_outputs = list(tab_components.values())
435
+ all_managed_inputs = set(webui_manager.get_components())
436
+
437
+ # --- Define Event Handler Wrappers ---
438
+ async def start_wrapper(comps: Dict[Component, Any]) -> AsyncGenerator[Dict[Component, Any], None]:
439
+ async for update in run_deep_research(webui_manager, comps):
440
+ yield update
441
+
442
+ async def stop_wrapper() -> AsyncGenerator[Dict[Component, Any], None]:
443
+ update_dict = await stop_deep_research(webui_manager)
444
+ yield update_dict
445
+
446
+ # --- Connect Handlers ---
447
+ start_button.click(
448
+ fn=start_wrapper,
449
+ inputs=all_managed_inputs,
450
+ outputs=dr_tab_outputs
451
+ )
452
+
453
+ stop_button.click(
454
+ fn=stop_wrapper,
455
+ inputs=None,
456
+ outputs=dr_tab_outputs
457
+ )
src/webui/components/load_save_config_tab.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio.components import Component
3
+
4
+ from src.webui.webui_manager import WebuiManager
5
+ from src.utils import config
6
+
7
+
8
+ def create_load_save_config_tab(webui_manager: WebuiManager):
9
+ """
10
+ Creates a load and save config tab.
11
+ """
12
+ input_components = set(webui_manager.get_components())
13
+ tab_components = {}
14
+
15
+ config_file = gr.File(
16
+ label="Load UI Settings from json",
17
+ file_types=[".json"],
18
+ interactive=True
19
+ )
20
+ with gr.Row():
21
+ load_config_button = gr.Button("Load Config", variant="primary")
22
+ save_config_button = gr.Button("Save UI Settings", variant="primary")
23
+
24
+ config_status = gr.Textbox(
25
+ label="Status",
26
+ lines=2,
27
+ interactive=False
28
+ )
29
+
30
+ tab_components.update(dict(
31
+ load_config_button=load_config_button,
32
+ save_config_button=save_config_button,
33
+ config_status=config_status,
34
+ config_file=config_file,
35
+ ))
36
+
37
+ webui_manager.add_components("load_save_config", tab_components)
38
+
39
+ save_config_button.click(
40
+ fn=webui_manager.save_config,
41
+ inputs=set(webui_manager.get_components()),
42
+ outputs=[config_status]
43
+ )
44
+
45
+ load_config_button.click(
46
+ fn=webui_manager.load_config,
47
+ inputs=[config_file],
48
+ outputs=webui_manager.get_components(),
49
+ )
50
+
src/webui/interface.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.webui.webui_manager import WebuiManager
4
+ from src.webui.components.agent_settings_tab import create_agent_settings_tab
5
+ from src.webui.components.browser_settings_tab import create_browser_settings_tab
6
+ from src.webui.components.browser_use_agent_tab import create_browser_use_agent_tab
7
+ from src.webui.components.deep_research_agent_tab import create_deep_research_agent_tab
8
+ from src.webui.components.load_save_config_tab import create_load_save_config_tab
9
+
10
+ theme_map = {
11
+ "Default": gr.themes.Default(),
12
+ "Soft": gr.themes.Soft(),
13
+ "Monochrome": gr.themes.Monochrome(),
14
+ "Glass": gr.themes.Glass(),
15
+ "Origin": gr.themes.Origin(),
16
+ "Citrus": gr.themes.Citrus(),
17
+ "Ocean": gr.themes.Ocean(),
18
+ "Base": gr.themes.Base()
19
+ }
20
+
21
+
22
+ def create_ui(theme_name="Ocean"):
23
+ css = """
24
+ .gradio-container {
25
+ width: 70vw !important;
26
+ max-width: 70% !important;
27
+ margin-left: auto !important;
28
+ margin-right: auto !important;
29
+ padding-top: 10px !important;
30
+ }
31
+ .header-text {
32
+ text-align: center;
33
+ margin-bottom: 20px;
34
+ }
35
+ .tab-header-text {
36
+ text-align: center;
37
+ }
38
+ .theme-section {
39
+ margin-bottom: 10px;
40
+ padding: 15px;
41
+ border-radius: 10px;
42
+ }
43
+ """
44
+
45
+ # dark mode in default
46
+ js_func = """
47
+ function refresh() {
48
+ const url = new URL(window.location);
49
+
50
+ if (url.searchParams.get('__theme') !== 'dark') {
51
+ url.searchParams.set('__theme', 'dark');
52
+ window.location.href = url.href;
53
+ }
54
+ }
55
+ """
56
+
57
+ ui_manager = WebuiManager()
58
+
59
+ with gr.Blocks(
60
+ title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js_func,
61
+ ) as demo:
62
+ with gr.Row():
63
+ gr.Markdown(
64
+ """
65
+ # 🌐 Browser Use WebUI
66
+ ### Control your browser with AI assistance
67
+ """,
68
+ elem_classes=["header-text"],
69
+ )
70
+
71
+ with gr.Tabs() as tabs:
72
+ with gr.TabItem("⚙️ Agent Settings"):
73
+ create_agent_settings_tab(ui_manager)
74
+
75
+ with gr.TabItem("🌐 Browser Settings"):
76
+ create_browser_settings_tab(ui_manager)
77
+
78
+ with gr.TabItem("🤖 Run Agent"):
79
+ create_browser_use_agent_tab(ui_manager)
80
+
81
+ with gr.TabItem("🎁 Agent Marketplace"):
82
+ gr.Markdown(
83
+ """
84
+ ### Agents built on Browser-Use
85
+ """,
86
+ elem_classes=["tab-header-text"],
87
+ )
88
+ with gr.Tabs():
89
+ with gr.TabItem("Deep Research"):
90
+ create_deep_research_agent_tab(ui_manager)
91
+
92
+ with gr.TabItem("📁 Load & Save Config"):
93
+ create_load_save_config_tab(ui_manager)
94
+
95
+ return demo
src/webui/webui_manager.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from collections.abc import Generator
3
+ from typing import TYPE_CHECKING
4
+ import os
5
+ import gradio as gr
6
+ from datetime import datetime
7
+ from typing import Optional, Dict, List
8
+ import uuid
9
+ import asyncio
10
+ import time
11
+
12
+ from gradio.components import Component
13
+ from browser_use.browser.browser import Browser
14
+ from browser_use.browser.context import BrowserContext
15
+ from browser_use.agent.service import Agent
16
+ from src.browser.custom_browser import CustomBrowser
17
+ from src.browser.custom_context import CustomBrowserContext
18
+ from src.controller.custom_controller import CustomController
19
+ from src.agent.deep_research.deep_research_agent import DeepResearchAgent
20
+
21
+
22
+ class WebuiManager:
23
+ def __init__(self, settings_save_dir: str = "./tmp/webui_settings"):
24
+ self.id_to_component: dict[str, Component] = {}
25
+ self.component_to_id: dict[Component, str] = {}
26
+
27
+ self.settings_save_dir = settings_save_dir
28
+ os.makedirs(self.settings_save_dir, exist_ok=True)
29
+
30
+ def init_browser_use_agent(self) -> None:
31
+ """
32
+ init browser use agent
33
+ """
34
+ self.bu_agent: Optional[Agent] = None
35
+ self.bu_browser: Optional[CustomBrowser] = None
36
+ self.bu_browser_context: Optional[CustomBrowserContext] = None
37
+ self.bu_controller: Optional[CustomController] = None
38
+ self.bu_chat_history: List[Dict[str, Optional[str]]] = []
39
+ self.bu_response_event: Optional[asyncio.Event] = None
40
+ self.bu_user_help_response: Optional[str] = None
41
+ self.bu_current_task: Optional[asyncio.Task] = None
42
+ self.bu_agent_task_id: Optional[str] = None
43
+
44
+ def init_deep_research_agent(self) -> None:
45
+ """
46
+ init deep research agent
47
+ """
48
+ self.dr_agent: Optional[DeepResearchAgent] = None
49
+ self.dr_current_task = None
50
+ self.dr_agent_task_id: Optional[str] = None
51
+ self.dr_save_dir: Optional[str] = None
52
+
53
+ def add_components(self, tab_name: str, components_dict: dict[str, "Component"]) -> None:
54
+ """
55
+ Add tab components
56
+ """
57
+ for comp_name, component in components_dict.items():
58
+ comp_id = f"{tab_name}.{comp_name}"
59
+ self.id_to_component[comp_id] = component
60
+ self.component_to_id[component] = comp_id
61
+
62
+ def get_components(self) -> list["Component"]:
63
+ """
64
+ Get all components
65
+ """
66
+ return list(self.id_to_component.values())
67
+
68
+ def get_component_by_id(self, comp_id: str) -> "Component":
69
+ """
70
+ Get component by id
71
+ """
72
+ return self.id_to_component[comp_id]
73
+
74
+ def get_id_by_component(self, comp: "Component") -> str:
75
+ """
76
+ Get id by component
77
+ """
78
+ return self.component_to_id[comp]
79
+
80
+ def save_config(self, components: Dict["Component", str]) -> None:
81
+ """
82
+ Save config
83
+ """
84
+ cur_settings = {}
85
+ for comp in components:
86
+ if not isinstance(comp, gr.Button) and not isinstance(comp, gr.File) and str(
87
+ getattr(comp, "interactive", True)).lower() != "false":
88
+ comp_id = self.get_id_by_component(comp)
89
+ cur_settings[comp_id] = components[comp]
90
+
91
+ config_name = datetime.now().strftime("%Y%m%d-%H%M%S")
92
+ with open(os.path.join(self.settings_save_dir, f"{config_name}.json"), "w") as fw:
93
+ json.dump(cur_settings, fw, indent=4)
94
+
95
+ return os.path.join(self.settings_save_dir, f"{config_name}.json")
96
+
97
+ def load_config(self, config_path: str):
98
+ """
99
+ Load config
100
+ """
101
+ with open(config_path, "r") as fr:
102
+ ui_settings = json.load(fr)
103
+
104
+ update_components = {}
105
+ for comp_id, comp_val in ui_settings.items():
106
+ if comp_id in self.id_to_component:
107
+ comp = self.id_to_component[comp_id]
108
+ if comp.__class__.__name__ == "Chatbot":
109
+ update_components[comp] = comp.__class__(value=comp_val, type="messages")
110
+ else:
111
+ update_components[comp] = comp.__class__(value=comp_val)
112
+ if comp_id == "agent_settings.planner_llm_provider":
113
+ yield update_components # yield provider, let callback run
114
+ time.sleep(0.1) # wait for Gradio UI callback
115
+
116
+ config_status = self.id_to_component["load_save_config.config_status"]
117
+ update_components.update(
118
+ {
119
+ config_status: config_status.__class__(value=f"Successfully loaded config: {config_path}")
120
+ }
121
+ )
122
+ yield update_components
supervisord.conf ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [supervisord]
2
+ user=root
3
+ nodaemon=true
4
+ logfile=/dev/stdout
5
+ logfile_maxbytes=0
6
+ loglevel=error
7
+
8
+ [program:xvfb]
9
+ command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset
10
+ autorestart=true
11
+ stdout_logfile=/dev/stdout
12
+ stdout_logfile_maxbytes=0
13
+ stderr_logfile=/dev/stderr
14
+ stderr_logfile_maxbytes=0
15
+ priority=100
16
+ startsecs=3
17
+ stopsignal=TERM
18
+ stopwaitsecs=10
19
+
20
+ [program:vnc_setup]
21
+ command=bash -c "mkdir -p ~/.vnc && echo '%(ENV_VNC_PASSWORD)s' | vncpasswd -f > ~/.vnc/passwd && chmod 600 ~/.vnc/passwd && ls -la ~/.vnc/passwd"
22
+ autorestart=false
23
+ startsecs=0
24
+ priority=150
25
+ stdout_logfile=/dev/stdout
26
+ stdout_logfile_maxbytes=0
27
+ stderr_logfile=/dev/stderr
28
+ stderr_logfile_maxbytes=0
29
+
30
+ [program:x11vnc]
31
+ command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && chmod 666 /var/log/x11vnc.log && sleep 5 && DISPLAY=:99 x11vnc -display :99 -forever -shared -rfbauth /root/.vnc/passwd -rfbport 5901 -o /var/log/x11vnc.log"
32
+ autorestart=true
33
+ stdout_logfile=/dev/stdout
34
+ stdout_logfile_maxbytes=0
35
+ stderr_logfile=/dev/stderr
36
+ stderr_logfile_maxbytes=0
37
+ priority=200
38
+ startretries=10
39
+ startsecs=10
40
+ stopsignal=TERM
41
+ stopwaitsecs=10
42
+ depends_on=vnc_setup,xvfb
43
+
44
+ [program:x11vnc_log]
45
+ command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && tail -f /var/log/x11vnc.log"
46
+ autorestart=true
47
+ stdout_logfile=/dev/stdout
48
+ stdout_logfile_maxbytes=0
49
+ stderr_logfile=/dev/stderr
50
+ stderr_logfile_maxbytes=0
51
+ priority=250
52
+ stopsignal=TERM
53
+ stopwaitsecs=5
54
+ depends_on=x11vnc
55
+
56
+ [program:novnc]
57
+ command=bash -c "sleep 5 && cd /opt/novnc && ./utils/novnc_proxy --vnc localhost:5901 --listen 0.0.0.0:6080 --web /opt/novnc"
58
+ autorestart=true
59
+ stdout_logfile=/dev/stdout
60
+ stdout_logfile_maxbytes=0
61
+ stderr_logfile=/dev/stderr
62
+ stderr_logfile_maxbytes=0
63
+ priority=300
64
+ startretries=5
65
+ startsecs=3
66
+ depends_on=x11vnc
67
+
68
+ [program:webui]
69
+ command=python webui.py --ip 0.0.0.0 --port 7788
70
+ directory=/app
71
+ autorestart=true
72
+ stdout_logfile=/dev/stdout
73
+ stdout_logfile_maxbytes=0
74
+ stderr_logfile=/dev/stderr
75
+ stderr_logfile_maxbytes=0
76
+ priority=400
77
+ startretries=3
78
+ startsecs=3
79
+ stopsignal=TERM
80
+ stopwaitsecs=10
tests/test_agents.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ import sys
7
+
8
+ sys.path.append(".")
9
+ import asyncio
10
+ import os
11
+ import sys
12
+ from pprint import pprint
13
+
14
+ from browser_use import Agent
15
+ from browser_use.agent.views import AgentHistoryList
16
+
17
+ from src.utils import utils
18
+
19
+
20
+ async def test_browser_use_agent():
21
+ from browser_use.browser.browser import Browser, BrowserConfig
22
+ from browser_use.browser.context import (
23
+ BrowserContextConfig
24
+ )
25
+ from browser_use.agent.service import Agent
26
+
27
+ from src.browser.custom_browser import CustomBrowser
28
+ from src.controller.custom_controller import CustomController
29
+ from src.utils import llm_provider
30
+ from src.agent.browser_use.browser_use_agent import BrowserUseAgent
31
+
32
+ llm = llm_provider.get_llm_model(
33
+ provider="openai",
34
+ model_name="gpt-4o",
35
+ temperature=0.8,
36
+ )
37
+
38
+ # llm = llm_provider.get_llm_model(
39
+ # provider="google",
40
+ # model_name="gemini-2.0-flash",
41
+ # temperature=0.6,
42
+ # api_key=os.getenv("GOOGLE_API_KEY", "")
43
+ # )
44
+
45
+ # llm = utils.get_llm_model(
46
+ # provider="deepseek",
47
+ # model_name="deepseek-reasoner",
48
+ # temperature=0.8
49
+ # )
50
+
51
+ # llm = utils.get_llm_model(
52
+ # provider="deepseek",
53
+ # model_name="deepseek-chat",
54
+ # temperature=0.8
55
+ # )
56
+
57
+ # llm = utils.get_llm_model(
58
+ # provider="ollama", model_name="qwen2.5:7b", temperature=0.5
59
+ # )
60
+
61
+ # llm = utils.get_llm_model(
62
+ # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
63
+ # )
64
+
65
+ window_w, window_h = 1280, 1100
66
+
67
+ # llm = llm_provider.get_llm_model(
68
+ # provider="azure_openai",
69
+ # model_name="gpt-4o",
70
+ # temperature=0.5,
71
+ # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
72
+ # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
73
+ # )
74
+
75
+ mcp_server_config = {
76
+ "mcpServers": {
77
+ # "markitdown": {
78
+ # "command": "docker",
79
+ # "args": [
80
+ # "run",
81
+ # "--rm",
82
+ # "-i",
83
+ # "markitdown-mcp:latest"
84
+ # ]
85
+ # },
86
+ "desktop-commander": {
87
+ "command": "npx",
88
+ "args": [
89
+ "-y",
90
+ "@wonderwhy-er/desktop-commander"
91
+ ]
92
+ },
93
+ }
94
+ }
95
+ controller = CustomController()
96
+ await controller.setup_mcp_client(mcp_server_config)
97
+ use_own_browser = True
98
+ use_vision = True # Set to False when using DeepSeek
99
+
100
+ max_actions_per_step = 10
101
+ browser = None
102
+ browser_context = None
103
+
104
+ try:
105
+ extra_browser_args = []
106
+ if use_own_browser:
107
+ browser_binary_path = os.getenv("BROWSER_PATH", None)
108
+ if browser_binary_path == "":
109
+ browser_binary_path = None
110
+ browser_user_data = os.getenv("BROWSER_USER_DATA", None)
111
+ if browser_user_data:
112
+ extra_browser_args += [f"--user-data-dir={browser_user_data}"]
113
+ else:
114
+ browser_binary_path = None
115
+ browser = CustomBrowser(
116
+ config=BrowserConfig(
117
+ headless=False,
118
+ browser_binary_path=browser_binary_path,
119
+ extra_browser_args=extra_browser_args,
120
+ new_context_config=BrowserContextConfig(
121
+ window_width=window_w,
122
+ window_height=window_h,
123
+ )
124
+ )
125
+ )
126
+ browser_context = await browser.new_context(
127
+ config=BrowserContextConfig(
128
+ trace_path=None,
129
+ save_recording_path=None,
130
+ save_downloads_path="./tmp/downloads",
131
+ window_height=window_h,
132
+ window_width=window_w,
133
+ )
134
+ )
135
+ agent = BrowserUseAgent(
136
+ # task="download pdf from https://arxiv.org/pdf/2311.16498 and rename this pdf to 'mcp-test.pdf'",
137
+ task="give me nvidia stock price",
138
+ llm=llm,
139
+ browser=browser,
140
+ browser_context=browser_context,
141
+ controller=controller,
142
+ use_vision=use_vision,
143
+ max_actions_per_step=max_actions_per_step,
144
+ generate_gif=True
145
+ )
146
+ history: AgentHistoryList = await agent.run(max_steps=100)
147
+
148
+ print("Final Result:")
149
+ pprint(history.final_result(), indent=4)
150
+
151
+ print("\nErrors:")
152
+ pprint(history.errors(), indent=4)
153
+
154
+ except Exception:
155
+ import traceback
156
+ traceback.print_exc()
157
+ finally:
158
+ if browser_context:
159
+ await browser_context.close()
160
+ if browser:
161
+ await browser.close()
162
+ if controller:
163
+ await controller.close_mcp_client()
164
+
165
+
166
+ async def test_browser_use_parallel():
167
+ from browser_use.browser.browser import Browser, BrowserConfig
168
+ from browser_use.browser.context import (
169
+ BrowserContextConfig,
170
+ )
171
+ from browser_use.agent.service import Agent
172
+
173
+ from src.browser.custom_browser import CustomBrowser
174
+ from src.controller.custom_controller import CustomController
175
+ from src.utils import llm_provider
176
+ from src.agent.browser_use.browser_use_agent import BrowserUseAgent
177
+
178
+ # llm = utils.get_llm_model(
179
+ # provider="openai",
180
+ # model_name="gpt-4o",
181
+ # temperature=0.8,
182
+ # base_url=os.getenv("OPENAI_ENDPOINT", ""),
183
+ # api_key=os.getenv("OPENAI_API_KEY", ""),
184
+ # )
185
+
186
+ # llm = utils.get_llm_model(
187
+ # provider="google",
188
+ # model_name="gemini-2.0-flash",
189
+ # temperature=0.6,
190
+ # api_key=os.getenv("GOOGLE_API_KEY", "")
191
+ # )
192
+
193
+ # llm = utils.get_llm_model(
194
+ # provider="deepseek",
195
+ # model_name="deepseek-reasoner",
196
+ # temperature=0.8
197
+ # )
198
+
199
+ # llm = utils.get_llm_model(
200
+ # provider="deepseek",
201
+ # model_name="deepseek-chat",
202
+ # temperature=0.8
203
+ # )
204
+
205
+ # llm = utils.get_llm_model(
206
+ # provider="ollama", model_name="qwen2.5:7b", temperature=0.5
207
+ # )
208
+
209
+ # llm = utils.get_llm_model(
210
+ # provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
211
+ # )
212
+
213
+ window_w, window_h = 1280, 1100
214
+
215
+ llm = llm_provider.get_llm_model(
216
+ provider="azure_openai",
217
+ model_name="gpt-4o",
218
+ temperature=0.5,
219
+ base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
220
+ api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
221
+ )
222
+
223
+ mcp_server_config = {
224
+ "mcpServers": {
225
+ # "markitdown": {
226
+ # "command": "docker",
227
+ # "args": [
228
+ # "run",
229
+ # "--rm",
230
+ # "-i",
231
+ # "markitdown-mcp:latest"
232
+ # ]
233
+ # },
234
+ "desktop-commander": {
235
+ "command": "npx",
236
+ "args": [
237
+ "-y",
238
+ "@wonderwhy-er/desktop-commander"
239
+ ]
240
+ },
241
+ # "filesystem": {
242
+ # "command": "npx",
243
+ # "args": [
244
+ # "-y",
245
+ # "@modelcontextprotocol/server-filesystem",
246
+ # "/Users/xxx/ai_workspace",
247
+ # ]
248
+ # },
249
+ }
250
+ }
251
+ controller = CustomController()
252
+ await controller.setup_mcp_client(mcp_server_config)
253
+ use_own_browser = True
254
+ use_vision = True # Set to False when using DeepSeek
255
+
256
+ max_actions_per_step = 10
257
+ browser = None
258
+ browser_context = None
259
+
260
+ try:
261
+ extra_browser_args = []
262
+ if use_own_browser:
263
+ browser_binary_path = os.getenv("BROWSER_PATH", None)
264
+ if browser_binary_path == "":
265
+ browser_binary_path = None
266
+ browser_user_data = os.getenv("BROWSER_USER_DATA", None)
267
+ if browser_user_data:
268
+ extra_browser_args += [f"--user-data-dir={browser_user_data}"]
269
+ else:
270
+ browser_binary_path = None
271
+ browser = CustomBrowser(
272
+ config=BrowserConfig(
273
+ headless=False,
274
+ browser_binary_path=browser_binary_path,
275
+ extra_browser_args=extra_browser_args,
276
+ new_context_config=BrowserContextConfig(
277
+ window_width=window_w,
278
+ window_height=window_h,
279
+ )
280
+ )
281
+ )
282
+ browser_context = await browser.new_context(
283
+ config=BrowserContextConfig(
284
+ trace_path=None,
285
+ save_recording_path=None,
286
+ save_downloads_path="./tmp/downloads",
287
+ window_height=window_h,
288
+ window_width=window_w,
289
+ force_new_context=True
290
+ )
291
+ )
292
+ agents = [
293
+ BrowserUseAgent(task=task, llm=llm, browser=browser, controller=controller)
294
+ for task in [
295
+ 'Search Google for weather in Tokyo',
296
+ # 'Check Reddit front page title',
297
+ # 'Find NASA image of the day',
298
+ # 'Check top story on CNN',
299
+ # 'Search latest SpaceX launch date',
300
+ # 'Look up population of Paris',
301
+ 'Find current time in Sydney',
302
+ 'Check who won last Super Bowl',
303
+ # 'Search trending topics on Twitter',
304
+ ]
305
+ ]
306
+
307
+ history = await asyncio.gather(*[agent.run() for agent in agents])
308
+ print("Final Result:")
309
+ pprint(history.final_result(), indent=4)
310
+
311
+ print("\nErrors:")
312
+ pprint(history.errors(), indent=4)
313
+
314
+ pdb.set_trace()
315
+
316
+ except Exception:
317
+ import traceback
318
+
319
+ traceback.print_exc()
320
+ finally:
321
+ if browser_context:
322
+ await browser_context.close()
323
+ if browser:
324
+ await browser.close()
325
+ if controller:
326
+ await controller.close_mcp_client()
327
+
328
+
329
+ async def test_deep_research_agent():
330
+ from src.agent.deep_research.deep_research_agent import DeepResearchAgent, PLAN_FILENAME, REPORT_FILENAME
331
+ from src.utils import llm_provider
332
+
333
+ llm = llm_provider.get_llm_model(
334
+ provider="openai",
335
+ model_name="gpt-4o",
336
+ temperature=0.5
337
+ )
338
+
339
+ # llm = llm_provider.get_llm_model(
340
+ # provider="bedrock",
341
+ # )
342
+
343
+ mcp_server_config = {
344
+ "mcpServers": {
345
+ "desktop-commander": {
346
+ "command": "npx",
347
+ "args": [
348
+ "-y",
349
+ "@wonderwhy-er/desktop-commander"
350
+ ]
351
+ },
352
+ }
353
+ }
354
+
355
+ browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False}
356
+ agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config)
357
+ research_topic = "Give me investment advices of nvidia and tesla."
358
+ task_id_to_resume = "" # Set this to resume a previous task ID
359
+
360
+ print(f"Starting research on: {research_topic}")
361
+
362
+ try:
363
+ # Call run and wait for the final result dictionary
364
+ result = await agent.run(research_topic,
365
+ task_id=task_id_to_resume,
366
+ save_dir="./tmp/deep_research",
367
+ max_parallel_browsers=1,
368
+ )
369
+
370
+ print("\n--- Research Process Ended ---")
371
+ print(f"Status: {result.get('status')}")
372
+ print(f"Message: {result.get('message')}")
373
+ print(f"Task ID: {result.get('task_id')}")
374
+
375
+ # Check the final state for the report
376
+ final_state = result.get('final_state', {})
377
+ if final_state:
378
+ print("\n--- Final State Summary ---")
379
+ print(
380
+ f" Plan Steps Completed: {sum(1 for item in final_state.get('research_plan', []) if item.get('status') == 'completed')}")
381
+ print(f" Total Search Results Logged: {len(final_state.get('search_results', []))}")
382
+ if final_state.get("final_report"):
383
+ print(" Final Report: Generated (content omitted). You can find it in the output directory.")
384
+ # print("\n--- Final Report ---") # Optionally print report
385
+ # print(final_state["final_report"])
386
+ else:
387
+ print(" Final Report: Not generated.")
388
+ else:
389
+ print("Final state information not available.")
390
+
391
+
392
+ except Exception as e:
393
+ print(f"\n--- An unhandled error occurred outside the agent run ---")
394
+ print(e)
395
+
396
+
397
+ if __name__ == "__main__":
398
+ asyncio.run(test_browser_use_agent())
399
+ # asyncio.run(test_browser_use_parallel())
400
+ # asyncio.run(test_deep_research_agent())
tests/test_controller.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import pdb
3
+ import sys
4
+ import time
5
+
6
+ sys.path.append(".")
7
+
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+
12
+
13
+ async def test_mcp_client():
14
+ from src.utils.mcp_client import setup_mcp_client_and_tools, create_tool_param_model
15
+
16
+ test_server_config = {
17
+ "mcpServers": {
18
+ # "markitdown": {
19
+ # "command": "docker",
20
+ # "args": [
21
+ # "run",
22
+ # "--rm",
23
+ # "-i",
24
+ # "markitdown-mcp:latest"
25
+ # ]
26
+ # },
27
+ "desktop-commander": {
28
+ "command": "npx",
29
+ "args": [
30
+ "-y",
31
+ "@wonderwhy-er/desktop-commander"
32
+ ]
33
+ },
34
+ # "filesystem": {
35
+ # "command": "npx",
36
+ # "args": [
37
+ # "-y",
38
+ # "@modelcontextprotocol/server-filesystem",
39
+ # "/Users/xxx/ai_workspace",
40
+ # ]
41
+ # },
42
+ }
43
+ }
44
+
45
+ mcp_tools, mcp_client = await setup_mcp_client_and_tools(test_server_config)
46
+
47
+ for tool in mcp_tools:
48
+ tool_param_model = create_tool_param_model(tool)
49
+ print(tool.name)
50
+ print(tool.description)
51
+ print(tool_param_model.model_json_schema())
52
+ pdb.set_trace()
53
+
54
+
55
+ async def test_controller_with_mcp():
56
+ import os
57
+ from src.controller.custom_controller import CustomController
58
+ from browser_use.controller.registry.views import ActionModel
59
+
60
+ mcp_server_config = {
61
+ "mcpServers": {
62
+ # "markitdown": {
63
+ # "command": "docker",
64
+ # "args": [
65
+ # "run",
66
+ # "--rm",
67
+ # "-i",
68
+ # "markitdown-mcp:latest"
69
+ # ]
70
+ # },
71
+ "desktop-commander": {
72
+ "command": "npx",
73
+ "args": [
74
+ "-y",
75
+ "@wonderwhy-er/desktop-commander"
76
+ ]
77
+ },
78
+ # "filesystem": {
79
+ # "command": "npx",
80
+ # "args": [
81
+ # "-y",
82
+ # "@modelcontextprotocol/server-filesystem",
83
+ # "/Users/xxx/ai_workspace",
84
+ # ]
85
+ # },
86
+ }
87
+ }
88
+
89
+ controller = CustomController()
90
+ await controller.setup_mcp_client(mcp_server_config)
91
+ action_name = "mcp.desktop-commander.execute_command"
92
+ action_info = controller.registry.registry.actions[action_name]
93
+ param_model = action_info.param_model
94
+ print(param_model.model_json_schema())
95
+ params = {"command": f"python ./tmp/test.py"
96
+ }
97
+ validated_params = param_model(**params)
98
+ ActionModel_ = controller.registry.create_action_model()
99
+ # Create ActionModel instance with the validated parameters
100
+ action_model = ActionModel_(**{action_name: validated_params})
101
+ result = await controller.act(action_model)
102
+ result = result.extracted_content
103
+ print(result)
104
+ if result and "Command is still running. Use read_output to get more output." in result and "PID" in \
105
+ result.split("\n")[0]:
106
+ pid = int(result.split("\n")[0].split("PID")[-1].strip())
107
+ action_name = "mcp.desktop-commander.read_output"
108
+ action_info = controller.registry.registry.actions[action_name]
109
+ param_model = action_info.param_model
110
+ print(param_model.model_json_schema())
111
+ params = {"pid": pid}
112
+ validated_params = param_model(**params)
113
+ action_model = ActionModel_(**{action_name: validated_params})
114
+ output_result = ""
115
+ while True:
116
+ time.sleep(1)
117
+ result = await controller.act(action_model)
118
+ result = result.extracted_content
119
+ if result:
120
+ pdb.set_trace()
121
+ output_result = result
122
+ break
123
+ print(output_result)
124
+ pdb.set_trace()
125
+ await controller.close_mcp_client()
126
+ pdb.set_trace()
127
+
128
+
129
+ if __name__ == '__main__':
130
+ # asyncio.run(test_mcp_client())
131
+ asyncio.run(test_controller_with_mcp())
tests/test_llm_api.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pdb
3
+ from dataclasses import dataclass
4
+
5
+ from dotenv import load_dotenv
6
+ from langchain_core.messages import HumanMessage, SystemMessage
7
+ from langchain_ollama import ChatOllama
8
+
9
+ load_dotenv()
10
+
11
+ import sys
12
+
13
+ sys.path.append(".")
14
+
15
+
16
+ @dataclass
17
+ class LLMConfig:
18
+ provider: str
19
+ model_name: str
20
+ temperature: float = 0.8
21
+ base_url: str = None
22
+ api_key: str = None
23
+
24
+
25
+ def create_message_content(text, image_path=None):
26
+ content = [{"type": "text", "text": text}]
27
+ image_format = "png" if image_path and image_path.endswith(".png") else "jpeg"
28
+ if image_path:
29
+ from src.utils import utils
30
+ image_data = utils.encode_image(image_path)
31
+ content.append({
32
+ "type": "image_url",
33
+ "image_url": {"url": f"data:image/{image_format};base64,{image_data}"}
34
+ })
35
+ return content
36
+
37
+
38
+ def get_env_value(key, provider):
39
+ env_mappings = {
40
+ "openai": {"api_key": "OPENAI_API_KEY", "base_url": "OPENAI_ENDPOINT"},
41
+ "azure_openai": {"api_key": "AZURE_OPENAI_API_KEY", "base_url": "AZURE_OPENAI_ENDPOINT"},
42
+ "google": {"api_key": "GOOGLE_API_KEY"},
43
+ "deepseek": {"api_key": "DEEPSEEK_API_KEY", "base_url": "DEEPSEEK_ENDPOINT"},
44
+ "mistral": {"api_key": "MISTRAL_API_KEY", "base_url": "MISTRAL_ENDPOINT"},
45
+ "alibaba": {"api_key": "ALIBABA_API_KEY", "base_url": "ALIBABA_ENDPOINT"},
46
+ "moonshot": {"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"},
47
+ "ibm": {"api_key": "IBM_API_KEY", "base_url": "IBM_ENDPOINT"}
48
+ }
49
+
50
+ if provider in env_mappings and key in env_mappings[provider]:
51
+ return os.getenv(env_mappings[provider][key], "")
52
+ return ""
53
+
54
+
55
+ def test_llm(config, query, image_path=None, system_message=None):
56
+ from src.utils import utils, llm_provider
57
+
58
+ # Special handling for Ollama-based models
59
+ if config.provider == "ollama":
60
+ if "deepseek-r1" in config.model_name:
61
+ from src.utils.llm_provider import DeepSeekR1ChatOllama
62
+ llm = DeepSeekR1ChatOllama(model=config.model_name)
63
+ else:
64
+ llm = ChatOllama(model=config.model_name)
65
+
66
+ ai_msg = llm.invoke(query)
67
+ print(ai_msg.content)
68
+ if "deepseek-r1" in config.model_name:
69
+ pdb.set_trace()
70
+ return
71
+
72
+ # For other providers, use the standard configuration
73
+ llm = llm_provider.get_llm_model(
74
+ provider=config.provider,
75
+ model_name=config.model_name,
76
+ temperature=config.temperature,
77
+ base_url=config.base_url or get_env_value("base_url", config.provider),
78
+ api_key=config.api_key or get_env_value("api_key", config.provider)
79
+ )
80
+
81
+ # Prepare messages for non-Ollama models
82
+ messages = []
83
+ if system_message:
84
+ messages.append(SystemMessage(content=create_message_content(system_message)))
85
+ messages.append(HumanMessage(content=create_message_content(query, image_path)))
86
+ ai_msg = llm.invoke(messages)
87
+
88
+ # Handle different response types
89
+ if hasattr(ai_msg, "reasoning_content"):
90
+ print(ai_msg.reasoning_content)
91
+ print(ai_msg.content)
92
+
93
+ def test_openai_model():
94
+ config = LLMConfig(provider="openai", model_name="gpt-4o")
95
+ test_llm(config, "Describe this image", "assets/examples/test.png")
96
+
97
+
98
+ def test_google_model():
99
+ # Enable your API key first if you haven't: https://ai.google.dev/palm_docs/oauth_quickstart
100
+ config = LLMConfig(provider="google", model_name="gemini-2.0-flash-exp")
101
+ test_llm(config, "Describe this image", "assets/examples/test.png")
102
+
103
+
104
+ def test_azure_openai_model():
105
+ config = LLMConfig(provider="azure_openai", model_name="gpt-4o")
106
+ test_llm(config, "Describe this image", "assets/examples/test.png")
107
+
108
+
109
+ def test_deepseek_model():
110
+ config = LLMConfig(provider="deepseek", model_name="deepseek-chat")
111
+ test_llm(config, "Who are you?")
112
+
113
+
114
+ def test_deepseek_r1_model():
115
+ config = LLMConfig(provider="deepseek", model_name="deepseek-reasoner")
116
+ test_llm(config, "Which is greater, 9.11 or 9.8?", system_message="You are a helpful AI assistant.")
117
+
118
+
119
+ def test_ollama_model():
120
+ config = LLMConfig(provider="ollama", model_name="qwen2.5:7b")
121
+ test_llm(config, "Sing a ballad of LangChain.")
122
+
123
+
124
+ def test_deepseek_r1_ollama_model():
125
+ config = LLMConfig(provider="ollama", model_name="deepseek-r1:14b")
126
+ test_llm(config, "How many 'r's are in the word 'strawberry'?")
127
+
128
+
129
+ def test_mistral_model():
130
+ config = LLMConfig(provider="mistral", model_name="pixtral-large-latest")
131
+ test_llm(config, "Describe this image", "assets/examples/test.png")
132
+
133
+
134
+ def test_moonshot_model():
135
+ config = LLMConfig(provider="moonshot", model_name="moonshot-v1-32k-vision-preview")
136
+ test_llm(config, "Describe this image", "assets/examples/test.png")
137
+
138
+
139
+ def test_ibm_model():
140
+ config = LLMConfig(provider="ibm", model_name="meta-llama/llama-4-maverick-17b-128e-instruct-fp8")
141
+ test_llm(config, "Describe this image", "assets/examples/test.png")
142
+
143
+
144
+ def test_qwen_model():
145
+ config = LLMConfig(provider="alibaba", model_name="qwen-vl-max")
146
+ test_llm(config, "How many 'r's are in the word 'strawberry'?")
147
+
148
+
149
+ if __name__ == "__main__":
150
+ # test_openai_model()
151
+ # test_google_model()
152
+ test_azure_openai_model()
153
+ # test_deepseek_model()
154
+ # test_ollama_model()
155
+ # test_deepseek_r1_model()
156
+ # test_deepseek_r1_ollama_model()
157
+ # test_mistral_model()
158
+ # test_ibm_model()
159
+ # test_qwen_model()
tests/test_playwright.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+
7
+ def test_connect_browser():
8
+ import os
9
+ from playwright.sync_api import sync_playwright
10
+
11
+ chrome_exe = os.getenv("CHROME_PATH", "")
12
+ chrome_use_data = os.getenv("CHROME_USER_DATA", "")
13
+
14
+ with sync_playwright() as p:
15
+ browser = p.chromium.launch_persistent_context(
16
+ user_data_dir=chrome_use_data,
17
+ executable_path=chrome_exe,
18
+ headless=False # Keep browser window visible
19
+ )
20
+
21
+ page = browser.new_page()
22
+ page.goto("https://mail.google.com/mail/u/0/#inbox")
23
+ page.wait_for_load_state()
24
+
25
+ input("Press the Enter key to close the browser...")
26
+
27
+ browser.close()
28
+
29
+
30
+ if __name__ == '__main__':
31
+ test_connect_browser()
webui.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+ import argparse
4
+ from src.webui.interface import theme_map, create_ui
5
+
6
+
7
+ def main():
8
+ parser = argparse.ArgumentParser(description="Gradio WebUI for Browser Agent")
9
+ parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to")
10
+ parser.add_argument("--port", type=int, default=7788, help="Port to listen on")
11
+ parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI")
12
+ args = parser.parse_args()
13
+
14
+ demo = create_ui(theme_name=args.theme)
15
+ demo.queue().launch(server_name=args.ip, server_port=args.port)
16
+
17
+
18
+ if __name__ == '__main__':
19
+ main()