aditya9128 commited on
Commit ·
4e3cee0
0
Parent(s):
Initial commit: VDoc-RAG - Intelligent Document Q&A with RAG
Browse files- .env.example +2 -0
- .gitignore +231 -0
- Dockerfile +43 -0
- HR_TESTING_GUIDE.md +63 -0
- README.md +111 -0
- README_HF.md +35 -0
- app/cache_manager.py +46 -0
- app/calibrate.py +121 -0
- app/chart_detect.py +125 -0
- app/chart_reasoner.py +232 -0
- app/debug_chunks.json +182 -0
- app/embeddings.py +30 -0
- app/feedback.json +30 -0
- app/feedback_manager.py +50 -0
- app/highlight_calibration.json +6 -0
- app/indexer.py +131 -0
- app/ingest.py +214 -0
- app/main.py +451 -0
- app/reader.py +145 -0
- app/tables.py +58 -0
- app/templates/benchmark_dashboard.html +81 -0
- app/templates/feedback_dashboard.html +64 -0
- app/templates/index.html +133 -0
- app/utils.py +3 -0
- app/visual_highlight.py +138 -0
- highlight_calibration.json +6 -0
- notebooks/evaluate_embeddings.ipynb +264 -0
- requirements.txt +22 -0
- samples/vdoc_rag_test.pdf +0 -0
- test.py +123 -0
- train_feedback_embeddings.py +104 -0
.env.example
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copy this to .env and fill in your values
|
| 2 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
.gitignore
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py.cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
#poetry.toml
|
| 110 |
+
|
| 111 |
+
# pdm
|
| 112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
+
#pdm.lock
|
| 116 |
+
#pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# pixi
|
| 121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
+
#pixi.lock
|
| 123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
+
.pixi
|
| 126 |
+
|
| 127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
+
__pypackages__/
|
| 129 |
+
|
| 130 |
+
# Celery stuff
|
| 131 |
+
celerybeat-schedule
|
| 132 |
+
celerybeat.pid
|
| 133 |
+
|
| 134 |
+
# SageMath parsed files
|
| 135 |
+
*.sage.py
|
| 136 |
+
|
| 137 |
+
# Environments
|
| 138 |
+
.env
|
| 139 |
+
.envrc
|
| 140 |
+
.venv
|
| 141 |
+
env/
|
| 142 |
+
venv/
|
| 143 |
+
ENV/
|
| 144 |
+
env.bak/
|
| 145 |
+
venv.bak/
|
| 146 |
+
|
| 147 |
+
# Spyder project settings
|
| 148 |
+
.spyderproject
|
| 149 |
+
.spyproject
|
| 150 |
+
|
| 151 |
+
# Rope project settings
|
| 152 |
+
.ropeproject
|
| 153 |
+
|
| 154 |
+
# mkdocs documentation
|
| 155 |
+
/site
|
| 156 |
+
|
| 157 |
+
# mypy
|
| 158 |
+
.mypy_cache/
|
| 159 |
+
.dmypy.json
|
| 160 |
+
dmypy.json
|
| 161 |
+
|
| 162 |
+
# Pyre type checker
|
| 163 |
+
.pyre/
|
| 164 |
+
|
| 165 |
+
# pytype static type analyzer
|
| 166 |
+
.pytype/
|
| 167 |
+
|
| 168 |
+
# Cython debug symbols
|
| 169 |
+
cython_debug/
|
| 170 |
+
|
| 171 |
+
# PyCharm
|
| 172 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 173 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 174 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 175 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 176 |
+
#.idea/
|
| 177 |
+
|
| 178 |
+
# Abstra
|
| 179 |
+
# Abstra is an AI-powered process automation framework.
|
| 180 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 181 |
+
# Learn more at https://abstra.io/docs
|
| 182 |
+
.abstra/
|
| 183 |
+
|
| 184 |
+
# Visual Studio Code
|
| 185 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 186 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 187 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 188 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 189 |
+
# .vscode/
|
| 190 |
+
|
| 191 |
+
# Ruff stuff:
|
| 192 |
+
.ruff_cache/
|
| 193 |
+
|
| 194 |
+
# PyPI configuration file
|
| 195 |
+
.pypirc
|
| 196 |
+
|
| 197 |
+
# Cursor
|
| 198 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 199 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 200 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 201 |
+
.cursorignore
|
| 202 |
+
.cursorindexingignore
|
| 203 |
+
|
| 204 |
+
# Marimo
|
| 205 |
+
marimo/_static/
|
| 206 |
+
marimo/_lsp/
|
| 207 |
+
__marimo__/
|
| 208 |
+
|
| 209 |
+
# =========================
|
| 210 |
+
# VDoc-RAG Project Specific
|
| 211 |
+
# =========================
|
| 212 |
+
# Runtime/Generated directories
|
| 213 |
+
app/cache/
|
| 214 |
+
app/uploads/
|
| 215 |
+
app/storage/
|
| 216 |
+
app/charts/
|
| 217 |
+
app/highlighted/
|
| 218 |
+
app/tmp/
|
| 219 |
+
app/tables/
|
| 220 |
+
storage/
|
| 221 |
+
|
| 222 |
+
# Keep directory structure with .gitkeep
|
| 223 |
+
!app/cache/.gitkeep
|
| 224 |
+
!app/uploads/.gitkeep
|
| 225 |
+
!app/storage/.gitkeep
|
| 226 |
+
!app/charts/.gitkeep
|
| 227 |
+
!app/highlighted/.gitkeep
|
| 228 |
+
!app/tmp/.gitkeep
|
| 229 |
+
!app/tables/.gitkeep
|
| 230 |
+
|
| 231 |
+
*.png
|
Dockerfile
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Install system dependencies
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
tesseract-ocr \
|
| 6 |
+
poppler-utils \
|
| 7 |
+
libgl1-mesa-glx \
|
| 8 |
+
libglib2.0-0 \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Set working directory
|
| 12 |
+
WORKDIR /app
|
| 13 |
+
|
| 14 |
+
# Copy requirements first for better caching
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
|
| 17 |
+
# Install Python dependencies
|
| 18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Copy the rest of the application
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
# Create necessary directories with proper permissions for HF Spaces
|
| 24 |
+
RUN mkdir -p /app/app/storage/chroma_db \
|
| 25 |
+
/app/app/uploads \
|
| 26 |
+
/app/app/tmp \
|
| 27 |
+
/app/app/highlighted \
|
| 28 |
+
/app/app/charts \
|
| 29 |
+
/app/app/tables \
|
| 30 |
+
/app/app/cache \
|
| 31 |
+
&& chmod -R 777 /app/app/storage \
|
| 32 |
+
&& chmod -R 777 /app/app/uploads \
|
| 33 |
+
&& chmod -R 777 /app/app/tmp \
|
| 34 |
+
&& chmod -R 777 /app/app/highlighted \
|
| 35 |
+
&& chmod -R 777 /app/app/charts \
|
| 36 |
+
&& chmod -R 777 /app/app/tables \
|
| 37 |
+
&& chmod -R 777 /app/app/cache
|
| 38 |
+
|
| 39 |
+
# Expose port (HF Spaces uses 7860)
|
| 40 |
+
EXPOSE 7860
|
| 41 |
+
|
| 42 |
+
# Run the application
|
| 43 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
HR_TESTING_GUIDE.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VDoc-RAG Demo Testing Guide
|
| 2 |
+
|
| 3 |
+
## What is this?
|
| 4 |
+
VDoc-RAG is an AI-powered document Q&A system that can:
|
| 5 |
+
- Extract text from PDFs (including charts and tables)
|
| 6 |
+
- Answer questions about uploaded documents
|
| 7 |
+
- Show confidence scores and source attribution
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## How to Test
|
| 12 |
+
|
| 13 |
+
### 1. Open the App
|
| 14 |
+
Click the link provided: `[YOUR_NGROK_URL]`
|
| 15 |
+
|
| 16 |
+
### 2. Upload a Document
|
| 17 |
+
- Click **"Choose File"** and select any PDF
|
| 18 |
+
- Click **"Upload & Index"**
|
| 19 |
+
- Wait for: `✅ Uploaded and indexed [filename] (X chunks)`
|
| 20 |
+
|
| 21 |
+
### 3. Ask Questions
|
| 22 |
+
Try these example questions:
|
| 23 |
+
- "What is this document about?"
|
| 24 |
+
- "Summarize the main points"
|
| 25 |
+
- "What are the key dates mentioned?"
|
| 26 |
+
- "Describe any charts or graphs"
|
| 27 |
+
|
| 28 |
+
### 4. Review the Response
|
| 29 |
+
You'll see:
|
| 30 |
+
- **Answer**: AI-generated response
|
| 31 |
+
- **Sources**: Which parts of the document were used
|
| 32 |
+
- **Confidence Score**: How relevant the retrieved content is
|
| 33 |
+
|
| 34 |
+
### 5. Provide Feedback
|
| 35 |
+
Click **✅ Correct** or **❌ Incorrect** to rate the answer
|
| 36 |
+
|
| 37 |
+
---
|
| 38 |
+
|
| 39 |
+
## Additional Features to Explore
|
| 40 |
+
|
| 41 |
+
| Page | What it Shows |
|
| 42 |
+
|------|---------------|
|
| 43 |
+
| `/feedback_dashboard` | Feedback statistics and model fine-tuning |
|
| 44 |
+
| `/benchmark_dashboard` | Embedding model evaluation metrics |
|
| 45 |
+
|
| 46 |
+
---
|
| 47 |
+
|
| 48 |
+
## Technical Highlights
|
| 49 |
+
|
| 50 |
+
- **RAG Pipeline**: Retrieval-Augmented Generation with ChromaDB
|
| 51 |
+
- **OCR**: Tesseract for text extraction from images/PDFs
|
| 52 |
+
- **Embeddings**: Sentence-transformers (all-MiniLM-L6-v2)
|
| 53 |
+
- **LLM**: Google Gemini for answer generation
|
| 54 |
+
- **Chart Detection**: CLIP + OpenCV for visual understanding
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
## Sample Test PDF
|
| 59 |
+
A sample document is pre-loaded. Upload your own PDF to test with real documents!
|
| 60 |
+
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
*Built with FastAPI, ChromaDB, Sentence-Transformers, and Google Gemini*
|
README.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 📄 VDoc-RAG (Visually-Rich Document Retrieval-Augmented Generation)
|
| 2 |
+
|
| 3 |
+
VDoc-RAG is an advanced multimodal system that answers questions from visually-rich documents (PDFs, reports, flyers) by combining OCR, table and chart reasoning, semantic embeddings, and LLMs.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## 🚀 Features
|
| 8 |
+
|
| 9 |
+
- 🧠 **RAG Pipeline** with persistent ChromaDB
|
| 10 |
+
- 🪄 **OCR + Table + Chart understanding**
|
| 11 |
+
- 📊 **Chart Reasoning** (Pix2Struct + OCR-based)
|
| 12 |
+
- 🔐 **Environment-based API key handling**
|
| 13 |
+
- 🧮 **Confidence Scoring** via cosine similarity
|
| 14 |
+
- 🧾 **Feedback Loop** for self-improving embeddings
|
| 15 |
+
- 📈 **Benchmark Dashboard** for evaluating embedding models
|
| 16 |
+
- 💾 **Persistent Storage** (DuckDB + Parquet backend)
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
## ⚙️ Quickstart (Windows)
|
| 21 |
+
|
| 22 |
+
### 1️⃣ Install Dependencies
|
| 23 |
+
|
| 24 |
+
Install:
|
| 25 |
+
- **Tesseract OCR** → [Tesseract Wiki](https://github.com/UB-Mannheim/tesseract/wiki)
|
| 26 |
+
- **Poppler for Windows** → [Poppler Releases](https://github.com/oschwartz10612/poppler-windows/releases)
|
| 27 |
+
|
| 28 |
+
Add both to your system PATH.
|
| 29 |
+
|
| 30 |
+
### 2️⃣ Create Virtual Environment
|
| 31 |
+
```bash
|
| 32 |
+
python -m venv venv
|
| 33 |
+
venv\Scripts\activate
|
| 34 |
+
pip install -r requirements.txt
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
### 3️⃣ Run the App
|
| 38 |
+
```bash
|
| 39 |
+
uvicorn app.main:app --reload --port 8000
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
Open → [http://127.0.0.1:8000](http://127.0.0.1:8000)
|
| 43 |
+
|
| 44 |
+
---
|
| 45 |
+
|
| 46 |
+
## 🖥️ Web Interfaces
|
| 47 |
+
|
| 48 |
+
| Page | Route | Description |
|
| 49 |
+
|------|-------|--------------|
|
| 50 |
+
| `/` | Main Interface | Upload, query, visualize highlights |
|
| 51 |
+
| `/feedback_dashboard` | Feedback Loop | View stats, fine-tune model |
|
| 52 |
+
| `/benchmark_dashboard` | Benchmarking | Evaluate embeddings (Precision/Recall/MRR) |
|
| 53 |
+
|
| 54 |
+
---
|
| 55 |
+
|
| 56 |
+
## 📁 Project Structure
|
| 57 |
+
|
| 58 |
+
```
|
| 59 |
+
vdoc-rag-mvp/
|
| 60 |
+
├─ app/
|
| 61 |
+
│ ├─ ingest.py # OCR, table & chart extraction
|
| 62 |
+
│ ├─ chart_reasoner.py # Chart summarization and trend detection
|
| 63 |
+
│ ├─ indexer.py # Persistent ChromaDB retrieval
|
| 64 |
+
│ ├─ reader.py # LLM question answering
|
| 65 |
+
│ ├─ feedback_manager.py # Feedback collection system
|
| 66 |
+
│ ├─ main.py # FastAPI server + dashboards
|
| 67 |
+
│ └─ visual_highlight.py # Highlight relevant regions
|
| 68 |
+
│
|
| 69 |
+
├─ models/vdoc_feedback_tuned/ # Fine-tuned embedding model
|
| 70 |
+
├─ storage/chroma_db/ # Persistent vector store
|
| 71 |
+
├─ notebooks/evaluate_embeddings.ipynb # Benchmarking notebook
|
| 72 |
+
└─ templates/ # HTML UIs (main, feedback, benchmark)
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
---
|
| 76 |
+
|
| 77 |
+
## 🧠 Models Used
|
| 78 |
+
|
| 79 |
+
| Type | Model | Purpose |
|
| 80 |
+
|------|--------|----------|
|
| 81 |
+
| Embedding | `all-MiniLM-L6-v2` (base), `multi-qa-MiniLM`, feedback-tuned variant | Semantic encoding |
|
| 82 |
+
| LLM Reader | Gemini / DistilGPT2 | Context-based answering |
|
| 83 |
+
| Chart Reasoning | Pix2Struct / OCR fallback | Visual trend analysis |
|
| 84 |
+
| Vector Store | ChromaDB (DuckDB + Parquet) | Persistent retrieval |
|
| 85 |
+
| Fine-tuning | SentenceTransformer + CosineLoss | Feedback-based learning |
|
| 86 |
+
|
| 87 |
+
---
|
| 88 |
+
|
| 89 |
+
## 🧩 Evaluation
|
| 90 |
+
|
| 91 |
+
- **Confidence Scoring**: cosine similarity between query & chunks
|
| 92 |
+
- **Precision / Recall / MRR**: benchmark dashboards & notebook
|
| 93 |
+
- **Feedback-driven fine-tuning**: iterative model improvement
|
| 94 |
+
|
| 95 |
+
---
|
| 96 |
+
|
| 97 |
+
## 🧠 Author’s Note
|
| 98 |
+
|
| 99 |
+
VDoc-RAG demonstrates how retrieval-augmented generation can evolve from plain text retrieval into **visually grounded document reasoning**, enabling future systems that can read, reason, and learn continuously.
|
| 100 |
+
|
| 101 |
+
---
|
| 102 |
+
|
| 103 |
+
**Developed as a full multimodal RAG research framework** — suitable for academic reports, enterprise document intelligence, and AI reasoning pipelines.
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
Tesseract
|
| 107 |
+
pix2struct
|
| 108 |
+
|
| 109 |
+
sentence transformer
|
| 110 |
+
MiniLM-L6-v2.
|
| 111 |
+
Gemini API
|
README_HF.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: VDoc-RAG
|
| 3 |
+
emoji: 📄
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# 📄 VDoc-RAG (Visually-Rich Document RAG)
|
| 12 |
+
|
| 13 |
+
An AI-powered document Q&A system that answers questions from PDFs with charts, tables, and images.
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- 🧠 **RAG Pipeline** with ChromaDB vector store
|
| 18 |
+
- 📊 **Chart & Table Understanding** via OCR
|
| 19 |
+
- 🔐 **Gemini LLM** for answer generation
|
| 20 |
+
- 🧮 **Confidence Scoring** via cosine similarity
|
| 21 |
+
- 🧾 **Feedback Loop** for improvement
|
| 22 |
+
|
| 23 |
+
## How to Use
|
| 24 |
+
|
| 25 |
+
1. Upload a PDF document
|
| 26 |
+
2. Ask questions about the content
|
| 27 |
+
3. Get AI-generated answers with sources
|
| 28 |
+
|
| 29 |
+
## Tech Stack
|
| 30 |
+
|
| 31 |
+
- FastAPI + Uvicorn
|
| 32 |
+
- Sentence-Transformers (all-MiniLM-L6-v2)
|
| 33 |
+
- ChromaDB
|
| 34 |
+
- Google Gemini
|
| 35 |
+
- Tesseract OCR
|
app/cache_manager.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import hashlib
|
| 3 |
+
import json
|
| 4 |
+
import shutil
|
| 5 |
+
|
| 6 |
+
CACHE_DIR = os.path.join(os.path.dirname(__file__), "cache")
|
| 7 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _hash_file(path: str) -> str:
|
| 11 |
+
"""Compute SHA256 fingerprint for a file."""
|
| 12 |
+
h = hashlib.sha256()
|
| 13 |
+
with open(path, "rb") as f:
|
| 14 |
+
while True:
|
| 15 |
+
chunk = f.read(8192)
|
| 16 |
+
if not chunk:
|
| 17 |
+
break
|
| 18 |
+
h.update(chunk)
|
| 19 |
+
return h.hexdigest()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def get_cache_path(pdf_path: str) -> str:
|
| 23 |
+
fid = _hash_file(pdf_path)
|
| 24 |
+
return os.path.join(CACHE_DIR, f"{fid}.json")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def save_chunks_to_cache(pdf_path: str, chunks) -> str:
|
| 28 |
+
path = get_cache_path(pdf_path)
|
| 29 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 30 |
+
json.dump(chunks, f, indent=2, ensure_ascii=False)
|
| 31 |
+
return path
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_chunks_from_cache(pdf_path: str):
|
| 35 |
+
path = get_cache_path(pdf_path)
|
| 36 |
+
if os.path.exists(path):
|
| 37 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 38 |
+
return json.load(f)
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def clear_cache() -> bool:
|
| 43 |
+
"""Delete all cached JSON files and recreate cache directory."""
|
| 44 |
+
shutil.rmtree(CACHE_DIR, ignore_errors=True)
|
| 45 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 46 |
+
return True
|
app/calibrate.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz
|
| 2 |
+
import matplotlib.pyplot as plt
|
| 3 |
+
from matplotlib.widgets import Slider, Button
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import io
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
pdf_path = "samples/vdoc_rag_test.pdf"
|
| 10 |
+
config_path = "highlight_calibration.json"
|
| 11 |
+
|
| 12 |
+
# Example hits
|
| 13 |
+
hits = [
|
| 14 |
+
{"metadata": {"page": 1, "bbox": [87, 1926, 775, 1957], "type": "text"}},
|
| 15 |
+
{"metadata": {"page": 2, "bbox": [87, 222, 592, 250], "type": "text"}},
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
# Load PDF
|
| 19 |
+
doc = fitz.open(pdf_path)
|
| 20 |
+
|
| 21 |
+
# Render both pages
|
| 22 |
+
pix1 = doc[0].get_pixmap(dpi=150)
|
| 23 |
+
pix2 = doc[1].get_pixmap(dpi=150)
|
| 24 |
+
|
| 25 |
+
img1 = Image.open(io.BytesIO(pix1.tobytes("png")))
|
| 26 |
+
img2 = Image.open(io.BytesIO(pix2.tobytes("png")))
|
| 27 |
+
|
| 28 |
+
# Combined figure (2 pages side-by-side)
|
| 29 |
+
fig, axes = plt.subplots(1, 2, figsize=(16, 10))
|
| 30 |
+
plt.subplots_adjust(bottom=0.25)
|
| 31 |
+
axes[0].imshow(img1)
|
| 32 |
+
axes[0].set_title("Page 1", fontsize=12)
|
| 33 |
+
axes[1].imshow(img2)
|
| 34 |
+
axes[1].set_title("Page 2", fontsize=12)
|
| 35 |
+
for ax in axes:
|
| 36 |
+
ax.axis("off")
|
| 37 |
+
|
| 38 |
+
# Keep reference sizes
|
| 39 |
+
img1_w, img1_h = img1.size
|
| 40 |
+
img2_w, img2_h = img2.size
|
| 41 |
+
|
| 42 |
+
# Prepare highlight rectangles for both pages
|
| 43 |
+
rects_page1, rects_page2 = [], []
|
| 44 |
+
for h in hits:
|
| 45 |
+
meta = h["metadata"]
|
| 46 |
+
page_idx = meta["page"] - 1
|
| 47 |
+
x0, y0, x1, y1 = [float(v) for v in meta["bbox"]]
|
| 48 |
+
rect = plt.Rectangle((x0, y0), x1 - x0, y1 - y0,
|
| 49 |
+
linewidth=2, edgecolor='r', facecolor='r', alpha=0.4)
|
| 50 |
+
if page_idx == 0:
|
| 51 |
+
rects_page1.append(rect)
|
| 52 |
+
axes[0].add_patch(rect)
|
| 53 |
+
elif page_idx == 1:
|
| 54 |
+
rects_page2.append(rect)
|
| 55 |
+
axes[1].add_patch(rect)
|
| 56 |
+
|
| 57 |
+
# 🎚️ Shared sliders
|
| 58 |
+
axcolor = 'lightgoldenrodyellow'
|
| 59 |
+
ax_x_offset = plt.axes([0.25, 0.12, 0.65, 0.03], facecolor=axcolor)
|
| 60 |
+
ax_x_scale = plt.axes([0.25, 0.09, 0.65, 0.03], facecolor=axcolor)
|
| 61 |
+
ax_y_offset = plt.axes([0.25, 0.06, 0.65, 0.03], facecolor=axcolor)
|
| 62 |
+
ax_y_scale = plt.axes([0.25, 0.03, 0.65, 0.03], facecolor=axcolor)
|
| 63 |
+
ax_save = plt.axes([0.85, 0.17, 0.10, 0.04])
|
| 64 |
+
|
| 65 |
+
slider_x_offset = Slider(ax_x_offset, 'X Offset', -500, 500, valinit=0, valstep=0.5)
|
| 66 |
+
slider_x_scale = Slider(ax_x_scale, 'X Scale', 0.3, 2.0, valinit=1.0, valstep=0.002)
|
| 67 |
+
slider_y_offset = Slider(ax_y_offset, 'Y Offset', -1500, 1500, valinit=0, valstep=0.5)
|
| 68 |
+
slider_y_scale = Slider(ax_y_scale, 'Y Scale', 0.3, 2.0, valinit=1.0, valstep=0.002)
|
| 69 |
+
btn_save = Button(ax_save, '💾 Save', color=axcolor, hovercolor='0.9')
|
| 70 |
+
|
| 71 |
+
def update(val):
|
| 72 |
+
xo, xs = slider_x_offset.val, slider_x_scale.val
|
| 73 |
+
yo, ys = slider_y_offset.val, slider_y_scale.val
|
| 74 |
+
|
| 75 |
+
# Page 1
|
| 76 |
+
for i, h in enumerate(rects_page1):
|
| 77 |
+
bbox = hits[0]["metadata"]["bbox"]
|
| 78 |
+
x0, y0, x1, y1 = [float(v) for v in bbox]
|
| 79 |
+
x0 = x0 * xs + xo
|
| 80 |
+
x1 = x1 * xs + xo
|
| 81 |
+
y0 = y0 * ys + yo
|
| 82 |
+
y1 = y1 * ys + yo
|
| 83 |
+
h.set_xy((x0, y1))
|
| 84 |
+
h.set_width(x1 - x0)
|
| 85 |
+
h.set_height(y0 - y1)
|
| 86 |
+
|
| 87 |
+
# Page 2
|
| 88 |
+
for i, h in enumerate(rects_page2):
|
| 89 |
+
bbox = hits[1]["metadata"]["bbox"]
|
| 90 |
+
x0, y0, x1, y1 = [float(v) for v in bbox]
|
| 91 |
+
x0 = x0 * xs + xo
|
| 92 |
+
x1 = x1 * xs + xo
|
| 93 |
+
y0 = y0 * ys + yo
|
| 94 |
+
y1 = y1 * ys + yo
|
| 95 |
+
h.set_xy((x0, y1))
|
| 96 |
+
h.set_width(x1 - x0)
|
| 97 |
+
h.set_height(y0 - y1)
|
| 98 |
+
|
| 99 |
+
fig.suptitle(
|
| 100 |
+
f"Xo={xo:.1f}, Xs={xs:.3f} | Yo={yo:.1f}, Ys={ys:.3f}",
|
| 101 |
+
fontsize=11, color='darkred'
|
| 102 |
+
)
|
| 103 |
+
fig.canvas.draw_idle()
|
| 104 |
+
|
| 105 |
+
for s in [slider_x_offset, slider_x_scale, slider_y_offset, slider_y_scale]:
|
| 106 |
+
s.on_changed(update)
|
| 107 |
+
|
| 108 |
+
def save_values(event):
|
| 109 |
+
xo, xs = slider_x_offset.val, slider_x_scale.val
|
| 110 |
+
yo, ys = slider_y_offset.val, slider_y_scale.val
|
| 111 |
+
calib = {
|
| 112 |
+
"x_offset": xo, "x_scale": xs,
|
| 113 |
+
"y_offset": yo, "y_scale": ys
|
| 114 |
+
}
|
| 115 |
+
with open(config_path, "w") as f:
|
| 116 |
+
json.dump(calib, f, indent=2)
|
| 117 |
+
print(f"✅ Saved combined calibration: {calib}")
|
| 118 |
+
|
| 119 |
+
btn_save.on_clicked(save_values)
|
| 120 |
+
|
| 121 |
+
plt.show()
|
app/chart_detect.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/chart_detect.py
|
| 2 |
+
import cv2
|
| 3 |
+
import os
|
| 4 |
+
import uuid
|
| 5 |
+
import numpy as np
|
| 6 |
+
from PIL import Image
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
|
| 9 |
+
# 🗂️ Ensure charts dir exists inside project
|
| 10 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 11 |
+
CHARTS_DIR = os.path.join(BASE_DIR, "charts")
|
| 12 |
+
os.makedirs(CHARTS_DIR, exist_ok=True)
|
| 13 |
+
|
| 14 |
+
def _ensure_bgr(img_or_path):
|
| 15 |
+
"""
|
| 16 |
+
Accept file path, PIL.Image, or ndarray → return OpenCV BGR ndarray.
|
| 17 |
+
"""
|
| 18 |
+
if isinstance(img_or_path, str):
|
| 19 |
+
img = cv2.imread(img_or_path)
|
| 20 |
+
if img is None:
|
| 21 |
+
raise ValueError(f"[chart_detect] cv2.imread failed: {img_or_path}")
|
| 22 |
+
return img
|
| 23 |
+
if isinstance(img_or_path, Image.Image):
|
| 24 |
+
return cv2.cvtColor(np.array(img_or_path), cv2.COLOR_RGB2BGR)
|
| 25 |
+
if isinstance(img_or_path, np.ndarray):
|
| 26 |
+
return img
|
| 27 |
+
raise ValueError("[chart_detect] Unsupported image type.")
|
| 28 |
+
|
| 29 |
+
def detect_charts(image_or_path, min_area=15000, debug=False, visualize=False):
|
| 30 |
+
"""
|
| 31 |
+
Detect chart-like rectangular regions in a page image.
|
| 32 |
+
Saves cropped charts into CHARTS_DIR and returns metadata list.
|
| 33 |
+
Each item: {"bbox": (x0,y0,x1,y1), "image_path": "<abs path>"}
|
| 34 |
+
"""
|
| 35 |
+
try:
|
| 36 |
+
img = _ensure_bgr(image_or_path)
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print("[chart_detect] load error:", e)
|
| 39 |
+
return []
|
| 40 |
+
|
| 41 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 42 |
+
gray = cv2.equalizeHist(gray)
|
| 43 |
+
blur = cv2.GaussianBlur(gray, (5, 5), 0)
|
| 44 |
+
|
| 45 |
+
# Canny edge detection — lowered thresholds for faint edges
|
| 46 |
+
edges = cv2.Canny(blur, 30, 100)
|
| 47 |
+
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 48 |
+
|
| 49 |
+
h_img, w_img = img.shape[:2]
|
| 50 |
+
charts = []
|
| 51 |
+
|
| 52 |
+
if debug:
|
| 53 |
+
print(f"[chart_detect] Found {len(contours)} raw contours")
|
| 54 |
+
|
| 55 |
+
for c in contours:
|
| 56 |
+
x, y, w, h = cv2.boundingRect(c)
|
| 57 |
+
area = w * h
|
| 58 |
+
aspect = w / (h + 1e-8)
|
| 59 |
+
|
| 60 |
+
# 🔧 More forgiving filtering
|
| 61 |
+
if area < min_area * 0.5:
|
| 62 |
+
continue
|
| 63 |
+
if w > 0.98 * w_img or h > 0.98 * h_img:
|
| 64 |
+
continue
|
| 65 |
+
if not (0.1 < aspect < 10.0):
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
# Merge very close bounding boxes
|
| 69 |
+
merged = False
|
| 70 |
+
for prev in charts:
|
| 71 |
+
px0, py0, px1, py1 = prev["bbox"]
|
| 72 |
+
# Overlap or close enough
|
| 73 |
+
if abs(x - px0) < 50 and abs(y - py0) < 50:
|
| 74 |
+
px0, py0 = min(px0, x), min(py0, y)
|
| 75 |
+
px1, py1 = max(px1, x + w), max(py1, y + h)
|
| 76 |
+
prev["bbox"] = (px0, py0, px1, py1)
|
| 77 |
+
merged = True
|
| 78 |
+
break
|
| 79 |
+
if merged:
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
# Slight padding
|
| 83 |
+
pad_x = int(min(0.1 * w, 40))
|
| 84 |
+
pad_y = int(min(0.1 * h, 40))
|
| 85 |
+
x0 = max(0, x - pad_x)
|
| 86 |
+
y0 = max(0, y - pad_y)
|
| 87 |
+
x1 = min(w_img, x + w + pad_x)
|
| 88 |
+
y1 = min(h_img, y + h + pad_y)
|
| 89 |
+
|
| 90 |
+
crop = img[y0:y1, x0:x1]
|
| 91 |
+
crop_name = f"chart_{uuid.uuid4().hex}.png"
|
| 92 |
+
crop_path = os.path.join(CHARTS_DIR, crop_name)
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
cv2.imwrite(crop_path, crop)
|
| 96 |
+
charts.append({"bbox": (x0, y0, x1, y1), "image_path": crop_path})
|
| 97 |
+
except Exception as e:
|
| 98 |
+
print(f"[chart_detect] Failed saving {crop_path}: {e}")
|
| 99 |
+
|
| 100 |
+
# Sort by size (largest first)
|
| 101 |
+
charts.sort(key=lambda c: (c["bbox"][2] - c["bbox"][0]) * (c["bbox"][3] - c["bbox"][1]), reverse=True)
|
| 102 |
+
|
| 103 |
+
if debug:
|
| 104 |
+
print(f"[chart_detect] ✅ Detected {len(charts)} likely chart(s). Saved to {CHARTS_DIR}")
|
| 105 |
+
|
| 106 |
+
# 🧠 Optional: Visualize results
|
| 107 |
+
if visualize:
|
| 108 |
+
vis = img.copy()
|
| 109 |
+
for c in charts:
|
| 110 |
+
x0, y0, x1, y1 = c["bbox"]
|
| 111 |
+
cv2.rectangle(vis, (x0, y0), (x1, y1), (0, 255, 0), 3)
|
| 112 |
+
plt.figure(figsize=(12, 10))
|
| 113 |
+
plt.imshow(cv2.cvtColor(vis, cv2.COLOR_BGR2RGB))
|
| 114 |
+
plt.title(f"Detected {len(charts)} chart(s)")
|
| 115 |
+
plt.axis("off")
|
| 116 |
+
plt.show()
|
| 117 |
+
|
| 118 |
+
return charts
|
| 119 |
+
|
| 120 |
+
# Manual debug run
|
| 121 |
+
if __name__ == "__main__":
|
| 122 |
+
test_image = "samples/vdoc_rag_test_page1.png" # example path
|
| 123 |
+
results = detect_charts(test_image, debug=True, visualize=True)
|
| 124 |
+
for r in results:
|
| 125 |
+
print(r)
|
app/chart_reasoner.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
|
| 6 |
+
import pytesseract
|
| 7 |
+
from PIL import Image
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
# Optional HF/Pix2Struct captioning
|
| 11 |
+
USE_PIX2STRUCT = False
|
| 12 |
+
try:
|
| 13 |
+
from transformers import AutoProcessor, AutoModelForVision2Seq
|
| 14 |
+
|
| 15 |
+
_pix2_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
|
| 16 |
+
_pix2_model = AutoModelForVision2Seq.from_pretrained("google/pix2struct-textcaps-base")
|
| 17 |
+
USE_PIX2STRUCT = True
|
| 18 |
+
print("[chart_reasoner] Pix2Struct/TextCaps available for chart captioning.")
|
| 19 |
+
except Exception:
|
| 20 |
+
USE_PIX2STRUCT = False
|
| 21 |
+
print("[chart_reasoner] Pix2Struct/TextCaps not available — will use OCR fallback.")
|
| 22 |
+
import os
|
| 23 |
+
import re
|
| 24 |
+
import json
|
| 25 |
+
from typing import List, Dict, Any, Optional
|
| 26 |
+
|
| 27 |
+
import pytesseract
|
| 28 |
+
from PIL import Image
|
| 29 |
+
import numpy as np
|
| 30 |
+
import cv2
|
| 31 |
+
|
| 32 |
+
# Optional Pix2Struct captioning
|
| 33 |
+
USE_PIX2STRUCT = False
|
| 34 |
+
try:
|
| 35 |
+
from transformers import AutoProcessor, AutoModelForVision2Seq
|
| 36 |
+
|
| 37 |
+
_pix2_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
|
| 38 |
+
_pix2_model = AutoModelForVision2Seq.from_pretrained("google/pix2struct-textcaps-base")
|
| 39 |
+
USE_PIX2STRUCT = True
|
| 40 |
+
print("[chart_reasoner] Pix2Struct/TextCaps available for chart captioning.")
|
| 41 |
+
except Exception:
|
| 42 |
+
USE_PIX2STRUCT = False
|
| 43 |
+
print("[chart_reasoner] Pix2Struct/TextCaps not available — will use OCR/geometric fallback.")
|
| 44 |
+
|
| 45 |
+
# Optional CLIP embeddings via sentence-transformers
|
| 46 |
+
USE_CLIP = False
|
| 47 |
+
try:
|
| 48 |
+
from sentence_transformers import SentenceTransformer
|
| 49 |
+
|
| 50 |
+
_clip_model = SentenceTransformer("clip-ViT-B-32")
|
| 51 |
+
USE_CLIP = True
|
| 52 |
+
print("[chart_reasoner] CLIP (sentence-transformers) available for chart embeddings.")
|
| 53 |
+
except Exception:
|
| 54 |
+
USE_CLIP = False
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def preprocess_for_ocr(image_path: str) -> Image.Image:
|
| 58 |
+
"""Enhance contrast and threshold image to improve OCR inside colored charts."""
|
| 59 |
+
img = cv2.imread(image_path)
|
| 60 |
+
if img is None:
|
| 61 |
+
raise ValueError(f"Could not read image: {image_path}")
|
| 62 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 63 |
+
gray = cv2.equalizeHist(gray)
|
| 64 |
+
# adaptive threshold for better text extraction
|
| 65 |
+
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 21, 10)
|
| 66 |
+
return Image.fromarray(thresh)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _extract_numbers_from_text(text: str) -> List[float]:
|
| 70 |
+
matches = re.findall(r"\(?-?\d[\d,\.\)\(]*%?", text)
|
| 71 |
+
nums: List[float] = []
|
| 72 |
+
for m in matches:
|
| 73 |
+
s = m.strip()
|
| 74 |
+
negative = False
|
| 75 |
+
if s.startswith("(") and s.endswith(")"):
|
| 76 |
+
negative = True
|
| 77 |
+
s = s[1:-1]
|
| 78 |
+
s = s.replace("%", "").replace(",", "")
|
| 79 |
+
try:
|
| 80 |
+
val = float(s)
|
| 81 |
+
if negative:
|
| 82 |
+
val = -val
|
| 83 |
+
nums.append(val)
|
| 84 |
+
except Exception:
|
| 85 |
+
continue
|
| 86 |
+
return nums
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def analyze_bar_chart(image_path: str, debug_save: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
| 90 |
+
"""Detect vertical bars and compute heights to infer a simple trend.
|
| 91 |
+
|
| 92 |
+
Returns None if no bar-like contours are found.
|
| 93 |
+
"""
|
| 94 |
+
img = cv2.imread(image_path)
|
| 95 |
+
if img is None:
|
| 96 |
+
return None
|
| 97 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 98 |
+
blur = cv2.GaussianBlur(gray, (5, 5), 0)
|
| 99 |
+
edges = cv2.Canny(blur, 50, 150)
|
| 100 |
+
|
| 101 |
+
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 102 |
+
h_img = img.shape[0]
|
| 103 |
+
|
| 104 |
+
bars = []
|
| 105 |
+
for c in contours:
|
| 106 |
+
x, y, w, h = cv2.boundingRect(c)
|
| 107 |
+
# Vertical bar heuristic: taller than wide, reasonable size
|
| 108 |
+
if w < 6 or h < 10:
|
| 109 |
+
continue
|
| 110 |
+
aspect = h / (w + 1e-8)
|
| 111 |
+
if aspect < 1.2:
|
| 112 |
+
continue
|
| 113 |
+
# ignore boxes that almost cover image (likely page border)
|
| 114 |
+
if h > 0.9 * h_img:
|
| 115 |
+
continue
|
| 116 |
+
bars.append((x, y, w, h))
|
| 117 |
+
|
| 118 |
+
if not bars:
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
# sort left-to-right
|
| 122 |
+
bars = sorted(bars, key=lambda b: b[0])
|
| 123 |
+
heights = [int(b[3]) for b in bars]
|
| 124 |
+
# normalize heights to 0-1
|
| 125 |
+
max_h = max(heights) if heights else 1
|
| 126 |
+
norm = [h / max_h for h in heights]
|
| 127 |
+
|
| 128 |
+
# trend by comparing first vs last
|
| 129 |
+
trend = "increasing" if heights[-1] > heights[0] else ("decreasing" if heights[-1] < heights[0] else "flat")
|
| 130 |
+
|
| 131 |
+
res = {
|
| 132 |
+
"bar_count": len(bars),
|
| 133 |
+
"heights": heights,
|
| 134 |
+
"normalized_heights": norm,
|
| 135 |
+
"trend": trend,
|
| 136 |
+
"bars_xywh": bars,
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
# debug: save overlay image showing detected bars
|
| 140 |
+
try:
|
| 141 |
+
if debug_save:
|
| 142 |
+
ov = img.copy()
|
| 143 |
+
for (x, y, w, h) in bars:
|
| 144 |
+
cv2.rectangle(ov, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
| 145 |
+
cv2.imwrite(debug_save, ov)
|
| 146 |
+
except Exception:
|
| 147 |
+
pass
|
| 148 |
+
|
| 149 |
+
return res
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def process_chart_crop(image_path: str) -> Dict[str, Any]:
|
| 153 |
+
"""Main entry: returns a textual summary and structured analysis for a chart image."""
|
| 154 |
+
if not os.path.exists(image_path):
|
| 155 |
+
return {"summary_text": f"[Error] Chart image not found: {image_path}", "structured": {}}
|
| 156 |
+
|
| 157 |
+
pix_caption = None
|
| 158 |
+
if USE_PIX2STRUCT:
|
| 159 |
+
try:
|
| 160 |
+
img = Image.open(image_path).convert("RGB")
|
| 161 |
+
inputs = _pix2_processor(images=img, text="Describe this chart.", return_tensors="pt")
|
| 162 |
+
outputs = _pix2_model.generate(**inputs, max_new_tokens=128)
|
| 163 |
+
try:
|
| 164 |
+
pix_caption = _pix2_processor.decode(outputs[0], skip_special_tokens=True)
|
| 165 |
+
except Exception:
|
| 166 |
+
from transformers import AutoTokenizer
|
| 167 |
+
|
| 168 |
+
tokenizer = AutoTokenizer.from_pretrained("google/pix2struct-textcaps-base")
|
| 169 |
+
pix_caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 170 |
+
except Exception as e:
|
| 171 |
+
print("[chart_reasoner] Pix2Struct failed:", e)
|
| 172 |
+
pix_caption = None
|
| 173 |
+
|
| 174 |
+
# Geometric analysis (bars)
|
| 175 |
+
bar_info = None
|
| 176 |
+
try:
|
| 177 |
+
# debug overlay path (optional)
|
| 178 |
+
debug_overlay = None
|
| 179 |
+
# if an environment var set, write overlays to app/charts/debug_*
|
| 180 |
+
charts_dir = os.environ.get("VDOCRAG_CHARTS_DIR", os.path.join(os.path.dirname(__file__), "charts"))
|
| 181 |
+
if os.path.isdir(charts_dir):
|
| 182 |
+
debug_overlay = os.path.join(charts_dir, f"debug_{os.path.basename(image_path)}")
|
| 183 |
+
bar_info = analyze_bar_chart(image_path, debug_save=debug_overlay)
|
| 184 |
+
except Exception as e:
|
| 185 |
+
print("[chart_reasoner] analyze_bar_chart error:", e)
|
| 186 |
+
bar_info = None
|
| 187 |
+
|
| 188 |
+
# OCR with preprocessing to capture axis labels / numbers
|
| 189 |
+
ocr_text = ""
|
| 190 |
+
try:
|
| 191 |
+
proc_img = preprocess_for_ocr(image_path)
|
| 192 |
+
ocr_text = pytesseract.image_to_string(proc_img, config="--psm 6")
|
| 193 |
+
except Exception as e:
|
| 194 |
+
try:
|
| 195 |
+
# fallback to raw OCR
|
| 196 |
+
ocr_text = pytesseract.image_to_string(Image.open(image_path))
|
| 197 |
+
except Exception as e2:
|
| 198 |
+
return {"summary_text": f"[Error] OCR failure: {e} / {e2}", "structured": {}}
|
| 199 |
+
|
| 200 |
+
nums = _extract_numbers_from_text(ocr_text)
|
| 201 |
+
structured: Dict[str, Any] = {"ocr_text": ocr_text.strip(), "numbers": nums}
|
| 202 |
+
|
| 203 |
+
summary_parts = []
|
| 204 |
+
if pix_caption:
|
| 205 |
+
summary_parts.append(pix_caption.strip())
|
| 206 |
+
|
| 207 |
+
if ocr_text.strip():
|
| 208 |
+
summary_parts.append("OCR summary: " + " ".join(ocr_text.strip().split())[:300])
|
| 209 |
+
|
| 210 |
+
if bar_info:
|
| 211 |
+
structured.update({
|
| 212 |
+
"bar_count": bar_info.get("bar_count"),
|
| 213 |
+
"bar_heights": bar_info.get("heights"),
|
| 214 |
+
"bar_trend": bar_info.get("trend"),
|
| 215 |
+
"bars_xywh": bar_info.get("bars_xywh"),
|
| 216 |
+
})
|
| 217 |
+
summary_parts.append(f"Bar chart trend: {bar_info.get('trend')} (left→right)")
|
| 218 |
+
|
| 219 |
+
# Optional CLIP embedding for retrieval
|
| 220 |
+
if USE_CLIP:
|
| 221 |
+
try:
|
| 222 |
+
emb = _clip_model.encode([" ".join(summary_parts) or ocr_text], normalize_embeddings=True)[0]
|
| 223 |
+
structured["clip_vector"] = [float(x) for x in np.asarray(emb).tolist()]
|
| 224 |
+
except Exception as e:
|
| 225 |
+
print("[chart_reasoner] CLIP encode failed:", e)
|
| 226 |
+
|
| 227 |
+
final_summary = " | ".join(summary_parts) if summary_parts else (ocr_text.strip() or "No description available.")
|
| 228 |
+
|
| 229 |
+
return {"summary_text": final_summary, "structured": structured}
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
__all__ = ["process_chart_crop"]
|
app/debug_chunks.json
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "text_9078364bec07451fbe7900a99835907b",
|
| 4 |
+
"text": "VDoc RAG Test Document",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 7 |
+
"page": 1,
|
| 8 |
+
"bbox": null,
|
| 9 |
+
"type": "text"
|
| 10 |
+
}
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"id": "text_0195b06b4eed4e3a9de12f8f73380390",
|
| 14 |
+
"text": "Contains Charts, Tables, and Flyers",
|
| 15 |
+
"metadata": {
|
| 16 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 17 |
+
"page": 1,
|
| 18 |
+
"bbox": null,
|
| 19 |
+
"type": "text"
|
| 20 |
+
}
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"id": "text_c6ec891227d44d42ab809baf2469bbc8",
|
| 24 |
+
"text": "Sample Data Table:",
|
| 25 |
+
"metadata": {
|
| 26 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 27 |
+
"page": 1,
|
| 28 |
+
"bbox": null,
|
| 29 |
+
"type": "text"
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"id": "text_9e2edb5e2e4f42b9a344a594b904a859",
|
| 34 |
+
"text": "ID Name Score Category",
|
| 35 |
+
"metadata": {
|
| 36 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 37 |
+
"page": 1,
|
| 38 |
+
"bbox": null,
|
| 39 |
+
"type": "text"
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"id": "text_c94b49b4eedd4f4a82a76fad15610f69",
|
| 44 |
+
"text": "1 Alice 85 A",
|
| 45 |
+
"metadata": {
|
| 46 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 47 |
+
"page": 1,
|
| 48 |
+
"bbox": null,
|
| 49 |
+
"type": "text"
|
| 50 |
+
}
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"id": "text_db03ed6418594c0fb3451cb7ba032342",
|
| 54 |
+
"text": "2 Bob 78 B",
|
| 55 |
+
"metadata": {
|
| 56 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 57 |
+
"page": 1,
|
| 58 |
+
"bbox": null,
|
| 59 |
+
"type": "text"
|
| 60 |
+
}
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"id": "text_24e15dd936584136ad45cdb48b74f695",
|
| 64 |
+
"text": "3 Charlie 92 A+",
|
| 65 |
+
"metadata": {
|
| 66 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 67 |
+
"page": 1,
|
| 68 |
+
"bbox": null,
|
| 69 |
+
"type": "text"
|
| 70 |
+
}
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"id": "text_9d77d0b5b759433d899289bb7a1b79d9",
|
| 74 |
+
"text": "4 David 64 C",
|
| 75 |
+
"metadata": {
|
| 76 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 77 |
+
"page": 1,
|
| 78 |
+
"bbox": null,
|
| 79 |
+
"type": "text"
|
| 80 |
+
}
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"id": "text_3d67b96a270944bf9b35ba358d534830",
|
| 84 |
+
"text": "5 Eva 88 A",
|
| 85 |
+
"metadata": {
|
| 86 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 87 |
+
"page": 1,
|
| 88 |
+
"bbox": null,
|
| 89 |
+
"type": "text"
|
| 90 |
+
}
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"id": "text_7aabcd11cb544fd99fd75c591885b5e8",
|
| 94 |
+
"text": "Flyer Section: Upcoming AI Workshop",
|
| 95 |
+
"metadata": {
|
| 96 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 97 |
+
"page": 1,
|
| 98 |
+
"bbox": null,
|
| 99 |
+
"type": "text"
|
| 100 |
+
}
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"id": "text_86c5a632d7de4c258d9bced4e8de84b4",
|
| 104 |
+
"text": "Join us for an engaging AI Workshop covering:",
|
| 105 |
+
"metadata": {
|
| 106 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 107 |
+
"page": 1,
|
| 108 |
+
"bbox": null,
|
| 109 |
+
"type": "text"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"id": "text_4dd5041c98f046a499bf076093f0503a",
|
| 114 |
+
"text": "- Machine Learning Basics",
|
| 115 |
+
"metadata": {
|
| 116 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 117 |
+
"page": 1,
|
| 118 |
+
"bbox": null,
|
| 119 |
+
"type": "text"
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"id": "text_30472772897c453491075424dbdf9927",
|
| 124 |
+
"text": "- LLM Applications",
|
| 125 |
+
"metadata": {
|
| 126 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 127 |
+
"page": 1,
|
| 128 |
+
"bbox": null,
|
| 129 |
+
"type": "text"
|
| 130 |
+
}
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"id": "text_e7f7640c0daa4146b3a30ec41d79b3e8",
|
| 134 |
+
"text": "- RAG (Retrieval Augmented Generation) Systems",
|
| 135 |
+
"metadata": {
|
| 136 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 137 |
+
"page": 1,
|
| 138 |
+
"bbox": null,
|
| 139 |
+
"type": "text"
|
| 140 |
+
}
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"id": "table_2791fed25a254479a185f8403f6ca385",
|
| 144 |
+
"text": "[{\"ID\": \"1\", \"Name\": \"Alice\", \"Score\": \"85\", \"Category\": \"A\"}, {\"ID\": \"2\", \"Name\": \"Bob\", \"Score\": \"78\", \"Category\": \"B\"}, {\"ID\": \"3\", \"Name\": \"Charlie\", \"Score\": \"92\", \"Category\": \"A+\"}, {\"ID\": \"4\", \"Name\": \"David\", \"Score\": \"64\", \"Category\": \"C\"}, {\"ID\": \"5\", \"Name\": \"Eva\", \"Score\": \"88\", \"Category\": \"A\"}]",
|
| 145 |
+
"metadata": {
|
| 146 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 147 |
+
"page": 1,
|
| 148 |
+
"bbox": null,
|
| 149 |
+
"type": "table"
|
| 150 |
+
}
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"id": "text_5d5f395ca35940d7aff54b1224f51940",
|
| 154 |
+
"text": "Date: November 20, 2025",
|
| 155 |
+
"metadata": {
|
| 156 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 157 |
+
"page": 2,
|
| 158 |
+
"bbox": null,
|
| 159 |
+
"type": "text"
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"id": "text_49338ecabed84edb8c71fa11dda41ff2",
|
| 164 |
+
"text": "Venue: Innovation Hall, Tech Park",
|
| 165 |
+
"metadata": {
|
| 166 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 167 |
+
"page": 2,
|
| 168 |
+
"bbox": null,
|
| 169 |
+
"type": "text"
|
| 170 |
+
}
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"id": "text_dd6a5828c8e64a628e0cec709e40172e",
|
| 174 |
+
"text": "Register now at: www.aiworkshop2025.com",
|
| 175 |
+
"metadata": {
|
| 176 |
+
"source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
|
| 177 |
+
"page": 2,
|
| 178 |
+
"bbox": null,
|
| 179 |
+
"type": "text"
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
]
|
app/embeddings.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
import os
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TextImageEmbedder:
|
| 7 |
+
def __init__(self, text_model_name=None):
|
| 8 |
+
# Automatically load fine-tuned model if available
|
| 9 |
+
default_model = "all-MiniLM-L6-v2"
|
| 10 |
+
tuned_model = os.path.join(os.path.dirname(__file__), "..", "models", "vdoc_feedback_tuned", "latest")
|
| 11 |
+
|
| 12 |
+
if text_model_name:
|
| 13 |
+
model_to_use = text_model_name
|
| 14 |
+
elif os.path.exists(os.path.abspath(tuned_model)):
|
| 15 |
+
tuned_path = os.path.abspath(tuned_model)
|
| 16 |
+
print(f"🧠 Using fine-tuned embedding model: {tuned_path}")
|
| 17 |
+
model_to_use = tuned_path
|
| 18 |
+
else:
|
| 19 |
+
print(f"📦 Using base embedding model: {default_model}")
|
| 20 |
+
model_to_use = default_model
|
| 21 |
+
|
| 22 |
+
self.text_model = SentenceTransformer(model_to_use)
|
| 23 |
+
|
| 24 |
+
def embed_text(self, texts):
|
| 25 |
+
if isinstance(texts, str):
|
| 26 |
+
texts = [texts]
|
| 27 |
+
return self.text_model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
|
| 28 |
+
|
| 29 |
+
def embed_text_sync(self, text):
|
| 30 |
+
return self.embed_text([text])[0]
|
app/feedback.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"timestamp": "2025-11-10T14:19:13",
|
| 4 |
+
"question": "what trend does the bar graph show",
|
| 5 |
+
"answer": "increasing (left→right)",
|
| 6 |
+
"correctness": "correct",
|
| 7 |
+
"sources": []
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"timestamp": "2026-03-11T02:25:53",
|
| 11 |
+
"question": "what is this document about?",
|
| 12 |
+
"answer": "This document is about an upcoming AI Workshop covering Machine Learning Basics, scheduled for November 20, 2025. It also contains a section describing a chart or graph with data from 2018 to 2024.",
|
| 13 |
+
"correctness": "correct",
|
| 14 |
+
"sources": []
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"timestamp": "2026-03-11T02:29:29",
|
| 18 |
+
"question": "what is this document about?",
|
| 19 |
+
"answer": "This document is about an upcoming AI Workshop covering Machine Learning Basics, scheduled for November 20, 2025. It also contains a section describing a chart or graph with data from 2018 to 2024.",
|
| 20 |
+
"correctness": "correct",
|
| 21 |
+
"sources": []
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"timestamp": "2026-03-11T02:29:40",
|
| 25 |
+
"question": "what is this document about?",
|
| 26 |
+
"answer": "This document is about an upcoming AI Workshop covering Machine Learning Basics, scheduled for November 20, 2025. It also contains a section describing a chart or graph with data from 2018 to 2024.",
|
| 27 |
+
"correctness": "correct",
|
| 28 |
+
"sources": []
|
| 29 |
+
}
|
| 30 |
+
]
|
app/feedback_manager.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 6 |
+
FEEDBACK_FILE = os.path.join(BASE_DIR, "feedback.json")
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _load_feedback():
|
| 10 |
+
if not os.path.exists(FEEDBACK_FILE):
|
| 11 |
+
return []
|
| 12 |
+
with open(FEEDBACK_FILE, "r", encoding="utf-8") as f:
|
| 13 |
+
try:
|
| 14 |
+
return json.load(f)
|
| 15 |
+
except Exception:
|
| 16 |
+
return []
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _save_feedback(data):
|
| 20 |
+
with open(FEEDBACK_FILE, "w", encoding="utf-8") as f:
|
| 21 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def record_feedback(question, answer, correctness, sources=None):
|
| 25 |
+
"""
|
| 26 |
+
Store user feedback about RAG answer correctness.
|
| 27 |
+
correctness: 'correct' | 'incorrect' | 'partial'
|
| 28 |
+
"""
|
| 29 |
+
entry = {
|
| 30 |
+
"timestamp": datetime.now().isoformat(timespec="seconds"),
|
| 31 |
+
"question": question,
|
| 32 |
+
"answer": answer,
|
| 33 |
+
"correctness": correctness,
|
| 34 |
+
"sources": sources or [],
|
| 35 |
+
}
|
| 36 |
+
data = _load_feedback()
|
| 37 |
+
data.append(entry)
|
| 38 |
+
_save_feedback(data)
|
| 39 |
+
print(f"📝 Feedback recorded ({correctness}) for: {question[:60]}...")
|
| 40 |
+
return entry
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def get_feedback_summary():
|
| 44 |
+
data = _load_feedback()
|
| 45 |
+
total = len(data)
|
| 46 |
+
if total == 0:
|
| 47 |
+
return "No feedback yet."
|
| 48 |
+
correct = sum(1 for x in data if x.get("correctness") == "correct")
|
| 49 |
+
incorrect = sum(1 for x in data if x.get("correctness") == "incorrect")
|
| 50 |
+
return f"Feedback Stats — ✅ {correct} correct, ❌ {incorrect} incorrect, total {total}"
|
app/highlight_calibration.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"x_offset": -33.0,
|
| 3 |
+
"x_scale": 1.0,
|
| 4 |
+
"y_offset": -65.0,
|
| 5 |
+
"y_scale": 1.02
|
| 6 |
+
}
|
app/indexer.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/indexer.py
|
| 2 |
+
import chromadb
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ChromaIndexer:
|
| 9 |
+
def __init__(self, embedding_function=None, persist_directory="./storage/chroma_db"):
|
| 10 |
+
"""
|
| 11 |
+
Persistent Chroma DB (DuckDB + Parquet) wrapper.
|
| 12 |
+
Stores vectors and metadata to disk so index survives restarts.
|
| 13 |
+
"""
|
| 14 |
+
os.makedirs(persist_directory, exist_ok=True)
|
| 15 |
+
self.embedding_function = embedding_function
|
| 16 |
+
self.persist_directory = persist_directory
|
| 17 |
+
self.active_doc_id = None # Track currently active document
|
| 18 |
+
|
| 19 |
+
# Use the PersistentClient backed by the provided directory
|
| 20 |
+
try:
|
| 21 |
+
self.client = chromadb.PersistentClient(path=persist_directory)
|
| 22 |
+
except Exception:
|
| 23 |
+
# fallback to in-memory client if PersistentClient not available
|
| 24 |
+
print("[indexer] PersistentClient not available, falling back to in-memory client.")
|
| 25 |
+
self.client = chromadb.Client()
|
| 26 |
+
|
| 27 |
+
self.collection = self.client.get_or_create_collection(
|
| 28 |
+
"vdoc",
|
| 29 |
+
metadata={"description": "VDoc-RAG persistent storage"},
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
print(f"✅ Chroma index loaded from: {persist_directory}")
|
| 33 |
+
|
| 34 |
+
def clear(self):
|
| 35 |
+
"""
|
| 36 |
+
Clear all documents from the collection.
|
| 37 |
+
Used for document isolation - clear before indexing new document.
|
| 38 |
+
"""
|
| 39 |
+
try:
|
| 40 |
+
# Delete and recreate collection
|
| 41 |
+
self.client.delete_collection("vdoc")
|
| 42 |
+
self.collection = self.client.get_or_create_collection(
|
| 43 |
+
"vdoc",
|
| 44 |
+
metadata={"description": "VDoc-RAG persistent storage"},
|
| 45 |
+
)
|
| 46 |
+
self.active_doc_id = None
|
| 47 |
+
print("🗑️ Cleared all chunks from index (document isolation)")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"[WARN] Failed to clear collection: {e}")
|
| 50 |
+
|
| 51 |
+
def set_active_document(self, doc_id: str):
|
| 52 |
+
"""Set the currently active document for querying."""
|
| 53 |
+
self.active_doc_id = doc_id
|
| 54 |
+
print(f"📄 Active document set to: {doc_id}")
|
| 55 |
+
|
| 56 |
+
def _sanitize_metadata(self, metadata):
|
| 57 |
+
clean_meta = {}
|
| 58 |
+
for k, v in metadata.items():
|
| 59 |
+
if isinstance(v, (str, int, float, bool)) or v is None:
|
| 60 |
+
clean_meta[k] = v
|
| 61 |
+
else:
|
| 62 |
+
try:
|
| 63 |
+
clean_meta[k] = json.dumps(v)
|
| 64 |
+
except Exception:
|
| 65 |
+
clean_meta[k] = str(v)
|
| 66 |
+
return clean_meta
|
| 67 |
+
|
| 68 |
+
def upsert(self, items):
|
| 69 |
+
ids = [it[0] for it in items]
|
| 70 |
+
embeddings = [it[1] for it in items]
|
| 71 |
+
metadatas = [self._sanitize_metadata(it[2]) for it in items]
|
| 72 |
+
documents = [it[3] for it in items]
|
| 73 |
+
|
| 74 |
+
self.collection.upsert(
|
| 75 |
+
ids=ids,
|
| 76 |
+
embeddings=embeddings,
|
| 77 |
+
metadatas=metadatas,
|
| 78 |
+
documents=documents,
|
| 79 |
+
)
|
| 80 |
+
print(f"💾 Upserted {len(items)} chunks into persistent Chroma collection.")
|
| 81 |
+
|
| 82 |
+
def query(self, qvec, top_k=5, doc_id=None):
|
| 83 |
+
"""
|
| 84 |
+
qvec: numpy vector or list (query embedding)
|
| 85 |
+
doc_id: optional document ID to filter results (for document isolation)
|
| 86 |
+
Returns list of {id, text, metadata, score (cosine sim 0–1)}
|
| 87 |
+
"""
|
| 88 |
+
# Use provided doc_id or fall back to active document
|
| 89 |
+
filter_doc = doc_id or self.active_doc_id
|
| 90 |
+
|
| 91 |
+
query_params = {
|
| 92 |
+
"query_embeddings": [qvec],
|
| 93 |
+
"n_results": top_k,
|
| 94 |
+
"include": ["embeddings", "metadatas", "documents", "distances"],
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
# Add document filter if specified
|
| 98 |
+
if filter_doc:
|
| 99 |
+
query_params["where"] = {"doc_id": filter_doc}
|
| 100 |
+
|
| 101 |
+
res = self.collection.query(**query_params)
|
| 102 |
+
|
| 103 |
+
out = []
|
| 104 |
+
if not res or "ids" not in res or len(res["ids"]) == 0:
|
| 105 |
+
return out
|
| 106 |
+
|
| 107 |
+
qvec = np.array(qvec, dtype=np.float32)
|
| 108 |
+
|
| 109 |
+
for i in range(len(res["ids"][0])):
|
| 110 |
+
try:
|
| 111 |
+
chunk_vec = np.array(res["embeddings"][0][i], dtype=np.float32)
|
| 112 |
+
cos_sim = float(
|
| 113 |
+
np.dot(qvec, chunk_vec) / (np.linalg.norm(qvec) * np.linalg.norm(chunk_vec) + 1e-8)
|
| 114 |
+
)
|
| 115 |
+
cos_sim = max(0.0, min(1.0, cos_sim))
|
| 116 |
+
except Exception:
|
| 117 |
+
try:
|
| 118 |
+
dist = res.get("distances", [[0]])[0][i]
|
| 119 |
+
cos_sim = max(0.0, min(1.0, 1.0 - float(dist)))
|
| 120 |
+
except Exception:
|
| 121 |
+
cos_sim = 0.0
|
| 122 |
+
|
| 123 |
+
out.append({
|
| 124 |
+
"id": res["ids"][0][i],
|
| 125 |
+
"text": res.get("documents", [[None]])[0][i],
|
| 126 |
+
"metadata": res.get("metadatas", [[None]])[0][i],
|
| 127 |
+
"score": round(cos_sim, 4),
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
out.sort(key=lambda x: x["score"], reverse=True)
|
| 131 |
+
return out
|
app/ingest.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/ingest.py
|
| 2 |
+
"""
|
| 3 |
+
PDF → Images → OCR text → Table extraction → Chart detection & reasoning
|
| 4 |
+
Generates chunks (text/table/chart) with metadata for embedding and indexing.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import uuid
|
| 9 |
+
from pdf2image import convert_from_path
|
| 10 |
+
from PIL import Image
|
| 11 |
+
import pytesseract
|
| 12 |
+
import pdfplumber
|
| 13 |
+
from app.tables import extract_tables_from_pdf
|
| 14 |
+
from app.chart_detect import detect_charts
|
| 15 |
+
from app.chart_reasoner import process_chart_crop
|
| 16 |
+
from app.cache_manager import load_chunks_from_cache, save_chunks_to_cache
|
| 17 |
+
|
| 18 |
+
# Project-local temporary/storage directories
|
| 19 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 20 |
+
TMP_DIR = os.path.join(BASE_DIR, "tmp")
|
| 21 |
+
TABLES_DIR = os.path.join(BASE_DIR, "tables")
|
| 22 |
+
CHARTS_DIR = os.path.join(BASE_DIR, "charts")
|
| 23 |
+
os.makedirs(TMP_DIR, exist_ok=True)
|
| 24 |
+
os.makedirs(TABLES_DIR, exist_ok=True)
|
| 25 |
+
os.makedirs(CHARTS_DIR, exist_ok=True)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def pdf_to_images(pdf_path, dpi=200):
|
| 29 |
+
"""
|
| 30 |
+
Convert a PDF into page-wise PNG images for OCR and visual analysis.
|
| 31 |
+
"""
|
| 32 |
+
pages = convert_from_path(pdf_path, dpi=dpi)
|
| 33 |
+
paths = []
|
| 34 |
+
# Use project-local tmp directory to avoid system temp folder
|
| 35 |
+
os.makedirs(TMP_DIR, exist_ok=True)
|
| 36 |
+
for i, p in enumerate(pages, start=1):
|
| 37 |
+
ppath = os.path.join(TMP_DIR, f"page_{uuid.uuid4().hex}_{i}.png")
|
| 38 |
+
p.save(ppath, "PNG")
|
| 39 |
+
paths.append(ppath)
|
| 40 |
+
return paths
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def ocr_image_to_blocks(image_path, min_words_per_line=3):
|
| 44 |
+
"""
|
| 45 |
+
Run OCR on an image and merge words into line-level text blocks.
|
| 46 |
+
This preserves full sentences like 'Venue: Delhi Convention Hall'.
|
| 47 |
+
"""
|
| 48 |
+
img = Image.open(image_path).convert("RGB")
|
| 49 |
+
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, config="--oem 3 --psm 6")
|
| 50 |
+
n = len(data["text"])
|
| 51 |
+
lines = {}
|
| 52 |
+
for i in range(n):
|
| 53 |
+
txt = data["text"][i].strip()
|
| 54 |
+
if not txt:
|
| 55 |
+
continue
|
| 56 |
+
line_no = data["line_num"][i]
|
| 57 |
+
if line_no not in lines:
|
| 58 |
+
lines[line_no] = {"words": [], "lefts": [], "tops": [], "rights": [], "bottoms": []}
|
| 59 |
+
lines[line_no]["words"].append(txt)
|
| 60 |
+
lines[line_no]["lefts"].append(data["left"][i])
|
| 61 |
+
lines[line_no]["tops"].append(data["top"][i])
|
| 62 |
+
lines[line_no]["rights"].append(data["left"][i] + data["width"][i])
|
| 63 |
+
lines[line_no]["bottoms"].append(data["top"][i] + data["height"][i])
|
| 64 |
+
|
| 65 |
+
blocks = []
|
| 66 |
+
for ln, d in lines.items():
|
| 67 |
+
if len(d["words"]) < min_words_per_line:
|
| 68 |
+
continue
|
| 69 |
+
text = " ".join(d["words"]).strip()
|
| 70 |
+
bbox = (
|
| 71 |
+
min(d["lefts"]),
|
| 72 |
+
min(d["tops"]),
|
| 73 |
+
max(d["rights"]),
|
| 74 |
+
max(d["bottoms"]),
|
| 75 |
+
)
|
| 76 |
+
blocks.append({"text": text, "bbox": bbox})
|
| 77 |
+
return blocks
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def process_pdf(path):
|
| 81 |
+
"""
|
| 82 |
+
Process a PDF or image file:
|
| 83 |
+
- Extract text chunks (OCR)
|
| 84 |
+
- Extract tables (pdfplumber)
|
| 85 |
+
- Detect charts (layoutparser or OpenCV)
|
| 86 |
+
- Run chart reasoning model (Donut/Pix2Struct/heuristics)
|
| 87 |
+
Returns: list of document chunks {id, text, metadata}
|
| 88 |
+
"""
|
| 89 |
+
# Check cache first
|
| 90 |
+
cached = load_chunks_from_cache(path)
|
| 91 |
+
if cached:
|
| 92 |
+
print(f"✅ Using cached chunks for {os.path.basename(path)}")
|
| 93 |
+
return cached
|
| 94 |
+
|
| 95 |
+
items = []
|
| 96 |
+
|
| 97 |
+
# 1️⃣ OCR text extraction (page images)
|
| 98 |
+
images = pdf_to_images(path)
|
| 99 |
+
for pno, imgpath in enumerate(images, start=1):
|
| 100 |
+
img = Image.open(imgpath).convert("RGB")
|
| 101 |
+
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, config="--oem 3 --psm 6")
|
| 102 |
+
n = len(data["text"])
|
| 103 |
+
|
| 104 |
+
current_line = None
|
| 105 |
+
line_words, lefts, tops, rights, bottoms = [], [], [], [], []
|
| 106 |
+
|
| 107 |
+
for i in range(n):
|
| 108 |
+
text = data["text"][i].strip()
|
| 109 |
+
if not text:
|
| 110 |
+
continue
|
| 111 |
+
line_num = data["line_num"][i]
|
| 112 |
+
|
| 113 |
+
# Start new line if changed
|
| 114 |
+
if current_line is None:
|
| 115 |
+
current_line = line_num
|
| 116 |
+
|
| 117 |
+
if line_num != current_line:
|
| 118 |
+
# finalize previous line
|
| 119 |
+
if line_words:
|
| 120 |
+
doc = {
|
| 121 |
+
"id": f"{uuid.uuid4().hex}",
|
| 122 |
+
"text": " ".join(line_words),
|
| 123 |
+
"metadata": {
|
| 124 |
+
"source": path,
|
| 125 |
+
"page": pno,
|
| 126 |
+
"bbox": (min(lefts), min(tops), max(rights), max(bottoms)),
|
| 127 |
+
"type": "text",
|
| 128 |
+
},
|
| 129 |
+
}
|
| 130 |
+
items.append(doc)
|
| 131 |
+
# reset
|
| 132 |
+
current_line = line_num
|
| 133 |
+
line_words, lefts, tops, rights, bottoms = [], [], [], [], []
|
| 134 |
+
|
| 135 |
+
# collect current word
|
| 136 |
+
line_words.append(text)
|
| 137 |
+
lefts.append(data["left"][i])
|
| 138 |
+
tops.append(data["top"][i])
|
| 139 |
+
rights.append(data["left"][i] + data["width"][i])
|
| 140 |
+
bottoms.append(data["top"][i] + data["height"][i])
|
| 141 |
+
|
| 142 |
+
# flush last line
|
| 143 |
+
if line_words:
|
| 144 |
+
doc = {
|
| 145 |
+
"id": f"{uuid.uuid4().hex}",
|
| 146 |
+
"text": " ".join(line_words),
|
| 147 |
+
"metadata": {
|
| 148 |
+
"source": path,
|
| 149 |
+
"page": pno,
|
| 150 |
+
"bbox": (min(lefts), min(tops), max(rights), max(bottoms)),
|
| 151 |
+
"type": "text",
|
| 152 |
+
},
|
| 153 |
+
}
|
| 154 |
+
items.append(doc)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
# 2️⃣ Table extraction (structured CSVs)
|
| 158 |
+
try:
|
| 159 |
+
tables = extract_tables_from_pdf(path)
|
| 160 |
+
for t in tables:
|
| 161 |
+
doc = {
|
| 162 |
+
"id": f"{uuid.uuid4().hex}",
|
| 163 |
+
"text": t["summary_text"],
|
| 164 |
+
"metadata": {
|
| 165 |
+
"source": path,
|
| 166 |
+
"page": t["page"],
|
| 167 |
+
"type": "table",
|
| 168 |
+
"csv_path": t["csv_path"],
|
| 169 |
+
"rows": t["rows"],
|
| 170 |
+
"bbox": t.get("bbox"),
|
| 171 |
+
},
|
| 172 |
+
}
|
| 173 |
+
items.append(doc)
|
| 174 |
+
except Exception as e:
|
| 175 |
+
print("[WARN] Table extraction failed:", e)
|
| 176 |
+
|
| 177 |
+
# 3️⃣ Chart detection + reasoning
|
| 178 |
+
for pno, imgpath in enumerate(images, start=1):
|
| 179 |
+
try:
|
| 180 |
+
chart_crops = detect_charts(imgpath, debug=True)
|
| 181 |
+
for c in chart_crops:
|
| 182 |
+
crop_path = c["image_path"]
|
| 183 |
+
bbox = c["bbox"]
|
| 184 |
+
|
| 185 |
+
# Run reasoning model or OCR heuristic
|
| 186 |
+
chart_res = process_chart_crop(crop_path)
|
| 187 |
+
summary = chart_res.get("summary_text", "Chart region detected.")
|
| 188 |
+
structured = chart_res.get("structured", {})
|
| 189 |
+
|
| 190 |
+
doc = {
|
| 191 |
+
"id": f"chart_{uuid.uuid4().hex}",
|
| 192 |
+
"text": summary,
|
| 193 |
+
"metadata": {
|
| 194 |
+
"source": path,
|
| 195 |
+
"page": pno,
|
| 196 |
+
"type": "chart",
|
| 197 |
+
"bbox": bbox,
|
| 198 |
+
"image_path": crop_path,
|
| 199 |
+
"structured": structured,
|
| 200 |
+
},
|
| 201 |
+
}
|
| 202 |
+
items.append(doc)
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
print(f"[WARN] Chart detection/reasoning failed on page {pno}:", e)
|
| 206 |
+
|
| 207 |
+
# Save to cache for future reuse
|
| 208 |
+
try:
|
| 209 |
+
save_chunks_to_cache(path, items)
|
| 210 |
+
print(f"💾 Cached {len(items)} chunks for {os.path.basename(path)}")
|
| 211 |
+
except Exception as e:
|
| 212 |
+
print("[WARN] Failed to save cache:", e)
|
| 213 |
+
|
| 214 |
+
return items
|
app/main.py
ADDED
|
@@ -0,0 +1,451 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uvicorn
|
| 3 |
+
from fastapi import FastAPI, UploadFile, File, Form, Request, HTTPException
|
| 4 |
+
from fastapi.responses import HTMLResponse
|
| 5 |
+
from fastapi.staticfiles import StaticFiles
|
| 6 |
+
from fastapi.templating import Jinja2Templates
|
| 7 |
+
|
| 8 |
+
from app.ingest import process_pdf
|
| 9 |
+
from app.indexer import ChromaIndexer
|
| 10 |
+
from app.embeddings import TextImageEmbedder
|
| 11 |
+
from app.reader import LLMReader
|
| 12 |
+
from app.visual_highlight import render_highlighted_pages
|
| 13 |
+
from app.cache_manager import clear_cache
|
| 14 |
+
from app.feedback_manager import record_feedback, get_feedback_summary, _load_feedback
|
| 15 |
+
import shutil
|
| 16 |
+
import subprocess
|
| 17 |
+
import pandas as pd
|
| 18 |
+
import numpy as np
|
| 19 |
+
import matplotlib.pyplot as plt
|
| 20 |
+
from io import BytesIO
|
| 21 |
+
import base64
|
| 22 |
+
from sentence_transformers import SentenceTransformer
|
| 23 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 24 |
+
|
| 25 |
+
# ---------------------------------------------------------
|
| 26 |
+
# Initialization
|
| 27 |
+
# ---------------------------------------------------------
|
| 28 |
+
app = FastAPI(title="VDoc RAG - Web UI")
|
| 29 |
+
|
| 30 |
+
# ---------------------------------------------------------
|
| 31 |
+
# Directories
|
| 32 |
+
# ---------------------------------------------------------
|
| 33 |
+
# Get absolute path to this file’s directory
|
| 34 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 35 |
+
|
| 36 |
+
# Define template and static directories relative to BASE_DIR
|
| 37 |
+
TEMPLATE_DIR = os.path.join(BASE_DIR, "templates")
|
| 38 |
+
STATIC_DIR = os.path.join(BASE_DIR, "static")
|
| 39 |
+
|
| 40 |
+
# Ensure directories exist
|
| 41 |
+
os.makedirs(TEMPLATE_DIR, exist_ok=True)
|
| 42 |
+
os.makedirs(STATIC_DIR, exist_ok=True)
|
| 43 |
+
|
| 44 |
+
# Mount static directory
|
| 45 |
+
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
| 46 |
+
# Serve highlighted images
|
| 47 |
+
HIGHLIGHTED_DIR = os.path.join(BASE_DIR, "highlighted")
|
| 48 |
+
os.makedirs(HIGHLIGHTED_DIR, exist_ok=True)
|
| 49 |
+
app.mount("/highlighted", StaticFiles(directory=HIGHLIGHTED_DIR), name="highlighted")
|
| 50 |
+
|
| 51 |
+
# Load Jinja2 templates safely
|
| 52 |
+
templates = Jinja2Templates(directory=TEMPLATE_DIR)
|
| 53 |
+
|
| 54 |
+
# ---------------------------------------------------------
|
| 55 |
+
# Core Components
|
| 56 |
+
# ---------------------------------------------------------
|
| 57 |
+
embedder = TextImageEmbedder()
|
| 58 |
+
# Use a project-local persistent directory for Chroma
|
| 59 |
+
STORAGE_DIR = os.path.join(BASE_DIR, "storage", "chroma_db")
|
| 60 |
+
indexer = ChromaIndexer(embedding_function=embedder.embed_text, persist_directory=STORAGE_DIR)
|
| 61 |
+
reader_provider = os.environ.get("VDOCRAG_READER_PROVIDER", "gemini")
|
| 62 |
+
reader = LLMReader(provider=reader_provider)
|
| 63 |
+
|
| 64 |
+
uploaded_files = [] # track uploaded docs for display
|
| 65 |
+
|
| 66 |
+
# ---------------------------------------------------------
|
| 67 |
+
# Routes
|
| 68 |
+
# ---------------------------------------------------------
|
| 69 |
+
@app.get("/", response_class=HTMLResponse)
|
| 70 |
+
async def home(request: Request):
|
| 71 |
+
"""Render main upload + query interface."""
|
| 72 |
+
print(f"✅ Using templates from: {TEMPLATE_DIR}")
|
| 73 |
+
if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
|
| 74 |
+
print("❌ index.html not found in:", TEMPLATE_DIR)
|
| 75 |
+
else:
|
| 76 |
+
print("✅ index.html found!")
|
| 77 |
+
|
| 78 |
+
return templates.TemplateResponse(
|
| 79 |
+
"index.html",
|
| 80 |
+
{"request": request, "uploaded": uploaded_files, "answer": None},
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@app.post("/upload")
|
| 85 |
+
async def upload_file(request: Request, file: UploadFile = File(...)):
|
| 86 |
+
"""Handle PDF/image upload and indexing."""
|
| 87 |
+
if not file.filename.lower().endswith((".pdf", ".png", ".jpg", ".jpeg")):
|
| 88 |
+
raise HTTPException(status_code=400, detail="Unsupported file type")
|
| 89 |
+
|
| 90 |
+
# Save uploaded file temporarily
|
| 91 |
+
temp_dir = os.path.join(BASE_DIR, "uploads")
|
| 92 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 93 |
+
path = os.path.join(temp_dir, file.filename)
|
| 94 |
+
|
| 95 |
+
with open(path, "wb") as f:
|
| 96 |
+
f.write(await file.read())
|
| 97 |
+
|
| 98 |
+
# 🔒 Document Isolation: Clear old chunks before indexing new document
|
| 99 |
+
indexer.clear()
|
| 100 |
+
uploaded_files.clear() # Reset uploaded files list
|
| 101 |
+
|
| 102 |
+
# Extract and process text chunks
|
| 103 |
+
docs = process_pdf(path)
|
| 104 |
+
if len(docs) == 0:
|
| 105 |
+
return templates.TemplateResponse(
|
| 106 |
+
"index.html",
|
| 107 |
+
{
|
| 108 |
+
"request": request,
|
| 109 |
+
"uploaded": uploaded_files,
|
| 110 |
+
"answer": "⚠️ No content extracted from file.",
|
| 111 |
+
},
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# Generate document ID for isolation
|
| 115 |
+
doc_id = file.filename
|
| 116 |
+
|
| 117 |
+
# Embed and index chunks with doc_id metadata
|
| 118 |
+
texts = [d["text"] for d in docs]
|
| 119 |
+
vectors = embedder.embed_text(texts)
|
| 120 |
+
|
| 121 |
+
# Add doc_id to each chunk's metadata for filtering
|
| 122 |
+
for d in docs:
|
| 123 |
+
d["metadata"]["doc_id"] = doc_id
|
| 124 |
+
|
| 125 |
+
items = [(d["id"], vectors[i].tolist(), d["metadata"], d["text"]) for i, d in enumerate(docs)]
|
| 126 |
+
indexer.upsert(items)
|
| 127 |
+
|
| 128 |
+
# Set this as the active document for queries
|
| 129 |
+
indexer.set_active_document(doc_id)
|
| 130 |
+
|
| 131 |
+
uploaded_files.append(file.filename)
|
| 132 |
+
print(f"✅ Indexed {len(docs)} chunks from {file.filename} (document isolation enabled)")
|
| 133 |
+
|
| 134 |
+
return templates.TemplateResponse(
|
| 135 |
+
"index.html",
|
| 136 |
+
{
|
| 137 |
+
"request": request,
|
| 138 |
+
"uploaded": uploaded_files,
|
| 139 |
+
"answer": f"✅ Uploaded and indexed {file.filename} ({len(docs)} chunks).",
|
| 140 |
+
},
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@app.post("/ask")
|
| 145 |
+
async def ask_question(request: Request, question: str = Form(...)):
|
| 146 |
+
"""Handle user query, retrieve relevant chunks, and generate LLM answer."""
|
| 147 |
+
# Step 1 — Embed question
|
| 148 |
+
qvec = embedder.embed_text([question])[0]
|
| 149 |
+
|
| 150 |
+
# Step 2 — Retrieve top chunks
|
| 151 |
+
hits = indexer.query(qvec, top_k=10)
|
| 152 |
+
|
| 153 |
+
# Debug log
|
| 154 |
+
print("\n🔍 Retrieved Chunks for Query:", question)
|
| 155 |
+
for i, h in enumerate(hits):
|
| 156 |
+
meta = h.get("metadata", {})
|
| 157 |
+
conf = h.get("score", 0)
|
| 158 |
+
print(f"Chunk {i+1}: Page {meta.get('page')} | BBox: {meta.get('bbox')} | Confidence: {conf*100:.1f}%")
|
| 159 |
+
print(f"Text: {h['text'][:500]}...\n")
|
| 160 |
+
|
| 161 |
+
# Prioritize chart-type hits for chart-related questions
|
| 162 |
+
chart_keywords = ["chart", "graph", "trend", "plot", "increase", "decrease", "growth"]
|
| 163 |
+
if any(k in question.lower() for k in chart_keywords):
|
| 164 |
+
try:
|
| 165 |
+
hits = sorted(hits, key=lambda h: h.get("metadata", {}).get("type") != "chart")
|
| 166 |
+
print("[INFO] Prioritized chart-type chunks for chart-related question.")
|
| 167 |
+
except Exception as e:
|
| 168 |
+
print("[WARN] Failed to prioritize chart hits:", e)
|
| 169 |
+
|
| 170 |
+
# Step 3 — Build context string
|
| 171 |
+
context_blocks = [
|
| 172 |
+
f"[{i+1}] {h['text']} (page: {h['metadata'].get('page')}, bbox: {h['metadata'].get('bbox')})"
|
| 173 |
+
for i, h in enumerate(hits)
|
| 174 |
+
]
|
| 175 |
+
context = "\n".join(context_blocks)
|
| 176 |
+
|
| 177 |
+
# Step 4 — Ask LLM
|
| 178 |
+
answer = reader.answer_question(query=question, context=context, sources=hits)
|
| 179 |
+
sources = answer.get("sources", [])
|
| 180 |
+
|
| 181 |
+
# 🖼️ Generate visual highlights
|
| 182 |
+
try:
|
| 183 |
+
first_source_path = hits[0]["metadata"].get("source") if hits else None
|
| 184 |
+
highlight_paths = []
|
| 185 |
+
if first_source_path and os.path.exists(first_source_path):
|
| 186 |
+
highlight_paths = render_highlighted_pages(first_source_path, hits)
|
| 187 |
+
# convert to web URLs for template
|
| 188 |
+
highlight_urls = ["/" + os.path.relpath(p, BASE_DIR).replace("\\", "/") for p in highlight_paths]
|
| 189 |
+
else:
|
| 190 |
+
highlight_urls = []
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print("[WARN] Highlight rendering failed:", e)
|
| 193 |
+
highlight_urls = []
|
| 194 |
+
|
| 195 |
+
# Step 5 — Prepare chunk previews for UI
|
| 196 |
+
chunk_previews = [
|
| 197 |
+
{
|
| 198 |
+
"index": i + 1,
|
| 199 |
+
"page": h["metadata"].get("page"),
|
| 200 |
+
"bbox": h["metadata"].get("bbox"),
|
| 201 |
+
"text": h["text"][:300] + ("..." if len(h["text"]) > 300 else ""),
|
| 202 |
+
"confidence": round(h.get("score", 0) * 100, 1),
|
| 203 |
+
}
|
| 204 |
+
for i, h in enumerate(hits)
|
| 205 |
+
]
|
| 206 |
+
|
| 207 |
+
# Average confidence for the retrieved set
|
| 208 |
+
avg_conf = sum(h.get("score", 0) for h in hits) / max(len(hits), 1)
|
| 209 |
+
|
| 210 |
+
# Step 6 — Render page
|
| 211 |
+
return templates.TemplateResponse(
|
| 212 |
+
"index.html",
|
| 213 |
+
{
|
| 214 |
+
"request": request,
|
| 215 |
+
"uploaded": uploaded_files,
|
| 216 |
+
"answer": answer["text"],
|
| 217 |
+
"question": question,
|
| 218 |
+
"sources": sources,
|
| 219 |
+
"chunks": chunk_previews,
|
| 220 |
+
"highlight_images": highlight_urls,
|
| 221 |
+
"confidence_avg": round(avg_conf * 100, 1),
|
| 222 |
+
},
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
@app.post("/clear_cache")
|
| 227 |
+
async def clear_cache_route(request: Request):
|
| 228 |
+
"""Clear all cached chunk data and re-render the index with a message."""
|
| 229 |
+
clear_cache()
|
| 230 |
+
return templates.TemplateResponse(
|
| 231 |
+
"index.html",
|
| 232 |
+
{
|
| 233 |
+
"request": request,
|
| 234 |
+
"uploaded": uploaded_files,
|
| 235 |
+
"answer": "🧹 Cache cleared successfully!",
|
| 236 |
+
},
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
@app.post("/clear_index")
|
| 241 |
+
async def clear_index(request: Request):
|
| 242 |
+
"""Clear the persistent Chroma index by deleting the storage directory."""
|
| 243 |
+
storage_dir = os.path.join(BASE_DIR, "storage", "chroma_db")
|
| 244 |
+
try:
|
| 245 |
+
shutil.rmtree(storage_dir, ignore_errors=True)
|
| 246 |
+
os.makedirs(storage_dir, exist_ok=True)
|
| 247 |
+
# Reinitialize indexer client to the new empty DB
|
| 248 |
+
global indexer
|
| 249 |
+
indexer = ChromaIndexer(embedding_function=embedder.embed_text, persist_directory=storage_dir)
|
| 250 |
+
except Exception as e:
|
| 251 |
+
print("[WARN] clear_index failed:", e)
|
| 252 |
+
return templates.TemplateResponse(
|
| 253 |
+
"index.html",
|
| 254 |
+
{"request": request, "uploaded": uploaded_files, "answer": "🧹 Chroma index cleared successfully!"},
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
@app.post("/feedback")
|
| 259 |
+
async def feedback(request: Request, question: str = Form(...), answer: str = Form(...), correctness: str = Form(...)):
|
| 260 |
+
"""Record user feedback (correct / incorrect) for RAG answers."""
|
| 261 |
+
try:
|
| 262 |
+
record_feedback(question=question, answer=answer, correctness=correctness)
|
| 263 |
+
summary = get_feedback_summary()
|
| 264 |
+
msg = f"✅ Feedback received! {summary}"
|
| 265 |
+
except Exception as e:
|
| 266 |
+
print("[WARN] Failed to record feedback:", e)
|
| 267 |
+
msg = "⚠️ Failed to record feedback"
|
| 268 |
+
|
| 269 |
+
return templates.TemplateResponse(
|
| 270 |
+
"index.html",
|
| 271 |
+
{"request": request, "uploaded": uploaded_files, "answer": msg},
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
@app.get("/feedback_dashboard", response_class=HTMLResponse)
|
| 276 |
+
async def feedback_dashboard(request: Request):
|
| 277 |
+
"""Display feedback statistics and allow fine-tuning."""
|
| 278 |
+
data = _load_feedback()
|
| 279 |
+
summary = get_feedback_summary()
|
| 280 |
+
total = len(data)
|
| 281 |
+
correct = sum(1 for x in data if x.get("correctness") == "correct")
|
| 282 |
+
incorrect = sum(1 for x in data if x.get("correctness") == "incorrect")
|
| 283 |
+
|
| 284 |
+
return templates.TemplateResponse(
|
| 285 |
+
"feedback_dashboard.html",
|
| 286 |
+
{
|
| 287 |
+
"request": request,
|
| 288 |
+
"summary": summary,
|
| 289 |
+
"total": total,
|
| 290 |
+
"correct": correct,
|
| 291 |
+
"incorrect": incorrect,
|
| 292 |
+
"feedback_data": data[::-1][:50], # show latest 50
|
| 293 |
+
},
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
@app.post("/train_feedback_model")
|
| 298 |
+
async def train_feedback_model(request: Request):
|
| 299 |
+
"""Run fine-tuning script directly from the UI."""
|
| 300 |
+
script_path = os.path.join(BASE_DIR, "..", "train_feedback_embeddings.py")
|
| 301 |
+
|
| 302 |
+
try:
|
| 303 |
+
print(f"🚀 Launching fine-tuning process: {script_path}")
|
| 304 |
+
process = subprocess.run(
|
| 305 |
+
["python", script_path],
|
| 306 |
+
capture_output=True,
|
| 307 |
+
text=True,
|
| 308 |
+
check=True,
|
| 309 |
+
)
|
| 310 |
+
output = process.stdout[-1000:]
|
| 311 |
+
message = "✅ Fine-tuning complete. Model updated successfully!"
|
| 312 |
+
except subprocess.CalledProcessError as e:
|
| 313 |
+
output = e.stderr or str(e)
|
| 314 |
+
message = "❌ Fine-tuning failed."
|
| 315 |
+
|
| 316 |
+
return templates.TemplateResponse(
|
| 317 |
+
"feedback_dashboard.html",
|
| 318 |
+
{
|
| 319 |
+
"request": request,
|
| 320 |
+
"summary": get_feedback_summary(),
|
| 321 |
+
"feedback_data": _load_feedback()[::-1][:50],
|
| 322 |
+
"train_output": output,
|
| 323 |
+
"message": message,
|
| 324 |
+
},
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
@app.get("/benchmark_dashboard", response_class=HTMLResponse)
|
| 329 |
+
async def benchmark_dashboard(request: Request):
|
| 330 |
+
"""Render model benchmarking interface."""
|
| 331 |
+
return templates.TemplateResponse(
|
| 332 |
+
"benchmark_dashboard.html",
|
| 333 |
+
{
|
| 334 |
+
"request": request,
|
| 335 |
+
"results": None,
|
| 336 |
+
"plot_precision": None,
|
| 337 |
+
"plot_recall": None,
|
| 338 |
+
"plot_mrr": None,
|
| 339 |
+
},
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
@app.post("/run_benchmark")
|
| 344 |
+
async def run_benchmark(request: Request, models: str = Form(...), chunk_size: int = Form(200), top_k: int = Form(5)):
|
| 345 |
+
"""
|
| 346 |
+
Run embedding benchmark across provided models using stored feedback data.
|
| 347 |
+
"""
|
| 348 |
+
data = _load_feedback()
|
| 349 |
+
if not data:
|
| 350 |
+
return templates.TemplateResponse(
|
| 351 |
+
"benchmark_dashboard.html",
|
| 352 |
+
{
|
| 353 |
+
"request": request,
|
| 354 |
+
"results": [],
|
| 355 |
+
"message": "⚠️ No feedback data available for benchmarking.",
|
| 356 |
+
},
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
queries = [f["question"] for f in data]
|
| 360 |
+
answers = [f["answer"] for f in data]
|
| 361 |
+
MODELS = [m.strip() for m in models.split(",") if m.strip()]
|
| 362 |
+
|
| 363 |
+
PDF_PATH = os.path.join(BASE_DIR, "samples", "vdoc_rag_test.pdf")
|
| 364 |
+
try:
|
| 365 |
+
raw_chunks = [d["text"] for d in process_pdf(PDF_PATH)]
|
| 366 |
+
except Exception as e:
|
| 367 |
+
print("[WARN] Could not process sample PDF for benchmark, falling back to small corpus:", e)
|
| 368 |
+
raw_chunks = [
|
| 369 |
+
"Yearly sales have been increasing steadily from 2018 to 2024, with a notable jump in 2021.",
|
| 370 |
+
"Charlie achieved the highest score in the table with 98 points.",
|
| 371 |
+
"The event will be held on November 20, 2025 at the downtown auditorium.",
|
| 372 |
+
]
|
| 373 |
+
|
| 374 |
+
# Split raw_chunks into sub-chunks by character length
|
| 375 |
+
chunks = []
|
| 376 |
+
for ch in raw_chunks:
|
| 377 |
+
for i in range(0, len(ch), chunk_size):
|
| 378 |
+
chunks.append(ch[i : i + chunk_size])
|
| 379 |
+
|
| 380 |
+
results = []
|
| 381 |
+
for model_name in MODELS:
|
| 382 |
+
try:
|
| 383 |
+
print(f"🧠 Evaluating {model_name}...")
|
| 384 |
+
model = SentenceTransformer(model_name)
|
| 385 |
+
chunk_embeddings = model.encode(chunks, normalize_embeddings=True, show_progress_bar=False)
|
| 386 |
+
except Exception as e:
|
| 387 |
+
print(f"[ERROR] Failed to load model {model_name}:", e)
|
| 388 |
+
continue
|
| 389 |
+
|
| 390 |
+
precision_scores, recall_scores, mrr_scores = [], [], []
|
| 391 |
+
|
| 392 |
+
for q, ans in zip(queries, answers):
|
| 393 |
+
qvec = model.encode([q], normalize_embeddings=True)
|
| 394 |
+
sims = cosine_similarity(qvec, chunk_embeddings)[0]
|
| 395 |
+
top_idx = np.argsort(sims)[::-1][:top_k]
|
| 396 |
+
retrieved = [chunks[i] for i in top_idx]
|
| 397 |
+
relevant = [1 if ans.lower() in c.lower() else 0 for c in retrieved]
|
| 398 |
+
precision = sum(relevant) / top_k
|
| 399 |
+
recall = sum(relevant) / max(1, len([c for c in chunks if ans.lower() in c.lower()]))
|
| 400 |
+
mrr = 0
|
| 401 |
+
for rank, rel in enumerate(relevant, start=1):
|
| 402 |
+
if rel:
|
| 403 |
+
mrr = 1 / rank
|
| 404 |
+
break
|
| 405 |
+
precision_scores.append(precision)
|
| 406 |
+
recall_scores.append(recall)
|
| 407 |
+
mrr_scores.append(mrr)
|
| 408 |
+
|
| 409 |
+
results.append({
|
| 410 |
+
"model": model_name,
|
| 411 |
+
"precision": round(np.mean(precision_scores), 3),
|
| 412 |
+
"recall": round(np.mean(recall_scores), 3),
|
| 413 |
+
"mrr": round(np.mean(mrr_scores), 3),
|
| 414 |
+
})
|
| 415 |
+
|
| 416 |
+
df = pd.DataFrame(results)
|
| 417 |
+
print(df)
|
| 418 |
+
|
| 419 |
+
def make_plot(metric):
|
| 420 |
+
plt.figure(figsize=(6, 4))
|
| 421 |
+
plt.barh(df["model"], df[metric], color="skyblue")
|
| 422 |
+
plt.title(f"{metric.upper()} Comparison")
|
| 423 |
+
plt.xlabel(metric.upper())
|
| 424 |
+
plt.tight_layout()
|
| 425 |
+
buf = BytesIO()
|
| 426 |
+
plt.savefig(buf, format="png")
|
| 427 |
+
buf.seek(0)
|
| 428 |
+
img_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
|
| 429 |
+
plt.close()
|
| 430 |
+
return f"data:image/png;base64,{img_base64}"
|
| 431 |
+
|
| 432 |
+
plot_precision = make_plot("precision") if not df.empty else None
|
| 433 |
+
plot_recall = make_plot("recall") if not df.empty else None
|
| 434 |
+
plot_mrr = make_plot("mrr") if not df.empty else None
|
| 435 |
+
|
| 436 |
+
return templates.TemplateResponse(
|
| 437 |
+
"benchmark_dashboard.html",
|
| 438 |
+
{
|
| 439 |
+
"request": request,
|
| 440 |
+
"results": results,
|
| 441 |
+
"plot_precision": plot_precision,
|
| 442 |
+
"plot_recall": plot_recall,
|
| 443 |
+
"plot_mrr": plot_mrr,
|
| 444 |
+
},
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
# ---------------------------------------------------------
|
| 448 |
+
# Run app
|
| 449 |
+
# ---------------------------------------------------------
|
| 450 |
+
if __name__ == "__main__":
|
| 451 |
+
uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
|
app/reader.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
from dotenv import dotenv_values
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
|
| 6 |
+
# --- Load from .env first, then fall back to system environment (for cloud deployment) ---
|
| 7 |
+
env_vars = dotenv_values(".env") # returns a dict from the .env file
|
| 8 |
+
|
| 9 |
+
def get_env(key, default=None):
|
| 10 |
+
"""Get env var from .env first, then system environment"""
|
| 11 |
+
return env_vars.get(key) or os.environ.get(key) or default
|
| 12 |
+
|
| 13 |
+
# --- FIX: Use the correct modern SDK import (google-genai) and initialize client ---
|
| 14 |
+
genai = None
|
| 15 |
+
_gemini_client = None
|
| 16 |
+
_api_key = get_env("GEMINI_API_KEY")
|
| 17 |
+
try:
|
| 18 |
+
from google import genai
|
| 19 |
+
from google.genai import types
|
| 20 |
+
|
| 21 |
+
# Initialize client - check both .env and system env for cloud deployment
|
| 22 |
+
if _api_key:
|
| 23 |
+
_gemini_client = genai.Client(api_key=_api_key)
|
| 24 |
+
except ImportError:
|
| 25 |
+
pass
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"Warning: Failed to initialize Gemini client. Check API key/configuration. Error: {e}")
|
| 28 |
+
|
| 29 |
+
class LLMReader:
|
| 30 |
+
"""
|
| 31 |
+
LLM Reader using Google Gemini (via GEMINI_API_KEY from .env or environment)
|
| 32 |
+
Falls back to a local small model if unavailable.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def __init__(self, provider: str = "gemini"):
|
| 36 |
+
self.provider = provider.lower()
|
| 37 |
+
|
| 38 |
+
# Load from .env or system environment (for cloud deployment)
|
| 39 |
+
self.model = get_env("VDOCRAG_LLM_MODEL", "gemini-2.5-flash")
|
| 40 |
+
self.api_key = get_env("GEMINI_API_KEY")
|
| 41 |
+
self.client = _gemini_client
|
| 42 |
+
self.local_pipeline = None
|
| 43 |
+
|
| 44 |
+
print("=" * 50)
|
| 45 |
+
print(f"LLMReader Init: Loading GEMINI_API_KEY...")
|
| 46 |
+
if self.api_key:
|
| 47 |
+
print(f"LLMReader Init: SUCCESS. Key prefix: {self.api_key[:4]}...{self.api_key[-4:]}")
|
| 48 |
+
else:
|
| 49 |
+
print(f"LLMReader Init: FAILED. GEMINI_API_KEY not found.")
|
| 50 |
+
print("=" * 50)
|
| 51 |
+
|
| 52 |
+
if self.provider == "gemini":
|
| 53 |
+
# Check for API key first - if missing, fall back to local
|
| 54 |
+
if not self.api_key:
|
| 55 |
+
print("⚠️ No GEMINI_API_KEY found, switching to local model.")
|
| 56 |
+
self.provider = "local"
|
| 57 |
+
elif genai is None:
|
| 58 |
+
raise ImportError("Please install the modern Google GenAI SDK: `pip install google-genai`.")
|
| 59 |
+
elif self.client is None:
|
| 60 |
+
print("⚠️ Failed to initialize Gemini client, switching to local model.")
|
| 61 |
+
self.provider = "local"
|
| 62 |
+
|
| 63 |
+
if self.provider == "local":
|
| 64 |
+
print(f"Loading local model: distilgpt2...")
|
| 65 |
+
self.local_pipeline = pipeline("text-generation", model="distilgpt2")
|
| 66 |
+
|
| 67 |
+
if self.provider not in ("gemini", "local"):
|
| 68 |
+
print(f"⚠️ Unknown provider '{self.provider}', defaulting to local.")
|
| 69 |
+
self.provider = "local"
|
| 70 |
+
if self.local_pipeline is None:
|
| 71 |
+
print(f"Loading local model: distilgpt2...")
|
| 72 |
+
self.local_pipeline = pipeline("text-generation", model="distilgpt2")
|
| 73 |
+
|
| 74 |
+
# --------------------------
|
| 75 |
+
# Gemini call (modern SDK)
|
| 76 |
+
# --------------------------
|
| 77 |
+
def _call_gemini(self, query: str, context: str) -> str:
|
| 78 |
+
system_prompt = (
|
| 79 |
+
"You are a precise data analysis assistant. "
|
| 80 |
+
"Given the provided CONTEXT, answer the user's QUESTION accurately. "
|
| 81 |
+
"If calculations are needed, perform them. "
|
| 82 |
+
"Only respond with the final answer and no additional commentary or explanation."
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
user_content = f"CONTEXT:\n---\n{context}\n---\nQUESTION: {query}"
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
config = types.GenerateContentConfig(
|
| 89 |
+
system_instruction=system_prompt,
|
| 90 |
+
temperature=0.1
|
| 91 |
+
)
|
| 92 |
+
response = self.client.models.generate_content(
|
| 93 |
+
model=self.model,
|
| 94 |
+
contents=user_content,
|
| 95 |
+
config=config
|
| 96 |
+
)
|
| 97 |
+
return response.text.strip()
|
| 98 |
+
except Exception as e:
|
| 99 |
+
return f"[Gemini API Error] {type(e).__name__}: {e}"
|
| 100 |
+
|
| 101 |
+
# --------------------------
|
| 102 |
+
# Local fallback
|
| 103 |
+
# --------------------------
|
| 104 |
+
def _call_local(self, query: str, context: str) -> str:
|
| 105 |
+
prompt = (
|
| 106 |
+
f"CONTEXT:\n{context}\n\n"
|
| 107 |
+
f"Based on the context, answer the following question:\n"
|
| 108 |
+
f"QUESTION: {query}\n"
|
| 109 |
+
f"ANSWER:"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
result = self.local_pipeline(
|
| 113 |
+
prompt,
|
| 114 |
+
max_new_tokens=100,
|
| 115 |
+
do_sample=True,
|
| 116 |
+
truncation=True
|
| 117 |
+
)
|
| 118 |
+
generated_text = result[0]["generated_text"]
|
| 119 |
+
answer = generated_text[len(prompt):].strip()
|
| 120 |
+
|
| 121 |
+
if not answer or context in answer:
|
| 122 |
+
return "[Local model failed to generate a new answer and may have repeated the context]"
|
| 123 |
+
return answer
|
| 124 |
+
|
| 125 |
+
# --------------------------
|
| 126 |
+
# Main answer method
|
| 127 |
+
# --------------------------
|
| 128 |
+
def answer_question(self, query: str, context: str, sources: List[Dict]) -> Dict:
|
| 129 |
+
if self.provider == "gemini":
|
| 130 |
+
answer_text = self._call_gemini(query, context)
|
| 131 |
+
elif self.provider == "local":
|
| 132 |
+
answer_text = self._call_local(query, context)
|
| 133 |
+
else:
|
| 134 |
+
answer_text = f"[Error: Unknown provider '{self.provider}']"
|
| 135 |
+
|
| 136 |
+
provenance = [
|
| 137 |
+
{
|
| 138 |
+
"page": s["metadata"].get("page"),
|
| 139 |
+
"text": s["text"][:200],
|
| 140 |
+
"score": s.get("score", 0),
|
| 141 |
+
}
|
| 142 |
+
for s in sources
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
return {"text": answer_text, "sources": provenance}
|
app/tables.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/tables.py
|
| 2 |
+
import os
|
| 3 |
+
import uuid
|
| 4 |
+
import pdfplumber
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from typing import List, Dict
|
| 7 |
+
|
| 8 |
+
TABLES_DIR = os.environ.get('VDOCRAG_TABLES_DIR', '/tmp/vdoc_tables')
|
| 9 |
+
os.makedirs(TABLES_DIR, exist_ok=True)
|
| 10 |
+
|
| 11 |
+
def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
|
| 12 |
+
"""
|
| 13 |
+
Extract tables using pdfplumber and save each as CSV. Returns a list of metadata dicts:
|
| 14 |
+
[{ 'csv_path': str, 'page': int, 'table_index': int, 'summary_text': str }]
|
| 15 |
+
"""
|
| 16 |
+
results = []
|
| 17 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 18 |
+
for pno, page in enumerate(pdf.pages, start=1):
|
| 19 |
+
try:
|
| 20 |
+
tables = page.extract_tables()
|
| 21 |
+
except Exception:
|
| 22 |
+
tables = []
|
| 23 |
+
for tidx, table in enumerate(tables):
|
| 24 |
+
# Convert to DataFrame
|
| 25 |
+
try:
|
| 26 |
+
df = pd.DataFrame(table[1:], columns=table[0]) if len(table) > 1 else pd.DataFrame(table)
|
| 27 |
+
except Exception:
|
| 28 |
+
df = pd.DataFrame(table)
|
| 29 |
+
|
| 30 |
+
fname = f"table_{uuid.uuid4().hex}_p{pno}_t{tidx}.csv"
|
| 31 |
+
csv_path = os.path.join(TABLES_DIR, fname)
|
| 32 |
+
# Save CSV
|
| 33 |
+
try:
|
| 34 |
+
df.to_csv(csv_path, index=False)
|
| 35 |
+
except Exception:
|
| 36 |
+
df.to_csv(csv_path, index=False, encoding='utf-8', errors='ignore')
|
| 37 |
+
|
| 38 |
+
# Get table bbox (approximate)
|
| 39 |
+
try:
|
| 40 |
+
# Each table has a bounding box in page._objects['rects'] or use the table extractor
|
| 41 |
+
table_bbox = page.find_tables()[tidx - 1].bbox # (x0, top, x1, bottom)
|
| 42 |
+
except Exception:
|
| 43 |
+
table_bbox = None
|
| 44 |
+
|
| 45 |
+
# create a short textual summary: columns and first N rows
|
| 46 |
+
cols = list(df.columns) if len(df.columns) > 0 else []
|
| 47 |
+
top_rows = df.head(5).to_dict(orient='records')
|
| 48 |
+
summary = f"Table (page {pno}) with columns: {cols}. First rows: {top_rows}"
|
| 49 |
+
|
| 50 |
+
results.append({
|
| 51 |
+
'csv_path': csv_path,
|
| 52 |
+
'page': pno,
|
| 53 |
+
'table_index': tidx,
|
| 54 |
+
'summary_text': summary,
|
| 55 |
+
'rows': len(df),
|
| 56 |
+
'bbox': table_bbox
|
| 57 |
+
})
|
| 58 |
+
return results
|
app/templates/benchmark_dashboard.html
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<title>📊 Embedding Benchmark Dashboard</title>
|
| 6 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 7 |
+
</head>
|
| 8 |
+
<body class="bg-blue-50 text-blue-800 min-h-screen flex flex-col items-center py-8">
|
| 9 |
+
<div class="bg-white shadow-lg rounded-xl p-8 w-full max-w-5xl">
|
| 10 |
+
<h1 class="text-2xl font-bold text-center mb-6">📊 Embedding Model Benchmark</h1>
|
| 11 |
+
|
| 12 |
+
<form action="/run_benchmark" method="post" class="space-y-3 mb-6">
|
| 13 |
+
<label class="block text-sm font-semibold">Enter model names (comma-separated)</label>
|
| 14 |
+
<input type="text" name="models"
|
| 15 |
+
value="all-MiniLM-L6-v2, multi-qa-MiniLM-L6-cos-v1, models/vdoc_feedback_tuned/latest"
|
| 16 |
+
class="border rounded w-full p-2 focus:outline-none focus:ring-2 focus:ring-blue-400"
|
| 17 |
+
required>
|
| 18 |
+
|
| 19 |
+
<div class="flex space-x-4">
|
| 20 |
+
<div class="flex-1">
|
| 21 |
+
<label class="block text-sm font-semibold">Chunk Size</label>
|
| 22 |
+
<input type="number" name="chunk_size" value="200" class="border p-2 rounded w-full">
|
| 23 |
+
</div>
|
| 24 |
+
<div class="flex-1">
|
| 25 |
+
<label class="block text-sm font-semibold">Top-K</label>
|
| 26 |
+
<input type="number" name="top_k" value="5" class="border p-2 rounded w-full">
|
| 27 |
+
</div>
|
| 28 |
+
</div>
|
| 29 |
+
|
| 30 |
+
<button type="submit"
|
| 31 |
+
class="bg-blue-600 text-white px-6 py-2 rounded hover:bg-blue-700 w-full mt-3">
|
| 32 |
+
🚀 Run Benchmark
|
| 33 |
+
</button>
|
| 34 |
+
</form>
|
| 35 |
+
|
| 36 |
+
{% if results %}
|
| 37 |
+
<h2 class="text-xl font-semibold mb-4">📈 Results</h2>
|
| 38 |
+
<table class="w-full border border-gray-300 text-sm mb-6">
|
| 39 |
+
<thead class="bg-blue-100 text-blue-800">
|
| 40 |
+
<tr>
|
| 41 |
+
<th class="border px-3 py-1 text-left">Model</th>
|
| 42 |
+
<th class="border px-3 py-1">Precision</th>
|
| 43 |
+
<th class="border px-3 py-1">Recall</th>
|
| 44 |
+
<th class="border px-3 py-1">MRR</th>
|
| 45 |
+
</tr>
|
| 46 |
+
</thead>
|
| 47 |
+
<tbody>
|
| 48 |
+
{% for r in results %}
|
| 49 |
+
<tr class="Border-b hover:bg-blue-50">
|
| 50 |
+
<td class="px-3 py-1">{{ r.model }}</td>
|
| 51 |
+
<td class="px-3 py-1 text-center">{{ r.precision }}</td>
|
| 52 |
+
<td class="px-3 py-1 text-center">{{ r.recall }}</td>
|
| 53 |
+
<td class="px-3 py-1 text-center">{{ r.mrr }}</td>
|
| 54 |
+
</tr>
|
| 55 |
+
{% endfor %}
|
| 56 |
+
</tbody>
|
| 57 |
+
</table>
|
| 58 |
+
|
| 59 |
+
<div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
|
| 60 |
+
{% if plot_precision %}
|
| 61 |
+
<img src="{{ plot_precision }}" class="rounded shadow">
|
| 62 |
+
{% endif %}
|
| 63 |
+
{% if plot_recall %}
|
| 64 |
+
<img src="{{ plot_recall }}" class="rounded shadow">
|
| 65 |
+
{% endif %}
|
| 66 |
+
{% if plot_mrr %}
|
| 67 |
+
<img src="{{ plot_mrr }}" class="rounded shadow">
|
| 68 |
+
{% endif %}
|
| 69 |
+
</div>
|
| 70 |
+
{% endif %}
|
| 71 |
+
|
| 72 |
+
{% if message %}
|
| 73 |
+
<p class="text-center text-red-700 font-semibold">{{ message }}</p>
|
| 74 |
+
{% endif %}
|
| 75 |
+
|
| 76 |
+
<div class="text-center mt-8">
|
| 77 |
+
<a href="/" class="text-blue-600 hover:underline">← Back to Main Interface</a>
|
| 78 |
+
</div>
|
| 79 |
+
</div>
|
| 80 |
+
</body>
|
| 81 |
+
</html>
|
app/templates/feedback_dashboard.html
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<title>🧠 VDoc Feedback Dashboard</title>
|
| 6 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 7 |
+
</head>
|
| 8 |
+
<body class="bg-blue-50 text-blue-800 min-h-screen flex flex-col items-center py-8">
|
| 9 |
+
<div class="bg-white shadow-lg rounded-xl p-8 w-full max-w-4xl">
|
| 10 |
+
<h1 class="text-2xl font-bold mb-4 text-center">🧠 Feedback Dashboard</h1>
|
| 11 |
+
|
| 12 |
+
<p class="text-center text-gray-600 mb-4">{{ summary }}</p>
|
| 13 |
+
|
| 14 |
+
<div class="grid grid-cols-3 gap-4 text-center mb-6">
|
| 15 |
+
<div class="bg-green-100 p-3 rounded-lg">
|
| 16 |
+
<p class="text-xl font-bold text-green-700">{{ correct }}</p>
|
| 17 |
+
<p class="text-sm text-green-800">Correct</p>
|
| 18 |
+
</div>
|
| 19 |
+
<div class="bg-red-100 p-3 rounded-lg">
|
| 20 |
+
<p class="text-xl font-bold text-red-700">{{ incorrect }}</p>
|
| 21 |
+
<p class="text-sm text-red-800">Incorrect</p>
|
| 22 |
+
</div>
|
| 23 |
+
<div class="bg-blue-100 p-3 rounded-lg">
|
| 24 |
+
<p class="text-xl font-bold text-blue-700">{{ total }}</p>
|
| 25 |
+
<p class="text-sm text-blue-800">Total Feedback</p>
|
| 26 |
+
</div>
|
| 27 |
+
</div>
|
| 28 |
+
|
| 29 |
+
<!-- Train model -->
|
| 30 |
+
<form action="/train_feedback_model" method="post" class="text-center mb-8">
|
| 31 |
+
<button type="submit"
|
| 32 |
+
class="bg-blue-600 text-white px-6 py-2 rounded hover:bg-blue-700">
|
| 33 |
+
🚀 Train Model from Feedback
|
| 34 |
+
</button>
|
| 35 |
+
</form>
|
| 36 |
+
|
| 37 |
+
{% if message %}
|
| 38 |
+
<div class="bg-gray-50 border-l-4 border-blue-400 p-3 mb-6">
|
| 39 |
+
<p class="text-gray-700 font-medium">{{ message }}</p>
|
| 40 |
+
<pre class="text-xs text-gray-600 mt-2 whitespace-pre-wrap">{{ train_output }}</pre>
|
| 41 |
+
</div>
|
| 42 |
+
{% endif %}
|
| 43 |
+
|
| 44 |
+
<!-- Feedback log -->
|
| 45 |
+
<h2 class="text-xl font-semibold mb-3">📜 Recent Feedback</h2>
|
| 46 |
+
<div class="max-h-96 overflow-y-auto border rounded p-3 bg-gray-50">
|
| 47 |
+
{% for fb in feedback_data %}
|
| 48 |
+
<div class="mb-3 border-b pb-2">
|
| 49 |
+
<p class="text-sm"><strong>🕓</strong> {{ fb.timestamp }}</p>
|
| 50 |
+
<p class="text-sm"><strong>❓</strong> {{ fb.question }}</p>
|
| 51 |
+
<p class="text-sm"><strong>💬</strong> {{ fb.answer }}</p>
|
| 52 |
+
<p class="text-sm">
|
| 53 |
+
<strong>✅</strong> {{ fb.correctness|capitalize }}
|
| 54 |
+
</p>
|
| 55 |
+
</div>
|
| 56 |
+
{% endfor %}
|
| 57 |
+
</div>
|
| 58 |
+
|
| 59 |
+
<div class="text-center mt-8">
|
| 60 |
+
<a href="/" class="text-blue-600 hover:underline">← Back to Main Interface</a>
|
| 61 |
+
</div>
|
| 62 |
+
</div>
|
| 63 |
+
</body>
|
| 64 |
+
</html>
|
app/templates/index.html
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<title>VDoc RAG - Web UI</title>
|
| 6 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 7 |
+
</head>
|
| 8 |
+
<body class="bg-blue-100 text-blue-800 min-h-screen flex flex-col items-center justify-center">
|
| 9 |
+
<div class="bg-white shadow-lg rounded-xl p-8 w-full max-w-2xl">
|
| 10 |
+
<h1 class="text-2xl font-bold text-center mb-6 text-blue-800">📄 VDoc RAG Web Interface</h1>
|
| 11 |
+
|
| 12 |
+
<!-- Upload Form -->
|
| 13 |
+
<form action="/upload" method="post" enctype="multipart/form-data" class="flex flex-col items-center space-y-3 mb-6">
|
| 14 |
+
<input type="file" name="file" accept=".pdf,.png,.jpg,.jpeg" required class="border p-2 rounded w-full">
|
| 15 |
+
<button type="submit" class="bg-blue-600 text-white px-4 py-2 rounded hover:bg-blue-700">Upload & Index</button>
|
| 16 |
+
</form>
|
| 17 |
+
|
| 18 |
+
<div class="mt-6 text-center">
|
| 19 |
+
<a href="/feedback_dashboard"
|
| 20 |
+
class="text-blue-700 font-semibold hover:underline">
|
| 21 |
+
🧠 Open Feedback Dashboard
|
| 22 |
+
</a>
|
| 23 |
+
</div>
|
| 24 |
+
|
| 25 |
+
<div class="mt-3 text-center">
|
| 26 |
+
<a href="/benchmark_dashboard"
|
| 27 |
+
class="text-blue-700 font-semibold hover:underline">
|
| 28 |
+
📊 Open Benchmark Dashboard
|
| 29 |
+
</a>
|
| 30 |
+
</div>
|
| 31 |
+
|
| 32 |
+
<!-- Cache Clear Button -->
|
| 33 |
+
<form action="/clear_cache" method="post" class="mb-6">
|
| 34 |
+
<button type="submit"
|
| 35 |
+
class="bg-red-600 text-white px-4 py-2 rounded hover:bg-red-700 w-full">
|
| 36 |
+
🧹 Clear Cache
|
| 37 |
+
</button>
|
| 38 |
+
</form>
|
| 39 |
+
|
| 40 |
+
<!-- Clear Persistent Index Button -->
|
| 41 |
+
<form action="/clear_index" method="post" class="mb-6">
|
| 42 |
+
<button type="submit"
|
| 43 |
+
class="bg-orange-600 text-white px-4 py-2 rounded hover:bg-orange-700 w-full">
|
| 44 |
+
🗑️ Clear Persistent Index
|
| 45 |
+
</button>
|
| 46 |
+
</form>
|
| 47 |
+
|
| 48 |
+
{% if uploaded %}
|
| 49 |
+
<p class="text-green-600 font-semibold mb-4">Uploaded files: {{ uploaded|join(', ') }}</p>
|
| 50 |
+
{% endif %}
|
| 51 |
+
|
| 52 |
+
<!-- Ask Question -->
|
| 53 |
+
<form action="/ask" method="post" class="space-y-3 mb-4">
|
| 54 |
+
<input type="text" name="question" placeholder="Ask a question about your document..." required
|
| 55 |
+
class="border rounded w-full p-2 focus:outline-none focus:ring-2 focus:ring-blue-400">
|
| 56 |
+
<button type="submit" class="bg-blue-600 text-white px-4 py-2 rounded hover:bg-blue-700 w-full">Ask</button>
|
| 57 |
+
</form>
|
| 58 |
+
|
| 59 |
+
{% if chunks %}
|
| 60 |
+
<div style="margin-top: 2em;">
|
| 61 |
+
<h3>🔍 Retrieved Chunks (Used in Prompt)</h3>
|
| 62 |
+
<ul>
|
| 63 |
+
{% for c in chunks %}
|
| 64 |
+
<li class="border-b border-gray-200 py-2">
|
| 65 |
+
<strong>[{{ c.index }}]</strong>
|
| 66 |
+
(Page: {{ c.page }}, BBox: {{ c.bbox }})<br>
|
| 67 |
+
<code>{{ c.text }}</code><br>
|
| 68 |
+
<span class="text-sm text-gray-500">🔹 Confidence: {{ c.confidence }}%</span>
|
| 69 |
+
</li>
|
| 70 |
+
{% endfor %}
|
| 71 |
+
</ul>
|
| 72 |
+
</div>
|
| 73 |
+
{% endif %}
|
| 74 |
+
|
| 75 |
+
<!-- Answer Section -->
|
| 76 |
+
{% if answer %}
|
| 77 |
+
<div class="bg-blue-50 border rounded-lg p-4 mt-4">
|
| 78 |
+
<h2 class="text-lg font-semibold text-blue-700 mb-2">Answer:</h2>
|
| 79 |
+
<p>{{ answer }}</p>
|
| 80 |
+
{% if sources %}
|
| 81 |
+
<h3 class="font-semibold mt-3">Sources:</h3>
|
| 82 |
+
<ul class="list-disc list-inside text-sm text-blue-700">
|
| 83 |
+
{% for s in sources %}
|
| 84 |
+
<li>Page {{ s.page }} → {{ s.text[:100] }}...</li>
|
| 85 |
+
{% endfor %}
|
| 86 |
+
</ul>
|
| 87 |
+
{% endif %}
|
| 88 |
+
<!-- Feedback Section -->
|
| 89 |
+
<form action="/feedback" method="post" class="mt-3 flex space-x-2">
|
| 90 |
+
<input type="hidden" name="question" value="{{ question }}">
|
| 91 |
+
<input type="hidden" name="answer" value="{{ answer }}">
|
| 92 |
+
<button type="submit" name="correctness" value="correct"
|
| 93 |
+
class="bg-green-600 text-white px-3 py-1 rounded hover:bg-green-700">
|
| 94 |
+
✅ Correct
|
| 95 |
+
</button>
|
| 96 |
+
<button type="submit" name="correctness" value="incorrect"
|
| 97 |
+
class="bg-red-600 text-white px-3 py-1 rounded hover:bg-red-700">
|
| 98 |
+
❌ Incorrect
|
| 99 |
+
</button>
|
| 100 |
+
</form>
|
| 101 |
+
</div>
|
| 102 |
+
<div id="highlight-section">
|
| 103 |
+
{% if highlight_images %}
|
| 104 |
+
<h3>📄 Relevant PDF Pages:</h3>
|
| 105 |
+
<div id="highlight-gallery">
|
| 106 |
+
{% for img in highlight_images %}
|
| 107 |
+
<img src="{{ img }}?v={{ loop.index }}" class="highlight-img"
|
| 108 |
+
style="max-width:80%; margin:10px; border:3px solid red;" />
|
| 109 |
+
{% endfor %}
|
| 110 |
+
</div>
|
| 111 |
+
{% endif %}
|
| 112 |
+
</div>
|
| 113 |
+
|
| 114 |
+
{% if confidence_avg is defined %}
|
| 115 |
+
<p class="text-sm text-gray-600 mt-2">🧠 Average confidence: {{ confidence_avg }}%</p>
|
| 116 |
+
{% endif %}
|
| 117 |
+
|
| 118 |
+
<script>
|
| 119 |
+
// Clear old images before new ones are inserted
|
| 120 |
+
document.addEventListener("DOMContentLoaded", function() {
|
| 121 |
+
const form = document.querySelector("form[action='/ask']");
|
| 122 |
+
if (form) {
|
| 123 |
+
form.addEventListener("submit", () => {
|
| 124 |
+
const gallery = document.getElementById("highlight-gallery");
|
| 125 |
+
if (gallery) gallery.innerHTML = ""; // remove old images
|
| 126 |
+
});
|
| 127 |
+
}
|
| 128 |
+
});
|
| 129 |
+
</script>
|
| 130 |
+
{% endif %}
|
| 131 |
+
</div>
|
| 132 |
+
</body>
|
| 133 |
+
</html>
|
app/utils.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def bbox_to_dict(bbox):
|
| 2 |
+
x0, y0, x1, y1 = bbox
|
| 3 |
+
return {'x0': int(x0), 'y0': int(y0), 'x1': int(x1), 'y1': int(y1)}
|
app/visual_highlight.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uuid
|
| 3 |
+
import json
|
| 4 |
+
import ast
|
| 5 |
+
from pdf2image import convert_from_path
|
| 6 |
+
from PIL import Image, ImageDraw
|
| 7 |
+
|
| 8 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def load_calibration(config_path="highlight_calibration.json"):
|
| 12 |
+
"""Load calibration values from JSON or fallback to defaults."""
|
| 13 |
+
if os.path.exists(config_path):
|
| 14 |
+
with open(config_path, "r") as f:
|
| 15 |
+
calib = json.load(f)
|
| 16 |
+
print(f"✅ Loaded calibration: {calib}")
|
| 17 |
+
return calib
|
| 18 |
+
else:
|
| 19 |
+
print("⚠️ No calibration file found. Using defaults.")
|
| 20 |
+
return {"x_offset": 0, "x_scale": 1.0, "y_offset": 0, "y_scale": 1.0}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def render_highlighted_pages(pdf_path, hits, output_dir=None, dpi=150):
|
| 24 |
+
"""
|
| 25 |
+
Render PDF pages as images and highlight bounding boxes with calibration applied.
|
| 26 |
+
Crops the output image tightly around highlighted area (+20 px padding).
|
| 27 |
+
"""
|
| 28 |
+
if output_dir is None:
|
| 29 |
+
output_dir = os.path.join(BASE_DIR, "highlighted")
|
| 30 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 31 |
+
|
| 32 |
+
calib = load_calibration()
|
| 33 |
+
X_OFFSET = calib.get("x_offset", 0)
|
| 34 |
+
X_SCALE = calib.get("x_scale", 1.0)
|
| 35 |
+
Y_OFFSET = calib.get("y_offset", 0)
|
| 36 |
+
Y_SCALE = calib.get("y_scale", 1.0)
|
| 37 |
+
|
| 38 |
+
# Clean previous outputs
|
| 39 |
+
for old in os.listdir(output_dir):
|
| 40 |
+
try:
|
| 41 |
+
os.remove(os.path.join(output_dir, old))
|
| 42 |
+
except Exception:
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
hits = hits[:1]
|
| 46 |
+
|
| 47 |
+
pages_to_render = sorted({h["metadata"]["page"] for h in hits})
|
| 48 |
+
pdf_images = convert_from_path(pdf_path, dpi=dpi)
|
| 49 |
+
result_paths = []
|
| 50 |
+
|
| 51 |
+
for page_num in pages_to_render:
|
| 52 |
+
page_index = page_num - 1
|
| 53 |
+
img = pdf_images[page_index].convert("RGBA")
|
| 54 |
+
w_img, h_img = img.size
|
| 55 |
+
overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
|
| 56 |
+
draw = ImageDraw.Draw(overlay)
|
| 57 |
+
page_bboxes = []
|
| 58 |
+
|
| 59 |
+
for h in hits:
|
| 60 |
+
meta = h.get("metadata", {})
|
| 61 |
+
if meta.get("page") != page_num:
|
| 62 |
+
continue
|
| 63 |
+
bbox = meta.get("bbox")
|
| 64 |
+
# Debug raw bbox
|
| 65 |
+
print(f"[DEBUG] page {page_num} raw bbox type: {type(bbox)} value: {bbox}")
|
| 66 |
+
|
| 67 |
+
# Safe parsing: accept list/tuple or stringified list
|
| 68 |
+
try:
|
| 69 |
+
if isinstance(bbox, str):
|
| 70 |
+
bbox = ast.literal_eval(bbox)
|
| 71 |
+
if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
|
| 72 |
+
print(f"[WARN] Invalid bbox for page {page_num}: {bbox}")
|
| 73 |
+
continue
|
| 74 |
+
# Apply calibration
|
| 75 |
+
x0, y0, x1, y1 = [float(v) for v in bbox]
|
| 76 |
+
x0 = x0 * X_SCALE + X_OFFSET
|
| 77 |
+
x1 = x1 * X_SCALE + X_OFFSET
|
| 78 |
+
y0 = y0 * Y_SCALE + Y_OFFSET
|
| 79 |
+
y1 = y1 * Y_SCALE + Y_OFFSET
|
| 80 |
+
except Exception as e:
|
| 81 |
+
print(f"[ERROR] Failed to parse bbox for page {page_num}: {bbox} -> {e}")
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
left, top = max(0, min(x0, x1)), max(0, min(y0, y1))
|
| 85 |
+
right, bottom = min(w_img, max(x0, x1)), min(h_img, max(y0, y1))
|
| 86 |
+
|
| 87 |
+
if right <= left or bottom <= top:
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
page_bboxes.append((left, top, right, bottom))
|
| 91 |
+
draw.rectangle(
|
| 92 |
+
[left, top, right, bottom],
|
| 93 |
+
outline=(255, 0, 0),
|
| 94 |
+
width=4,
|
| 95 |
+
fill=(255, 0, 0, 100)
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Merge highlights with image
|
| 99 |
+
highlighted = Image.alpha_composite(img, overlay)
|
| 100 |
+
|
| 101 |
+
# --- 🧭 Crop around highlighted region (+20px padding) ---
|
| 102 |
+
if page_bboxes:
|
| 103 |
+
min_x = min(b[0] for b in page_bboxes)
|
| 104 |
+
min_y = min(b[1] for b in page_bboxes)
|
| 105 |
+
max_x = max(b[2] for b in page_bboxes)
|
| 106 |
+
max_y = max(b[3] for b in page_bboxes)
|
| 107 |
+
|
| 108 |
+
pad = 100
|
| 109 |
+
crop_box = (
|
| 110 |
+
max(0, int(min_x - pad)),
|
| 111 |
+
max(0, int(min_y - pad)),
|
| 112 |
+
int(min(max_x + pad, w_img)),
|
| 113 |
+
int(min(max_y + pad, h_img)),
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
cropped = highlighted.crop(crop_box)
|
| 117 |
+
else:
|
| 118 |
+
cropped = highlighted # fallback if no bbox
|
| 119 |
+
|
| 120 |
+
# Log how many boxes were drawn
|
| 121 |
+
print(f"✅ Drew {len(page_bboxes)} boxes on page {page_num}")
|
| 122 |
+
|
| 123 |
+
out_path = os.path.join(output_dir, f"highlight_page{page_num}_{uuid.uuid4().hex}.png")
|
| 124 |
+
cropped.convert("RGB").save(out_path)
|
| 125 |
+
result_paths.append(out_path)
|
| 126 |
+
|
| 127 |
+
print(f"✅ Highlighted and cropped page {page_num}: {out_path}")
|
| 128 |
+
|
| 129 |
+
return result_paths
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# Example usage
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
hits = [
|
| 135 |
+
{"metadata": {"page": 2, "bbox": [87, 222, 592, 250], "type": "text"}},
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
render_highlighted_pages("samples/vdoc_rag_test.pdf", hits)
|
highlight_calibration.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"x_offset": -33.0,
|
| 3 |
+
"x_scale": 1.0,
|
| 4 |
+
"y_offset": -65.0,
|
| 5 |
+
"y_scale": 1.02
|
| 6 |
+
}
|
notebooks/evaluate_embeddings.ipynb
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "55b021d5",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# Embedding & Retrieval Evaluation\n",
|
| 9 |
+
"\n",
|
| 10 |
+
"This notebook benchmarks embedding models and chunk sizes for retrieval quality using your project's Chroma index and collected feedback as a small labeled set. Metrics: Precision@K, Recall@K, and MRR."
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": null,
|
| 16 |
+
"id": "18518993",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [
|
| 19 |
+
{
|
| 20 |
+
"ename": "",
|
| 21 |
+
"evalue": "",
|
| 22 |
+
"output_type": "error",
|
| 23 |
+
"traceback": [
|
| 24 |
+
"\u001b[1;31mFailed to start the Kernel. \n",
|
| 25 |
+
"\u001b[1;31mPermissionError: [WinError 5] Access is denied: 'C:\\\\Users\\\\abhin\\\\.ipython\\\\profile_default\\\\security'. \n",
|
| 26 |
+
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
| 27 |
+
]
|
| 28 |
+
}
|
| 29 |
+
],
|
| 30 |
+
"source": [
|
| 31 |
+
"# Standard imports\n",
|
| 32 |
+
"import os\n",
|
| 33 |
+
"import json\n",
|
| 34 |
+
"import numpy as np\n",
|
| 35 |
+
"import pandas as pd\n",
|
| 36 |
+
"from tqdm import tqdm\n",
|
| 37 |
+
"from sentence_transformers import SentenceTransformer\n",
|
| 38 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 39 |
+
"import matplotlib.pyplot as plt\n",
|
| 40 |
+
"\n",
|
| 41 |
+
"# Project imports (uses your existing pipeline)\n",
|
| 42 |
+
"from app.feedback_manager import _load_feedback\n",
|
| 43 |
+
"from app.ingest import process_pdf\n",
|
| 44 |
+
"from app.embeddings import TextImageEmbedder\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"# Config\n",
|
| 47 |
+
"BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
|
| 48 |
+
"PDF_PATH = os.path.join(BASE_DIR, \"samples\", \"vdoc_rag_test.pdf\") # replace with a real sample PDF path\n",
|
| 49 |
+
"STORAGE_DIR = os.path.join(BASE_DIR, \"storage\", \"chroma_db\")\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"MODELS_TO_TEST = [\n",
|
| 52 |
+
" \"all-MiniLM-L6-v2\",\n",
|
| 53 |
+
" \"multi-qa-MiniLM-L6-cos-v1\",\n",
|
| 54 |
+
" \"paraphrase-MiniLM-L3-v2\",\n",
|
| 55 |
+
" os.path.join(BASE_DIR, \"models\", \"vdoc_feedback_tuned\", \"latest\"),\n",
|
| 56 |
+
"]\n",
|
| 57 |
+
"CHUNK_SIZES = [200, 500, 800] # in characters\n",
|
| 58 |
+
"TOP_K = 5\n",
|
| 59 |
+
"\n",
|
| 60 |
+
"print(\"Notebook configured. If the tuned model path does not exist, it will be skipped in runs.\")"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"cell_type": "code",
|
| 65 |
+
"execution_count": null,
|
| 66 |
+
"id": "863ba97b",
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"outputs": [],
|
| 69 |
+
"source": [
|
| 70 |
+
"# Load feedback (if available)\n",
|
| 71 |
+
"feedback = _load_feedback()\n",
|
| 72 |
+
"print(f\"Loaded {len(feedback)} feedback entries.\")\n",
|
| 73 |
+
"if feedback:\n",
|
| 74 |
+
" sample_queries = [f['question'] for f in feedback]\n",
|
| 75 |
+
" sample_answers = [f['answer'] for f in feedback]\n",
|
| 76 |
+
"else:\n",
|
| 77 |
+
" # fallback small test set\n",
|
| 78 |
+
" sample_queries = [\n",
|
| 79 |
+
" \"What is the trend in yearly sales?\",\n",
|
| 80 |
+
" \"Who scored highest in the table?\",\n",
|
| 81 |
+
" \"What is the event date?\",\n",
|
| 82 |
+
" ]\n",
|
| 83 |
+
" sample_answers = [\"increasing\", \"Charlie\", \"November 20, 2025\"]\n",
|
| 84 |
+
"\n",
|
| 85 |
+
"# Small helper to preview feedback structure\n",
|
| 86 |
+
"if feedback:\n",
|
| 87 |
+
" display(pd.DataFrame(feedback)[['timestamp','question','answer','correctness']].tail(10))"
|
| 88 |
+
]
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"cell_type": "code",
|
| 92 |
+
"execution_count": null,
|
| 93 |
+
"id": "ad0fcb6c",
|
| 94 |
+
"metadata": {},
|
| 95 |
+
"outputs": [],
|
| 96 |
+
"source": [
|
| 97 |
+
"# Helper: process the PDF into chunks (optional - heavy).\n",
|
| 98 |
+
"def load_chunks(pdf_path):\n",
|
| 99 |
+
" if not os.path.exists(pdf_path):\n",
|
| 100 |
+
" raise FileNotFoundError(f\"PDF not found: {pdf_path}\")\n",
|
| 101 |
+
" print(\"Processing PDF into chunks (this may take a while)...\")\n",
|
| 102 |
+
" docs = process_pdf(pdf_path)\n",
|
| 103 |
+
" texts = [d['text'] for d in docs]\n",
|
| 104 |
+
" return texts\n",
|
| 105 |
+
"\n",
|
| 106 |
+
"# Try to load sample chunks if available, otherwise create toy chunks from feedback answers\n",
|
| 107 |
+
"try:\n",
|
| 108 |
+
" chunks = load_chunks(PDF_PATH)\n",
|
| 109 |
+
" print(f\"Total chunks from PDF: {len(chunks)}\")\n",
|
| 110 |
+
"except Exception as e:\n",
|
| 111 |
+
" print(\"Could not process PDF, falling back to feedback-derived tiny corpus:\", e)\n",
|
| 112 |
+
" # fallback corpus built from sample answers/queries for quick runs\n",
|
| 113 |
+
" chunks = [\n",
|
| 114 |
+
" \"Yearly sales have been increasing steadily from 2018 to 2024, with a notable jump in 2021.\",\n",
|
| 115 |
+
" \"Charlie achieved the highest score in the table with 98 points.\",\n",
|
| 116 |
+
" \"The event will be held on November 20, 2025 at the downtown auditorium.\",\n",
|
| 117 |
+
" ]\n",
|
| 118 |
+
" print(f\"Using fallback chunks: {len(chunks)} items\")"
|
| 119 |
+
]
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"cell_type": "code",
|
| 123 |
+
"execution_count": null,
|
| 124 |
+
"id": "5c8b6ffe",
|
| 125 |
+
"metadata": {},
|
| 126 |
+
"outputs": [],
|
| 127 |
+
"source": [
|
| 128 |
+
"# Evaluation function (Precision@K, Recall@K, MRR)\n",
|
| 129 |
+
"def evaluate_model(model_name, chunks, queries, answers, chunk_size, top_k=TOP_K):\n",
|
| 130 |
+
" print(f\"\\n🧠 Evaluating {model_name} (chunk size {chunk_size})\")\n",
|
| 131 |
+
" # Skip model if path does not exist (for tuned model)\n",
|
| 132 |
+
" if os.path.isabs(model_name) and not os.path.exists(model_name):\n",
|
| 133 |
+
" print(f\"- Skipping (path not found): {model_name}\")\n",
|
| 134 |
+
" return None\n",
|
| 135 |
+
"\n",
|
| 136 |
+
" model = SentenceTransformer(model_name)\n",
|
| 137 |
+
"\n",
|
| 138 |
+
" # Split chunks by size\n",
|
| 139 |
+
" split_chunks = []\n",
|
| 140 |
+
" for ch in chunks:\n",
|
| 141 |
+
" for i in range(0, len(ch), chunk_size):\n",
|
| 142 |
+
" split_chunks.append(ch[i:i+chunk_size])\n",
|
| 143 |
+
" chunk_embeddings = model.encode(split_chunks, normalize_embeddings=True, show_progress_bar=False)\n",
|
| 144 |
+
"\n",
|
| 145 |
+
" precision_scores, recall_scores, mrr_scores = [], [], []\n",
|
| 146 |
+
"\n",
|
| 147 |
+
" # Precompute reference counts for recall denominator\n",
|
| 148 |
+
" total_relevant_counts = []\n",
|
| 149 |
+
" for ans in answers:\n",
|
| 150 |
+
" total_relevant_counts.append(sum(1 for c in split_chunks if ans.lower() in c.lower()))\n",
|
| 151 |
+
"\n",
|
| 152 |
+
" for q, ans in tqdm(list(zip(queries, answers)), total=len(queries), desc=f\"Evaluating {model_name}\"):\n",
|
| 153 |
+
" qvec = model.encode([q], normalize_embeddings=True)\n",
|
| 154 |
+
" sims = cosine_similarity(qvec, chunk_embeddings)[0]\n",
|
| 155 |
+
" top_indices = np.argsort(sims)[::-1][:top_k]\n",
|
| 156 |
+
" retrieved_chunks = [split_chunks[i] for i in top_indices]\n",
|
| 157 |
+
"\n",
|
| 158 |
+
" relevant = [1 if ans.lower() in c.lower() else 0 for c in retrieved_chunks]\n",
|
| 159 |
+
" precision = sum(relevant) / top_k\n",
|
| 160 |
+
" recall = sum(relevant) / max(1, total_relevant_counts.pop(0))\n",
|
| 161 |
+
" mrr = 0.0\n",
|
| 162 |
+
" for rank, rel in enumerate(relevant, start=1):\n",
|
| 163 |
+
" if rel == 1:\n",
|
| 164 |
+
" mrr = 1.0 / rank\n",
|
| 165 |
+
" break\n",
|
| 166 |
+
"\n",
|
| 167 |
+
" precision_scores.append(precision)\n",
|
| 168 |
+
" recall_scores.append(recall)\n",
|
| 169 |
+
" mrr_scores.append(mrr)\n",
|
| 170 |
+
"\n",
|
| 171 |
+
" return {\n",
|
| 172 |
+
" \"model\": model_name,\n",
|
| 173 |
+
" \"chunk_size\": chunk_size,\n",
|
| 174 |
+
" \"precision\": float(np.mean(precision_scores)),\n",
|
| 175 |
+
" \"recall\": float(np.mean(recall_scores)),\n",
|
| 176 |
+
" \"mrr\": float(np.mean(mrr_scores)),\n",
|
| 177 |
+
" }"
|
| 178 |
+
]
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"cell_type": "code",
|
| 182 |
+
"execution_count": null,
|
| 183 |
+
"id": "ca934bfc",
|
| 184 |
+
"metadata": {},
|
| 185 |
+
"outputs": [],
|
| 186 |
+
"source": [
|
| 187 |
+
"# Run evaluation across models and chunk sizes\n",
|
| 188 |
+
"results = []\n",
|
| 189 |
+
"for model_name in MODELS_TO_TEST:\n",
|
| 190 |
+
" for cs in CHUNK_SIZES:\n",
|
| 191 |
+
" res = evaluate_model(model_name, chunks, sample_queries, sample_answers, cs)\n",
|
| 192 |
+
" if res:\n",
|
| 193 |
+
" results.append(res)\n",
|
| 194 |
+
"\n",
|
| 195 |
+
"df = pd.DataFrame(results)\n",
|
| 196 |
+
"if not df.empty:\n",
|
| 197 |
+
" display(df)\n",
|
| 198 |
+
"else:\n",
|
| 199 |
+
" print(\"No results to show (models may have been skipped).\")"
|
| 200 |
+
]
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"cell_type": "code",
|
| 204 |
+
"execution_count": null,
|
| 205 |
+
"id": "e6f75729",
|
| 206 |
+
"metadata": {},
|
| 207 |
+
"outputs": [],
|
| 208 |
+
"source": [
|
| 209 |
+
"# Visualization\n",
|
| 210 |
+
"if not df.empty:\n",
|
| 211 |
+
" plt.figure(figsize=(8,5))\n",
|
| 212 |
+
" for m in df['model'].unique():\n",
|
| 213 |
+
" subset = df[df['model'] == m]\n",
|
| 214 |
+
" plt.plot(subset['chunk_size'], subset['precision'], marker='o', label=f\"{m} (Precision)\")\n",
|
| 215 |
+
" plt.title('Precision@5 vs Chunk Size')\n",
|
| 216 |
+
" plt.xlabel('Chunk Size (characters)')\n",
|
| 217 |
+
" plt.ylabel('Precision@5')\n",
|
| 218 |
+
" plt.legend()\n",
|
| 219 |
+
" plt.grid(True)\n",
|
| 220 |
+
" plt.show()\n",
|
| 221 |
+
"\n",
|
| 222 |
+
" plt.figure(figsize=(8,5))\n",
|
| 223 |
+
" for m in df['model'].unique():\n",
|
| 224 |
+
" subset = df[df['model'] == m]\n",
|
| 225 |
+
" plt.plot(subset['chunk_size'], subset['recall'], marker='s', label=f\"{m} (Recall)\")\n",
|
| 226 |
+
" plt.title('Recall@5 vs Chunk Size')\n",
|
| 227 |
+
" plt.xlabel('Chunk Size (characters)')\n",
|
| 228 |
+
" plt.ylabel('Recall@5')\n",
|
| 229 |
+
" plt.legend()\n",
|
| 230 |
+
" plt.grid(True)\n",
|
| 231 |
+
" plt.show()"
|
| 232 |
+
]
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"cell_type": "code",
|
| 236 |
+
"execution_count": null,
|
| 237 |
+
"id": "249d2857",
|
| 238 |
+
"metadata": {},
|
| 239 |
+
"outputs": [],
|
| 240 |
+
"source": [
|
| 241 |
+
"# Save results to CSV for reporting\n",
|
| 242 |
+
"output_csv = os.path.join(BASE_DIR, 'notebooks', 'embedding_benchmark_results.csv')\n",
|
| 243 |
+
"if not df.empty:\n",
|
| 244 |
+
" df.to_csv(output_csv, index=False)\n",
|
| 245 |
+
" print(f\"✅ Benchmark results saved to {output_csv}\")\n",
|
| 246 |
+
"else:\n",
|
| 247 |
+
" print(\"No data to save.\")"
|
| 248 |
+
]
|
| 249 |
+
}
|
| 250 |
+
],
|
| 251 |
+
"metadata": {
|
| 252 |
+
"kernelspec": {
|
| 253 |
+
"display_name": "Python 3",
|
| 254 |
+
"language": "python",
|
| 255 |
+
"name": "python3"
|
| 256 |
+
},
|
| 257 |
+
"language_info": {
|
| 258 |
+
"name": "python",
|
| 259 |
+
"version": "3.13.2"
|
| 260 |
+
}
|
| 261 |
+
},
|
| 262 |
+
"nbformat": 4,
|
| 263 |
+
"nbformat_minor": 5
|
| 264 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
python-multipart
|
| 4 |
+
pdf2image
|
| 5 |
+
pdfplumber
|
| 6 |
+
pytesseract
|
| 7 |
+
Pillow
|
| 8 |
+
sentence-transformers
|
| 9 |
+
transformers
|
| 10 |
+
torch
|
| 11 |
+
chromadb
|
| 12 |
+
numpy
|
| 13 |
+
pandas
|
| 14 |
+
aiofiles
|
| 15 |
+
openai
|
| 16 |
+
layoutparser
|
| 17 |
+
opencv-python-headless
|
| 18 |
+
matplotlib
|
| 19 |
+
scikit-learn
|
| 20 |
+
google-genai
|
| 21 |
+
python-dotenv
|
| 22 |
+
jinja2
|
samples/vdoc_rag_test.pdf
ADDED
|
Binary file (52.1 kB). View file
|
|
|
test.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uuid
|
| 3 |
+
import json
|
| 4 |
+
from pdf2image import convert_from_path
|
| 5 |
+
from PIL import Image, ImageDraw
|
| 6 |
+
|
| 7 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def load_calibration(config_path="highlight_calibration.json"):
|
| 11 |
+
"""Load calibration values from JSON or fallback to defaults."""
|
| 12 |
+
if os.path.exists(config_path):
|
| 13 |
+
with open(config_path, "r") as f:
|
| 14 |
+
calib = json.load(f)
|
| 15 |
+
print(f"✅ Loaded calibration: {calib}")
|
| 16 |
+
return calib
|
| 17 |
+
else:
|
| 18 |
+
print("⚠️ No calibration file found. Using defaults.")
|
| 19 |
+
return {"x_offset": 0, "x_scale": 1.0, "y_offset": 0, "y_scale": 1.0}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def render_highlighted_pages(pdf_path, hits, output_dir=None, dpi=150):
|
| 23 |
+
"""
|
| 24 |
+
Render PDF pages as images and highlight bounding boxes with calibration applied.
|
| 25 |
+
Crops the output image tightly around highlighted area (+20 px padding).
|
| 26 |
+
"""
|
| 27 |
+
if output_dir is None:
|
| 28 |
+
output_dir = os.path.join(BASE_DIR, "highlighted")
|
| 29 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 30 |
+
|
| 31 |
+
calib = load_calibration()
|
| 32 |
+
X_OFFSET = calib.get("x_offset", 0)
|
| 33 |
+
X_SCALE = calib.get("x_scale", 1.0)
|
| 34 |
+
Y_OFFSET = calib.get("y_offset", 0)
|
| 35 |
+
Y_SCALE = calib.get("y_scale", 1.0)
|
| 36 |
+
|
| 37 |
+
# Clean previous outputs
|
| 38 |
+
for old in os.listdir(output_dir):
|
| 39 |
+
try:
|
| 40 |
+
os.remove(os.path.join(output_dir, old))
|
| 41 |
+
except Exception:
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
pages_to_render = sorted({h["metadata"]["page"] for h in hits})
|
| 45 |
+
pdf_images = convert_from_path(pdf_path, dpi=dpi)
|
| 46 |
+
result_paths = []
|
| 47 |
+
|
| 48 |
+
for page_num in pages_to_render:
|
| 49 |
+
page_index = page_num - 1
|
| 50 |
+
img = pdf_images[page_index].convert("RGBA")
|
| 51 |
+
w_img, h_img = img.size
|
| 52 |
+
overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
|
| 53 |
+
draw = ImageDraw.Draw(overlay)
|
| 54 |
+
page_bboxes = []
|
| 55 |
+
|
| 56 |
+
for h in hits:
|
| 57 |
+
meta = h["metadata"]
|
| 58 |
+
if meta["page"] != page_num:
|
| 59 |
+
continue
|
| 60 |
+
bbox = meta["bbox"]
|
| 61 |
+
if not bbox or len(bbox) != 4:
|
| 62 |
+
continue
|
| 63 |
+
|
| 64 |
+
# Apply calibration
|
| 65 |
+
x0, y0, x1, y1 = [float(v) for v in bbox]
|
| 66 |
+
x0 = x0 * X_SCALE + X_OFFSET
|
| 67 |
+
x1 = x1 * X_SCALE + X_OFFSET
|
| 68 |
+
y0 = y0 * Y_SCALE + Y_OFFSET
|
| 69 |
+
y1 = y1 * Y_SCALE + Y_OFFSET
|
| 70 |
+
|
| 71 |
+
left, top = max(0, min(x0, x1)), max(0, min(y0, y1))
|
| 72 |
+
right, bottom = min(w_img, max(x0, x1)), min(h_img, max(y0, y1))
|
| 73 |
+
|
| 74 |
+
if right <= left or bottom <= top:
|
| 75 |
+
continue
|
| 76 |
+
|
| 77 |
+
page_bboxes.append((left, top, right, bottom))
|
| 78 |
+
draw.rectangle(
|
| 79 |
+
[left, top, right, bottom],
|
| 80 |
+
outline=(255, 0, 0),
|
| 81 |
+
width=4,
|
| 82 |
+
fill=(255, 0, 0, 100)
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# Merge highlights with image
|
| 86 |
+
highlighted = Image.alpha_composite(img, overlay)
|
| 87 |
+
|
| 88 |
+
# --- 🧭 Crop around highlighted region (+20px padding) ---
|
| 89 |
+
if page_bboxes:
|
| 90 |
+
min_x = min(b[0] for b in page_bboxes)
|
| 91 |
+
min_y = min(b[1] for b in page_bboxes)
|
| 92 |
+
max_x = max(b[2] for b in page_bboxes)
|
| 93 |
+
max_y = max(b[3] for b in page_bboxes)
|
| 94 |
+
|
| 95 |
+
pad = 100
|
| 96 |
+
crop_box = (
|
| 97 |
+
max(0, int(min_x - pad)),
|
| 98 |
+
max(0, int(min_y - pad)),
|
| 99 |
+
int(min(max_x + pad, w_img)),
|
| 100 |
+
int(min(max_y + pad, h_img)),
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
cropped = highlighted.crop(crop_box)
|
| 104 |
+
else:
|
| 105 |
+
cropped = highlighted # fallback if no bbox
|
| 106 |
+
|
| 107 |
+
out_path = os.path.join(output_dir, f"highlight_page{page_num}_{uuid.uuid4().hex}.png")
|
| 108 |
+
cropped.convert("RGB").save(out_path)
|
| 109 |
+
result_paths.append(out_path)
|
| 110 |
+
|
| 111 |
+
print(f"✅ Highlighted and cropped page {page_num}: {out_path}")
|
| 112 |
+
|
| 113 |
+
return result_paths
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# Example usage
|
| 117 |
+
if __name__ == "__main__":
|
| 118 |
+
hits = [
|
| 119 |
+
{"metadata": {"page": 1, "bbox": [87, 1926, 775, 1957], "type": "text"}},
|
| 120 |
+
{"metadata": {"page": 2, "bbox": [87, 222, 592, 250], "type": "text"}},
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
render_highlighted_pages("samples/vdoc_rag_test.pdf", hits)
|
train_feedback_embeddings.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
train_feedback_embeddings.py
|
| 3 |
+
Fine-tune the VDoc-RAG embedding model using stored user feedback.
|
| 4 |
+
|
| 5 |
+
Place this file at the repository root and run:
|
| 6 |
+
|
| 7 |
+
python train_feedback_embeddings.py
|
| 8 |
+
|
| 9 |
+
It will load feedback from `app/feedback.json`, prepare training pairs, fine-tune a
|
| 10 |
+
SentenceTransformer model, and save checkpoints under `models/vdoc_feedback_tuned/`.
|
| 11 |
+
"""
|
| 12 |
+
import os
|
| 13 |
+
import json
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
from torch.utils.data import DataLoader
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from sentence_transformers import SentenceTransformer, InputExample, losses
|
| 19 |
+
except Exception as e:
|
| 20 |
+
raise ImportError("Please install sentence-transformers and torch to run this script: pip install sentence-transformers torch")
|
| 21 |
+
|
| 22 |
+
# --- Paths ---
|
| 23 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 24 |
+
APP_DIR = os.path.join(BASE_DIR, "app")
|
| 25 |
+
FEEDBACK_PATH = os.path.join(APP_DIR, "feedback.json")
|
| 26 |
+
OUTPUT_DIR = os.path.join(BASE_DIR, "models", "vdoc_feedback_tuned")
|
| 27 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 28 |
+
|
| 29 |
+
# --- Step 1: Load Feedback ---
|
| 30 |
+
if not os.path.exists(FEEDBACK_PATH):
|
| 31 |
+
raise FileNotFoundError(f"❌ No feedback.json found at {FEEDBACK_PATH}")
|
| 32 |
+
|
| 33 |
+
with open(FEEDBACK_PATH, "r", encoding="utf-8") as f:
|
| 34 |
+
feedback = json.load(f)
|
| 35 |
+
|
| 36 |
+
if not feedback:
|
| 37 |
+
raise ValueError("⚠️ feedback.json is empty — collect feedback first!")
|
| 38 |
+
|
| 39 |
+
# --- Step 2: Prepare Training Data ---
|
| 40 |
+
train_examples = []
|
| 41 |
+
for fb in feedback:
|
| 42 |
+
question = fb.get("question", "").strip()
|
| 43 |
+
answer = fb.get("answer", "").strip()
|
| 44 |
+
correctness = (fb.get("correctness") or "").lower()
|
| 45 |
+
if not question or not answer:
|
| 46 |
+
continue
|
| 47 |
+
if correctness not in ("correct", "incorrect"):
|
| 48 |
+
continue
|
| 49 |
+
label = 1.0 if correctness == "correct" else 0.0
|
| 50 |
+
train_examples.append(InputExample(texts=[question, answer], label=label))
|
| 51 |
+
|
| 52 |
+
if len(train_examples) < 5:
|
| 53 |
+
raise ValueError(f"⚠️ Too few feedback entries ({len(train_examples)}). Need at least 5 to fine-tune meaningfully.")
|
| 54 |
+
|
| 55 |
+
print(f"✅ Loaded {len(train_examples)} feedback samples for training.")
|
| 56 |
+
|
| 57 |
+
# --- Step 3: Load Base Model ---
|
| 58 |
+
base_model = os.environ.get("VDOCRAG_FEEDBACK_BASE", "all-MiniLM-L6-v2")
|
| 59 |
+
print(f"📦 Loading base model: {base_model}")
|
| 60 |
+
model = SentenceTransformer(base_model)
|
| 61 |
+
|
| 62 |
+
# --- Step 4: Create DataLoader and Loss ---
|
| 63 |
+
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
|
| 64 |
+
train_loss = losses.CosineSimilarityLoss(model)
|
| 65 |
+
|
| 66 |
+
# --- Step 5: Train ---
|
| 67 |
+
print("🚀 Starting fine-tuning...")
|
| 68 |
+
model.fit(
|
| 69 |
+
train_objectives=[(train_dataloader, train_loss)],
|
| 70 |
+
epochs=1,
|
| 71 |
+
warmup_steps=10,
|
| 72 |
+
show_progress_bar=True,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# --- Step 6: Save Fine-tuned Model ---
|
| 76 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 77 |
+
save_path = os.path.join(OUTPUT_DIR, f"checkpoint_{timestamp}")
|
| 78 |
+
os.makedirs(save_path, exist_ok=True)
|
| 79 |
+
model.save(save_path)
|
| 80 |
+
print(f"✅ Fine-tuned model saved at: {save_path}")
|
| 81 |
+
|
| 82 |
+
# --- Step 7: Create "latest" symlink / pointer ---
|
| 83 |
+
latest_path = os.path.join(OUTPUT_DIR, "latest")
|
| 84 |
+
try:
|
| 85 |
+
if os.path.exists(latest_path):
|
| 86 |
+
if os.path.islink(latest_path):
|
| 87 |
+
os.unlink(latest_path)
|
| 88 |
+
else:
|
| 89 |
+
import shutil
|
| 90 |
+
|
| 91 |
+
shutil.rmtree(latest_path)
|
| 92 |
+
os.symlink(save_path, latest_path, target_is_directory=True)
|
| 93 |
+
print(f"🔗 Symlink created: {latest_path} → {save_path}")
|
| 94 |
+
except Exception:
|
| 95 |
+
# On Windows, symlink may fail — copy instead
|
| 96 |
+
import shutil
|
| 97 |
+
|
| 98 |
+
if os.path.exists(latest_path):
|
| 99 |
+
shutil.rmtree(latest_path, ignore_errors=True)
|
| 100 |
+
shutil.copytree(save_path, latest_path)
|
| 101 |
+
print(f"📁 Copied model to {latest_path} (symlink not supported).")
|
| 102 |
+
|
| 103 |
+
print("\n🎉 Training complete! Your VDoc-RAG can now use:")
|
| 104 |
+
print(f" models/vdoc_feedback_tuned/latest/")
|