aditya9128 commited on
Commit
4e3cee0
·
0 Parent(s):

Initial commit: VDoc-RAG - Intelligent Document Q&A with RAG

Browse files
.env.example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Copy this to .env and fill in your values
2
+ GEMINI_API_KEY=your_gemini_api_key_here
.gitignore ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+ # =========================
210
+ # VDoc-RAG Project Specific
211
+ # =========================
212
+ # Runtime/Generated directories
213
+ app/cache/
214
+ app/uploads/
215
+ app/storage/
216
+ app/charts/
217
+ app/highlighted/
218
+ app/tmp/
219
+ app/tables/
220
+ storage/
221
+
222
+ # Keep directory structure with .gitkeep
223
+ !app/cache/.gitkeep
224
+ !app/uploads/.gitkeep
225
+ !app/storage/.gitkeep
226
+ !app/charts/.gitkeep
227
+ !app/highlighted/.gitkeep
228
+ !app/tmp/.gitkeep
229
+ !app/tables/.gitkeep
230
+
231
+ *.png
Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ tesseract-ocr \
6
+ poppler-utils \
7
+ libgl1-mesa-glx \
8
+ libglib2.0-0 \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Set working directory
12
+ WORKDIR /app
13
+
14
+ # Copy requirements first for better caching
15
+ COPY requirements.txt .
16
+
17
+ # Install Python dependencies
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy the rest of the application
21
+ COPY . .
22
+
23
+ # Create necessary directories with proper permissions for HF Spaces
24
+ RUN mkdir -p /app/app/storage/chroma_db \
25
+ /app/app/uploads \
26
+ /app/app/tmp \
27
+ /app/app/highlighted \
28
+ /app/app/charts \
29
+ /app/app/tables \
30
+ /app/app/cache \
31
+ && chmod -R 777 /app/app/storage \
32
+ && chmod -R 777 /app/app/uploads \
33
+ && chmod -R 777 /app/app/tmp \
34
+ && chmod -R 777 /app/app/highlighted \
35
+ && chmod -R 777 /app/app/charts \
36
+ && chmod -R 777 /app/app/tables \
37
+ && chmod -R 777 /app/app/cache
38
+
39
+ # Expose port (HF Spaces uses 7860)
40
+ EXPOSE 7860
41
+
42
+ # Run the application
43
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
HR_TESTING_GUIDE.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VDoc-RAG Demo Testing Guide
2
+
3
+ ## What is this?
4
+ VDoc-RAG is an AI-powered document Q&A system that can:
5
+ - Extract text from PDFs (including charts and tables)
6
+ - Answer questions about uploaded documents
7
+ - Show confidence scores and source attribution
8
+
9
+ ---
10
+
11
+ ## How to Test
12
+
13
+ ### 1. Open the App
14
+ Click the link provided: `[YOUR_NGROK_URL]`
15
+
16
+ ### 2. Upload a Document
17
+ - Click **"Choose File"** and select any PDF
18
+ - Click **"Upload & Index"**
19
+ - Wait for: `✅ Uploaded and indexed [filename] (X chunks)`
20
+
21
+ ### 3. Ask Questions
22
+ Try these example questions:
23
+ - "What is this document about?"
24
+ - "Summarize the main points"
25
+ - "What are the key dates mentioned?"
26
+ - "Describe any charts or graphs"
27
+
28
+ ### 4. Review the Response
29
+ You'll see:
30
+ - **Answer**: AI-generated response
31
+ - **Sources**: Which parts of the document were used
32
+ - **Confidence Score**: How relevant the retrieved content is
33
+
34
+ ### 5. Provide Feedback
35
+ Click **✅ Correct** or **❌ Incorrect** to rate the answer
36
+
37
+ ---
38
+
39
+ ## Additional Features to Explore
40
+
41
+ | Page | What it Shows |
42
+ |------|---------------|
43
+ | `/feedback_dashboard` | Feedback statistics and model fine-tuning |
44
+ | `/benchmark_dashboard` | Embedding model evaluation metrics |
45
+
46
+ ---
47
+
48
+ ## Technical Highlights
49
+
50
+ - **RAG Pipeline**: Retrieval-Augmented Generation with ChromaDB
51
+ - **OCR**: Tesseract for text extraction from images/PDFs
52
+ - **Embeddings**: Sentence-transformers (all-MiniLM-L6-v2)
53
+ - **LLM**: Google Gemini for answer generation
54
+ - **Chart Detection**: CLIP + OpenCV for visual understanding
55
+
56
+ ---
57
+
58
+ ## Sample Test PDF
59
+ A sample document is pre-loaded. Upload your own PDF to test with real documents!
60
+
61
+ ---
62
+
63
+ *Built with FastAPI, ChromaDB, Sentence-Transformers, and Google Gemini*
README.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 📄 VDoc-RAG (Visually-Rich Document Retrieval-Augmented Generation)
2
+
3
+ VDoc-RAG is an advanced multimodal system that answers questions from visually-rich documents (PDFs, reports, flyers) by combining OCR, table and chart reasoning, semantic embeddings, and LLMs.
4
+
5
+ ---
6
+
7
+ ## 🚀 Features
8
+
9
+ - 🧠 **RAG Pipeline** with persistent ChromaDB
10
+ - 🪄 **OCR + Table + Chart understanding**
11
+ - 📊 **Chart Reasoning** (Pix2Struct + OCR-based)
12
+ - 🔐 **Environment-based API key handling**
13
+ - 🧮 **Confidence Scoring** via cosine similarity
14
+ - 🧾 **Feedback Loop** for self-improving embeddings
15
+ - 📈 **Benchmark Dashboard** for evaluating embedding models
16
+ - 💾 **Persistent Storage** (DuckDB + Parquet backend)
17
+
18
+ ---
19
+
20
+ ## ⚙️ Quickstart (Windows)
21
+
22
+ ### 1️⃣ Install Dependencies
23
+
24
+ Install:
25
+ - **Tesseract OCR** → [Tesseract Wiki](https://github.com/UB-Mannheim/tesseract/wiki)
26
+ - **Poppler for Windows** → [Poppler Releases](https://github.com/oschwartz10612/poppler-windows/releases)
27
+
28
+ Add both to your system PATH.
29
+
30
+ ### 2️⃣ Create Virtual Environment
31
+ ```bash
32
+ python -m venv venv
33
+ venv\Scripts\activate
34
+ pip install -r requirements.txt
35
+ ```
36
+
37
+ ### 3️⃣ Run the App
38
+ ```bash
39
+ uvicorn app.main:app --reload --port 8000
40
+ ```
41
+
42
+ Open → [http://127.0.0.1:8000](http://127.0.0.1:8000)
43
+
44
+ ---
45
+
46
+ ## 🖥️ Web Interfaces
47
+
48
+ | Page | Route | Description |
49
+ |------|-------|--------------|
50
+ | `/` | Main Interface | Upload, query, visualize highlights |
51
+ | `/feedback_dashboard` | Feedback Loop | View stats, fine-tune model |
52
+ | `/benchmark_dashboard` | Benchmarking | Evaluate embeddings (Precision/Recall/MRR) |
53
+
54
+ ---
55
+
56
+ ## 📁 Project Structure
57
+
58
+ ```
59
+ vdoc-rag-mvp/
60
+ ├─ app/
61
+ │ ├─ ingest.py # OCR, table & chart extraction
62
+ │ ├─ chart_reasoner.py # Chart summarization and trend detection
63
+ │ ├─ indexer.py # Persistent ChromaDB retrieval
64
+ │ ├─ reader.py # LLM question answering
65
+ │ ├─ feedback_manager.py # Feedback collection system
66
+ │ ├─ main.py # FastAPI server + dashboards
67
+ │ └─ visual_highlight.py # Highlight relevant regions
68
+
69
+ ├─ models/vdoc_feedback_tuned/ # Fine-tuned embedding model
70
+ ├─ storage/chroma_db/ # Persistent vector store
71
+ ├─ notebooks/evaluate_embeddings.ipynb # Benchmarking notebook
72
+ └─ templates/ # HTML UIs (main, feedback, benchmark)
73
+ ```
74
+
75
+ ---
76
+
77
+ ## 🧠 Models Used
78
+
79
+ | Type | Model | Purpose |
80
+ |------|--------|----------|
81
+ | Embedding | `all-MiniLM-L6-v2` (base), `multi-qa-MiniLM`, feedback-tuned variant | Semantic encoding |
82
+ | LLM Reader | Gemini / DistilGPT2 | Context-based answering |
83
+ | Chart Reasoning | Pix2Struct / OCR fallback | Visual trend analysis |
84
+ | Vector Store | ChromaDB (DuckDB + Parquet) | Persistent retrieval |
85
+ | Fine-tuning | SentenceTransformer + CosineLoss | Feedback-based learning |
86
+
87
+ ---
88
+
89
+ ## 🧩 Evaluation
90
+
91
+ - **Confidence Scoring**: cosine similarity between query & chunks
92
+ - **Precision / Recall / MRR**: benchmark dashboards & notebook
93
+ - **Feedback-driven fine-tuning**: iterative model improvement
94
+
95
+ ---
96
+
97
+ ## 🧠 Author’s Note
98
+
99
+ VDoc-RAG demonstrates how retrieval-augmented generation can evolve from plain text retrieval into **visually grounded document reasoning**, enabling future systems that can read, reason, and learn continuously.
100
+
101
+ ---
102
+
103
+ **Developed as a full multimodal RAG research framework** — suitable for academic reports, enterprise document intelligence, and AI reasoning pipelines.
104
+
105
+
106
+ Tesseract
107
+ pix2struct
108
+
109
+ sentence transformer
110
+ MiniLM-L6-v2.
111
+ Gemini API
README_HF.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: VDoc-RAG
3
+ emoji: 📄
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ # 📄 VDoc-RAG (Visually-Rich Document RAG)
12
+
13
+ An AI-powered document Q&A system that answers questions from PDFs with charts, tables, and images.
14
+
15
+ ## Features
16
+
17
+ - 🧠 **RAG Pipeline** with ChromaDB vector store
18
+ - 📊 **Chart & Table Understanding** via OCR
19
+ - 🔐 **Gemini LLM** for answer generation
20
+ - 🧮 **Confidence Scoring** via cosine similarity
21
+ - 🧾 **Feedback Loop** for improvement
22
+
23
+ ## How to Use
24
+
25
+ 1. Upload a PDF document
26
+ 2. Ask questions about the content
27
+ 3. Get AI-generated answers with sources
28
+
29
+ ## Tech Stack
30
+
31
+ - FastAPI + Uvicorn
32
+ - Sentence-Transformers (all-MiniLM-L6-v2)
33
+ - ChromaDB
34
+ - Google Gemini
35
+ - Tesseract OCR
app/cache_manager.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import hashlib
3
+ import json
4
+ import shutil
5
+
6
+ CACHE_DIR = os.path.join(os.path.dirname(__file__), "cache")
7
+ os.makedirs(CACHE_DIR, exist_ok=True)
8
+
9
+
10
+ def _hash_file(path: str) -> str:
11
+ """Compute SHA256 fingerprint for a file."""
12
+ h = hashlib.sha256()
13
+ with open(path, "rb") as f:
14
+ while True:
15
+ chunk = f.read(8192)
16
+ if not chunk:
17
+ break
18
+ h.update(chunk)
19
+ return h.hexdigest()
20
+
21
+
22
+ def get_cache_path(pdf_path: str) -> str:
23
+ fid = _hash_file(pdf_path)
24
+ return os.path.join(CACHE_DIR, f"{fid}.json")
25
+
26
+
27
+ def save_chunks_to_cache(pdf_path: str, chunks) -> str:
28
+ path = get_cache_path(pdf_path)
29
+ with open(path, "w", encoding="utf-8") as f:
30
+ json.dump(chunks, f, indent=2, ensure_ascii=False)
31
+ return path
32
+
33
+
34
+ def load_chunks_from_cache(pdf_path: str):
35
+ path = get_cache_path(pdf_path)
36
+ if os.path.exists(path):
37
+ with open(path, "r", encoding="utf-8") as f:
38
+ return json.load(f)
39
+ return None
40
+
41
+
42
+ def clear_cache() -> bool:
43
+ """Delete all cached JSON files and recreate cache directory."""
44
+ shutil.rmtree(CACHE_DIR, ignore_errors=True)
45
+ os.makedirs(CACHE_DIR, exist_ok=True)
46
+ return True
app/calibrate.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import matplotlib.pyplot as plt
3
+ from matplotlib.widgets import Slider, Button
4
+ from PIL import Image
5
+ import io
6
+ import json
7
+ import os
8
+
9
+ pdf_path = "samples/vdoc_rag_test.pdf"
10
+ config_path = "highlight_calibration.json"
11
+
12
+ # Example hits
13
+ hits = [
14
+ {"metadata": {"page": 1, "bbox": [87, 1926, 775, 1957], "type": "text"}},
15
+ {"metadata": {"page": 2, "bbox": [87, 222, 592, 250], "type": "text"}},
16
+ ]
17
+
18
+ # Load PDF
19
+ doc = fitz.open(pdf_path)
20
+
21
+ # Render both pages
22
+ pix1 = doc[0].get_pixmap(dpi=150)
23
+ pix2 = doc[1].get_pixmap(dpi=150)
24
+
25
+ img1 = Image.open(io.BytesIO(pix1.tobytes("png")))
26
+ img2 = Image.open(io.BytesIO(pix2.tobytes("png")))
27
+
28
+ # Combined figure (2 pages side-by-side)
29
+ fig, axes = plt.subplots(1, 2, figsize=(16, 10))
30
+ plt.subplots_adjust(bottom=0.25)
31
+ axes[0].imshow(img1)
32
+ axes[0].set_title("Page 1", fontsize=12)
33
+ axes[1].imshow(img2)
34
+ axes[1].set_title("Page 2", fontsize=12)
35
+ for ax in axes:
36
+ ax.axis("off")
37
+
38
+ # Keep reference sizes
39
+ img1_w, img1_h = img1.size
40
+ img2_w, img2_h = img2.size
41
+
42
+ # Prepare highlight rectangles for both pages
43
+ rects_page1, rects_page2 = [], []
44
+ for h in hits:
45
+ meta = h["metadata"]
46
+ page_idx = meta["page"] - 1
47
+ x0, y0, x1, y1 = [float(v) for v in meta["bbox"]]
48
+ rect = plt.Rectangle((x0, y0), x1 - x0, y1 - y0,
49
+ linewidth=2, edgecolor='r', facecolor='r', alpha=0.4)
50
+ if page_idx == 0:
51
+ rects_page1.append(rect)
52
+ axes[0].add_patch(rect)
53
+ elif page_idx == 1:
54
+ rects_page2.append(rect)
55
+ axes[1].add_patch(rect)
56
+
57
+ # 🎚️ Shared sliders
58
+ axcolor = 'lightgoldenrodyellow'
59
+ ax_x_offset = plt.axes([0.25, 0.12, 0.65, 0.03], facecolor=axcolor)
60
+ ax_x_scale = plt.axes([0.25, 0.09, 0.65, 0.03], facecolor=axcolor)
61
+ ax_y_offset = plt.axes([0.25, 0.06, 0.65, 0.03], facecolor=axcolor)
62
+ ax_y_scale = plt.axes([0.25, 0.03, 0.65, 0.03], facecolor=axcolor)
63
+ ax_save = plt.axes([0.85, 0.17, 0.10, 0.04])
64
+
65
+ slider_x_offset = Slider(ax_x_offset, 'X Offset', -500, 500, valinit=0, valstep=0.5)
66
+ slider_x_scale = Slider(ax_x_scale, 'X Scale', 0.3, 2.0, valinit=1.0, valstep=0.002)
67
+ slider_y_offset = Slider(ax_y_offset, 'Y Offset', -1500, 1500, valinit=0, valstep=0.5)
68
+ slider_y_scale = Slider(ax_y_scale, 'Y Scale', 0.3, 2.0, valinit=1.0, valstep=0.002)
69
+ btn_save = Button(ax_save, '💾 Save', color=axcolor, hovercolor='0.9')
70
+
71
+ def update(val):
72
+ xo, xs = slider_x_offset.val, slider_x_scale.val
73
+ yo, ys = slider_y_offset.val, slider_y_scale.val
74
+
75
+ # Page 1
76
+ for i, h in enumerate(rects_page1):
77
+ bbox = hits[0]["metadata"]["bbox"]
78
+ x0, y0, x1, y1 = [float(v) for v in bbox]
79
+ x0 = x0 * xs + xo
80
+ x1 = x1 * xs + xo
81
+ y0 = y0 * ys + yo
82
+ y1 = y1 * ys + yo
83
+ h.set_xy((x0, y1))
84
+ h.set_width(x1 - x0)
85
+ h.set_height(y0 - y1)
86
+
87
+ # Page 2
88
+ for i, h in enumerate(rects_page2):
89
+ bbox = hits[1]["metadata"]["bbox"]
90
+ x0, y0, x1, y1 = [float(v) for v in bbox]
91
+ x0 = x0 * xs + xo
92
+ x1 = x1 * xs + xo
93
+ y0 = y0 * ys + yo
94
+ y1 = y1 * ys + yo
95
+ h.set_xy((x0, y1))
96
+ h.set_width(x1 - x0)
97
+ h.set_height(y0 - y1)
98
+
99
+ fig.suptitle(
100
+ f"Xo={xo:.1f}, Xs={xs:.3f} | Yo={yo:.1f}, Ys={ys:.3f}",
101
+ fontsize=11, color='darkred'
102
+ )
103
+ fig.canvas.draw_idle()
104
+
105
+ for s in [slider_x_offset, slider_x_scale, slider_y_offset, slider_y_scale]:
106
+ s.on_changed(update)
107
+
108
+ def save_values(event):
109
+ xo, xs = slider_x_offset.val, slider_x_scale.val
110
+ yo, ys = slider_y_offset.val, slider_y_scale.val
111
+ calib = {
112
+ "x_offset": xo, "x_scale": xs,
113
+ "y_offset": yo, "y_scale": ys
114
+ }
115
+ with open(config_path, "w") as f:
116
+ json.dump(calib, f, indent=2)
117
+ print(f"✅ Saved combined calibration: {calib}")
118
+
119
+ btn_save.on_clicked(save_values)
120
+
121
+ plt.show()
app/chart_detect.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/chart_detect.py
2
+ import cv2
3
+ import os
4
+ import uuid
5
+ import numpy as np
6
+ from PIL import Image
7
+ import matplotlib.pyplot as plt
8
+
9
+ # 🗂️ Ensure charts dir exists inside project
10
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
11
+ CHARTS_DIR = os.path.join(BASE_DIR, "charts")
12
+ os.makedirs(CHARTS_DIR, exist_ok=True)
13
+
14
+ def _ensure_bgr(img_or_path):
15
+ """
16
+ Accept file path, PIL.Image, or ndarray → return OpenCV BGR ndarray.
17
+ """
18
+ if isinstance(img_or_path, str):
19
+ img = cv2.imread(img_or_path)
20
+ if img is None:
21
+ raise ValueError(f"[chart_detect] cv2.imread failed: {img_or_path}")
22
+ return img
23
+ if isinstance(img_or_path, Image.Image):
24
+ return cv2.cvtColor(np.array(img_or_path), cv2.COLOR_RGB2BGR)
25
+ if isinstance(img_or_path, np.ndarray):
26
+ return img
27
+ raise ValueError("[chart_detect] Unsupported image type.")
28
+
29
+ def detect_charts(image_or_path, min_area=15000, debug=False, visualize=False):
30
+ """
31
+ Detect chart-like rectangular regions in a page image.
32
+ Saves cropped charts into CHARTS_DIR and returns metadata list.
33
+ Each item: {"bbox": (x0,y0,x1,y1), "image_path": "<abs path>"}
34
+ """
35
+ try:
36
+ img = _ensure_bgr(image_or_path)
37
+ except Exception as e:
38
+ print("[chart_detect] load error:", e)
39
+ return []
40
+
41
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
42
+ gray = cv2.equalizeHist(gray)
43
+ blur = cv2.GaussianBlur(gray, (5, 5), 0)
44
+
45
+ # Canny edge detection — lowered thresholds for faint edges
46
+ edges = cv2.Canny(blur, 30, 100)
47
+ contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
48
+
49
+ h_img, w_img = img.shape[:2]
50
+ charts = []
51
+
52
+ if debug:
53
+ print(f"[chart_detect] Found {len(contours)} raw contours")
54
+
55
+ for c in contours:
56
+ x, y, w, h = cv2.boundingRect(c)
57
+ area = w * h
58
+ aspect = w / (h + 1e-8)
59
+
60
+ # 🔧 More forgiving filtering
61
+ if area < min_area * 0.5:
62
+ continue
63
+ if w > 0.98 * w_img or h > 0.98 * h_img:
64
+ continue
65
+ if not (0.1 < aspect < 10.0):
66
+ continue
67
+
68
+ # Merge very close bounding boxes
69
+ merged = False
70
+ for prev in charts:
71
+ px0, py0, px1, py1 = prev["bbox"]
72
+ # Overlap or close enough
73
+ if abs(x - px0) < 50 and abs(y - py0) < 50:
74
+ px0, py0 = min(px0, x), min(py0, y)
75
+ px1, py1 = max(px1, x + w), max(py1, y + h)
76
+ prev["bbox"] = (px0, py0, px1, py1)
77
+ merged = True
78
+ break
79
+ if merged:
80
+ continue
81
+
82
+ # Slight padding
83
+ pad_x = int(min(0.1 * w, 40))
84
+ pad_y = int(min(0.1 * h, 40))
85
+ x0 = max(0, x - pad_x)
86
+ y0 = max(0, y - pad_y)
87
+ x1 = min(w_img, x + w + pad_x)
88
+ y1 = min(h_img, y + h + pad_y)
89
+
90
+ crop = img[y0:y1, x0:x1]
91
+ crop_name = f"chart_{uuid.uuid4().hex}.png"
92
+ crop_path = os.path.join(CHARTS_DIR, crop_name)
93
+
94
+ try:
95
+ cv2.imwrite(crop_path, crop)
96
+ charts.append({"bbox": (x0, y0, x1, y1), "image_path": crop_path})
97
+ except Exception as e:
98
+ print(f"[chart_detect] Failed saving {crop_path}: {e}")
99
+
100
+ # Sort by size (largest first)
101
+ charts.sort(key=lambda c: (c["bbox"][2] - c["bbox"][0]) * (c["bbox"][3] - c["bbox"][1]), reverse=True)
102
+
103
+ if debug:
104
+ print(f"[chart_detect] ✅ Detected {len(charts)} likely chart(s). Saved to {CHARTS_DIR}")
105
+
106
+ # 🧠 Optional: Visualize results
107
+ if visualize:
108
+ vis = img.copy()
109
+ for c in charts:
110
+ x0, y0, x1, y1 = c["bbox"]
111
+ cv2.rectangle(vis, (x0, y0), (x1, y1), (0, 255, 0), 3)
112
+ plt.figure(figsize=(12, 10))
113
+ plt.imshow(cv2.cvtColor(vis, cv2.COLOR_BGR2RGB))
114
+ plt.title(f"Detected {len(charts)} chart(s)")
115
+ plt.axis("off")
116
+ plt.show()
117
+
118
+ return charts
119
+
120
+ # Manual debug run
121
+ if __name__ == "__main__":
122
+ test_image = "samples/vdoc_rag_test_page1.png" # example path
123
+ results = detect_charts(test_image, debug=True, visualize=True)
124
+ for r in results:
125
+ print(r)
app/chart_reasoner.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ from typing import List, Dict, Any
5
+
6
+ import pytesseract
7
+ from PIL import Image
8
+ import numpy as np
9
+
10
+ # Optional HF/Pix2Struct captioning
11
+ USE_PIX2STRUCT = False
12
+ try:
13
+ from transformers import AutoProcessor, AutoModelForVision2Seq
14
+
15
+ _pix2_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
16
+ _pix2_model = AutoModelForVision2Seq.from_pretrained("google/pix2struct-textcaps-base")
17
+ USE_PIX2STRUCT = True
18
+ print("[chart_reasoner] Pix2Struct/TextCaps available for chart captioning.")
19
+ except Exception:
20
+ USE_PIX2STRUCT = False
21
+ print("[chart_reasoner] Pix2Struct/TextCaps not available — will use OCR fallback.")
22
+ import os
23
+ import re
24
+ import json
25
+ from typing import List, Dict, Any, Optional
26
+
27
+ import pytesseract
28
+ from PIL import Image
29
+ import numpy as np
30
+ import cv2
31
+
32
+ # Optional Pix2Struct captioning
33
+ USE_PIX2STRUCT = False
34
+ try:
35
+ from transformers import AutoProcessor, AutoModelForVision2Seq
36
+
37
+ _pix2_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
38
+ _pix2_model = AutoModelForVision2Seq.from_pretrained("google/pix2struct-textcaps-base")
39
+ USE_PIX2STRUCT = True
40
+ print("[chart_reasoner] Pix2Struct/TextCaps available for chart captioning.")
41
+ except Exception:
42
+ USE_PIX2STRUCT = False
43
+ print("[chart_reasoner] Pix2Struct/TextCaps not available — will use OCR/geometric fallback.")
44
+
45
+ # Optional CLIP embeddings via sentence-transformers
46
+ USE_CLIP = False
47
+ try:
48
+ from sentence_transformers import SentenceTransformer
49
+
50
+ _clip_model = SentenceTransformer("clip-ViT-B-32")
51
+ USE_CLIP = True
52
+ print("[chart_reasoner] CLIP (sentence-transformers) available for chart embeddings.")
53
+ except Exception:
54
+ USE_CLIP = False
55
+
56
+
57
+ def preprocess_for_ocr(image_path: str) -> Image.Image:
58
+ """Enhance contrast and threshold image to improve OCR inside colored charts."""
59
+ img = cv2.imread(image_path)
60
+ if img is None:
61
+ raise ValueError(f"Could not read image: {image_path}")
62
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
63
+ gray = cv2.equalizeHist(gray)
64
+ # adaptive threshold for better text extraction
65
+ thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 21, 10)
66
+ return Image.fromarray(thresh)
67
+
68
+
69
+ def _extract_numbers_from_text(text: str) -> List[float]:
70
+ matches = re.findall(r"\(?-?\d[\d,\.\)\(]*%?", text)
71
+ nums: List[float] = []
72
+ for m in matches:
73
+ s = m.strip()
74
+ negative = False
75
+ if s.startswith("(") and s.endswith(")"):
76
+ negative = True
77
+ s = s[1:-1]
78
+ s = s.replace("%", "").replace(",", "")
79
+ try:
80
+ val = float(s)
81
+ if negative:
82
+ val = -val
83
+ nums.append(val)
84
+ except Exception:
85
+ continue
86
+ return nums
87
+
88
+
89
+ def analyze_bar_chart(image_path: str, debug_save: Optional[str] = None) -> Optional[Dict[str, Any]]:
90
+ """Detect vertical bars and compute heights to infer a simple trend.
91
+
92
+ Returns None if no bar-like contours are found.
93
+ """
94
+ img = cv2.imread(image_path)
95
+ if img is None:
96
+ return None
97
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
98
+ blur = cv2.GaussianBlur(gray, (5, 5), 0)
99
+ edges = cv2.Canny(blur, 50, 150)
100
+
101
+ contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
102
+ h_img = img.shape[0]
103
+
104
+ bars = []
105
+ for c in contours:
106
+ x, y, w, h = cv2.boundingRect(c)
107
+ # Vertical bar heuristic: taller than wide, reasonable size
108
+ if w < 6 or h < 10:
109
+ continue
110
+ aspect = h / (w + 1e-8)
111
+ if aspect < 1.2:
112
+ continue
113
+ # ignore boxes that almost cover image (likely page border)
114
+ if h > 0.9 * h_img:
115
+ continue
116
+ bars.append((x, y, w, h))
117
+
118
+ if not bars:
119
+ return None
120
+
121
+ # sort left-to-right
122
+ bars = sorted(bars, key=lambda b: b[0])
123
+ heights = [int(b[3]) for b in bars]
124
+ # normalize heights to 0-1
125
+ max_h = max(heights) if heights else 1
126
+ norm = [h / max_h for h in heights]
127
+
128
+ # trend by comparing first vs last
129
+ trend = "increasing" if heights[-1] > heights[0] else ("decreasing" if heights[-1] < heights[0] else "flat")
130
+
131
+ res = {
132
+ "bar_count": len(bars),
133
+ "heights": heights,
134
+ "normalized_heights": norm,
135
+ "trend": trend,
136
+ "bars_xywh": bars,
137
+ }
138
+
139
+ # debug: save overlay image showing detected bars
140
+ try:
141
+ if debug_save:
142
+ ov = img.copy()
143
+ for (x, y, w, h) in bars:
144
+ cv2.rectangle(ov, (x, y), (x + w, y + h), (0, 255, 0), 2)
145
+ cv2.imwrite(debug_save, ov)
146
+ except Exception:
147
+ pass
148
+
149
+ return res
150
+
151
+
152
+ def process_chart_crop(image_path: str) -> Dict[str, Any]:
153
+ """Main entry: returns a textual summary and structured analysis for a chart image."""
154
+ if not os.path.exists(image_path):
155
+ return {"summary_text": f"[Error] Chart image not found: {image_path}", "structured": {}}
156
+
157
+ pix_caption = None
158
+ if USE_PIX2STRUCT:
159
+ try:
160
+ img = Image.open(image_path).convert("RGB")
161
+ inputs = _pix2_processor(images=img, text="Describe this chart.", return_tensors="pt")
162
+ outputs = _pix2_model.generate(**inputs, max_new_tokens=128)
163
+ try:
164
+ pix_caption = _pix2_processor.decode(outputs[0], skip_special_tokens=True)
165
+ except Exception:
166
+ from transformers import AutoTokenizer
167
+
168
+ tokenizer = AutoTokenizer.from_pretrained("google/pix2struct-textcaps-base")
169
+ pix_caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
170
+ except Exception as e:
171
+ print("[chart_reasoner] Pix2Struct failed:", e)
172
+ pix_caption = None
173
+
174
+ # Geometric analysis (bars)
175
+ bar_info = None
176
+ try:
177
+ # debug overlay path (optional)
178
+ debug_overlay = None
179
+ # if an environment var set, write overlays to app/charts/debug_*
180
+ charts_dir = os.environ.get("VDOCRAG_CHARTS_DIR", os.path.join(os.path.dirname(__file__), "charts"))
181
+ if os.path.isdir(charts_dir):
182
+ debug_overlay = os.path.join(charts_dir, f"debug_{os.path.basename(image_path)}")
183
+ bar_info = analyze_bar_chart(image_path, debug_save=debug_overlay)
184
+ except Exception as e:
185
+ print("[chart_reasoner] analyze_bar_chart error:", e)
186
+ bar_info = None
187
+
188
+ # OCR with preprocessing to capture axis labels / numbers
189
+ ocr_text = ""
190
+ try:
191
+ proc_img = preprocess_for_ocr(image_path)
192
+ ocr_text = pytesseract.image_to_string(proc_img, config="--psm 6")
193
+ except Exception as e:
194
+ try:
195
+ # fallback to raw OCR
196
+ ocr_text = pytesseract.image_to_string(Image.open(image_path))
197
+ except Exception as e2:
198
+ return {"summary_text": f"[Error] OCR failure: {e} / {e2}", "structured": {}}
199
+
200
+ nums = _extract_numbers_from_text(ocr_text)
201
+ structured: Dict[str, Any] = {"ocr_text": ocr_text.strip(), "numbers": nums}
202
+
203
+ summary_parts = []
204
+ if pix_caption:
205
+ summary_parts.append(pix_caption.strip())
206
+
207
+ if ocr_text.strip():
208
+ summary_parts.append("OCR summary: " + " ".join(ocr_text.strip().split())[:300])
209
+
210
+ if bar_info:
211
+ structured.update({
212
+ "bar_count": bar_info.get("bar_count"),
213
+ "bar_heights": bar_info.get("heights"),
214
+ "bar_trend": bar_info.get("trend"),
215
+ "bars_xywh": bar_info.get("bars_xywh"),
216
+ })
217
+ summary_parts.append(f"Bar chart trend: {bar_info.get('trend')} (left→right)")
218
+
219
+ # Optional CLIP embedding for retrieval
220
+ if USE_CLIP:
221
+ try:
222
+ emb = _clip_model.encode([" ".join(summary_parts) or ocr_text], normalize_embeddings=True)[0]
223
+ structured["clip_vector"] = [float(x) for x in np.asarray(emb).tolist()]
224
+ except Exception as e:
225
+ print("[chart_reasoner] CLIP encode failed:", e)
226
+
227
+ final_summary = " | ".join(summary_parts) if summary_parts else (ocr_text.strip() or "No description available.")
228
+
229
+ return {"summary_text": final_summary, "structured": structured}
230
+
231
+
232
+ __all__ = ["process_chart_crop"]
app/debug_chunks.json ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "text_9078364bec07451fbe7900a99835907b",
4
+ "text": "VDoc RAG Test Document",
5
+ "metadata": {
6
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
7
+ "page": 1,
8
+ "bbox": null,
9
+ "type": "text"
10
+ }
11
+ },
12
+ {
13
+ "id": "text_0195b06b4eed4e3a9de12f8f73380390",
14
+ "text": "Contains Charts, Tables, and Flyers",
15
+ "metadata": {
16
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
17
+ "page": 1,
18
+ "bbox": null,
19
+ "type": "text"
20
+ }
21
+ },
22
+ {
23
+ "id": "text_c6ec891227d44d42ab809baf2469bbc8",
24
+ "text": "Sample Data Table:",
25
+ "metadata": {
26
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
27
+ "page": 1,
28
+ "bbox": null,
29
+ "type": "text"
30
+ }
31
+ },
32
+ {
33
+ "id": "text_9e2edb5e2e4f42b9a344a594b904a859",
34
+ "text": "ID Name Score Category",
35
+ "metadata": {
36
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
37
+ "page": 1,
38
+ "bbox": null,
39
+ "type": "text"
40
+ }
41
+ },
42
+ {
43
+ "id": "text_c94b49b4eedd4f4a82a76fad15610f69",
44
+ "text": "1 Alice 85 A",
45
+ "metadata": {
46
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
47
+ "page": 1,
48
+ "bbox": null,
49
+ "type": "text"
50
+ }
51
+ },
52
+ {
53
+ "id": "text_db03ed6418594c0fb3451cb7ba032342",
54
+ "text": "2 Bob 78 B",
55
+ "metadata": {
56
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
57
+ "page": 1,
58
+ "bbox": null,
59
+ "type": "text"
60
+ }
61
+ },
62
+ {
63
+ "id": "text_24e15dd936584136ad45cdb48b74f695",
64
+ "text": "3 Charlie 92 A+",
65
+ "metadata": {
66
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
67
+ "page": 1,
68
+ "bbox": null,
69
+ "type": "text"
70
+ }
71
+ },
72
+ {
73
+ "id": "text_9d77d0b5b759433d899289bb7a1b79d9",
74
+ "text": "4 David 64 C",
75
+ "metadata": {
76
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
77
+ "page": 1,
78
+ "bbox": null,
79
+ "type": "text"
80
+ }
81
+ },
82
+ {
83
+ "id": "text_3d67b96a270944bf9b35ba358d534830",
84
+ "text": "5 Eva 88 A",
85
+ "metadata": {
86
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
87
+ "page": 1,
88
+ "bbox": null,
89
+ "type": "text"
90
+ }
91
+ },
92
+ {
93
+ "id": "text_7aabcd11cb544fd99fd75c591885b5e8",
94
+ "text": "Flyer Section: Upcoming AI Workshop",
95
+ "metadata": {
96
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
97
+ "page": 1,
98
+ "bbox": null,
99
+ "type": "text"
100
+ }
101
+ },
102
+ {
103
+ "id": "text_86c5a632d7de4c258d9bced4e8de84b4",
104
+ "text": "Join us for an engaging AI Workshop covering:",
105
+ "metadata": {
106
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
107
+ "page": 1,
108
+ "bbox": null,
109
+ "type": "text"
110
+ }
111
+ },
112
+ {
113
+ "id": "text_4dd5041c98f046a499bf076093f0503a",
114
+ "text": "- Machine Learning Basics",
115
+ "metadata": {
116
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
117
+ "page": 1,
118
+ "bbox": null,
119
+ "type": "text"
120
+ }
121
+ },
122
+ {
123
+ "id": "text_30472772897c453491075424dbdf9927",
124
+ "text": "- LLM Applications",
125
+ "metadata": {
126
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
127
+ "page": 1,
128
+ "bbox": null,
129
+ "type": "text"
130
+ }
131
+ },
132
+ {
133
+ "id": "text_e7f7640c0daa4146b3a30ec41d79b3e8",
134
+ "text": "- RAG (Retrieval Augmented Generation) Systems",
135
+ "metadata": {
136
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
137
+ "page": 1,
138
+ "bbox": null,
139
+ "type": "text"
140
+ }
141
+ },
142
+ {
143
+ "id": "table_2791fed25a254479a185f8403f6ca385",
144
+ "text": "[{\"ID\": \"1\", \"Name\": \"Alice\", \"Score\": \"85\", \"Category\": \"A\"}, {\"ID\": \"2\", \"Name\": \"Bob\", \"Score\": \"78\", \"Category\": \"B\"}, {\"ID\": \"3\", \"Name\": \"Charlie\", \"Score\": \"92\", \"Category\": \"A+\"}, {\"ID\": \"4\", \"Name\": \"David\", \"Score\": \"64\", \"Category\": \"C\"}, {\"ID\": \"5\", \"Name\": \"Eva\", \"Score\": \"88\", \"Category\": \"A\"}]",
145
+ "metadata": {
146
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
147
+ "page": 1,
148
+ "bbox": null,
149
+ "type": "table"
150
+ }
151
+ },
152
+ {
153
+ "id": "text_5d5f395ca35940d7aff54b1224f51940",
154
+ "text": "Date: November 20, 2025",
155
+ "metadata": {
156
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
157
+ "page": 2,
158
+ "bbox": null,
159
+ "type": "text"
160
+ }
161
+ },
162
+ {
163
+ "id": "text_49338ecabed84edb8c71fa11dda41ff2",
164
+ "text": "Venue: Innovation Hall, Tech Park",
165
+ "metadata": {
166
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
167
+ "page": 2,
168
+ "bbox": null,
169
+ "type": "text"
170
+ }
171
+ },
172
+ {
173
+ "id": "text_dd6a5828c8e64a628e0cec709e40172e",
174
+ "text": "Register now at: www.aiworkshop2025.com",
175
+ "metadata": {
176
+ "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
177
+ "page": 2,
178
+ "bbox": null,
179
+ "type": "text"
180
+ }
181
+ }
182
+ ]
app/embeddings.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import os
3
+ import numpy as np
4
+
5
+
6
+ class TextImageEmbedder:
7
+ def __init__(self, text_model_name=None):
8
+ # Automatically load fine-tuned model if available
9
+ default_model = "all-MiniLM-L6-v2"
10
+ tuned_model = os.path.join(os.path.dirname(__file__), "..", "models", "vdoc_feedback_tuned", "latest")
11
+
12
+ if text_model_name:
13
+ model_to_use = text_model_name
14
+ elif os.path.exists(os.path.abspath(tuned_model)):
15
+ tuned_path = os.path.abspath(tuned_model)
16
+ print(f"🧠 Using fine-tuned embedding model: {tuned_path}")
17
+ model_to_use = tuned_path
18
+ else:
19
+ print(f"📦 Using base embedding model: {default_model}")
20
+ model_to_use = default_model
21
+
22
+ self.text_model = SentenceTransformer(model_to_use)
23
+
24
+ def embed_text(self, texts):
25
+ if isinstance(texts, str):
26
+ texts = [texts]
27
+ return self.text_model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
28
+
29
+ def embed_text_sync(self, text):
30
+ return self.embed_text([text])[0]
app/feedback.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "timestamp": "2025-11-10T14:19:13",
4
+ "question": "what trend does the bar graph show",
5
+ "answer": "increasing (left→right)",
6
+ "correctness": "correct",
7
+ "sources": []
8
+ },
9
+ {
10
+ "timestamp": "2026-03-11T02:25:53",
11
+ "question": "what is this document about?",
12
+ "answer": "This document is about an upcoming AI Workshop covering Machine Learning Basics, scheduled for November 20, 2025. It also contains a section describing a chart or graph with data from 2018 to 2024.",
13
+ "correctness": "correct",
14
+ "sources": []
15
+ },
16
+ {
17
+ "timestamp": "2026-03-11T02:29:29",
18
+ "question": "what is this document about?",
19
+ "answer": "This document is about an upcoming AI Workshop covering Machine Learning Basics, scheduled for November 20, 2025. It also contains a section describing a chart or graph with data from 2018 to 2024.",
20
+ "correctness": "correct",
21
+ "sources": []
22
+ },
23
+ {
24
+ "timestamp": "2026-03-11T02:29:40",
25
+ "question": "what is this document about?",
26
+ "answer": "This document is about an upcoming AI Workshop covering Machine Learning Basics, scheduled for November 20, 2025. It also contains a section describing a chart or graph with data from 2018 to 2024.",
27
+ "correctness": "correct",
28
+ "sources": []
29
+ }
30
+ ]
app/feedback_manager.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from datetime import datetime
4
+
5
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ FEEDBACK_FILE = os.path.join(BASE_DIR, "feedback.json")
7
+
8
+
9
+ def _load_feedback():
10
+ if not os.path.exists(FEEDBACK_FILE):
11
+ return []
12
+ with open(FEEDBACK_FILE, "r", encoding="utf-8") as f:
13
+ try:
14
+ return json.load(f)
15
+ except Exception:
16
+ return []
17
+
18
+
19
+ def _save_feedback(data):
20
+ with open(FEEDBACK_FILE, "w", encoding="utf-8") as f:
21
+ json.dump(data, f, indent=2, ensure_ascii=False)
22
+
23
+
24
+ def record_feedback(question, answer, correctness, sources=None):
25
+ """
26
+ Store user feedback about RAG answer correctness.
27
+ correctness: 'correct' | 'incorrect' | 'partial'
28
+ """
29
+ entry = {
30
+ "timestamp": datetime.now().isoformat(timespec="seconds"),
31
+ "question": question,
32
+ "answer": answer,
33
+ "correctness": correctness,
34
+ "sources": sources or [],
35
+ }
36
+ data = _load_feedback()
37
+ data.append(entry)
38
+ _save_feedback(data)
39
+ print(f"📝 Feedback recorded ({correctness}) for: {question[:60]}...")
40
+ return entry
41
+
42
+
43
+ def get_feedback_summary():
44
+ data = _load_feedback()
45
+ total = len(data)
46
+ if total == 0:
47
+ return "No feedback yet."
48
+ correct = sum(1 for x in data if x.get("correctness") == "correct")
49
+ incorrect = sum(1 for x in data if x.get("correctness") == "incorrect")
50
+ return f"Feedback Stats — ✅ {correct} correct, ❌ {incorrect} incorrect, total {total}"
app/highlight_calibration.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "x_offset": -33.0,
3
+ "x_scale": 1.0,
4
+ "y_offset": -65.0,
5
+ "y_scale": 1.02
6
+ }
app/indexer.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/indexer.py
2
+ import chromadb
3
+ import json
4
+ import os
5
+ import numpy as np
6
+
7
+
8
+ class ChromaIndexer:
9
+ def __init__(self, embedding_function=None, persist_directory="./storage/chroma_db"):
10
+ """
11
+ Persistent Chroma DB (DuckDB + Parquet) wrapper.
12
+ Stores vectors and metadata to disk so index survives restarts.
13
+ """
14
+ os.makedirs(persist_directory, exist_ok=True)
15
+ self.embedding_function = embedding_function
16
+ self.persist_directory = persist_directory
17
+ self.active_doc_id = None # Track currently active document
18
+
19
+ # Use the PersistentClient backed by the provided directory
20
+ try:
21
+ self.client = chromadb.PersistentClient(path=persist_directory)
22
+ except Exception:
23
+ # fallback to in-memory client if PersistentClient not available
24
+ print("[indexer] PersistentClient not available, falling back to in-memory client.")
25
+ self.client = chromadb.Client()
26
+
27
+ self.collection = self.client.get_or_create_collection(
28
+ "vdoc",
29
+ metadata={"description": "VDoc-RAG persistent storage"},
30
+ )
31
+
32
+ print(f"✅ Chroma index loaded from: {persist_directory}")
33
+
34
+ def clear(self):
35
+ """
36
+ Clear all documents from the collection.
37
+ Used for document isolation - clear before indexing new document.
38
+ """
39
+ try:
40
+ # Delete and recreate collection
41
+ self.client.delete_collection("vdoc")
42
+ self.collection = self.client.get_or_create_collection(
43
+ "vdoc",
44
+ metadata={"description": "VDoc-RAG persistent storage"},
45
+ )
46
+ self.active_doc_id = None
47
+ print("🗑️ Cleared all chunks from index (document isolation)")
48
+ except Exception as e:
49
+ print(f"[WARN] Failed to clear collection: {e}")
50
+
51
+ def set_active_document(self, doc_id: str):
52
+ """Set the currently active document for querying."""
53
+ self.active_doc_id = doc_id
54
+ print(f"📄 Active document set to: {doc_id}")
55
+
56
+ def _sanitize_metadata(self, metadata):
57
+ clean_meta = {}
58
+ for k, v in metadata.items():
59
+ if isinstance(v, (str, int, float, bool)) or v is None:
60
+ clean_meta[k] = v
61
+ else:
62
+ try:
63
+ clean_meta[k] = json.dumps(v)
64
+ except Exception:
65
+ clean_meta[k] = str(v)
66
+ return clean_meta
67
+
68
+ def upsert(self, items):
69
+ ids = [it[0] for it in items]
70
+ embeddings = [it[1] for it in items]
71
+ metadatas = [self._sanitize_metadata(it[2]) for it in items]
72
+ documents = [it[3] for it in items]
73
+
74
+ self.collection.upsert(
75
+ ids=ids,
76
+ embeddings=embeddings,
77
+ metadatas=metadatas,
78
+ documents=documents,
79
+ )
80
+ print(f"💾 Upserted {len(items)} chunks into persistent Chroma collection.")
81
+
82
+ def query(self, qvec, top_k=5, doc_id=None):
83
+ """
84
+ qvec: numpy vector or list (query embedding)
85
+ doc_id: optional document ID to filter results (for document isolation)
86
+ Returns list of {id, text, metadata, score (cosine sim 0–1)}
87
+ """
88
+ # Use provided doc_id or fall back to active document
89
+ filter_doc = doc_id or self.active_doc_id
90
+
91
+ query_params = {
92
+ "query_embeddings": [qvec],
93
+ "n_results": top_k,
94
+ "include": ["embeddings", "metadatas", "documents", "distances"],
95
+ }
96
+
97
+ # Add document filter if specified
98
+ if filter_doc:
99
+ query_params["where"] = {"doc_id": filter_doc}
100
+
101
+ res = self.collection.query(**query_params)
102
+
103
+ out = []
104
+ if not res or "ids" not in res or len(res["ids"]) == 0:
105
+ return out
106
+
107
+ qvec = np.array(qvec, dtype=np.float32)
108
+
109
+ for i in range(len(res["ids"][0])):
110
+ try:
111
+ chunk_vec = np.array(res["embeddings"][0][i], dtype=np.float32)
112
+ cos_sim = float(
113
+ np.dot(qvec, chunk_vec) / (np.linalg.norm(qvec) * np.linalg.norm(chunk_vec) + 1e-8)
114
+ )
115
+ cos_sim = max(0.0, min(1.0, cos_sim))
116
+ except Exception:
117
+ try:
118
+ dist = res.get("distances", [[0]])[0][i]
119
+ cos_sim = max(0.0, min(1.0, 1.0 - float(dist)))
120
+ except Exception:
121
+ cos_sim = 0.0
122
+
123
+ out.append({
124
+ "id": res["ids"][0][i],
125
+ "text": res.get("documents", [[None]])[0][i],
126
+ "metadata": res.get("metadatas", [[None]])[0][i],
127
+ "score": round(cos_sim, 4),
128
+ })
129
+
130
+ out.sort(key=lambda x: x["score"], reverse=True)
131
+ return out
app/ingest.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/ingest.py
2
+ """
3
+ PDF → Images → OCR text → Table extraction → Chart detection & reasoning
4
+ Generates chunks (text/table/chart) with metadata for embedding and indexing.
5
+ """
6
+
7
+ import os
8
+ import uuid
9
+ from pdf2image import convert_from_path
10
+ from PIL import Image
11
+ import pytesseract
12
+ import pdfplumber
13
+ from app.tables import extract_tables_from_pdf
14
+ from app.chart_detect import detect_charts
15
+ from app.chart_reasoner import process_chart_crop
16
+ from app.cache_manager import load_chunks_from_cache, save_chunks_to_cache
17
+
18
+ # Project-local temporary/storage directories
19
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
20
+ TMP_DIR = os.path.join(BASE_DIR, "tmp")
21
+ TABLES_DIR = os.path.join(BASE_DIR, "tables")
22
+ CHARTS_DIR = os.path.join(BASE_DIR, "charts")
23
+ os.makedirs(TMP_DIR, exist_ok=True)
24
+ os.makedirs(TABLES_DIR, exist_ok=True)
25
+ os.makedirs(CHARTS_DIR, exist_ok=True)
26
+
27
+
28
+ def pdf_to_images(pdf_path, dpi=200):
29
+ """
30
+ Convert a PDF into page-wise PNG images for OCR and visual analysis.
31
+ """
32
+ pages = convert_from_path(pdf_path, dpi=dpi)
33
+ paths = []
34
+ # Use project-local tmp directory to avoid system temp folder
35
+ os.makedirs(TMP_DIR, exist_ok=True)
36
+ for i, p in enumerate(pages, start=1):
37
+ ppath = os.path.join(TMP_DIR, f"page_{uuid.uuid4().hex}_{i}.png")
38
+ p.save(ppath, "PNG")
39
+ paths.append(ppath)
40
+ return paths
41
+
42
+
43
+ def ocr_image_to_blocks(image_path, min_words_per_line=3):
44
+ """
45
+ Run OCR on an image and merge words into line-level text blocks.
46
+ This preserves full sentences like 'Venue: Delhi Convention Hall'.
47
+ """
48
+ img = Image.open(image_path).convert("RGB")
49
+ data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, config="--oem 3 --psm 6")
50
+ n = len(data["text"])
51
+ lines = {}
52
+ for i in range(n):
53
+ txt = data["text"][i].strip()
54
+ if not txt:
55
+ continue
56
+ line_no = data["line_num"][i]
57
+ if line_no not in lines:
58
+ lines[line_no] = {"words": [], "lefts": [], "tops": [], "rights": [], "bottoms": []}
59
+ lines[line_no]["words"].append(txt)
60
+ lines[line_no]["lefts"].append(data["left"][i])
61
+ lines[line_no]["tops"].append(data["top"][i])
62
+ lines[line_no]["rights"].append(data["left"][i] + data["width"][i])
63
+ lines[line_no]["bottoms"].append(data["top"][i] + data["height"][i])
64
+
65
+ blocks = []
66
+ for ln, d in lines.items():
67
+ if len(d["words"]) < min_words_per_line:
68
+ continue
69
+ text = " ".join(d["words"]).strip()
70
+ bbox = (
71
+ min(d["lefts"]),
72
+ min(d["tops"]),
73
+ max(d["rights"]),
74
+ max(d["bottoms"]),
75
+ )
76
+ blocks.append({"text": text, "bbox": bbox})
77
+ return blocks
78
+
79
+
80
+ def process_pdf(path):
81
+ """
82
+ Process a PDF or image file:
83
+ - Extract text chunks (OCR)
84
+ - Extract tables (pdfplumber)
85
+ - Detect charts (layoutparser or OpenCV)
86
+ - Run chart reasoning model (Donut/Pix2Struct/heuristics)
87
+ Returns: list of document chunks {id, text, metadata}
88
+ """
89
+ # Check cache first
90
+ cached = load_chunks_from_cache(path)
91
+ if cached:
92
+ print(f"✅ Using cached chunks for {os.path.basename(path)}")
93
+ return cached
94
+
95
+ items = []
96
+
97
+ # 1️⃣ OCR text extraction (page images)
98
+ images = pdf_to_images(path)
99
+ for pno, imgpath in enumerate(images, start=1):
100
+ img = Image.open(imgpath).convert("RGB")
101
+ data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, config="--oem 3 --psm 6")
102
+ n = len(data["text"])
103
+
104
+ current_line = None
105
+ line_words, lefts, tops, rights, bottoms = [], [], [], [], []
106
+
107
+ for i in range(n):
108
+ text = data["text"][i].strip()
109
+ if not text:
110
+ continue
111
+ line_num = data["line_num"][i]
112
+
113
+ # Start new line if changed
114
+ if current_line is None:
115
+ current_line = line_num
116
+
117
+ if line_num != current_line:
118
+ # finalize previous line
119
+ if line_words:
120
+ doc = {
121
+ "id": f"{uuid.uuid4().hex}",
122
+ "text": " ".join(line_words),
123
+ "metadata": {
124
+ "source": path,
125
+ "page": pno,
126
+ "bbox": (min(lefts), min(tops), max(rights), max(bottoms)),
127
+ "type": "text",
128
+ },
129
+ }
130
+ items.append(doc)
131
+ # reset
132
+ current_line = line_num
133
+ line_words, lefts, tops, rights, bottoms = [], [], [], [], []
134
+
135
+ # collect current word
136
+ line_words.append(text)
137
+ lefts.append(data["left"][i])
138
+ tops.append(data["top"][i])
139
+ rights.append(data["left"][i] + data["width"][i])
140
+ bottoms.append(data["top"][i] + data["height"][i])
141
+
142
+ # flush last line
143
+ if line_words:
144
+ doc = {
145
+ "id": f"{uuid.uuid4().hex}",
146
+ "text": " ".join(line_words),
147
+ "metadata": {
148
+ "source": path,
149
+ "page": pno,
150
+ "bbox": (min(lefts), min(tops), max(rights), max(bottoms)),
151
+ "type": "text",
152
+ },
153
+ }
154
+ items.append(doc)
155
+
156
+
157
+ # 2️⃣ Table extraction (structured CSVs)
158
+ try:
159
+ tables = extract_tables_from_pdf(path)
160
+ for t in tables:
161
+ doc = {
162
+ "id": f"{uuid.uuid4().hex}",
163
+ "text": t["summary_text"],
164
+ "metadata": {
165
+ "source": path,
166
+ "page": t["page"],
167
+ "type": "table",
168
+ "csv_path": t["csv_path"],
169
+ "rows": t["rows"],
170
+ "bbox": t.get("bbox"),
171
+ },
172
+ }
173
+ items.append(doc)
174
+ except Exception as e:
175
+ print("[WARN] Table extraction failed:", e)
176
+
177
+ # 3️⃣ Chart detection + reasoning
178
+ for pno, imgpath in enumerate(images, start=1):
179
+ try:
180
+ chart_crops = detect_charts(imgpath, debug=True)
181
+ for c in chart_crops:
182
+ crop_path = c["image_path"]
183
+ bbox = c["bbox"]
184
+
185
+ # Run reasoning model or OCR heuristic
186
+ chart_res = process_chart_crop(crop_path)
187
+ summary = chart_res.get("summary_text", "Chart region detected.")
188
+ structured = chart_res.get("structured", {})
189
+
190
+ doc = {
191
+ "id": f"chart_{uuid.uuid4().hex}",
192
+ "text": summary,
193
+ "metadata": {
194
+ "source": path,
195
+ "page": pno,
196
+ "type": "chart",
197
+ "bbox": bbox,
198
+ "image_path": crop_path,
199
+ "structured": structured,
200
+ },
201
+ }
202
+ items.append(doc)
203
+
204
+ except Exception as e:
205
+ print(f"[WARN] Chart detection/reasoning failed on page {pno}:", e)
206
+
207
+ # Save to cache for future reuse
208
+ try:
209
+ save_chunks_to_cache(path, items)
210
+ print(f"💾 Cached {len(items)} chunks for {os.path.basename(path)}")
211
+ except Exception as e:
212
+ print("[WARN] Failed to save cache:", e)
213
+
214
+ return items
app/main.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uvicorn
3
+ from fastapi import FastAPI, UploadFile, File, Form, Request, HTTPException
4
+ from fastapi.responses import HTMLResponse
5
+ from fastapi.staticfiles import StaticFiles
6
+ from fastapi.templating import Jinja2Templates
7
+
8
+ from app.ingest import process_pdf
9
+ from app.indexer import ChromaIndexer
10
+ from app.embeddings import TextImageEmbedder
11
+ from app.reader import LLMReader
12
+ from app.visual_highlight import render_highlighted_pages
13
+ from app.cache_manager import clear_cache
14
+ from app.feedback_manager import record_feedback, get_feedback_summary, _load_feedback
15
+ import shutil
16
+ import subprocess
17
+ import pandas as pd
18
+ import numpy as np
19
+ import matplotlib.pyplot as plt
20
+ from io import BytesIO
21
+ import base64
22
+ from sentence_transformers import SentenceTransformer
23
+ from sklearn.metrics.pairwise import cosine_similarity
24
+
25
+ # ---------------------------------------------------------
26
+ # Initialization
27
+ # ---------------------------------------------------------
28
+ app = FastAPI(title="VDoc RAG - Web UI")
29
+
30
+ # ---------------------------------------------------------
31
+ # Directories
32
+ # ---------------------------------------------------------
33
+ # Get absolute path to this file’s directory
34
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
35
+
36
+ # Define template and static directories relative to BASE_DIR
37
+ TEMPLATE_DIR = os.path.join(BASE_DIR, "templates")
38
+ STATIC_DIR = os.path.join(BASE_DIR, "static")
39
+
40
+ # Ensure directories exist
41
+ os.makedirs(TEMPLATE_DIR, exist_ok=True)
42
+ os.makedirs(STATIC_DIR, exist_ok=True)
43
+
44
+ # Mount static directory
45
+ app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
46
+ # Serve highlighted images
47
+ HIGHLIGHTED_DIR = os.path.join(BASE_DIR, "highlighted")
48
+ os.makedirs(HIGHLIGHTED_DIR, exist_ok=True)
49
+ app.mount("/highlighted", StaticFiles(directory=HIGHLIGHTED_DIR), name="highlighted")
50
+
51
+ # Load Jinja2 templates safely
52
+ templates = Jinja2Templates(directory=TEMPLATE_DIR)
53
+
54
+ # ---------------------------------------------------------
55
+ # Core Components
56
+ # ---------------------------------------------------------
57
+ embedder = TextImageEmbedder()
58
+ # Use a project-local persistent directory for Chroma
59
+ STORAGE_DIR = os.path.join(BASE_DIR, "storage", "chroma_db")
60
+ indexer = ChromaIndexer(embedding_function=embedder.embed_text, persist_directory=STORAGE_DIR)
61
+ reader_provider = os.environ.get("VDOCRAG_READER_PROVIDER", "gemini")
62
+ reader = LLMReader(provider=reader_provider)
63
+
64
+ uploaded_files = [] # track uploaded docs for display
65
+
66
+ # ---------------------------------------------------------
67
+ # Routes
68
+ # ---------------------------------------------------------
69
+ @app.get("/", response_class=HTMLResponse)
70
+ async def home(request: Request):
71
+ """Render main upload + query interface."""
72
+ print(f"✅ Using templates from: {TEMPLATE_DIR}")
73
+ if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
74
+ print("❌ index.html not found in:", TEMPLATE_DIR)
75
+ else:
76
+ print("✅ index.html found!")
77
+
78
+ return templates.TemplateResponse(
79
+ "index.html",
80
+ {"request": request, "uploaded": uploaded_files, "answer": None},
81
+ )
82
+
83
+
84
+ @app.post("/upload")
85
+ async def upload_file(request: Request, file: UploadFile = File(...)):
86
+ """Handle PDF/image upload and indexing."""
87
+ if not file.filename.lower().endswith((".pdf", ".png", ".jpg", ".jpeg")):
88
+ raise HTTPException(status_code=400, detail="Unsupported file type")
89
+
90
+ # Save uploaded file temporarily
91
+ temp_dir = os.path.join(BASE_DIR, "uploads")
92
+ os.makedirs(temp_dir, exist_ok=True)
93
+ path = os.path.join(temp_dir, file.filename)
94
+
95
+ with open(path, "wb") as f:
96
+ f.write(await file.read())
97
+
98
+ # 🔒 Document Isolation: Clear old chunks before indexing new document
99
+ indexer.clear()
100
+ uploaded_files.clear() # Reset uploaded files list
101
+
102
+ # Extract and process text chunks
103
+ docs = process_pdf(path)
104
+ if len(docs) == 0:
105
+ return templates.TemplateResponse(
106
+ "index.html",
107
+ {
108
+ "request": request,
109
+ "uploaded": uploaded_files,
110
+ "answer": "⚠️ No content extracted from file.",
111
+ },
112
+ )
113
+
114
+ # Generate document ID for isolation
115
+ doc_id = file.filename
116
+
117
+ # Embed and index chunks with doc_id metadata
118
+ texts = [d["text"] for d in docs]
119
+ vectors = embedder.embed_text(texts)
120
+
121
+ # Add doc_id to each chunk's metadata for filtering
122
+ for d in docs:
123
+ d["metadata"]["doc_id"] = doc_id
124
+
125
+ items = [(d["id"], vectors[i].tolist(), d["metadata"], d["text"]) for i, d in enumerate(docs)]
126
+ indexer.upsert(items)
127
+
128
+ # Set this as the active document for queries
129
+ indexer.set_active_document(doc_id)
130
+
131
+ uploaded_files.append(file.filename)
132
+ print(f"✅ Indexed {len(docs)} chunks from {file.filename} (document isolation enabled)")
133
+
134
+ return templates.TemplateResponse(
135
+ "index.html",
136
+ {
137
+ "request": request,
138
+ "uploaded": uploaded_files,
139
+ "answer": f"✅ Uploaded and indexed {file.filename} ({len(docs)} chunks).",
140
+ },
141
+ )
142
+
143
+
144
+ @app.post("/ask")
145
+ async def ask_question(request: Request, question: str = Form(...)):
146
+ """Handle user query, retrieve relevant chunks, and generate LLM answer."""
147
+ # Step 1 — Embed question
148
+ qvec = embedder.embed_text([question])[0]
149
+
150
+ # Step 2 — Retrieve top chunks
151
+ hits = indexer.query(qvec, top_k=10)
152
+
153
+ # Debug log
154
+ print("\n🔍 Retrieved Chunks for Query:", question)
155
+ for i, h in enumerate(hits):
156
+ meta = h.get("metadata", {})
157
+ conf = h.get("score", 0)
158
+ print(f"Chunk {i+1}: Page {meta.get('page')} | BBox: {meta.get('bbox')} | Confidence: {conf*100:.1f}%")
159
+ print(f"Text: {h['text'][:500]}...\n")
160
+
161
+ # Prioritize chart-type hits for chart-related questions
162
+ chart_keywords = ["chart", "graph", "trend", "plot", "increase", "decrease", "growth"]
163
+ if any(k in question.lower() for k in chart_keywords):
164
+ try:
165
+ hits = sorted(hits, key=lambda h: h.get("metadata", {}).get("type") != "chart")
166
+ print("[INFO] Prioritized chart-type chunks for chart-related question.")
167
+ except Exception as e:
168
+ print("[WARN] Failed to prioritize chart hits:", e)
169
+
170
+ # Step 3 — Build context string
171
+ context_blocks = [
172
+ f"[{i+1}] {h['text']} (page: {h['metadata'].get('page')}, bbox: {h['metadata'].get('bbox')})"
173
+ for i, h in enumerate(hits)
174
+ ]
175
+ context = "\n".join(context_blocks)
176
+
177
+ # Step 4 — Ask LLM
178
+ answer = reader.answer_question(query=question, context=context, sources=hits)
179
+ sources = answer.get("sources", [])
180
+
181
+ # 🖼️ Generate visual highlights
182
+ try:
183
+ first_source_path = hits[0]["metadata"].get("source") if hits else None
184
+ highlight_paths = []
185
+ if first_source_path and os.path.exists(first_source_path):
186
+ highlight_paths = render_highlighted_pages(first_source_path, hits)
187
+ # convert to web URLs for template
188
+ highlight_urls = ["/" + os.path.relpath(p, BASE_DIR).replace("\\", "/") for p in highlight_paths]
189
+ else:
190
+ highlight_urls = []
191
+ except Exception as e:
192
+ print("[WARN] Highlight rendering failed:", e)
193
+ highlight_urls = []
194
+
195
+ # Step 5 — Prepare chunk previews for UI
196
+ chunk_previews = [
197
+ {
198
+ "index": i + 1,
199
+ "page": h["metadata"].get("page"),
200
+ "bbox": h["metadata"].get("bbox"),
201
+ "text": h["text"][:300] + ("..." if len(h["text"]) > 300 else ""),
202
+ "confidence": round(h.get("score", 0) * 100, 1),
203
+ }
204
+ for i, h in enumerate(hits)
205
+ ]
206
+
207
+ # Average confidence for the retrieved set
208
+ avg_conf = sum(h.get("score", 0) for h in hits) / max(len(hits), 1)
209
+
210
+ # Step 6 — Render page
211
+ return templates.TemplateResponse(
212
+ "index.html",
213
+ {
214
+ "request": request,
215
+ "uploaded": uploaded_files,
216
+ "answer": answer["text"],
217
+ "question": question,
218
+ "sources": sources,
219
+ "chunks": chunk_previews,
220
+ "highlight_images": highlight_urls,
221
+ "confidence_avg": round(avg_conf * 100, 1),
222
+ },
223
+ )
224
+
225
+
226
+ @app.post("/clear_cache")
227
+ async def clear_cache_route(request: Request):
228
+ """Clear all cached chunk data and re-render the index with a message."""
229
+ clear_cache()
230
+ return templates.TemplateResponse(
231
+ "index.html",
232
+ {
233
+ "request": request,
234
+ "uploaded": uploaded_files,
235
+ "answer": "🧹 Cache cleared successfully!",
236
+ },
237
+ )
238
+
239
+
240
+ @app.post("/clear_index")
241
+ async def clear_index(request: Request):
242
+ """Clear the persistent Chroma index by deleting the storage directory."""
243
+ storage_dir = os.path.join(BASE_DIR, "storage", "chroma_db")
244
+ try:
245
+ shutil.rmtree(storage_dir, ignore_errors=True)
246
+ os.makedirs(storage_dir, exist_ok=True)
247
+ # Reinitialize indexer client to the new empty DB
248
+ global indexer
249
+ indexer = ChromaIndexer(embedding_function=embedder.embed_text, persist_directory=storage_dir)
250
+ except Exception as e:
251
+ print("[WARN] clear_index failed:", e)
252
+ return templates.TemplateResponse(
253
+ "index.html",
254
+ {"request": request, "uploaded": uploaded_files, "answer": "🧹 Chroma index cleared successfully!"},
255
+ )
256
+
257
+
258
+ @app.post("/feedback")
259
+ async def feedback(request: Request, question: str = Form(...), answer: str = Form(...), correctness: str = Form(...)):
260
+ """Record user feedback (correct / incorrect) for RAG answers."""
261
+ try:
262
+ record_feedback(question=question, answer=answer, correctness=correctness)
263
+ summary = get_feedback_summary()
264
+ msg = f"✅ Feedback received! {summary}"
265
+ except Exception as e:
266
+ print("[WARN] Failed to record feedback:", e)
267
+ msg = "⚠️ Failed to record feedback"
268
+
269
+ return templates.TemplateResponse(
270
+ "index.html",
271
+ {"request": request, "uploaded": uploaded_files, "answer": msg},
272
+ )
273
+
274
+
275
+ @app.get("/feedback_dashboard", response_class=HTMLResponse)
276
+ async def feedback_dashboard(request: Request):
277
+ """Display feedback statistics and allow fine-tuning."""
278
+ data = _load_feedback()
279
+ summary = get_feedback_summary()
280
+ total = len(data)
281
+ correct = sum(1 for x in data if x.get("correctness") == "correct")
282
+ incorrect = sum(1 for x in data if x.get("correctness") == "incorrect")
283
+
284
+ return templates.TemplateResponse(
285
+ "feedback_dashboard.html",
286
+ {
287
+ "request": request,
288
+ "summary": summary,
289
+ "total": total,
290
+ "correct": correct,
291
+ "incorrect": incorrect,
292
+ "feedback_data": data[::-1][:50], # show latest 50
293
+ },
294
+ )
295
+
296
+
297
+ @app.post("/train_feedback_model")
298
+ async def train_feedback_model(request: Request):
299
+ """Run fine-tuning script directly from the UI."""
300
+ script_path = os.path.join(BASE_DIR, "..", "train_feedback_embeddings.py")
301
+
302
+ try:
303
+ print(f"🚀 Launching fine-tuning process: {script_path}")
304
+ process = subprocess.run(
305
+ ["python", script_path],
306
+ capture_output=True,
307
+ text=True,
308
+ check=True,
309
+ )
310
+ output = process.stdout[-1000:]
311
+ message = "✅ Fine-tuning complete. Model updated successfully!"
312
+ except subprocess.CalledProcessError as e:
313
+ output = e.stderr or str(e)
314
+ message = "❌ Fine-tuning failed."
315
+
316
+ return templates.TemplateResponse(
317
+ "feedback_dashboard.html",
318
+ {
319
+ "request": request,
320
+ "summary": get_feedback_summary(),
321
+ "feedback_data": _load_feedback()[::-1][:50],
322
+ "train_output": output,
323
+ "message": message,
324
+ },
325
+ )
326
+
327
+
328
+ @app.get("/benchmark_dashboard", response_class=HTMLResponse)
329
+ async def benchmark_dashboard(request: Request):
330
+ """Render model benchmarking interface."""
331
+ return templates.TemplateResponse(
332
+ "benchmark_dashboard.html",
333
+ {
334
+ "request": request,
335
+ "results": None,
336
+ "plot_precision": None,
337
+ "plot_recall": None,
338
+ "plot_mrr": None,
339
+ },
340
+ )
341
+
342
+
343
+ @app.post("/run_benchmark")
344
+ async def run_benchmark(request: Request, models: str = Form(...), chunk_size: int = Form(200), top_k: int = Form(5)):
345
+ """
346
+ Run embedding benchmark across provided models using stored feedback data.
347
+ """
348
+ data = _load_feedback()
349
+ if not data:
350
+ return templates.TemplateResponse(
351
+ "benchmark_dashboard.html",
352
+ {
353
+ "request": request,
354
+ "results": [],
355
+ "message": "⚠️ No feedback data available for benchmarking.",
356
+ },
357
+ )
358
+
359
+ queries = [f["question"] for f in data]
360
+ answers = [f["answer"] for f in data]
361
+ MODELS = [m.strip() for m in models.split(",") if m.strip()]
362
+
363
+ PDF_PATH = os.path.join(BASE_DIR, "samples", "vdoc_rag_test.pdf")
364
+ try:
365
+ raw_chunks = [d["text"] for d in process_pdf(PDF_PATH)]
366
+ except Exception as e:
367
+ print("[WARN] Could not process sample PDF for benchmark, falling back to small corpus:", e)
368
+ raw_chunks = [
369
+ "Yearly sales have been increasing steadily from 2018 to 2024, with a notable jump in 2021.",
370
+ "Charlie achieved the highest score in the table with 98 points.",
371
+ "The event will be held on November 20, 2025 at the downtown auditorium.",
372
+ ]
373
+
374
+ # Split raw_chunks into sub-chunks by character length
375
+ chunks = []
376
+ for ch in raw_chunks:
377
+ for i in range(0, len(ch), chunk_size):
378
+ chunks.append(ch[i : i + chunk_size])
379
+
380
+ results = []
381
+ for model_name in MODELS:
382
+ try:
383
+ print(f"🧠 Evaluating {model_name}...")
384
+ model = SentenceTransformer(model_name)
385
+ chunk_embeddings = model.encode(chunks, normalize_embeddings=True, show_progress_bar=False)
386
+ except Exception as e:
387
+ print(f"[ERROR] Failed to load model {model_name}:", e)
388
+ continue
389
+
390
+ precision_scores, recall_scores, mrr_scores = [], [], []
391
+
392
+ for q, ans in zip(queries, answers):
393
+ qvec = model.encode([q], normalize_embeddings=True)
394
+ sims = cosine_similarity(qvec, chunk_embeddings)[0]
395
+ top_idx = np.argsort(sims)[::-1][:top_k]
396
+ retrieved = [chunks[i] for i in top_idx]
397
+ relevant = [1 if ans.lower() in c.lower() else 0 for c in retrieved]
398
+ precision = sum(relevant) / top_k
399
+ recall = sum(relevant) / max(1, len([c for c in chunks if ans.lower() in c.lower()]))
400
+ mrr = 0
401
+ for rank, rel in enumerate(relevant, start=1):
402
+ if rel:
403
+ mrr = 1 / rank
404
+ break
405
+ precision_scores.append(precision)
406
+ recall_scores.append(recall)
407
+ mrr_scores.append(mrr)
408
+
409
+ results.append({
410
+ "model": model_name,
411
+ "precision": round(np.mean(precision_scores), 3),
412
+ "recall": round(np.mean(recall_scores), 3),
413
+ "mrr": round(np.mean(mrr_scores), 3),
414
+ })
415
+
416
+ df = pd.DataFrame(results)
417
+ print(df)
418
+
419
+ def make_plot(metric):
420
+ plt.figure(figsize=(6, 4))
421
+ plt.barh(df["model"], df[metric], color="skyblue")
422
+ plt.title(f"{metric.upper()} Comparison")
423
+ plt.xlabel(metric.upper())
424
+ plt.tight_layout()
425
+ buf = BytesIO()
426
+ plt.savefig(buf, format="png")
427
+ buf.seek(0)
428
+ img_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
429
+ plt.close()
430
+ return f"data:image/png;base64,{img_base64}"
431
+
432
+ plot_precision = make_plot("precision") if not df.empty else None
433
+ plot_recall = make_plot("recall") if not df.empty else None
434
+ plot_mrr = make_plot("mrr") if not df.empty else None
435
+
436
+ return templates.TemplateResponse(
437
+ "benchmark_dashboard.html",
438
+ {
439
+ "request": request,
440
+ "results": results,
441
+ "plot_precision": plot_precision,
442
+ "plot_recall": plot_recall,
443
+ "plot_mrr": plot_mrr,
444
+ },
445
+ )
446
+
447
+ # ---------------------------------------------------------
448
+ # Run app
449
+ # ---------------------------------------------------------
450
+ if __name__ == "__main__":
451
+ uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
app/reader.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict
3
+ from dotenv import dotenv_values
4
+ from transformers import pipeline
5
+
6
+ # --- Load from .env first, then fall back to system environment (for cloud deployment) ---
7
+ env_vars = dotenv_values(".env") # returns a dict from the .env file
8
+
9
+ def get_env(key, default=None):
10
+ """Get env var from .env first, then system environment"""
11
+ return env_vars.get(key) or os.environ.get(key) or default
12
+
13
+ # --- FIX: Use the correct modern SDK import (google-genai) and initialize client ---
14
+ genai = None
15
+ _gemini_client = None
16
+ _api_key = get_env("GEMINI_API_KEY")
17
+ try:
18
+ from google import genai
19
+ from google.genai import types
20
+
21
+ # Initialize client - check both .env and system env for cloud deployment
22
+ if _api_key:
23
+ _gemini_client = genai.Client(api_key=_api_key)
24
+ except ImportError:
25
+ pass
26
+ except Exception as e:
27
+ print(f"Warning: Failed to initialize Gemini client. Check API key/configuration. Error: {e}")
28
+
29
+ class LLMReader:
30
+ """
31
+ LLM Reader using Google Gemini (via GEMINI_API_KEY from .env or environment)
32
+ Falls back to a local small model if unavailable.
33
+ """
34
+
35
+ def __init__(self, provider: str = "gemini"):
36
+ self.provider = provider.lower()
37
+
38
+ # Load from .env or system environment (for cloud deployment)
39
+ self.model = get_env("VDOCRAG_LLM_MODEL", "gemini-2.5-flash")
40
+ self.api_key = get_env("GEMINI_API_KEY")
41
+ self.client = _gemini_client
42
+ self.local_pipeline = None
43
+
44
+ print("=" * 50)
45
+ print(f"LLMReader Init: Loading GEMINI_API_KEY...")
46
+ if self.api_key:
47
+ print(f"LLMReader Init: SUCCESS. Key prefix: {self.api_key[:4]}...{self.api_key[-4:]}")
48
+ else:
49
+ print(f"LLMReader Init: FAILED. GEMINI_API_KEY not found.")
50
+ print("=" * 50)
51
+
52
+ if self.provider == "gemini":
53
+ # Check for API key first - if missing, fall back to local
54
+ if not self.api_key:
55
+ print("⚠️ No GEMINI_API_KEY found, switching to local model.")
56
+ self.provider = "local"
57
+ elif genai is None:
58
+ raise ImportError("Please install the modern Google GenAI SDK: `pip install google-genai`.")
59
+ elif self.client is None:
60
+ print("⚠️ Failed to initialize Gemini client, switching to local model.")
61
+ self.provider = "local"
62
+
63
+ if self.provider == "local":
64
+ print(f"Loading local model: distilgpt2...")
65
+ self.local_pipeline = pipeline("text-generation", model="distilgpt2")
66
+
67
+ if self.provider not in ("gemini", "local"):
68
+ print(f"⚠️ Unknown provider '{self.provider}', defaulting to local.")
69
+ self.provider = "local"
70
+ if self.local_pipeline is None:
71
+ print(f"Loading local model: distilgpt2...")
72
+ self.local_pipeline = pipeline("text-generation", model="distilgpt2")
73
+
74
+ # --------------------------
75
+ # Gemini call (modern SDK)
76
+ # --------------------------
77
+ def _call_gemini(self, query: str, context: str) -> str:
78
+ system_prompt = (
79
+ "You are a precise data analysis assistant. "
80
+ "Given the provided CONTEXT, answer the user's QUESTION accurately. "
81
+ "If calculations are needed, perform them. "
82
+ "Only respond with the final answer and no additional commentary or explanation."
83
+ )
84
+
85
+ user_content = f"CONTEXT:\n---\n{context}\n---\nQUESTION: {query}"
86
+
87
+ try:
88
+ config = types.GenerateContentConfig(
89
+ system_instruction=system_prompt,
90
+ temperature=0.1
91
+ )
92
+ response = self.client.models.generate_content(
93
+ model=self.model,
94
+ contents=user_content,
95
+ config=config
96
+ )
97
+ return response.text.strip()
98
+ except Exception as e:
99
+ return f"[Gemini API Error] {type(e).__name__}: {e}"
100
+
101
+ # --------------------------
102
+ # Local fallback
103
+ # --------------------------
104
+ def _call_local(self, query: str, context: str) -> str:
105
+ prompt = (
106
+ f"CONTEXT:\n{context}\n\n"
107
+ f"Based on the context, answer the following question:\n"
108
+ f"QUESTION: {query}\n"
109
+ f"ANSWER:"
110
+ )
111
+
112
+ result = self.local_pipeline(
113
+ prompt,
114
+ max_new_tokens=100,
115
+ do_sample=True,
116
+ truncation=True
117
+ )
118
+ generated_text = result[0]["generated_text"]
119
+ answer = generated_text[len(prompt):].strip()
120
+
121
+ if not answer or context in answer:
122
+ return "[Local model failed to generate a new answer and may have repeated the context]"
123
+ return answer
124
+
125
+ # --------------------------
126
+ # Main answer method
127
+ # --------------------------
128
+ def answer_question(self, query: str, context: str, sources: List[Dict]) -> Dict:
129
+ if self.provider == "gemini":
130
+ answer_text = self._call_gemini(query, context)
131
+ elif self.provider == "local":
132
+ answer_text = self._call_local(query, context)
133
+ else:
134
+ answer_text = f"[Error: Unknown provider '{self.provider}']"
135
+
136
+ provenance = [
137
+ {
138
+ "page": s["metadata"].get("page"),
139
+ "text": s["text"][:200],
140
+ "score": s.get("score", 0),
141
+ }
142
+ for s in sources
143
+ ]
144
+
145
+ return {"text": answer_text, "sources": provenance}
app/tables.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/tables.py
2
+ import os
3
+ import uuid
4
+ import pdfplumber
5
+ import pandas as pd
6
+ from typing import List, Dict
7
+
8
+ TABLES_DIR = os.environ.get('VDOCRAG_TABLES_DIR', '/tmp/vdoc_tables')
9
+ os.makedirs(TABLES_DIR, exist_ok=True)
10
+
11
+ def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
12
+ """
13
+ Extract tables using pdfplumber and save each as CSV. Returns a list of metadata dicts:
14
+ [{ 'csv_path': str, 'page': int, 'table_index': int, 'summary_text': str }]
15
+ """
16
+ results = []
17
+ with pdfplumber.open(pdf_path) as pdf:
18
+ for pno, page in enumerate(pdf.pages, start=1):
19
+ try:
20
+ tables = page.extract_tables()
21
+ except Exception:
22
+ tables = []
23
+ for tidx, table in enumerate(tables):
24
+ # Convert to DataFrame
25
+ try:
26
+ df = pd.DataFrame(table[1:], columns=table[0]) if len(table) > 1 else pd.DataFrame(table)
27
+ except Exception:
28
+ df = pd.DataFrame(table)
29
+
30
+ fname = f"table_{uuid.uuid4().hex}_p{pno}_t{tidx}.csv"
31
+ csv_path = os.path.join(TABLES_DIR, fname)
32
+ # Save CSV
33
+ try:
34
+ df.to_csv(csv_path, index=False)
35
+ except Exception:
36
+ df.to_csv(csv_path, index=False, encoding='utf-8', errors='ignore')
37
+
38
+ # Get table bbox (approximate)
39
+ try:
40
+ # Each table has a bounding box in page._objects['rects'] or use the table extractor
41
+ table_bbox = page.find_tables()[tidx - 1].bbox # (x0, top, x1, bottom)
42
+ except Exception:
43
+ table_bbox = None
44
+
45
+ # create a short textual summary: columns and first N rows
46
+ cols = list(df.columns) if len(df.columns) > 0 else []
47
+ top_rows = df.head(5).to_dict(orient='records')
48
+ summary = f"Table (page {pno}) with columns: {cols}. First rows: {top_rows}"
49
+
50
+ results.append({
51
+ 'csv_path': csv_path,
52
+ 'page': pno,
53
+ 'table_index': tidx,
54
+ 'summary_text': summary,
55
+ 'rows': len(df),
56
+ 'bbox': table_bbox
57
+ })
58
+ return results
app/templates/benchmark_dashboard.html ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>📊 Embedding Benchmark Dashboard</title>
6
+ <script src="https://cdn.tailwindcss.com"></script>
7
+ </head>
8
+ <body class="bg-blue-50 text-blue-800 min-h-screen flex flex-col items-center py-8">
9
+ <div class="bg-white shadow-lg rounded-xl p-8 w-full max-w-5xl">
10
+ <h1 class="text-2xl font-bold text-center mb-6">📊 Embedding Model Benchmark</h1>
11
+
12
+ <form action="/run_benchmark" method="post" class="space-y-3 mb-6">
13
+ <label class="block text-sm font-semibold">Enter model names (comma-separated)</label>
14
+ <input type="text" name="models"
15
+ value="all-MiniLM-L6-v2, multi-qa-MiniLM-L6-cos-v1, models/vdoc_feedback_tuned/latest"
16
+ class="border rounded w-full p-2 focus:outline-none focus:ring-2 focus:ring-blue-400"
17
+ required>
18
+
19
+ <div class="flex space-x-4">
20
+ <div class="flex-1">
21
+ <label class="block text-sm font-semibold">Chunk Size</label>
22
+ <input type="number" name="chunk_size" value="200" class="border p-2 rounded w-full">
23
+ </div>
24
+ <div class="flex-1">
25
+ <label class="block text-sm font-semibold">Top-K</label>
26
+ <input type="number" name="top_k" value="5" class="border p-2 rounded w-full">
27
+ </div>
28
+ </div>
29
+
30
+ <button type="submit"
31
+ class="bg-blue-600 text-white px-6 py-2 rounded hover:bg-blue-700 w-full mt-3">
32
+ 🚀 Run Benchmark
33
+ </button>
34
+ </form>
35
+
36
+ {% if results %}
37
+ <h2 class="text-xl font-semibold mb-4">📈 Results</h2>
38
+ <table class="w-full border border-gray-300 text-sm mb-6">
39
+ <thead class="bg-blue-100 text-blue-800">
40
+ <tr>
41
+ <th class="border px-3 py-1 text-left">Model</th>
42
+ <th class="border px-3 py-1">Precision</th>
43
+ <th class="border px-3 py-1">Recall</th>
44
+ <th class="border px-3 py-1">MRR</th>
45
+ </tr>
46
+ </thead>
47
+ <tbody>
48
+ {% for r in results %}
49
+ <tr class="Border-b hover:bg-blue-50">
50
+ <td class="px-3 py-1">{{ r.model }}</td>
51
+ <td class="px-3 py-1 text-center">{{ r.precision }}</td>
52
+ <td class="px-3 py-1 text-center">{{ r.recall }}</td>
53
+ <td class="px-3 py-1 text-center">{{ r.mrr }}</td>
54
+ </tr>
55
+ {% endfor %}
56
+ </tbody>
57
+ </table>
58
+
59
+ <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
60
+ {% if plot_precision %}
61
+ <img src="{{ plot_precision }}" class="rounded shadow">
62
+ {% endif %}
63
+ {% if plot_recall %}
64
+ <img src="{{ plot_recall }}" class="rounded shadow">
65
+ {% endif %}
66
+ {% if plot_mrr %}
67
+ <img src="{{ plot_mrr }}" class="rounded shadow">
68
+ {% endif %}
69
+ </div>
70
+ {% endif %}
71
+
72
+ {% if message %}
73
+ <p class="text-center text-red-700 font-semibold">{{ message }}</p>
74
+ {% endif %}
75
+
76
+ <div class="text-center mt-8">
77
+ <a href="/" class="text-blue-600 hover:underline">← Back to Main Interface</a>
78
+ </div>
79
+ </div>
80
+ </body>
81
+ </html>
app/templates/feedback_dashboard.html ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>🧠 VDoc Feedback Dashboard</title>
6
+ <script src="https://cdn.tailwindcss.com"></script>
7
+ </head>
8
+ <body class="bg-blue-50 text-blue-800 min-h-screen flex flex-col items-center py-8">
9
+ <div class="bg-white shadow-lg rounded-xl p-8 w-full max-w-4xl">
10
+ <h1 class="text-2xl font-bold mb-4 text-center">🧠 Feedback Dashboard</h1>
11
+
12
+ <p class="text-center text-gray-600 mb-4">{{ summary }}</p>
13
+
14
+ <div class="grid grid-cols-3 gap-4 text-center mb-6">
15
+ <div class="bg-green-100 p-3 rounded-lg">
16
+ <p class="text-xl font-bold text-green-700">{{ correct }}</p>
17
+ <p class="text-sm text-green-800">Correct</p>
18
+ </div>
19
+ <div class="bg-red-100 p-3 rounded-lg">
20
+ <p class="text-xl font-bold text-red-700">{{ incorrect }}</p>
21
+ <p class="text-sm text-red-800">Incorrect</p>
22
+ </div>
23
+ <div class="bg-blue-100 p-3 rounded-lg">
24
+ <p class="text-xl font-bold text-blue-700">{{ total }}</p>
25
+ <p class="text-sm text-blue-800">Total Feedback</p>
26
+ </div>
27
+ </div>
28
+
29
+ <!-- Train model -->
30
+ <form action="/train_feedback_model" method="post" class="text-center mb-8">
31
+ <button type="submit"
32
+ class="bg-blue-600 text-white px-6 py-2 rounded hover:bg-blue-700">
33
+ 🚀 Train Model from Feedback
34
+ </button>
35
+ </form>
36
+
37
+ {% if message %}
38
+ <div class="bg-gray-50 border-l-4 border-blue-400 p-3 mb-6">
39
+ <p class="text-gray-700 font-medium">{{ message }}</p>
40
+ <pre class="text-xs text-gray-600 mt-2 whitespace-pre-wrap">{{ train_output }}</pre>
41
+ </div>
42
+ {% endif %}
43
+
44
+ <!-- Feedback log -->
45
+ <h2 class="text-xl font-semibold mb-3">📜 Recent Feedback</h2>
46
+ <div class="max-h-96 overflow-y-auto border rounded p-3 bg-gray-50">
47
+ {% for fb in feedback_data %}
48
+ <div class="mb-3 border-b pb-2">
49
+ <p class="text-sm"><strong>🕓</strong> {{ fb.timestamp }}</p>
50
+ <p class="text-sm"><strong>❓</strong> {{ fb.question }}</p>
51
+ <p class="text-sm"><strong>💬</strong> {{ fb.answer }}</p>
52
+ <p class="text-sm">
53
+ <strong>✅</strong> {{ fb.correctness|capitalize }}
54
+ </p>
55
+ </div>
56
+ {% endfor %}
57
+ </div>
58
+
59
+ <div class="text-center mt-8">
60
+ <a href="/" class="text-blue-600 hover:underline">← Back to Main Interface</a>
61
+ </div>
62
+ </div>
63
+ </body>
64
+ </html>
app/templates/index.html ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>VDoc RAG - Web UI</title>
6
+ <script src="https://cdn.tailwindcss.com"></script>
7
+ </head>
8
+ <body class="bg-blue-100 text-blue-800 min-h-screen flex flex-col items-center justify-center">
9
+ <div class="bg-white shadow-lg rounded-xl p-8 w-full max-w-2xl">
10
+ <h1 class="text-2xl font-bold text-center mb-6 text-blue-800">📄 VDoc RAG Web Interface</h1>
11
+
12
+ <!-- Upload Form -->
13
+ <form action="/upload" method="post" enctype="multipart/form-data" class="flex flex-col items-center space-y-3 mb-6">
14
+ <input type="file" name="file" accept=".pdf,.png,.jpg,.jpeg" required class="border p-2 rounded w-full">
15
+ <button type="submit" class="bg-blue-600 text-white px-4 py-2 rounded hover:bg-blue-700">Upload & Index</button>
16
+ </form>
17
+
18
+ <div class="mt-6 text-center">
19
+ <a href="/feedback_dashboard"
20
+ class="text-blue-700 font-semibold hover:underline">
21
+ 🧠 Open Feedback Dashboard
22
+ </a>
23
+ </div>
24
+
25
+ <div class="mt-3 text-center">
26
+ <a href="/benchmark_dashboard"
27
+ class="text-blue-700 font-semibold hover:underline">
28
+ 📊 Open Benchmark Dashboard
29
+ </a>
30
+ </div>
31
+
32
+ <!-- Cache Clear Button -->
33
+ <form action="/clear_cache" method="post" class="mb-6">
34
+ <button type="submit"
35
+ class="bg-red-600 text-white px-4 py-2 rounded hover:bg-red-700 w-full">
36
+ 🧹 Clear Cache
37
+ </button>
38
+ </form>
39
+
40
+ <!-- Clear Persistent Index Button -->
41
+ <form action="/clear_index" method="post" class="mb-6">
42
+ <button type="submit"
43
+ class="bg-orange-600 text-white px-4 py-2 rounded hover:bg-orange-700 w-full">
44
+ 🗑️ Clear Persistent Index
45
+ </button>
46
+ </form>
47
+
48
+ {% if uploaded %}
49
+ <p class="text-green-600 font-semibold mb-4">Uploaded files: {{ uploaded|join(', ') }}</p>
50
+ {% endif %}
51
+
52
+ <!-- Ask Question -->
53
+ <form action="/ask" method="post" class="space-y-3 mb-4">
54
+ <input type="text" name="question" placeholder="Ask a question about your document..." required
55
+ class="border rounded w-full p-2 focus:outline-none focus:ring-2 focus:ring-blue-400">
56
+ <button type="submit" class="bg-blue-600 text-white px-4 py-2 rounded hover:bg-blue-700 w-full">Ask</button>
57
+ </form>
58
+
59
+ {% if chunks %}
60
+ <div style="margin-top: 2em;">
61
+ <h3>🔍 Retrieved Chunks (Used in Prompt)</h3>
62
+ <ul>
63
+ {% for c in chunks %}
64
+ <li class="border-b border-gray-200 py-2">
65
+ <strong>[{{ c.index }}]</strong>
66
+ (Page: {{ c.page }}, BBox: {{ c.bbox }})<br>
67
+ <code>{{ c.text }}</code><br>
68
+ <span class="text-sm text-gray-500">🔹 Confidence: {{ c.confidence }}%</span>
69
+ </li>
70
+ {% endfor %}
71
+ </ul>
72
+ </div>
73
+ {% endif %}
74
+
75
+ <!-- Answer Section -->
76
+ {% if answer %}
77
+ <div class="bg-blue-50 border rounded-lg p-4 mt-4">
78
+ <h2 class="text-lg font-semibold text-blue-700 mb-2">Answer:</h2>
79
+ <p>{{ answer }}</p>
80
+ {% if sources %}
81
+ <h3 class="font-semibold mt-3">Sources:</h3>
82
+ <ul class="list-disc list-inside text-sm text-blue-700">
83
+ {% for s in sources %}
84
+ <li>Page {{ s.page }} → {{ s.text[:100] }}...</li>
85
+ {% endfor %}
86
+ </ul>
87
+ {% endif %}
88
+ <!-- Feedback Section -->
89
+ <form action="/feedback" method="post" class="mt-3 flex space-x-2">
90
+ <input type="hidden" name="question" value="{{ question }}">
91
+ <input type="hidden" name="answer" value="{{ answer }}">
92
+ <button type="submit" name="correctness" value="correct"
93
+ class="bg-green-600 text-white px-3 py-1 rounded hover:bg-green-700">
94
+ ✅ Correct
95
+ </button>
96
+ <button type="submit" name="correctness" value="incorrect"
97
+ class="bg-red-600 text-white px-3 py-1 rounded hover:bg-red-700">
98
+ ❌ Incorrect
99
+ </button>
100
+ </form>
101
+ </div>
102
+ <div id="highlight-section">
103
+ {% if highlight_images %}
104
+ <h3>📄 Relevant PDF Pages:</h3>
105
+ <div id="highlight-gallery">
106
+ {% for img in highlight_images %}
107
+ <img src="{{ img }}?v={{ loop.index }}" class="highlight-img"
108
+ style="max-width:80%; margin:10px; border:3px solid red;" />
109
+ {% endfor %}
110
+ </div>
111
+ {% endif %}
112
+ </div>
113
+
114
+ {% if confidence_avg is defined %}
115
+ <p class="text-sm text-gray-600 mt-2">🧠 Average confidence: {{ confidence_avg }}%</p>
116
+ {% endif %}
117
+
118
+ <script>
119
+ // Clear old images before new ones are inserted
120
+ document.addEventListener("DOMContentLoaded", function() {
121
+ const form = document.querySelector("form[action='/ask']");
122
+ if (form) {
123
+ form.addEventListener("submit", () => {
124
+ const gallery = document.getElementById("highlight-gallery");
125
+ if (gallery) gallery.innerHTML = ""; // remove old images
126
+ });
127
+ }
128
+ });
129
+ </script>
130
+ {% endif %}
131
+ </div>
132
+ </body>
133
+ </html>
app/utils.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ def bbox_to_dict(bbox):
2
+ x0, y0, x1, y1 = bbox
3
+ return {'x0': int(x0), 'y0': int(y0), 'x1': int(x1), 'y1': int(y1)}
app/visual_highlight.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import json
4
+ import ast
5
+ from pdf2image import convert_from_path
6
+ from PIL import Image, ImageDraw
7
+
8
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
9
+
10
+
11
+ def load_calibration(config_path="highlight_calibration.json"):
12
+ """Load calibration values from JSON or fallback to defaults."""
13
+ if os.path.exists(config_path):
14
+ with open(config_path, "r") as f:
15
+ calib = json.load(f)
16
+ print(f"✅ Loaded calibration: {calib}")
17
+ return calib
18
+ else:
19
+ print("⚠️ No calibration file found. Using defaults.")
20
+ return {"x_offset": 0, "x_scale": 1.0, "y_offset": 0, "y_scale": 1.0}
21
+
22
+
23
+ def render_highlighted_pages(pdf_path, hits, output_dir=None, dpi=150):
24
+ """
25
+ Render PDF pages as images and highlight bounding boxes with calibration applied.
26
+ Crops the output image tightly around highlighted area (+20 px padding).
27
+ """
28
+ if output_dir is None:
29
+ output_dir = os.path.join(BASE_DIR, "highlighted")
30
+ os.makedirs(output_dir, exist_ok=True)
31
+
32
+ calib = load_calibration()
33
+ X_OFFSET = calib.get("x_offset", 0)
34
+ X_SCALE = calib.get("x_scale", 1.0)
35
+ Y_OFFSET = calib.get("y_offset", 0)
36
+ Y_SCALE = calib.get("y_scale", 1.0)
37
+
38
+ # Clean previous outputs
39
+ for old in os.listdir(output_dir):
40
+ try:
41
+ os.remove(os.path.join(output_dir, old))
42
+ except Exception:
43
+ pass
44
+
45
+ hits = hits[:1]
46
+
47
+ pages_to_render = sorted({h["metadata"]["page"] for h in hits})
48
+ pdf_images = convert_from_path(pdf_path, dpi=dpi)
49
+ result_paths = []
50
+
51
+ for page_num in pages_to_render:
52
+ page_index = page_num - 1
53
+ img = pdf_images[page_index].convert("RGBA")
54
+ w_img, h_img = img.size
55
+ overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
56
+ draw = ImageDraw.Draw(overlay)
57
+ page_bboxes = []
58
+
59
+ for h in hits:
60
+ meta = h.get("metadata", {})
61
+ if meta.get("page") != page_num:
62
+ continue
63
+ bbox = meta.get("bbox")
64
+ # Debug raw bbox
65
+ print(f"[DEBUG] page {page_num} raw bbox type: {type(bbox)} value: {bbox}")
66
+
67
+ # Safe parsing: accept list/tuple or stringified list
68
+ try:
69
+ if isinstance(bbox, str):
70
+ bbox = ast.literal_eval(bbox)
71
+ if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
72
+ print(f"[WARN] Invalid bbox for page {page_num}: {bbox}")
73
+ continue
74
+ # Apply calibration
75
+ x0, y0, x1, y1 = [float(v) for v in bbox]
76
+ x0 = x0 * X_SCALE + X_OFFSET
77
+ x1 = x1 * X_SCALE + X_OFFSET
78
+ y0 = y0 * Y_SCALE + Y_OFFSET
79
+ y1 = y1 * Y_SCALE + Y_OFFSET
80
+ except Exception as e:
81
+ print(f"[ERROR] Failed to parse bbox for page {page_num}: {bbox} -> {e}")
82
+ continue
83
+
84
+ left, top = max(0, min(x0, x1)), max(0, min(y0, y1))
85
+ right, bottom = min(w_img, max(x0, x1)), min(h_img, max(y0, y1))
86
+
87
+ if right <= left or bottom <= top:
88
+ continue
89
+
90
+ page_bboxes.append((left, top, right, bottom))
91
+ draw.rectangle(
92
+ [left, top, right, bottom],
93
+ outline=(255, 0, 0),
94
+ width=4,
95
+ fill=(255, 0, 0, 100)
96
+ )
97
+
98
+ # Merge highlights with image
99
+ highlighted = Image.alpha_composite(img, overlay)
100
+
101
+ # --- 🧭 Crop around highlighted region (+20px padding) ---
102
+ if page_bboxes:
103
+ min_x = min(b[0] for b in page_bboxes)
104
+ min_y = min(b[1] for b in page_bboxes)
105
+ max_x = max(b[2] for b in page_bboxes)
106
+ max_y = max(b[3] for b in page_bboxes)
107
+
108
+ pad = 100
109
+ crop_box = (
110
+ max(0, int(min_x - pad)),
111
+ max(0, int(min_y - pad)),
112
+ int(min(max_x + pad, w_img)),
113
+ int(min(max_y + pad, h_img)),
114
+ )
115
+
116
+ cropped = highlighted.crop(crop_box)
117
+ else:
118
+ cropped = highlighted # fallback if no bbox
119
+
120
+ # Log how many boxes were drawn
121
+ print(f"✅ Drew {len(page_bboxes)} boxes on page {page_num}")
122
+
123
+ out_path = os.path.join(output_dir, f"highlight_page{page_num}_{uuid.uuid4().hex}.png")
124
+ cropped.convert("RGB").save(out_path)
125
+ result_paths.append(out_path)
126
+
127
+ print(f"✅ Highlighted and cropped page {page_num}: {out_path}")
128
+
129
+ return result_paths
130
+
131
+
132
+ # Example usage
133
+ if __name__ == "__main__":
134
+ hits = [
135
+ {"metadata": {"page": 2, "bbox": [87, 222, 592, 250], "type": "text"}},
136
+ ]
137
+
138
+ render_highlighted_pages("samples/vdoc_rag_test.pdf", hits)
highlight_calibration.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "x_offset": -33.0,
3
+ "x_scale": 1.0,
4
+ "y_offset": -65.0,
5
+ "y_scale": 1.02
6
+ }
notebooks/evaluate_embeddings.ipynb ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "55b021d5",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Embedding & Retrieval Evaluation\n",
9
+ "\n",
10
+ "This notebook benchmarks embedding models and chunk sizes for retrieval quality using your project's Chroma index and collected feedback as a small labeled set. Metrics: Precision@K, Recall@K, and MRR."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "id": "18518993",
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "ename": "",
21
+ "evalue": "",
22
+ "output_type": "error",
23
+ "traceback": [
24
+ "\u001b[1;31mFailed to start the Kernel. \n",
25
+ "\u001b[1;31mPermissionError: [WinError 5] Access is denied: 'C:\\\\Users\\\\abhin\\\\.ipython\\\\profile_default\\\\security'. \n",
26
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
27
+ ]
28
+ }
29
+ ],
30
+ "source": [
31
+ "# Standard imports\n",
32
+ "import os\n",
33
+ "import json\n",
34
+ "import numpy as np\n",
35
+ "import pandas as pd\n",
36
+ "from tqdm import tqdm\n",
37
+ "from sentence_transformers import SentenceTransformer\n",
38
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
39
+ "import matplotlib.pyplot as plt\n",
40
+ "\n",
41
+ "# Project imports (uses your existing pipeline)\n",
42
+ "from app.feedback_manager import _load_feedback\n",
43
+ "from app.ingest import process_pdf\n",
44
+ "from app.embeddings import TextImageEmbedder\n",
45
+ "\n",
46
+ "# Config\n",
47
+ "BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
48
+ "PDF_PATH = os.path.join(BASE_DIR, \"samples\", \"vdoc_rag_test.pdf\") # replace with a real sample PDF path\n",
49
+ "STORAGE_DIR = os.path.join(BASE_DIR, \"storage\", \"chroma_db\")\n",
50
+ "\n",
51
+ "MODELS_TO_TEST = [\n",
52
+ " \"all-MiniLM-L6-v2\",\n",
53
+ " \"multi-qa-MiniLM-L6-cos-v1\",\n",
54
+ " \"paraphrase-MiniLM-L3-v2\",\n",
55
+ " os.path.join(BASE_DIR, \"models\", \"vdoc_feedback_tuned\", \"latest\"),\n",
56
+ "]\n",
57
+ "CHUNK_SIZES = [200, 500, 800] # in characters\n",
58
+ "TOP_K = 5\n",
59
+ "\n",
60
+ "print(\"Notebook configured. If the tuned model path does not exist, it will be skipped in runs.\")"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "id": "863ba97b",
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "# Load feedback (if available)\n",
71
+ "feedback = _load_feedback()\n",
72
+ "print(f\"Loaded {len(feedback)} feedback entries.\")\n",
73
+ "if feedback:\n",
74
+ " sample_queries = [f['question'] for f in feedback]\n",
75
+ " sample_answers = [f['answer'] for f in feedback]\n",
76
+ "else:\n",
77
+ " # fallback small test set\n",
78
+ " sample_queries = [\n",
79
+ " \"What is the trend in yearly sales?\",\n",
80
+ " \"Who scored highest in the table?\",\n",
81
+ " \"What is the event date?\",\n",
82
+ " ]\n",
83
+ " sample_answers = [\"increasing\", \"Charlie\", \"November 20, 2025\"]\n",
84
+ "\n",
85
+ "# Small helper to preview feedback structure\n",
86
+ "if feedback:\n",
87
+ " display(pd.DataFrame(feedback)[['timestamp','question','answer','correctness']].tail(10))"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": null,
93
+ "id": "ad0fcb6c",
94
+ "metadata": {},
95
+ "outputs": [],
96
+ "source": [
97
+ "# Helper: process the PDF into chunks (optional - heavy).\n",
98
+ "def load_chunks(pdf_path):\n",
99
+ " if not os.path.exists(pdf_path):\n",
100
+ " raise FileNotFoundError(f\"PDF not found: {pdf_path}\")\n",
101
+ " print(\"Processing PDF into chunks (this may take a while)...\")\n",
102
+ " docs = process_pdf(pdf_path)\n",
103
+ " texts = [d['text'] for d in docs]\n",
104
+ " return texts\n",
105
+ "\n",
106
+ "# Try to load sample chunks if available, otherwise create toy chunks from feedback answers\n",
107
+ "try:\n",
108
+ " chunks = load_chunks(PDF_PATH)\n",
109
+ " print(f\"Total chunks from PDF: {len(chunks)}\")\n",
110
+ "except Exception as e:\n",
111
+ " print(\"Could not process PDF, falling back to feedback-derived tiny corpus:\", e)\n",
112
+ " # fallback corpus built from sample answers/queries for quick runs\n",
113
+ " chunks = [\n",
114
+ " \"Yearly sales have been increasing steadily from 2018 to 2024, with a notable jump in 2021.\",\n",
115
+ " \"Charlie achieved the highest score in the table with 98 points.\",\n",
116
+ " \"The event will be held on November 20, 2025 at the downtown auditorium.\",\n",
117
+ " ]\n",
118
+ " print(f\"Using fallback chunks: {len(chunks)} items\")"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": null,
124
+ "id": "5c8b6ffe",
125
+ "metadata": {},
126
+ "outputs": [],
127
+ "source": [
128
+ "# Evaluation function (Precision@K, Recall@K, MRR)\n",
129
+ "def evaluate_model(model_name, chunks, queries, answers, chunk_size, top_k=TOP_K):\n",
130
+ " print(f\"\\n🧠 Evaluating {model_name} (chunk size {chunk_size})\")\n",
131
+ " # Skip model if path does not exist (for tuned model)\n",
132
+ " if os.path.isabs(model_name) and not os.path.exists(model_name):\n",
133
+ " print(f\"- Skipping (path not found): {model_name}\")\n",
134
+ " return None\n",
135
+ "\n",
136
+ " model = SentenceTransformer(model_name)\n",
137
+ "\n",
138
+ " # Split chunks by size\n",
139
+ " split_chunks = []\n",
140
+ " for ch in chunks:\n",
141
+ " for i in range(0, len(ch), chunk_size):\n",
142
+ " split_chunks.append(ch[i:i+chunk_size])\n",
143
+ " chunk_embeddings = model.encode(split_chunks, normalize_embeddings=True, show_progress_bar=False)\n",
144
+ "\n",
145
+ " precision_scores, recall_scores, mrr_scores = [], [], []\n",
146
+ "\n",
147
+ " # Precompute reference counts for recall denominator\n",
148
+ " total_relevant_counts = []\n",
149
+ " for ans in answers:\n",
150
+ " total_relevant_counts.append(sum(1 for c in split_chunks if ans.lower() in c.lower()))\n",
151
+ "\n",
152
+ " for q, ans in tqdm(list(zip(queries, answers)), total=len(queries), desc=f\"Evaluating {model_name}\"):\n",
153
+ " qvec = model.encode([q], normalize_embeddings=True)\n",
154
+ " sims = cosine_similarity(qvec, chunk_embeddings)[0]\n",
155
+ " top_indices = np.argsort(sims)[::-1][:top_k]\n",
156
+ " retrieved_chunks = [split_chunks[i] for i in top_indices]\n",
157
+ "\n",
158
+ " relevant = [1 if ans.lower() in c.lower() else 0 for c in retrieved_chunks]\n",
159
+ " precision = sum(relevant) / top_k\n",
160
+ " recall = sum(relevant) / max(1, total_relevant_counts.pop(0))\n",
161
+ " mrr = 0.0\n",
162
+ " for rank, rel in enumerate(relevant, start=1):\n",
163
+ " if rel == 1:\n",
164
+ " mrr = 1.0 / rank\n",
165
+ " break\n",
166
+ "\n",
167
+ " precision_scores.append(precision)\n",
168
+ " recall_scores.append(recall)\n",
169
+ " mrr_scores.append(mrr)\n",
170
+ "\n",
171
+ " return {\n",
172
+ " \"model\": model_name,\n",
173
+ " \"chunk_size\": chunk_size,\n",
174
+ " \"precision\": float(np.mean(precision_scores)),\n",
175
+ " \"recall\": float(np.mean(recall_scores)),\n",
176
+ " \"mrr\": float(np.mean(mrr_scores)),\n",
177
+ " }"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": null,
183
+ "id": "ca934bfc",
184
+ "metadata": {},
185
+ "outputs": [],
186
+ "source": [
187
+ "# Run evaluation across models and chunk sizes\n",
188
+ "results = []\n",
189
+ "for model_name in MODELS_TO_TEST:\n",
190
+ " for cs in CHUNK_SIZES:\n",
191
+ " res = evaluate_model(model_name, chunks, sample_queries, sample_answers, cs)\n",
192
+ " if res:\n",
193
+ " results.append(res)\n",
194
+ "\n",
195
+ "df = pd.DataFrame(results)\n",
196
+ "if not df.empty:\n",
197
+ " display(df)\n",
198
+ "else:\n",
199
+ " print(\"No results to show (models may have been skipped).\")"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": null,
205
+ "id": "e6f75729",
206
+ "metadata": {},
207
+ "outputs": [],
208
+ "source": [
209
+ "# Visualization\n",
210
+ "if not df.empty:\n",
211
+ " plt.figure(figsize=(8,5))\n",
212
+ " for m in df['model'].unique():\n",
213
+ " subset = df[df['model'] == m]\n",
214
+ " plt.plot(subset['chunk_size'], subset['precision'], marker='o', label=f\"{m} (Precision)\")\n",
215
+ " plt.title('Precision@5 vs Chunk Size')\n",
216
+ " plt.xlabel('Chunk Size (characters)')\n",
217
+ " plt.ylabel('Precision@5')\n",
218
+ " plt.legend()\n",
219
+ " plt.grid(True)\n",
220
+ " plt.show()\n",
221
+ "\n",
222
+ " plt.figure(figsize=(8,5))\n",
223
+ " for m in df['model'].unique():\n",
224
+ " subset = df[df['model'] == m]\n",
225
+ " plt.plot(subset['chunk_size'], subset['recall'], marker='s', label=f\"{m} (Recall)\")\n",
226
+ " plt.title('Recall@5 vs Chunk Size')\n",
227
+ " plt.xlabel('Chunk Size (characters)')\n",
228
+ " plt.ylabel('Recall@5')\n",
229
+ " plt.legend()\n",
230
+ " plt.grid(True)\n",
231
+ " plt.show()"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": null,
237
+ "id": "249d2857",
238
+ "metadata": {},
239
+ "outputs": [],
240
+ "source": [
241
+ "# Save results to CSV for reporting\n",
242
+ "output_csv = os.path.join(BASE_DIR, 'notebooks', 'embedding_benchmark_results.csv')\n",
243
+ "if not df.empty:\n",
244
+ " df.to_csv(output_csv, index=False)\n",
245
+ " print(f\"✅ Benchmark results saved to {output_csv}\")\n",
246
+ "else:\n",
247
+ " print(\"No data to save.\")"
248
+ ]
249
+ }
250
+ ],
251
+ "metadata": {
252
+ "kernelspec": {
253
+ "display_name": "Python 3",
254
+ "language": "python",
255
+ "name": "python3"
256
+ },
257
+ "language_info": {
258
+ "name": "python",
259
+ "version": "3.13.2"
260
+ }
261
+ },
262
+ "nbformat": 4,
263
+ "nbformat_minor": 5
264
+ }
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ python-multipart
4
+ pdf2image
5
+ pdfplumber
6
+ pytesseract
7
+ Pillow
8
+ sentence-transformers
9
+ transformers
10
+ torch
11
+ chromadb
12
+ numpy
13
+ pandas
14
+ aiofiles
15
+ openai
16
+ layoutparser
17
+ opencv-python-headless
18
+ matplotlib
19
+ scikit-learn
20
+ google-genai
21
+ python-dotenv
22
+ jinja2
samples/vdoc_rag_test.pdf ADDED
Binary file (52.1 kB). View file
 
test.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import json
4
+ from pdf2image import convert_from_path
5
+ from PIL import Image, ImageDraw
6
+
7
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
8
+
9
+
10
+ def load_calibration(config_path="highlight_calibration.json"):
11
+ """Load calibration values from JSON or fallback to defaults."""
12
+ if os.path.exists(config_path):
13
+ with open(config_path, "r") as f:
14
+ calib = json.load(f)
15
+ print(f"✅ Loaded calibration: {calib}")
16
+ return calib
17
+ else:
18
+ print("⚠️ No calibration file found. Using defaults.")
19
+ return {"x_offset": 0, "x_scale": 1.0, "y_offset": 0, "y_scale": 1.0}
20
+
21
+
22
+ def render_highlighted_pages(pdf_path, hits, output_dir=None, dpi=150):
23
+ """
24
+ Render PDF pages as images and highlight bounding boxes with calibration applied.
25
+ Crops the output image tightly around highlighted area (+20 px padding).
26
+ """
27
+ if output_dir is None:
28
+ output_dir = os.path.join(BASE_DIR, "highlighted")
29
+ os.makedirs(output_dir, exist_ok=True)
30
+
31
+ calib = load_calibration()
32
+ X_OFFSET = calib.get("x_offset", 0)
33
+ X_SCALE = calib.get("x_scale", 1.0)
34
+ Y_OFFSET = calib.get("y_offset", 0)
35
+ Y_SCALE = calib.get("y_scale", 1.0)
36
+
37
+ # Clean previous outputs
38
+ for old in os.listdir(output_dir):
39
+ try:
40
+ os.remove(os.path.join(output_dir, old))
41
+ except Exception:
42
+ pass
43
+
44
+ pages_to_render = sorted({h["metadata"]["page"] for h in hits})
45
+ pdf_images = convert_from_path(pdf_path, dpi=dpi)
46
+ result_paths = []
47
+
48
+ for page_num in pages_to_render:
49
+ page_index = page_num - 1
50
+ img = pdf_images[page_index].convert("RGBA")
51
+ w_img, h_img = img.size
52
+ overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
53
+ draw = ImageDraw.Draw(overlay)
54
+ page_bboxes = []
55
+
56
+ for h in hits:
57
+ meta = h["metadata"]
58
+ if meta["page"] != page_num:
59
+ continue
60
+ bbox = meta["bbox"]
61
+ if not bbox or len(bbox) != 4:
62
+ continue
63
+
64
+ # Apply calibration
65
+ x0, y0, x1, y1 = [float(v) for v in bbox]
66
+ x0 = x0 * X_SCALE + X_OFFSET
67
+ x1 = x1 * X_SCALE + X_OFFSET
68
+ y0 = y0 * Y_SCALE + Y_OFFSET
69
+ y1 = y1 * Y_SCALE + Y_OFFSET
70
+
71
+ left, top = max(0, min(x0, x1)), max(0, min(y0, y1))
72
+ right, bottom = min(w_img, max(x0, x1)), min(h_img, max(y0, y1))
73
+
74
+ if right <= left or bottom <= top:
75
+ continue
76
+
77
+ page_bboxes.append((left, top, right, bottom))
78
+ draw.rectangle(
79
+ [left, top, right, bottom],
80
+ outline=(255, 0, 0),
81
+ width=4,
82
+ fill=(255, 0, 0, 100)
83
+ )
84
+
85
+ # Merge highlights with image
86
+ highlighted = Image.alpha_composite(img, overlay)
87
+
88
+ # --- 🧭 Crop around highlighted region (+20px padding) ---
89
+ if page_bboxes:
90
+ min_x = min(b[0] for b in page_bboxes)
91
+ min_y = min(b[1] for b in page_bboxes)
92
+ max_x = max(b[2] for b in page_bboxes)
93
+ max_y = max(b[3] for b in page_bboxes)
94
+
95
+ pad = 100
96
+ crop_box = (
97
+ max(0, int(min_x - pad)),
98
+ max(0, int(min_y - pad)),
99
+ int(min(max_x + pad, w_img)),
100
+ int(min(max_y + pad, h_img)),
101
+ )
102
+
103
+ cropped = highlighted.crop(crop_box)
104
+ else:
105
+ cropped = highlighted # fallback if no bbox
106
+
107
+ out_path = os.path.join(output_dir, f"highlight_page{page_num}_{uuid.uuid4().hex}.png")
108
+ cropped.convert("RGB").save(out_path)
109
+ result_paths.append(out_path)
110
+
111
+ print(f"✅ Highlighted and cropped page {page_num}: {out_path}")
112
+
113
+ return result_paths
114
+
115
+
116
+ # Example usage
117
+ if __name__ == "__main__":
118
+ hits = [
119
+ {"metadata": {"page": 1, "bbox": [87, 1926, 775, 1957], "type": "text"}},
120
+ {"metadata": {"page": 2, "bbox": [87, 222, 592, 250], "type": "text"}},
121
+ ]
122
+
123
+ render_highlighted_pages("samples/vdoc_rag_test.pdf", hits)
train_feedback_embeddings.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ train_feedback_embeddings.py
3
+ Fine-tune the VDoc-RAG embedding model using stored user feedback.
4
+
5
+ Place this file at the repository root and run:
6
+
7
+ python train_feedback_embeddings.py
8
+
9
+ It will load feedback from `app/feedback.json`, prepare training pairs, fine-tune a
10
+ SentenceTransformer model, and save checkpoints under `models/vdoc_feedback_tuned/`.
11
+ """
12
+ import os
13
+ import json
14
+ from datetime import datetime
15
+ from torch.utils.data import DataLoader
16
+
17
+ try:
18
+ from sentence_transformers import SentenceTransformer, InputExample, losses
19
+ except Exception as e:
20
+ raise ImportError("Please install sentence-transformers and torch to run this script: pip install sentence-transformers torch")
21
+
22
+ # --- Paths ---
23
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
24
+ APP_DIR = os.path.join(BASE_DIR, "app")
25
+ FEEDBACK_PATH = os.path.join(APP_DIR, "feedback.json")
26
+ OUTPUT_DIR = os.path.join(BASE_DIR, "models", "vdoc_feedback_tuned")
27
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
28
+
29
+ # --- Step 1: Load Feedback ---
30
+ if not os.path.exists(FEEDBACK_PATH):
31
+ raise FileNotFoundError(f"❌ No feedback.json found at {FEEDBACK_PATH}")
32
+
33
+ with open(FEEDBACK_PATH, "r", encoding="utf-8") as f:
34
+ feedback = json.load(f)
35
+
36
+ if not feedback:
37
+ raise ValueError("⚠️ feedback.json is empty — collect feedback first!")
38
+
39
+ # --- Step 2: Prepare Training Data ---
40
+ train_examples = []
41
+ for fb in feedback:
42
+ question = fb.get("question", "").strip()
43
+ answer = fb.get("answer", "").strip()
44
+ correctness = (fb.get("correctness") or "").lower()
45
+ if not question or not answer:
46
+ continue
47
+ if correctness not in ("correct", "incorrect"):
48
+ continue
49
+ label = 1.0 if correctness == "correct" else 0.0
50
+ train_examples.append(InputExample(texts=[question, answer], label=label))
51
+
52
+ if len(train_examples) < 5:
53
+ raise ValueError(f"⚠️ Too few feedback entries ({len(train_examples)}). Need at least 5 to fine-tune meaningfully.")
54
+
55
+ print(f"✅ Loaded {len(train_examples)} feedback samples for training.")
56
+
57
+ # --- Step 3: Load Base Model ---
58
+ base_model = os.environ.get("VDOCRAG_FEEDBACK_BASE", "all-MiniLM-L6-v2")
59
+ print(f"📦 Loading base model: {base_model}")
60
+ model = SentenceTransformer(base_model)
61
+
62
+ # --- Step 4: Create DataLoader and Loss ---
63
+ train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
64
+ train_loss = losses.CosineSimilarityLoss(model)
65
+
66
+ # --- Step 5: Train ---
67
+ print("🚀 Starting fine-tuning...")
68
+ model.fit(
69
+ train_objectives=[(train_dataloader, train_loss)],
70
+ epochs=1,
71
+ warmup_steps=10,
72
+ show_progress_bar=True,
73
+ )
74
+
75
+ # --- Step 6: Save Fine-tuned Model ---
76
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
77
+ save_path = os.path.join(OUTPUT_DIR, f"checkpoint_{timestamp}")
78
+ os.makedirs(save_path, exist_ok=True)
79
+ model.save(save_path)
80
+ print(f"✅ Fine-tuned model saved at: {save_path}")
81
+
82
+ # --- Step 7: Create "latest" symlink / pointer ---
83
+ latest_path = os.path.join(OUTPUT_DIR, "latest")
84
+ try:
85
+ if os.path.exists(latest_path):
86
+ if os.path.islink(latest_path):
87
+ os.unlink(latest_path)
88
+ else:
89
+ import shutil
90
+
91
+ shutil.rmtree(latest_path)
92
+ os.symlink(save_path, latest_path, target_is_directory=True)
93
+ print(f"🔗 Symlink created: {latest_path} → {save_path}")
94
+ except Exception:
95
+ # On Windows, symlink may fail — copy instead
96
+ import shutil
97
+
98
+ if os.path.exists(latest_path):
99
+ shutil.rmtree(latest_path, ignore_errors=True)
100
+ shutil.copytree(save_path, latest_path)
101
+ print(f"📁 Copied model to {latest_path} (symlink not supported).")
102
+
103
+ print("\n🎉 Training complete! Your VDoc-RAG can now use:")
104
+ print(f" models/vdoc_feedback_tuned/latest/")