Spaces:

adi9128
/

vdoc_rag

Sleeping

App Files Files Community

aditya9128 commited on Mar 10

Commit

4e3cee0

0 Parent(s):

Initial commit: VDoc-RAG - Intelligent Document Q&A with RAG

Browse files

Files changed (31) hide show

.env.example +2 -0
.gitignore +231 -0
Dockerfile +43 -0
HR_TESTING_GUIDE.md +63 -0
README.md +111 -0
README_HF.md +35 -0
app/cache_manager.py +46 -0
app/calibrate.py +121 -0
app/chart_detect.py +125 -0
app/chart_reasoner.py +232 -0
app/debug_chunks.json +182 -0
app/embeddings.py +30 -0
app/feedback.json +30 -0
app/feedback_manager.py +50 -0
app/highlight_calibration.json +6 -0
app/indexer.py +131 -0
app/ingest.py +214 -0
app/main.py +451 -0
app/reader.py +145 -0
app/tables.py +58 -0
app/templates/benchmark_dashboard.html +81 -0
app/templates/feedback_dashboard.html +64 -0
app/templates/index.html +133 -0
app/utils.py +3 -0
app/visual_highlight.py +138 -0
highlight_calibration.json +6 -0
notebooks/evaluate_embeddings.ipynb +264 -0
requirements.txt +22 -0
samples/vdoc_rag_test.pdf +0 -0
test.py +123 -0
train_feedback_embeddings.py +104 -0

.env.example ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copy this to .env and fill in your values
2	+ GEMINI_API_KEY=your_gemini_api_key_here

.gitignore ADDED Viewed

	@@ -0,0 +1,231 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# =========================
+# VDoc-RAG Project Specific
+# =========================
+# Runtime/Generated directories
+app/cache/
+app/uploads/
+app/storage/
+app/charts/
+app/highlighted/
+app/tmp/
+app/tables/
+storage/
+# Keep directory structure with .gitkeep
+!app/cache/.gitkeep
+!app/uploads/.gitkeep
+!app/storage/.gitkeep
+!app/charts/.gitkeep
+!app/highlighted/.gitkeep
+!app/tmp/.gitkeep
+!app/tables/.gitkeep
+*.png

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+FROM python:3.11-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    poppler-utils \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . .
+# Create necessary directories with proper permissions for HF Spaces
+RUN mkdir -p /app/app/storage/chroma_db \
+    /app/app/uploads \
+    /app/app/tmp \
+    /app/app/highlighted \
+    /app/app/charts \
+    /app/app/tables \
+    /app/app/cache \
+    && chmod -R 777 /app/app/storage \
+    && chmod -R 777 /app/app/uploads \
+    && chmod -R 777 /app/app/tmp \
+    && chmod -R 777 /app/app/highlighted \
+    && chmod -R 777 /app/app/charts \
+    && chmod -R 777 /app/app/tables \
+    && chmod -R 777 /app/app/cache
+# Expose port (HF Spaces uses 7860)
+EXPOSE 7860
+# Run the application
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

HR_TESTING_GUIDE.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# VDoc-RAG Demo Testing Guide
+## What is this?
+VDoc-RAG is an AI-powered document Q&A system that can:
+- Extract text from PDFs (including charts and tables)
+- Answer questions about uploaded documents
+- Show confidence scores and source attribution
+---
+## How to Test
+### 1. Open the App
+Click the link provided: `[YOUR_NGROK_URL]`
+### 2. Upload a Document
+- Click **"Choose File"** and select any PDF
+- Click **"Upload & Index"**
+- Wait for: `✅ Uploaded and indexed [filename] (X chunks)`
+### 3. Ask Questions
+Try these example questions:
+- "What is this document about?"
+- "Summarize the main points"
+- "What are the key dates mentioned?"
+- "Describe any charts or graphs"
+### 4. Review the Response
+You'll see:
+- **Answer**: AI-generated response
+- **Sources**: Which parts of the document were used
+- **Confidence Score**: How relevant the retrieved content is
+### 5. Provide Feedback
+Click **✅ Correct** or **❌ Incorrect** to rate the answer
+---
+## Additional Features to Explore
+| Page | What it Shows |
+|------|---------------|
+| `/feedback_dashboard` | Feedback statistics and model fine-tuning |
+| `/benchmark_dashboard` | Embedding model evaluation metrics |
+---
+## Technical Highlights
+- **RAG Pipeline**: Retrieval-Augmented Generation with ChromaDB
+- **OCR**: Tesseract for text extraction from images/PDFs
+- **Embeddings**: Sentence-transformers (all-MiniLM-L6-v2)
+- **LLM**: Google Gemini for answer generation
+- **Chart Detection**: CLIP + OpenCV for visual understanding
+---
+## Sample Test PDF
+A sample document is pre-loaded. Upload your own PDF to test with real documents!
+---
+*Built with FastAPI, ChromaDB, Sentence-Transformers, and Google Gemini*

README.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# 📄 VDoc-RAG (Visually-Rich Document Retrieval-Augmented Generation)
+VDoc-RAG is an advanced multimodal system that answers questions from visually-rich documents (PDFs, reports, flyers) by combining OCR, table and chart reasoning, semantic embeddings, and LLMs.
+---
+## 🚀 Features
+- 🧠 **RAG Pipeline** with persistent ChromaDB
+- 🪄 **OCR + Table + Chart understanding**
+- 📊 **Chart Reasoning** (Pix2Struct + OCR-based)
+- 🔐 **Environment-based API key handling**
+- 🧮 **Confidence Scoring** via cosine similarity
+- 🧾 **Feedback Loop** for self-improving embeddings
+- 📈 **Benchmark Dashboard** for evaluating embedding models
+- 💾 **Persistent Storage** (DuckDB + Parquet backend)
+---
+## ⚙️ Quickstart (Windows)
+### 1️⃣ Install Dependencies
+Install:
+- **Tesseract OCR** → [Tesseract Wiki](https://github.com/UB-Mannheim/tesseract/wiki)
+- **Poppler for Windows** → [Poppler Releases](https://github.com/oschwartz10612/poppler-windows/releases)
+Add both to your system PATH.
+### 2️⃣ Create Virtual Environment
+```bash
+python -m venv venv
+venv\Scripts\activate
+pip install -r requirements.txt
+```
+### 3️⃣ Run the App
+```bash
+uvicorn app.main:app --reload --port 8000
+```
+Open → [http://127.0.0.1:8000](http://127.0.0.1:8000)
+---
+## 🖥️ Web Interfaces
+| Page | Route | Description |
+|------|-------|--------------|
+| `/` | Main Interface | Upload, query, visualize highlights |
+| `/feedback_dashboard` | Feedback Loop | View stats, fine-tune model |
+| `/benchmark_dashboard` | Benchmarking | Evaluate embeddings (Precision/Recall/MRR) |
+---
+## 📁 Project Structure
+```
+vdoc-rag-mvp/
+├─ app/
+│  ├─ ingest.py              # OCR, table & chart extraction
+│  ├─ chart_reasoner.py      # Chart summarization and trend detection
+│  ├─ indexer.py             # Persistent ChromaDB retrieval
+│  ├─ reader.py              # LLM question answering
+│  ├─ feedback_manager.py    # Feedback collection system
+│  ├─ main.py                # FastAPI server + dashboards
+│  └─ visual_highlight.py    # Highlight relevant regions
+│
+├─ models/vdoc_feedback_tuned/  # Fine-tuned embedding model
+├─ storage/chroma_db/           # Persistent vector store
+├─ notebooks/evaluate_embeddings.ipynb  # Benchmarking notebook
+└─ templates/                   # HTML UIs (main, feedback, benchmark)
+```
+---
+## 🧠 Models Used
+| Type | Model | Purpose |
+|------|--------|----------|
+| Embedding | `all-MiniLM-L6-v2` (base), `multi-qa-MiniLM`, feedback-tuned variant | Semantic encoding |
+| LLM Reader | Gemini / DistilGPT2 | Context-based answering |
+| Chart Reasoning | Pix2Struct / OCR fallback | Visual trend analysis |
+| Vector Store | ChromaDB (DuckDB + Parquet) | Persistent retrieval |
+| Fine-tuning | SentenceTransformer + CosineLoss | Feedback-based learning |
+---
+## 🧩 Evaluation
+- **Confidence Scoring**: cosine similarity between query & chunks
+- **Precision / Recall / MRR**: benchmark dashboards & notebook
+- **Feedback-driven fine-tuning**: iterative model improvement
+---
+## 🧠 Author’s Note
+VDoc-RAG demonstrates how retrieval-augmented generation can evolve from plain text retrieval into **visually grounded document reasoning**, enabling future systems that can read, reason, and learn continuously.
+---
+**Developed as a full multimodal RAG research framework** — suitable for academic reports, enterprise document intelligence, and AI reasoning pipelines.
+Tesseract
+pix2struct
+sentence transformer
+MiniLM-L6-v2.
+Gemini API

README_HF.md ADDED Viewed

	@@ -0,0 +1,35 @@

+---
+title: VDoc-RAG
+emoji: 📄
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+pinned: false
+license: mit
+---
+# 📄 VDoc-RAG (Visually-Rich Document RAG)
+An AI-powered document Q&A system that answers questions from PDFs with charts, tables, and images.
+## Features
+- 🧠 **RAG Pipeline** with ChromaDB vector store
+- 📊 **Chart & Table Understanding** via OCR
+- 🔐 **Gemini LLM** for answer generation
+- 🧮 **Confidence Scoring** via cosine similarity
+- 🧾 **Feedback Loop** for improvement
+## How to Use
+1. Upload a PDF document
+2. Ask questions about the content
+3. Get AI-generated answers with sources
+## Tech Stack
+- FastAPI + Uvicorn
+- Sentence-Transformers (all-MiniLM-L6-v2)
+- ChromaDB
+- Google Gemini
+- Tesseract OCR

app/cache_manager.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import hashlib
+import json
+import shutil
+CACHE_DIR = os.path.join(os.path.dirname(__file__), "cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+def _hash_file(path: str) -> str:
+    """Compute SHA256 fingerprint for a file."""
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        while True:
+            chunk = f.read(8192)
+            if not chunk:
+                break
+            h.update(chunk)
+    return h.hexdigest()
+def get_cache_path(pdf_path: str) -> str:
+    fid = _hash_file(pdf_path)
+    return os.path.join(CACHE_DIR, f"{fid}.json")
+def save_chunks_to_cache(pdf_path: str, chunks) -> str:
+    path = get_cache_path(pdf_path)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(chunks, f, indent=2, ensure_ascii=False)
+    return path
+def load_chunks_from_cache(pdf_path: str):
+    path = get_cache_path(pdf_path)
+    if os.path.exists(path):
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    return None
+def clear_cache() -> bool:
+    """Delete all cached JSON files and recreate cache directory."""
+    shutil.rmtree(CACHE_DIR, ignore_errors=True)
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    return True

app/calibrate.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import fitz
+import matplotlib.pyplot as plt
+from matplotlib.widgets import Slider, Button
+from PIL import Image
+import io
+import json
+import os
+pdf_path = "samples/vdoc_rag_test.pdf"
+config_path = "highlight_calibration.json"
+# Example hits
+hits = [
+    {"metadata": {"page": 1, "bbox": [87, 1926, 775, 1957], "type": "text"}},
+    {"metadata": {"page": 2, "bbox": [87, 222, 592, 250], "type": "text"}},
+]
+# Load PDF
+doc = fitz.open(pdf_path)
+# Render both pages
+pix1 = doc[0].get_pixmap(dpi=150)
+pix2 = doc[1].get_pixmap(dpi=150)
+img1 = Image.open(io.BytesIO(pix1.tobytes("png")))
+img2 = Image.open(io.BytesIO(pix2.tobytes("png")))
+# Combined figure (2 pages side-by-side)
+fig, axes = plt.subplots(1, 2, figsize=(16, 10))
+plt.subplots_adjust(bottom=0.25)
+axes[0].imshow(img1)
+axes[0].set_title("Page 1", fontsize=12)
+axes[1].imshow(img2)
+axes[1].set_title("Page 2", fontsize=12)
+for ax in axes:
+    ax.axis("off")
+# Keep reference sizes
+img1_w, img1_h = img1.size
+img2_w, img2_h = img2.size
+# Prepare highlight rectangles for both pages
+rects_page1, rects_page2 = [], []
+for h in hits:
+    meta = h["metadata"]
+    page_idx = meta["page"] - 1
+    x0, y0, x1, y1 = [float(v) for v in meta["bbox"]]
+    rect = plt.Rectangle((x0, y0), x1 - x0, y1 - y0,
+                         linewidth=2, edgecolor='r', facecolor='r', alpha=0.4)
+    if page_idx == 0:
+        rects_page1.append(rect)
+        axes[0].add_patch(rect)
+    elif page_idx == 1:
+        rects_page2.append(rect)
+        axes[1].add_patch(rect)
+# 🎚️ Shared sliders
+axcolor = 'lightgoldenrodyellow'
+ax_x_offset = plt.axes([0.25, 0.12, 0.65, 0.03], facecolor=axcolor)
+ax_x_scale  = plt.axes([0.25, 0.09, 0.65, 0.03], facecolor=axcolor)
+ax_y_offset = plt.axes([0.25, 0.06, 0.65, 0.03], facecolor=axcolor)
+ax_y_scale  = plt.axes([0.25, 0.03, 0.65, 0.03], facecolor=axcolor)
+ax_save     = plt.axes([0.85, 0.17, 0.10, 0.04])
+slider_x_offset = Slider(ax_x_offset, 'X Offset', -500, 500, valinit=0, valstep=0.5)
+slider_x_scale  = Slider(ax_x_scale,  'X Scale',  0.3, 2.0, valinit=1.0, valstep=0.002)
+slider_y_offset = Slider(ax_y_offset, 'Y Offset', -1500, 1500, valinit=0, valstep=0.5)
+slider_y_scale  = Slider(ax_y_scale,  'Y Scale',  0.3, 2.0, valinit=1.0, valstep=0.002)
+btn_save        = Button(ax_save, '💾 Save', color=axcolor, hovercolor='0.9')
+def update(val):
+    xo, xs = slider_x_offset.val, slider_x_scale.val
+    yo, ys = slider_y_offset.val, slider_y_scale.val
+    # Page 1
+    for i, h in enumerate(rects_page1):
+        bbox = hits[0]["metadata"]["bbox"]
+        x0, y0, x1, y1 = [float(v) for v in bbox]
+        x0 = x0 * xs + xo
+        x1 = x1 * xs + xo
+        y0 = y0 * ys + yo
+        y1 = y1 * ys + yo
+        h.set_xy((x0, y1))
+        h.set_width(x1 - x0)
+        h.set_height(y0 - y1)
+    # Page 2
+    for i, h in enumerate(rects_page2):
+        bbox = hits[1]["metadata"]["bbox"]
+        x0, y0, x1, y1 = [float(v) for v in bbox]
+        x0 = x0 * xs + xo
+        x1 = x1 * xs + xo
+        y0 = y0 * ys + yo
+        y1 = y1 * ys + yo
+        h.set_xy((x0, y1))
+        h.set_width(x1 - x0)
+        h.set_height(y0 - y1)
+    fig.suptitle(
+        f"Xo={xo:.1f}, Xs={xs:.3f} | Yo={yo:.1f}, Ys={ys:.3f}",
+        fontsize=11, color='darkred'
+    )
+    fig.canvas.draw_idle()
+for s in [slider_x_offset, slider_x_scale, slider_y_offset, slider_y_scale]:
+    s.on_changed(update)
+def save_values(event):
+    xo, xs = slider_x_offset.val, slider_x_scale.val
+    yo, ys = slider_y_offset.val, slider_y_scale.val
+    calib = {
+        "x_offset": xo, "x_scale": xs,
+        "y_offset": yo, "y_scale": ys
+    }
+    with open(config_path, "w") as f:
+        json.dump(calib, f, indent=2)
+    print(f"✅ Saved combined calibration: {calib}")
+btn_save.on_clicked(save_values)
+plt.show()

app/chart_detect.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# app/chart_detect.py
+import cv2
+import os
+import uuid
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+# 🗂️ Ensure charts dir exists inside project
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+CHARTS_DIR = os.path.join(BASE_DIR, "charts")
+os.makedirs(CHARTS_DIR, exist_ok=True)
+def _ensure_bgr(img_or_path):
+    """
+    Accept file path, PIL.Image, or ndarray → return OpenCV BGR ndarray.
+    """
+    if isinstance(img_or_path, str):
+        img = cv2.imread(img_or_path)
+        if img is None:
+            raise ValueError(f"[chart_detect] cv2.imread failed: {img_or_path}")
+        return img
+    if isinstance(img_or_path, Image.Image):
+        return cv2.cvtColor(np.array(img_or_path), cv2.COLOR_RGB2BGR)
+    if isinstance(img_or_path, np.ndarray):
+        return img
+    raise ValueError("[chart_detect] Unsupported image type.")
+def detect_charts(image_or_path, min_area=15000, debug=False, visualize=False):
+    """
+    Detect chart-like rectangular regions in a page image.
+    Saves cropped charts into CHARTS_DIR and returns metadata list.
+    Each item: {"bbox": (x0,y0,x1,y1), "image_path": "<abs path>"}
+    """
+    try:
+        img = _ensure_bgr(image_or_path)
+    except Exception as e:
+        print("[chart_detect] load error:", e)
+        return []
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    gray = cv2.equalizeHist(gray)
+    blur = cv2.GaussianBlur(gray, (5, 5), 0)
+    # Canny edge detection — lowered thresholds for faint edges
+    edges = cv2.Canny(blur, 30, 100)
+    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    h_img, w_img = img.shape[:2]
+    charts = []
+    if debug:
+        print(f"[chart_detect] Found {len(contours)} raw contours")
+    for c in contours:
+        x, y, w, h = cv2.boundingRect(c)
+        area = w * h
+        aspect = w / (h + 1e-8)
+        # 🔧 More forgiving filtering
+        if area < min_area * 0.5:
+            continue
+        if w > 0.98 * w_img or h > 0.98 * h_img:
+            continue
+        if not (0.1 < aspect < 10.0):
+            continue
+        # Merge very close bounding boxes
+        merged = False
+        for prev in charts:
+            px0, py0, px1, py1 = prev["bbox"]
+            # Overlap or close enough
+            if abs(x - px0) < 50 and abs(y - py0) < 50:
+                px0, py0 = min(px0, x), min(py0, y)
+                px1, py1 = max(px1, x + w), max(py1, y + h)
+                prev["bbox"] = (px0, py0, px1, py1)
+                merged = True
+                break
+        if merged:
+            continue
+        # Slight padding
+        pad_x = int(min(0.1 * w, 40))
+        pad_y = int(min(0.1 * h, 40))
+        x0 = max(0, x - pad_x)
+        y0 = max(0, y - pad_y)
+        x1 = min(w_img, x + w + pad_x)
+        y1 = min(h_img, y + h + pad_y)
+        crop = img[y0:y1, x0:x1]
+        crop_name = f"chart_{uuid.uuid4().hex}.png"
+        crop_path = os.path.join(CHARTS_DIR, crop_name)
+        try:
+            cv2.imwrite(crop_path, crop)
+            charts.append({"bbox": (x0, y0, x1, y1), "image_path": crop_path})
+        except Exception as e:
+            print(f"[chart_detect] Failed saving {crop_path}: {e}")
+    # Sort by size (largest first)
+    charts.sort(key=lambda c: (c["bbox"][2] - c["bbox"][0]) * (c["bbox"][3] - c["bbox"][1]), reverse=True)
+    if debug:
+        print(f"[chart_detect] ✅ Detected {len(charts)} likely chart(s). Saved to {CHARTS_DIR}")
+    # 🧠 Optional: Visualize results
+    if visualize:
+        vis = img.copy()
+        for c in charts:
+            x0, y0, x1, y1 = c["bbox"]
+            cv2.rectangle(vis, (x0, y0), (x1, y1), (0, 255, 0), 3)
+        plt.figure(figsize=(12, 10))
+        plt.imshow(cv2.cvtColor(vis, cv2.COLOR_BGR2RGB))
+        plt.title(f"Detected {len(charts)} chart(s)")
+        plt.axis("off")
+        plt.show()
+    return charts
+# Manual debug run
+if __name__ == "__main__":
+    test_image = "samples/vdoc_rag_test_page1.png"  # example path
+    results = detect_charts(test_image, debug=True, visualize=True)
+    for r in results:
+        print(r)

app/chart_reasoner.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import os
+import re
+import json
+from typing import List, Dict, Any
+import pytesseract
+from PIL import Image
+import numpy as np
+# Optional HF/Pix2Struct captioning
+USE_PIX2STRUCT = False
+try:
+    from transformers import AutoProcessor, AutoModelForVision2Seq
+    _pix2_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
+    _pix2_model = AutoModelForVision2Seq.from_pretrained("google/pix2struct-textcaps-base")
+    USE_PIX2STRUCT = True
+    print("[chart_reasoner] Pix2Struct/TextCaps available for chart captioning.")
+except Exception:
+    USE_PIX2STRUCT = False
+    print("[chart_reasoner] Pix2Struct/TextCaps not available — will use OCR fallback.")
+import os
+import re
+import json
+from typing import List, Dict, Any, Optional
+import pytesseract
+from PIL import Image
+import numpy as np
+import cv2
+# Optional Pix2Struct captioning
+USE_PIX2STRUCT = False
+try:
+    from transformers import AutoProcessor, AutoModelForVision2Seq
+    _pix2_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
+    _pix2_model = AutoModelForVision2Seq.from_pretrained("google/pix2struct-textcaps-base")
+    USE_PIX2STRUCT = True
+    print("[chart_reasoner] Pix2Struct/TextCaps available for chart captioning.")
+except Exception:
+    USE_PIX2STRUCT = False
+    print("[chart_reasoner] Pix2Struct/TextCaps not available — will use OCR/geometric fallback.")
+# Optional CLIP embeddings via sentence-transformers
+USE_CLIP = False
+try:
+    from sentence_transformers import SentenceTransformer
+    _clip_model = SentenceTransformer("clip-ViT-B-32")
+    USE_CLIP = True
+    print("[chart_reasoner] CLIP (sentence-transformers) available for chart embeddings.")
+except Exception:
+    USE_CLIP = False
+def preprocess_for_ocr(image_path: str) -> Image.Image:
+    """Enhance contrast and threshold image to improve OCR inside colored charts."""
+    img = cv2.imread(image_path)
+    if img is None:
+        raise ValueError(f"Could not read image: {image_path}")
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    gray = cv2.equalizeHist(gray)
+    # adaptive threshold for better text extraction
+    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 21, 10)
+    return Image.fromarray(thresh)
+def _extract_numbers_from_text(text: str) -> List[float]:
+    matches = re.findall(r"\(?-?\d[\d,\.\)\(]*%?", text)
+    nums: List[float] = []
+    for m in matches:
+        s = m.strip()
+        negative = False
+        if s.startswith("(") and s.endswith(")"):
+            negative = True
+            s = s[1:-1]
+        s = s.replace("%", "").replace(",", "")
+        try:
+            val = float(s)
+            if negative:
+                val = -val
+            nums.append(val)
+        except Exception:
+            continue
+    return nums
+def analyze_bar_chart(image_path: str, debug_save: Optional[str] = None) -> Optional[Dict[str, Any]]:
+    """Detect vertical bars and compute heights to infer a simple trend.
+    Returns None if no bar-like contours are found.
+    """
+    img = cv2.imread(image_path)
+    if img is None:
+        return None
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    blur = cv2.GaussianBlur(gray, (5, 5), 0)
+    edges = cv2.Canny(blur, 50, 150)
+    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    h_img = img.shape[0]
+    bars = []
+    for c in contours:
+        x, y, w, h = cv2.boundingRect(c)
+        # Vertical bar heuristic: taller than wide, reasonable size
+        if w < 6 or h < 10:
+            continue
+        aspect = h / (w + 1e-8)
+        if aspect < 1.2:
+            continue
+        # ignore boxes that almost cover image (likely page border)
+        if h > 0.9 * h_img:
+            continue
+        bars.append((x, y, w, h))
+    if not bars:
+        return None
+    # sort left-to-right
+    bars = sorted(bars, key=lambda b: b[0])
+    heights = [int(b[3]) for b in bars]
+    # normalize heights to 0-1
+    max_h = max(heights) if heights else 1
+    norm = [h / max_h for h in heights]
+    # trend by comparing first vs last
+    trend = "increasing" if heights[-1] > heights[0] else ("decreasing" if heights[-1] < heights[0] else "flat")
+    res = {
+        "bar_count": len(bars),
+        "heights": heights,
+        "normalized_heights": norm,
+        "trend": trend,
+        "bars_xywh": bars,
+    }
+    # debug: save overlay image showing detected bars
+    try:
+        if debug_save:
+            ov = img.copy()
+            for (x, y, w, h) in bars:
+                cv2.rectangle(ov, (x, y), (x + w, y + h), (0, 255, 0), 2)
+            cv2.imwrite(debug_save, ov)
+    except Exception:
+        pass
+    return res
+def process_chart_crop(image_path: str) -> Dict[str, Any]:
+    """Main entry: returns a textual summary and structured analysis for a chart image."""
+    if not os.path.exists(image_path):
+        return {"summary_text": f"[Error] Chart image not found: {image_path}", "structured": {}}
+    pix_caption = None
+    if USE_PIX2STRUCT:
+        try:
+            img = Image.open(image_path).convert("RGB")
+            inputs = _pix2_processor(images=img, text="Describe this chart.", return_tensors="pt")
+            outputs = _pix2_model.generate(**inputs, max_new_tokens=128)
+            try:
+                pix_caption = _pix2_processor.decode(outputs[0], skip_special_tokens=True)
+            except Exception:
+                from transformers import AutoTokenizer
+                tokenizer = AutoTokenizer.from_pretrained("google/pix2struct-textcaps-base")
+                pix_caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        except Exception as e:
+            print("[chart_reasoner] Pix2Struct failed:", e)
+            pix_caption = None
+    # Geometric analysis (bars)
+    bar_info = None
+    try:
+        # debug overlay path (optional)
+        debug_overlay = None
+        # if an environment var set, write overlays to app/charts/debug_*
+        charts_dir = os.environ.get("VDOCRAG_CHARTS_DIR", os.path.join(os.path.dirname(__file__), "charts"))
+        if os.path.isdir(charts_dir):
+            debug_overlay = os.path.join(charts_dir, f"debug_{os.path.basename(image_path)}")
+        bar_info = analyze_bar_chart(image_path, debug_save=debug_overlay)
+    except Exception as e:
+        print("[chart_reasoner] analyze_bar_chart error:", e)
+        bar_info = None
+    # OCR with preprocessing to capture axis labels / numbers
+    ocr_text = ""
+    try:
+        proc_img = preprocess_for_ocr(image_path)
+        ocr_text = pytesseract.image_to_string(proc_img, config="--psm 6")
+    except Exception as e:
+        try:
+            # fallback to raw OCR
+            ocr_text = pytesseract.image_to_string(Image.open(image_path))
+        except Exception as e2:
+            return {"summary_text": f"[Error] OCR failure: {e} / {e2}", "structured": {}}
+    nums = _extract_numbers_from_text(ocr_text)
+    structured: Dict[str, Any] = {"ocr_text": ocr_text.strip(), "numbers": nums}
+    summary_parts = []
+    if pix_caption:
+        summary_parts.append(pix_caption.strip())
+    if ocr_text.strip():
+        summary_parts.append("OCR summary: " + " ".join(ocr_text.strip().split())[:300])
+    if bar_info:
+        structured.update({
+            "bar_count": bar_info.get("bar_count"),
+            "bar_heights": bar_info.get("heights"),
+            "bar_trend": bar_info.get("trend"),
+            "bars_xywh": bar_info.get("bars_xywh"),
+        })
+        summary_parts.append(f"Bar chart trend: {bar_info.get('trend')} (left→right)")
+    # Optional CLIP embedding for retrieval
+    if USE_CLIP:
+        try:
+            emb = _clip_model.encode([" ".join(summary_parts) or ocr_text], normalize_embeddings=True)[0]
+            structured["clip_vector"] = [float(x) for x in np.asarray(emb).tolist()]
+        except Exception as e:
+            print("[chart_reasoner] CLIP encode failed:", e)
+    final_summary = " | ".join(summary_parts) if summary_parts else (ocr_text.strip() or "No description available.")
+    return {"summary_text": final_summary, "structured": structured}
+__all__ = ["process_chart_crop"]

app/debug_chunks.json ADDED Viewed

	@@ -0,0 +1,182 @@

+[
+  {
+    "id": "text_9078364bec07451fbe7900a99835907b",
+    "text": "VDoc RAG Test Document",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_0195b06b4eed4e3a9de12f8f73380390",
+    "text": "Contains Charts, Tables, and Flyers",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_c6ec891227d44d42ab809baf2469bbc8",
+    "text": "Sample Data Table:",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_9e2edb5e2e4f42b9a344a594b904a859",
+    "text": "ID Name Score Category",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_c94b49b4eedd4f4a82a76fad15610f69",
+    "text": "1 Alice 85 A",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_db03ed6418594c0fb3451cb7ba032342",
+    "text": "2 Bob 78 B",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_24e15dd936584136ad45cdb48b74f695",
+    "text": "3 Charlie 92 A+",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_9d77d0b5b759433d899289bb7a1b79d9",
+    "text": "4 David 64 C",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_3d67b96a270944bf9b35ba358d534830",
+    "text": "5 Eva 88 A",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_7aabcd11cb544fd99fd75c591885b5e8",
+    "text": "Flyer Section: Upcoming AI Workshop",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_86c5a632d7de4c258d9bced4e8de84b4",
+    "text": "Join us for an engaging AI Workshop covering:",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_4dd5041c98f046a499bf076093f0503a",
+    "text": "- Machine Learning Basics",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_30472772897c453491075424dbdf9927",
+    "text": "- LLM Applications",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_e7f7640c0daa4146b3a30ec41d79b3e8",
+    "text": "- RAG (Retrieval Augmented Generation) Systems",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "table_2791fed25a254479a185f8403f6ca385",
+    "text": "[{\"ID\": \"1\", \"Name\": \"Alice\", \"Score\": \"85\", \"Category\": \"A\"}, {\"ID\": \"2\", \"Name\": \"Bob\", \"Score\": \"78\", \"Category\": \"B\"}, {\"ID\": \"3\", \"Name\": \"Charlie\", \"Score\": \"92\", \"Category\": \"A+\"}, {\"ID\": \"4\", \"Name\": \"David\", \"Score\": \"64\", \"Category\": \"C\"}, {\"ID\": \"5\", \"Name\": \"Eva\", \"Score\": \"88\", \"Category\": \"A\"}]",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 1,
+      "bbox": null,
+      "type": "table"
+    }
+  },
+  {
+    "id": "text_5d5f395ca35940d7aff54b1224f51940",
+    "text": "Date: November 20, 2025",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 2,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_49338ecabed84edb8c71fa11dda41ff2",
+    "text": "Venue: Innovation Hall, Tech Park",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 2,
+      "bbox": null,
+      "type": "text"
+    }
+  },
+  {
+    "id": "text_dd6a5828c8e64a628e0cec709e40172e",
+    "text": "Register now at: www.aiworkshop2025.com",
+    "metadata": {
+      "source": "C:\\Users\\abhin\\OneDrive\\Desktop\\vdoc-rag-mvp\\app\\uploads\\vdoc_rag_test.pdf",
+      "page": 2,
+      "bbox": null,
+      "type": "text"
+    }
+  }
+]

app/embeddings.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from sentence_transformers import SentenceTransformer
+import os
+import numpy as np
+class TextImageEmbedder:
+    def __init__(self, text_model_name=None):
+        # Automatically load fine-tuned model if available
+        default_model = "all-MiniLM-L6-v2"
+        tuned_model = os.path.join(os.path.dirname(__file__), "..", "models", "vdoc_feedback_tuned", "latest")
+        if text_model_name:
+            model_to_use = text_model_name
+        elif os.path.exists(os.path.abspath(tuned_model)):
+            tuned_path = os.path.abspath(tuned_model)
+            print(f"🧠 Using fine-tuned embedding model: {tuned_path}")
+            model_to_use = tuned_path
+        else:
+            print(f"📦 Using base embedding model: {default_model}")
+            model_to_use = default_model
+        self.text_model = SentenceTransformer(model_to_use)
+    def embed_text(self, texts):
+        if isinstance(texts, str):
+            texts = [texts]
+        return self.text_model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
+    def embed_text_sync(self, text):
+        return self.embed_text([text])[0]

app/feedback.json ADDED Viewed

	@@ -0,0 +1,30 @@

+[
+  {
+    "timestamp": "2025-11-10T14:19:13",
+    "question": "what trend does the bar graph show",
+    "answer": "increasing (left→right)",
+    "correctness": "correct",
+    "sources": []
+  },
+  {
+    "timestamp": "2026-03-11T02:25:53",
+    "question": "what is this document about?",
+    "answer": "This document is about an upcoming AI Workshop covering Machine Learning Basics, scheduled for November 20, 2025. It also contains a section describing a chart or graph with data from 2018 to 2024.",
+    "correctness": "correct",
+    "sources": []
+  },
+  {
+    "timestamp": "2026-03-11T02:29:29",
+    "question": "what is this document about?",
+    "answer": "This document is about an upcoming AI Workshop covering Machine Learning Basics, scheduled for November 20, 2025. It also contains a section describing a chart or graph with data from 2018 to 2024.",
+    "correctness": "correct",
+    "sources": []
+  },
+  {
+    "timestamp": "2026-03-11T02:29:40",
+    "question": "what is this document about?",
+    "answer": "This document is about an upcoming AI Workshop covering Machine Learning Basics, scheduled for November 20, 2025. It also contains a section describing a chart or graph with data from 2018 to 2024.",
+    "correctness": "correct",
+    "sources": []
+  }
+]

app/feedback_manager.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import json
+from datetime import datetime
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+FEEDBACK_FILE = os.path.join(BASE_DIR, "feedback.json")
+def _load_feedback():
+    if not os.path.exists(FEEDBACK_FILE):
+        return []
+    with open(FEEDBACK_FILE, "r", encoding="utf-8") as f:
+        try:
+            return json.load(f)
+        except Exception:
+            return []
+def _save_feedback(data):
+    with open(FEEDBACK_FILE, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+def record_feedback(question, answer, correctness, sources=None):
+    """
+    Store user feedback about RAG answer correctness.
+    correctness: 'correct' | 'incorrect' | 'partial'
+    """
+    entry = {
+        "timestamp": datetime.now().isoformat(timespec="seconds"),
+        "question": question,
+        "answer": answer,
+        "correctness": correctness,
+        "sources": sources or [],
+    }
+    data = _load_feedback()
+    data.append(entry)
+    _save_feedback(data)
+    print(f"📝 Feedback recorded ({correctness}) for: {question[:60]}...")
+    return entry
+def get_feedback_summary():
+    data = _load_feedback()
+    total = len(data)
+    if total == 0:
+        return "No feedback yet."
+    correct = sum(1 for x in data if x.get("correctness") == "correct")
+    incorrect = sum(1 for x in data if x.get("correctness") == "incorrect")
+    return f"Feedback Stats — ✅ {correct} correct, ❌ {incorrect} incorrect, total {total}"

app/highlight_calibration.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "x_offset": -33.0,
+  "x_scale": 1.0,
+  "y_offset": -65.0,
+  "y_scale": 1.02
+}

app/indexer.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# app/indexer.py
+import chromadb
+import json
+import os
+import numpy as np
+class ChromaIndexer:
+    def __init__(self, embedding_function=None, persist_directory="./storage/chroma_db"):
+        """
+        Persistent Chroma DB (DuckDB + Parquet) wrapper.
+        Stores vectors and metadata to disk so index survives restarts.
+        """
+        os.makedirs(persist_directory, exist_ok=True)
+        self.embedding_function = embedding_function
+        self.persist_directory = persist_directory
+        self.active_doc_id = None  # Track currently active document
+        # Use the PersistentClient backed by the provided directory
+        try:
+            self.client = chromadb.PersistentClient(path=persist_directory)
+        except Exception:
+            # fallback to in-memory client if PersistentClient not available
+            print("[indexer] PersistentClient not available, falling back to in-memory client.")
+            self.client = chromadb.Client()
+        self.collection = self.client.get_or_create_collection(
+            "vdoc",
+            metadata={"description": "VDoc-RAG persistent storage"},
+        )
+        print(f"✅ Chroma index loaded from: {persist_directory}")
+    def clear(self):
+        """
+        Clear all documents from the collection.
+        Used for document isolation - clear before indexing new document.
+        """
+        try:
+            # Delete and recreate collection
+            self.client.delete_collection("vdoc")
+            self.collection = self.client.get_or_create_collection(
+                "vdoc",
+                metadata={"description": "VDoc-RAG persistent storage"},
+            )
+            self.active_doc_id = None
+            print("🗑️ Cleared all chunks from index (document isolation)")
+        except Exception as e:
+            print(f"[WARN] Failed to clear collection: {e}")
+    def set_active_document(self, doc_id: str):
+        """Set the currently active document for querying."""
+        self.active_doc_id = doc_id
+        print(f"📄 Active document set to: {doc_id}")
+    def _sanitize_metadata(self, metadata):
+        clean_meta = {}
+        for k, v in metadata.items():
+            if isinstance(v, (str, int, float, bool)) or v is None:
+                clean_meta[k] = v
+            else:
+                try:
+                    clean_meta[k] = json.dumps(v)
+                except Exception:
+                    clean_meta[k] = str(v)
+        return clean_meta
+    def upsert(self, items):
+        ids = [it[0] for it in items]
+        embeddings = [it[1] for it in items]
+        metadatas = [self._sanitize_metadata(it[2]) for it in items]
+        documents = [it[3] for it in items]
+        self.collection.upsert(
+            ids=ids,
+            embeddings=embeddings,
+            metadatas=metadatas,
+            documents=documents,
+        )
+        print(f"💾 Upserted {len(items)} chunks into persistent Chroma collection.")
+    def query(self, qvec, top_k=5, doc_id=None):
+        """
+        qvec: numpy vector or list (query embedding)
+        doc_id: optional document ID to filter results (for document isolation)
+        Returns list of {id, text, metadata, score (cosine sim 0–1)}
+        """
+        # Use provided doc_id or fall back to active document
+        filter_doc = doc_id or self.active_doc_id
+        query_params = {
+            "query_embeddings": [qvec],
+            "n_results": top_k,
+            "include": ["embeddings", "metadatas", "documents", "distances"],
+        }
+        # Add document filter if specified
+        if filter_doc:
+            query_params["where"] = {"doc_id": filter_doc}
+        res = self.collection.query(**query_params)
+        out = []
+        if not res or "ids" not in res or len(res["ids"]) == 0:
+            return out
+        qvec = np.array(qvec, dtype=np.float32)
+        for i in range(len(res["ids"][0])):
+            try:
+                chunk_vec = np.array(res["embeddings"][0][i], dtype=np.float32)
+                cos_sim = float(
+                    np.dot(qvec, chunk_vec) / (np.linalg.norm(qvec) * np.linalg.norm(chunk_vec) + 1e-8)
+                )
+                cos_sim = max(0.0, min(1.0, cos_sim))
+            except Exception:
+                try:
+                    dist = res.get("distances", [[0]])[0][i]
+                    cos_sim = max(0.0, min(1.0, 1.0 - float(dist)))
+                except Exception:
+                    cos_sim = 0.0
+            out.append({
+                "id": res["ids"][0][i],
+                "text": res.get("documents", [[None]])[0][i],
+                "metadata": res.get("metadatas", [[None]])[0][i],
+                "score": round(cos_sim, 4),
+            })
+        out.sort(key=lambda x: x["score"], reverse=True)
+        return out

app/ingest.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# app/ingest.py
+"""
+PDF → Images → OCR text → Table extraction → Chart detection & reasoning
+Generates chunks (text/table/chart) with metadata for embedding and indexing.
+"""
+import os
+import uuid
+from pdf2image import convert_from_path
+from PIL import Image
+import pytesseract
+import pdfplumber
+from app.tables import extract_tables_from_pdf
+from app.chart_detect import detect_charts
+from app.chart_reasoner import process_chart_crop
+from app.cache_manager import load_chunks_from_cache, save_chunks_to_cache
+# Project-local temporary/storage directories
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+TMP_DIR = os.path.join(BASE_DIR, "tmp")
+TABLES_DIR = os.path.join(BASE_DIR, "tables")
+CHARTS_DIR = os.path.join(BASE_DIR, "charts")
+os.makedirs(TMP_DIR, exist_ok=True)
+os.makedirs(TABLES_DIR, exist_ok=True)
+os.makedirs(CHARTS_DIR, exist_ok=True)
+def pdf_to_images(pdf_path, dpi=200):
+    """
+    Convert a PDF into page-wise PNG images for OCR and visual analysis.
+    """
+    pages = convert_from_path(pdf_path, dpi=dpi)
+    paths = []
+    # Use project-local tmp directory to avoid system temp folder
+    os.makedirs(TMP_DIR, exist_ok=True)
+    for i, p in enumerate(pages, start=1):
+        ppath = os.path.join(TMP_DIR, f"page_{uuid.uuid4().hex}_{i}.png")
+        p.save(ppath, "PNG")
+        paths.append(ppath)
+    return paths
+def ocr_image_to_blocks(image_path, min_words_per_line=3):
+    """
+    Run OCR on an image and merge words into line-level text blocks.
+    This preserves full sentences like 'Venue: Delhi Convention Hall'.
+    """
+    img = Image.open(image_path).convert("RGB")
+    data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, config="--oem 3 --psm 6")
+    n = len(data["text"])
+    lines = {}
+    for i in range(n):
+        txt = data["text"][i].strip()
+        if not txt:
+            continue
+        line_no = data["line_num"][i]
+        if line_no not in lines:
+            lines[line_no] = {"words": [], "lefts": [], "tops": [], "rights": [], "bottoms": []}
+        lines[line_no]["words"].append(txt)
+        lines[line_no]["lefts"].append(data["left"][i])
+        lines[line_no]["tops"].append(data["top"][i])
+        lines[line_no]["rights"].append(data["left"][i] + data["width"][i])
+        lines[line_no]["bottoms"].append(data["top"][i] + data["height"][i])
+    blocks = []
+    for ln, d in lines.items():
+        if len(d["words"]) < min_words_per_line:
+            continue
+        text = " ".join(d["words"]).strip()
+        bbox = (
+            min(d["lefts"]),
+            min(d["tops"]),
+            max(d["rights"]),
+            max(d["bottoms"]),
+        )
+        blocks.append({"text": text, "bbox": bbox})
+    return blocks
+def process_pdf(path):
+    """
+    Process a PDF or image file:
+    - Extract text chunks (OCR)
+    - Extract tables (pdfplumber)
+    - Detect charts (layoutparser or OpenCV)
+    - Run chart reasoning model (Donut/Pix2Struct/heuristics)
+    Returns: list of document chunks {id, text, metadata}
+    """
+    # Check cache first
+    cached = load_chunks_from_cache(path)
+    if cached:
+        print(f"✅ Using cached chunks for {os.path.basename(path)}")
+        return cached
+    items = []
+    # 1️⃣ OCR text extraction (page images)
+    images = pdf_to_images(path)
+    for pno, imgpath in enumerate(images, start=1):
+        img = Image.open(imgpath).convert("RGB")
+        data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, config="--oem 3 --psm 6")
+        n = len(data["text"])
+        current_line = None
+        line_words, lefts, tops, rights, bottoms = [], [], [], [], []
+        for i in range(n):
+            text = data["text"][i].strip()
+            if not text:
+                continue
+            line_num = data["line_num"][i]
+            # Start new line if changed
+            if current_line is None:
+                current_line = line_num
+            if line_num != current_line:
+                # finalize previous line
+                if line_words:
+                    doc = {
+                        "id": f"{uuid.uuid4().hex}",
+                        "text": " ".join(line_words),
+                        "metadata": {
+                            "source": path,
+                            "page": pno,
+                            "bbox": (min(lefts), min(tops), max(rights), max(bottoms)),
+                            "type": "text",
+                        },
+                    }
+                    items.append(doc)
+                # reset
+                current_line = line_num
+                line_words, lefts, tops, rights, bottoms = [], [], [], [], []
+            # collect current word
+            line_words.append(text)
+            lefts.append(data["left"][i])
+            tops.append(data["top"][i])
+            rights.append(data["left"][i] + data["width"][i])
+            bottoms.append(data["top"][i] + data["height"][i])
+        # flush last line
+        if line_words:
+            doc = {
+                "id": f"{uuid.uuid4().hex}",
+                "text": " ".join(line_words),
+                "metadata": {
+                    "source": path,
+                    "page": pno,
+                    "bbox": (min(lefts), min(tops), max(rights), max(bottoms)),
+                    "type": "text",
+                },
+            }
+            items.append(doc)
+    # 2️⃣ Table extraction (structured CSVs)
+    try:
+        tables = extract_tables_from_pdf(path)
+        for t in tables:
+            doc = {
+                "id": f"{uuid.uuid4().hex}",
+                "text": t["summary_text"],
+                "metadata": {
+                    "source": path,
+                    "page": t["page"],
+                    "type": "table",
+                    "csv_path": t["csv_path"],
+                    "rows": t["rows"],
+                    "bbox": t.get("bbox"),
+                },
+            }
+            items.append(doc)
+    except Exception as e:
+        print("[WARN] Table extraction failed:", e)
+    # 3️⃣ Chart detection + reasoning
+    for pno, imgpath in enumerate(images, start=1):
+        try:
+            chart_crops = detect_charts(imgpath, debug=True)
+            for c in chart_crops:
+                crop_path = c["image_path"]
+                bbox = c["bbox"]
+                # Run reasoning model or OCR heuristic
+                chart_res = process_chart_crop(crop_path)
+                summary = chart_res.get("summary_text", "Chart region detected.")
+                structured = chart_res.get("structured", {})
+                doc = {
+                    "id": f"chart_{uuid.uuid4().hex}",
+                    "text": summary,
+                    "metadata": {
+                        "source": path,
+                        "page": pno,
+                        "type": "chart",
+                        "bbox": bbox,
+                        "image_path": crop_path,
+                        "structured": structured,
+                    },
+                }
+                items.append(doc)
+        except Exception as e:
+            print(f"[WARN] Chart detection/reasoning failed on page {pno}:", e)
+    # Save to cache for future reuse
+    try:
+        save_chunks_to_cache(path, items)
+        print(f"💾 Cached {len(items)} chunks for {os.path.basename(path)}")
+    except Exception as e:
+        print("[WARN] Failed to save cache:", e)
+    return items

app/main.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import os
+import uvicorn
+from fastapi import FastAPI, UploadFile, File, Form, Request, HTTPException
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from app.ingest import process_pdf
+from app.indexer import ChromaIndexer
+from app.embeddings import TextImageEmbedder
+from app.reader import LLMReader
+from app.visual_highlight import render_highlighted_pages
+from app.cache_manager import clear_cache
+from app.feedback_manager import record_feedback, get_feedback_summary, _load_feedback
+import shutil
+import subprocess
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from io import BytesIO
+import base64
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+# ---------------------------------------------------------
+# Initialization
+# ---------------------------------------------------------
+app = FastAPI(title="VDoc RAG - Web UI")
+# ---------------------------------------------------------
+# Directories
+# ---------------------------------------------------------
+# Get absolute path to this file’s directory
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# Define template and static directories relative to BASE_DIR
+TEMPLATE_DIR = os.path.join(BASE_DIR, "templates")
+STATIC_DIR = os.path.join(BASE_DIR, "static")
+# Ensure directories exist
+os.makedirs(TEMPLATE_DIR, exist_ok=True)
+os.makedirs(STATIC_DIR, exist_ok=True)
+# Mount static directory
+app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
+# Serve highlighted images
+HIGHLIGHTED_DIR = os.path.join(BASE_DIR, "highlighted")
+os.makedirs(HIGHLIGHTED_DIR, exist_ok=True)
+app.mount("/highlighted", StaticFiles(directory=HIGHLIGHTED_DIR), name="highlighted")
+# Load Jinja2 templates safely
+templates = Jinja2Templates(directory=TEMPLATE_DIR)
+# ---------------------------------------------------------
+# Core Components
+# ---------------------------------------------------------
+embedder = TextImageEmbedder()
+# Use a project-local persistent directory for Chroma
+STORAGE_DIR = os.path.join(BASE_DIR, "storage", "chroma_db")
+indexer = ChromaIndexer(embedding_function=embedder.embed_text, persist_directory=STORAGE_DIR)
+reader_provider = os.environ.get("VDOCRAG_READER_PROVIDER", "gemini")
+reader = LLMReader(provider=reader_provider)
+uploaded_files = []  # track uploaded docs for display
+# ---------------------------------------------------------
+# Routes
+# ---------------------------------------------------------
+@app.get("/", response_class=HTMLResponse)
+async def home(request: Request):
+    """Render main upload + query interface."""
+    print(f"✅ Using templates from: {TEMPLATE_DIR}")
+    if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
+        print("❌ index.html not found in:", TEMPLATE_DIR)
+    else:
+        print("✅ index.html found!")
+    return templates.TemplateResponse(
+        "index.html",
+        {"request": request, "uploaded": uploaded_files, "answer": None},
+    )
+@app.post("/upload")
+async def upload_file(request: Request, file: UploadFile = File(...)):
+    """Handle PDF/image upload and indexing."""
+    if not file.filename.lower().endswith((".pdf", ".png", ".jpg", ".jpeg")):
+        raise HTTPException(status_code=400, detail="Unsupported file type")
+    # Save uploaded file temporarily
+    temp_dir = os.path.join(BASE_DIR, "uploads")
+    os.makedirs(temp_dir, exist_ok=True)
+    path = os.path.join(temp_dir, file.filename)
+    with open(path, "wb") as f:
+        f.write(await file.read())
+    # 🔒 Document Isolation: Clear old chunks before indexing new document
+    indexer.clear()
+    uploaded_files.clear()  # Reset uploaded files list
+    # Extract and process text chunks
+    docs = process_pdf(path)
+    if len(docs) == 0:
+        return templates.TemplateResponse(
+            "index.html",
+            {
+                "request": request,
+                "uploaded": uploaded_files,
+                "answer": "⚠️ No content extracted from file.",
+            },
+        )
+    # Generate document ID for isolation
+    doc_id = file.filename
+    # Embed and index chunks with doc_id metadata
+    texts = [d["text"] for d in docs]
+    vectors = embedder.embed_text(texts)
+    # Add doc_id to each chunk's metadata for filtering
+    for d in docs:
+        d["metadata"]["doc_id"] = doc_id
+    items = [(d["id"], vectors[i].tolist(), d["metadata"], d["text"]) for i, d in enumerate(docs)]
+    indexer.upsert(items)
+    # Set this as the active document for queries
+    indexer.set_active_document(doc_id)
+    uploaded_files.append(file.filename)
+    print(f"✅ Indexed {len(docs)} chunks from {file.filename} (document isolation enabled)")
+    return templates.TemplateResponse(
+        "index.html",
+        {
+            "request": request,
+            "uploaded": uploaded_files,
+            "answer": f"✅ Uploaded and indexed {file.filename} ({len(docs)} chunks).",
+        },
+    )
+@app.post("/ask")
+async def ask_question(request: Request, question: str = Form(...)):
+    """Handle user query, retrieve relevant chunks, and generate LLM answer."""
+    # Step 1 — Embed question
+    qvec = embedder.embed_text([question])[0]
+    # Step 2 — Retrieve top chunks
+    hits = indexer.query(qvec, top_k=10)
+    # Debug log
+    print("\n🔍 Retrieved Chunks for Query:", question)
+    for i, h in enumerate(hits):
+        meta = h.get("metadata", {})
+        conf = h.get("score", 0)
+        print(f"Chunk {i+1}: Page {meta.get('page')} | BBox: {meta.get('bbox')} | Confidence: {conf*100:.1f}%")
+        print(f"Text: {h['text'][:500]}...\n")
+    # Prioritize chart-type hits for chart-related questions
+    chart_keywords = ["chart", "graph", "trend", "plot", "increase", "decrease", "growth"]
+    if any(k in question.lower() for k in chart_keywords):
+        try:
+            hits = sorted(hits, key=lambda h: h.get("metadata", {}).get("type") != "chart")
+            print("[INFO] Prioritized chart-type chunks for chart-related question.")
+        except Exception as e:
+            print("[WARN] Failed to prioritize chart hits:", e)
+    # Step 3 — Build context string
+    context_blocks = [
+        f"[{i+1}] {h['text']} (page: {h['metadata'].get('page')}, bbox: {h['metadata'].get('bbox')})"
+        for i, h in enumerate(hits)
+    ]
+    context = "\n".join(context_blocks)
+    # Step 4 — Ask LLM
+    answer = reader.answer_question(query=question, context=context, sources=hits)
+    sources = answer.get("sources", [])
+    # 🖼️ Generate visual highlights
+    try:
+        first_source_path = hits[0]["metadata"].get("source") if hits else None
+        highlight_paths = []
+        if first_source_path and os.path.exists(first_source_path):
+            highlight_paths = render_highlighted_pages(first_source_path, hits)
+            # convert to web URLs for template
+            highlight_urls = ["/" + os.path.relpath(p, BASE_DIR).replace("\\", "/") for p in highlight_paths]
+        else:
+            highlight_urls = []
+    except Exception as e:
+        print("[WARN] Highlight rendering failed:", e)
+        highlight_urls = []
+    # Step 5 — Prepare chunk previews for UI
+    chunk_previews = [
+        {
+            "index": i + 1,
+            "page": h["metadata"].get("page"),
+            "bbox": h["metadata"].get("bbox"),
+            "text": h["text"][:300] + ("..." if len(h["text"]) > 300 else ""),
+            "confidence": round(h.get("score", 0) * 100, 1),
+        }
+        for i, h in enumerate(hits)
+    ]
+    # Average confidence for the retrieved set
+    avg_conf = sum(h.get("score", 0) for h in hits) / max(len(hits), 1)
+    # Step 6 — Render page
+    return templates.TemplateResponse(
+        "index.html",
+        {
+            "request": request,
+            "uploaded": uploaded_files,
+            "answer": answer["text"],
+            "question": question,
+            "sources": sources,
+            "chunks": chunk_previews,
+                "highlight_images": highlight_urls,
+                "confidence_avg": round(avg_conf * 100, 1),
+        },
+    )
+@app.post("/clear_cache")
+async def clear_cache_route(request: Request):
+    """Clear all cached chunk data and re-render the index with a message."""
+    clear_cache()
+    return templates.TemplateResponse(
+        "index.html",
+        {
+            "request": request,
+            "uploaded": uploaded_files,
+            "answer": "🧹 Cache cleared successfully!",
+        },
+    )
+@app.post("/clear_index")
+async def clear_index(request: Request):
+    """Clear the persistent Chroma index by deleting the storage directory."""
+    storage_dir = os.path.join(BASE_DIR, "storage", "chroma_db")
+    try:
+        shutil.rmtree(storage_dir, ignore_errors=True)
+        os.makedirs(storage_dir, exist_ok=True)
+        # Reinitialize indexer client to the new empty DB
+        global indexer
+        indexer = ChromaIndexer(embedding_function=embedder.embed_text, persist_directory=storage_dir)
+    except Exception as e:
+        print("[WARN] clear_index failed:", e)
+    return templates.TemplateResponse(
+        "index.html",
+        {"request": request, "uploaded": uploaded_files, "answer": "🧹 Chroma index cleared successfully!"},
+    )
+@app.post("/feedback")
+async def feedback(request: Request, question: str = Form(...), answer: str = Form(...), correctness: str = Form(...)):
+    """Record user feedback (correct / incorrect) for RAG answers."""
+    try:
+        record_feedback(question=question, answer=answer, correctness=correctness)
+        summary = get_feedback_summary()
+        msg = f"✅ Feedback received! {summary}"
+    except Exception as e:
+        print("[WARN] Failed to record feedback:", e)
+        msg = "⚠️ Failed to record feedback"
+    return templates.TemplateResponse(
+        "index.html",
+        {"request": request, "uploaded": uploaded_files, "answer": msg},
+    )
+@app.get("/feedback_dashboard", response_class=HTMLResponse)
+async def feedback_dashboard(request: Request):
+    """Display feedback statistics and allow fine-tuning."""
+    data = _load_feedback()
+    summary = get_feedback_summary()
+    total = len(data)
+    correct = sum(1 for x in data if x.get("correctness") == "correct")
+    incorrect = sum(1 for x in data if x.get("correctness") == "incorrect")
+    return templates.TemplateResponse(
+        "feedback_dashboard.html",
+        {
+            "request": request,
+            "summary": summary,
+            "total": total,
+            "correct": correct,
+            "incorrect": incorrect,
+            "feedback_data": data[::-1][:50],  # show latest 50
+        },
+    )
+@app.post("/train_feedback_model")
+async def train_feedback_model(request: Request):
+    """Run fine-tuning script directly from the UI."""
+    script_path = os.path.join(BASE_DIR, "..", "train_feedback_embeddings.py")
+    try:
+        print(f"🚀 Launching fine-tuning process: {script_path}")
+        process = subprocess.run(
+            ["python", script_path],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        output = process.stdout[-1000:]
+        message = "✅ Fine-tuning complete. Model updated successfully!"
+    except subprocess.CalledProcessError as e:
+        output = e.stderr or str(e)
+        message = "❌ Fine-tuning failed."
+    return templates.TemplateResponse(
+        "feedback_dashboard.html",
+        {
+            "request": request,
+            "summary": get_feedback_summary(),
+            "feedback_data": _load_feedback()[::-1][:50],
+            "train_output": output,
+            "message": message,
+        },
+    )
+@app.get("/benchmark_dashboard", response_class=HTMLResponse)
+async def benchmark_dashboard(request: Request):
+    """Render model benchmarking interface."""
+    return templates.TemplateResponse(
+        "benchmark_dashboard.html",
+        {
+            "request": request,
+            "results": None,
+            "plot_precision": None,
+            "plot_recall": None,
+            "plot_mrr": None,
+        },
+    )
+@app.post("/run_benchmark")
+async def run_benchmark(request: Request, models: str = Form(...), chunk_size: int = Form(200), top_k: int = Form(5)):
+    """
+    Run embedding benchmark across provided models using stored feedback data.
+    """
+    data = _load_feedback()
+    if not data:
+        return templates.TemplateResponse(
+            "benchmark_dashboard.html",
+            {
+                "request": request,
+                "results": [],
+                "message": "⚠️ No feedback data available for benchmarking.",
+            },
+        )
+    queries = [f["question"] for f in data]
+    answers = [f["answer"] for f in data]
+    MODELS = [m.strip() for m in models.split(",") if m.strip()]
+    PDF_PATH = os.path.join(BASE_DIR, "samples", "vdoc_rag_test.pdf")
+    try:
+        raw_chunks = [d["text"] for d in process_pdf(PDF_PATH)]
+    except Exception as e:
+        print("[WARN] Could not process sample PDF for benchmark, falling back to small corpus:", e)
+        raw_chunks = [
+            "Yearly sales have been increasing steadily from 2018 to 2024, with a notable jump in 2021.",
+            "Charlie achieved the highest score in the table with 98 points.",
+            "The event will be held on November 20, 2025 at the downtown auditorium.",
+        ]
+    # Split raw_chunks into sub-chunks by character length
+    chunks = []
+    for ch in raw_chunks:
+        for i in range(0, len(ch), chunk_size):
+            chunks.append(ch[i : i + chunk_size])
+    results = []
+    for model_name in MODELS:
+        try:
+            print(f"🧠 Evaluating {model_name}...")
+            model = SentenceTransformer(model_name)
+            chunk_embeddings = model.encode(chunks, normalize_embeddings=True, show_progress_bar=False)
+        except Exception as e:
+            print(f"[ERROR] Failed to load model {model_name}:", e)
+            continue
+        precision_scores, recall_scores, mrr_scores = [], [], []
+        for q, ans in zip(queries, answers):
+            qvec = model.encode([q], normalize_embeddings=True)
+            sims = cosine_similarity(qvec, chunk_embeddings)[0]
+            top_idx = np.argsort(sims)[::-1][:top_k]
+            retrieved = [chunks[i] for i in top_idx]
+            relevant = [1 if ans.lower() in c.lower() else 0 for c in retrieved]
+            precision = sum(relevant) / top_k
+            recall = sum(relevant) / max(1, len([c for c in chunks if ans.lower() in c.lower()]))
+            mrr = 0
+            for rank, rel in enumerate(relevant, start=1):
+                if rel:
+                    mrr = 1 / rank
+                    break
+            precision_scores.append(precision)
+            recall_scores.append(recall)
+            mrr_scores.append(mrr)
+        results.append({
+            "model": model_name,
+            "precision": round(np.mean(precision_scores), 3),
+            "recall": round(np.mean(recall_scores), 3),
+            "mrr": round(np.mean(mrr_scores), 3),
+        })
+    df = pd.DataFrame(results)
+    print(df)
+    def make_plot(metric):
+        plt.figure(figsize=(6, 4))
+        plt.barh(df["model"], df[metric], color="skyblue")
+        plt.title(f"{metric.upper()} Comparison")
+        plt.xlabel(metric.upper())
+        plt.tight_layout()
+        buf = BytesIO()
+        plt.savefig(buf, format="png")
+        buf.seek(0)
+        img_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+        plt.close()
+        return f"data:image/png;base64,{img_base64}"
+    plot_precision = make_plot("precision") if not df.empty else None
+    plot_recall = make_plot("recall") if not df.empty else None
+    plot_mrr = make_plot("mrr") if not df.empty else None
+    return templates.TemplateResponse(
+        "benchmark_dashboard.html",
+        {
+            "request": request,
+            "results": results,
+            "plot_precision": plot_precision,
+            "plot_recall": plot_recall,
+            "plot_mrr": plot_mrr,
+        },
+    )
+# ---------------------------------------------------------
+# Run app
+# ---------------------------------------------------------
+if __name__ == "__main__":
+    uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)

app/reader.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+from typing import List, Dict
+from dotenv import dotenv_values
+from transformers import pipeline
+# --- Load from .env first, then fall back to system environment (for cloud deployment) ---
+env_vars = dotenv_values(".env")  # returns a dict from the .env file
+def get_env(key, default=None):
+    """Get env var from .env first, then system environment"""
+    return env_vars.get(key) or os.environ.get(key) or default
+# --- FIX: Use the correct modern SDK import (google-genai) and initialize client ---
+genai = None
+_gemini_client = None
+_api_key = get_env("GEMINI_API_KEY")
+try:
+    from google import genai
+    from google.genai import types
+    # Initialize client - check both .env and system env for cloud deployment
+    if _api_key:
+        _gemini_client = genai.Client(api_key=_api_key)
+except ImportError:
+    pass
+except Exception as e:
+    print(f"Warning: Failed to initialize Gemini client. Check API key/configuration. Error: {e}")
+class LLMReader:
+    """
+    LLM Reader using Google Gemini (via GEMINI_API_KEY from .env or environment)
+    Falls back to a local small model if unavailable.
+    """
+    def __init__(self, provider: str = "gemini"):
+        self.provider = provider.lower()
+        # Load from .env or system environment (for cloud deployment)
+        self.model = get_env("VDOCRAG_LLM_MODEL", "gemini-2.5-flash")
+        self.api_key = get_env("GEMINI_API_KEY")
+        self.client = _gemini_client
+        self.local_pipeline = None
+        print("=" * 50)
+        print(f"LLMReader Init: Loading GEMINI_API_KEY...")
+        if self.api_key:
+            print(f"LLMReader Init: SUCCESS. Key prefix: {self.api_key[:4]}...{self.api_key[-4:]}")
+        else:
+            print(f"LLMReader Init: FAILED. GEMINI_API_KEY not found.")
+        print("=" * 50)
+        if self.provider == "gemini":
+            # Check for API key first - if missing, fall back to local
+            if not self.api_key:
+                print("⚠️ No GEMINI_API_KEY found, switching to local model.")
+                self.provider = "local"
+            elif genai is None:
+                raise ImportError("Please install the modern Google GenAI SDK: `pip install google-genai`.")
+            elif self.client is None:
+                print("⚠️ Failed to initialize Gemini client, switching to local model.")
+                self.provider = "local"
+        if self.provider == "local":
+            print(f"Loading local model: distilgpt2...")
+            self.local_pipeline = pipeline("text-generation", model="distilgpt2")
+        if self.provider not in ("gemini", "local"):
+            print(f"⚠️ Unknown provider '{self.provider}', defaulting to local.")
+            self.provider = "local"
+            if self.local_pipeline is None:
+                print(f"Loading local model: distilgpt2...")
+                self.local_pipeline = pipeline("text-generation", model="distilgpt2")
+    # --------------------------
+    # Gemini call (modern SDK)
+    # --------------------------
+    def _call_gemini(self, query: str, context: str) -> str:
+        system_prompt = (
+            "You are a precise data analysis assistant. "
+            "Given the provided CONTEXT, answer the user's QUESTION accurately. "
+            "If calculations are needed, perform them. "
+            "Only respond with the final answer and no additional commentary or explanation."
+        )
+        user_content = f"CONTEXT:\n---\n{context}\n---\nQUESTION: {query}"
+        try:
+            config = types.GenerateContentConfig(
+                system_instruction=system_prompt,
+                temperature=0.1
+            )
+            response = self.client.models.generate_content(
+                model=self.model,
+                contents=user_content,
+                config=config
+            )
+            return response.text.strip()
+        except Exception as e:
+            return f"[Gemini API Error] {type(e).__name__}: {e}"
+    # --------------------------
+    # Local fallback
+    # --------------------------
+    def _call_local(self, query: str, context: str) -> str:
+        prompt = (
+            f"CONTEXT:\n{context}\n\n"
+            f"Based on the context, answer the following question:\n"
+            f"QUESTION: {query}\n"
+            f"ANSWER:"
+        )
+        result = self.local_pipeline(
+            prompt,
+            max_new_tokens=100,
+            do_sample=True,
+            truncation=True
+        )
+        generated_text = result[0]["generated_text"]
+        answer = generated_text[len(prompt):].strip()
+        if not answer or context in answer:
+            return "[Local model failed to generate a new answer and may have repeated the context]"
+        return answer
+    # --------------------------
+    # Main answer method
+    # --------------------------
+    def answer_question(self, query: str, context: str, sources: List[Dict]) -> Dict:
+        if self.provider == "gemini":
+            answer_text = self._call_gemini(query, context)
+        elif self.provider == "local":
+            answer_text = self._call_local(query, context)
+        else:
+            answer_text = f"[Error: Unknown provider '{self.provider}']"
+        provenance = [
+            {
+                "page": s["metadata"].get("page"),
+                "text": s["text"][:200],
+                "score": s.get("score", 0),
+            }
+            for s in sources
+        ]
+        return {"text": answer_text, "sources": provenance}

app/tables.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# app/tables.py
+import os
+import uuid
+import pdfplumber
+import pandas as pd
+from typing import List, Dict
+TABLES_DIR = os.environ.get('VDOCRAG_TABLES_DIR', '/tmp/vdoc_tables')
+os.makedirs(TABLES_DIR, exist_ok=True)
+def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
+    """
+    Extract tables using pdfplumber and save each as CSV. Returns a list of metadata dicts:
+    [{ 'csv_path': str, 'page': int, 'table_index': int, 'summary_text': str }]
+    """
+    results = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for pno, page in enumerate(pdf.pages, start=1):
+            try:
+                tables = page.extract_tables()
+            except Exception:
+                tables = []
+            for tidx, table in enumerate(tables):
+                # Convert to DataFrame
+                try:
+                    df = pd.DataFrame(table[1:], columns=table[0]) if len(table) > 1 else pd.DataFrame(table)
+                except Exception:
+                    df = pd.DataFrame(table)
+                fname = f"table_{uuid.uuid4().hex}_p{pno}_t{tidx}.csv"
+                csv_path = os.path.join(TABLES_DIR, fname)
+                # Save CSV
+                try:
+                    df.to_csv(csv_path, index=False)
+                except Exception:
+                    df.to_csv(csv_path, index=False, encoding='utf-8', errors='ignore')
+                # Get table bbox (approximate)
+                try:
+                    # Each table has a bounding box in page._objects['rects'] or use the table extractor
+                    table_bbox = page.find_tables()[tidx - 1].bbox  # (x0, top, x1, bottom)
+                except Exception:
+                    table_bbox = None
+                # create a short textual summary: columns and first N rows
+                cols = list(df.columns) if len(df.columns) > 0 else []
+                top_rows = df.head(5).to_dict(orient='records')
+                summary = f"Table (page {pno}) with columns: {cols}. First rows: {top_rows}"
+                results.append({
+                    'csv_path': csv_path,
+                    'page': pno,
+                    'table_index': tidx,
+                    'summary_text': summary,
+                    'rows': len(df),
+                    'bbox': table_bbox
+                })
+    return results

app/templates/benchmark_dashboard.html ADDED Viewed

	@@ -0,0 +1,81 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <title>📊 Embedding Benchmark Dashboard</title>
+  <script src="https://cdn.tailwindcss.com"></script>
+</head>
+<body class="bg-blue-50 text-blue-800 min-h-screen flex flex-col items-center py-8">
+  <div class="bg-white shadow-lg rounded-xl p-8 w-full max-w-5xl">
+    <h1 class="text-2xl font-bold text-center mb-6">📊 Embedding Model Benchmark</h1>
+    <form action="/run_benchmark" method="post" class="space-y-3 mb-6">
+      <label class="block text-sm font-semibold">Enter model names (comma-separated)</label>
+      <input type="text" name="models"
+             value="all-MiniLM-L6-v2, multi-qa-MiniLM-L6-cos-v1, models/vdoc_feedback_tuned/latest"
+             class="border rounded w-full p-2 focus:outline-none focus:ring-2 focus:ring-blue-400"
+             required>
+      <div class="flex space-x-4">
+        <div class="flex-1">
+          <label class="block text-sm font-semibold">Chunk Size</label>
+          <input type="number" name="chunk_size" value="200" class="border p-2 rounded w-full">
+        </div>
+        <div class="flex-1">
+          <label class="block text-sm font-semibold">Top-K</label>
+          <input type="number" name="top_k" value="5" class="border p-2 rounded w-full">
+        </div>
+      </div>
+      <button type="submit"
+              class="bg-blue-600 text-white px-6 py-2 rounded hover:bg-blue-700 w-full mt-3">
+        🚀 Run Benchmark
+      </button>
+    </form>
+    {% if results %}
+    <h2 class="text-xl font-semibold mb-4">📈 Results</h2>
+    <table class="w-full border border-gray-300 text-sm mb-6">
+      <thead class="bg-blue-100 text-blue-800">
+        <tr>
+          <th class="border px-3 py-1 text-left">Model</th>
+          <th class="border px-3 py-1">Precision</th>
+          <th class="border px-3 py-1">Recall</th>
+          <th class="border px-3 py-1">MRR</th>
+        </tr>
+      </thead>
+      <tbody>
+        {% for r in results %}
+        <tr class="Border-b hover:bg-blue-50">
+          <td class="px-3 py-1">{{ r.model }}</td>
+          <td class="px-3 py-1 text-center">{{ r.precision }}</td>
+          <td class="px-3 py-1 text-center">{{ r.recall }}</td>
+          <td class="px-3 py-1 text-center">{{ r.mrr }}</td>
+        </tr>
+        {% endfor %}
+      </tbody>
+    </table>
+    <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
+      {% if plot_precision %}
+        <img src="{{ plot_precision }}" class="rounded shadow">
+      {% endif %}
+      {% if plot_recall %}
+        <img src="{{ plot_recall }}" class="rounded shadow">
+      {% endif %}
+      {% if plot_mrr %}
+        <img src="{{ plot_mrr }}" class="rounded shadow">
+      {% endif %}
+    </div>
+    {% endif %}
+    {% if message %}
+      <p class="text-center text-red-700 font-semibold">{{ message }}</p>
+    {% endif %}
+    <div class="text-center mt-8">
+      <a href="/" class="text-blue-600 hover:underline">← Back to Main Interface</a>
+    </div>
+  </div>
+</body>
+</html>

app/templates/feedback_dashboard.html ADDED Viewed

	@@ -0,0 +1,64 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <title>🧠 VDoc Feedback Dashboard</title>
+  <script src="https://cdn.tailwindcss.com"></script>
+</head>
+<body class="bg-blue-50 text-blue-800 min-h-screen flex flex-col items-center py-8">
+  <div class="bg-white shadow-lg rounded-xl p-8 w-full max-w-4xl">
+    <h1 class="text-2xl font-bold mb-4 text-center">🧠 Feedback Dashboard</h1>
+    <p class="text-center text-gray-600 mb-4">{{ summary }}</p>
+    <div class="grid grid-cols-3 gap-4 text-center mb-6">
+      <div class="bg-green-100 p-3 rounded-lg">
+        <p class="text-xl font-bold text-green-700">{{ correct }}</p>
+        <p class="text-sm text-green-800">Correct</p>
+      </div>
+      <div class="bg-red-100 p-3 rounded-lg">
+        <p class="text-xl font-bold text-red-700">{{ incorrect }}</p>
+        <p class="text-sm text-red-800">Incorrect</p>
+      </div>
+      <div class="bg-blue-100 p-3 rounded-lg">
+        <p class="text-xl font-bold text-blue-700">{{ total }}</p>
+        <p class="text-sm text-blue-800">Total Feedback</p>
+      </div>
+    </div>
+    <!-- Train model -->
+    <form action="/train_feedback_model" method="post" class="text-center mb-8">
+      <button type="submit"
+              class="bg-blue-600 text-white px-6 py-2 rounded hover:bg-blue-700">
+        🚀 Train Model from Feedback
+      </button>
+    </form>
+    {% if message %}
+      <div class="bg-gray-50 border-l-4 border-blue-400 p-3 mb-6">
+        <p class="text-gray-700 font-medium">{{ message }}</p>
+        <pre class="text-xs text-gray-600 mt-2 whitespace-pre-wrap">{{ train_output }}</pre>
+      </div>
+    {% endif %}
+    <!-- Feedback log -->
+    <h2 class="text-xl font-semibold mb-3">📜 Recent Feedback</h2>
+    <div class="max-h-96 overflow-y-auto border rounded p-3 bg-gray-50">
+      {% for fb in feedback_data %}
+        <div class="mb-3 border-b pb-2">
+          <p class="text-sm"><strong>🕓</strong> {{ fb.timestamp }}</p>
+          <p class="text-sm"><strong>❓</strong> {{ fb.question }}</p>
+          <p class="text-sm"><strong>💬</strong> {{ fb.answer }}</p>
+          <p class="text-sm">
+            <strong>✅</strong> {{ fb.correctness|capitalize }}
+          </p>
+        </div>
+      {% endfor %}
+    </div>
+    <div class="text-center mt-8">
+      <a href="/" class="text-blue-600 hover:underline">← Back to Main Interface</a>
+    </div>
+  </div>
+</body>
+</html>

app/templates/index.html ADDED Viewed

	@@ -0,0 +1,133 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <title>VDoc RAG - Web UI</title>
+  <script src="https://cdn.tailwindcss.com"></script>
+</head>
+<body class="bg-blue-100 text-blue-800 min-h-screen flex flex-col items-center justify-center">
+  <div class="bg-white shadow-lg rounded-xl p-8 w-full max-w-2xl">
+    <h1 class="text-2xl font-bold text-center mb-6 text-blue-800">📄 VDoc RAG Web Interface</h1>
+    <!-- Upload Form -->
+    <form action="/upload" method="post" enctype="multipart/form-data" class="flex flex-col items-center space-y-3 mb-6">
+      <input type="file" name="file" accept=".pdf,.png,.jpg,.jpeg" required class="border p-2 rounded w-full">
+      <button type="submit" class="bg-blue-600 text-white px-4 py-2 rounded hover:bg-blue-700">Upload & Index</button>
+    </form>
+    <div class="mt-6 text-center">
+      <a href="/feedback_dashboard"
+         class="text-blue-700 font-semibold hover:underline">
+         🧠 Open Feedback Dashboard
+      </a>
+    </div>
+    <div class="mt-3 text-center">
+      <a href="/benchmark_dashboard"
+         class="text-blue-700 font-semibold hover:underline">
+         📊 Open Benchmark Dashboard
+      </a>
+    </div>
+    <!-- Cache Clear Button -->
+    <form action="/clear_cache" method="post" class="mb-6">
+      <button type="submit"
+              class="bg-red-600 text-white px-4 py-2 rounded hover:bg-red-700 w-full">
+        🧹 Clear Cache
+      </button>
+    </form>
+    <!-- Clear Persistent Index Button -->
+    <form action="/clear_index" method="post" class="mb-6">
+      <button type="submit"
+              class="bg-orange-600 text-white px-4 py-2 rounded hover:bg-orange-700 w-full">
+        🗑️ Clear Persistent Index
+      </button>
+    </form>
+    {% if uploaded %}
+      <p class="text-green-600 font-semibold mb-4">Uploaded files: {{ uploaded|join(', ') }}</p>
+    {% endif %}
+    <!-- Ask Question -->
+    <form action="/ask" method="post" class="space-y-3 mb-4">
+      <input type="text" name="question" placeholder="Ask a question about your document..." required
+             class="border rounded w-full p-2 focus:outline-none focus:ring-2 focus:ring-blue-400">
+      <button type="submit" class="bg-blue-600 text-white px-4 py-2 rounded hover:bg-blue-700 w-full">Ask</button>
+    </form>
+    {% if chunks %}
+    <div style="margin-top: 2em;">
+      <h3>🔍 Retrieved Chunks (Used in Prompt)</h3>
+      <ul>
+        {% for c in chunks %}
+          <li class="border-b border-gray-200 py-2">
+            <strong>[{{ c.index }}]</strong>
+            (Page: {{ c.page }}, BBox: {{ c.bbox }})<br>
+            <code>{{ c.text }}</code><br>
+            <span class="text-sm text-gray-500">🔹 Confidence: {{ c.confidence }}%</span>
+          </li>
+        {% endfor %}
+      </ul>
+    </div>
+    {% endif %}
+    <!-- Answer Section -->
+    {% if answer %}
+      <div class="bg-blue-50 border rounded-lg p-4 mt-4">
+        <h2 class="text-lg font-semibold text-blue-700 mb-2">Answer:</h2>
+        <p>{{ answer }}</p>
+        {% if sources %}
+          <h3 class="font-semibold mt-3">Sources:</h3>
+          <ul class="list-disc list-inside text-sm text-blue-700">
+            {% for s in sources %}
+              <li>Page {{ s.page }} → {{ s.text[:100] }}...</li>
+            {% endfor %}
+          </ul>
+        {% endif %}
+        <!-- Feedback Section -->
+        <form action="/feedback" method="post" class="mt-3 flex space-x-2">
+          <input type="hidden" name="question" value="{{ question }}">
+          <input type="hidden" name="answer" value="{{ answer }}">
+          <button type="submit" name="correctness" value="correct"
+                  class="bg-green-600 text-white px-3 py-1 rounded hover:bg-green-700">
+            ✅ Correct
+          </button>
+          <button type="submit" name="correctness" value="incorrect"
+                  class="bg-red-600 text-white px-3 py-1 rounded hover:bg-red-700">
+            ❌ Incorrect
+          </button>
+        </form>
+      </div>
+      <div id="highlight-section">
+        {% if highlight_images %}
+          <h3>📄 Relevant PDF Pages:</h3>
+          <div id="highlight-gallery">
+            {% for img in highlight_images %}
+              <img src="{{ img }}?v={{ loop.index }}" class="highlight-img"
+                   style="max-width:80%; margin:10px; border:3px solid red;" />
+            {% endfor %}
+          </div>
+        {% endif %}
+      </div>
+      {% if confidence_avg is defined %}
+        <p class="text-sm text-gray-600 mt-2">🧠 Average confidence: {{ confidence_avg }}%</p>
+      {% endif %}
+      <script>
+        // Clear old images before new ones are inserted
+        document.addEventListener("DOMContentLoaded", function() {
+          const form = document.querySelector("form[action='/ask']");
+          if (form) {
+            form.addEventListener("submit", () => {
+              const gallery = document.getElementById("highlight-gallery");
+              if (gallery) gallery.innerHTML = ""; // remove old images
+            });
+          }
+        });
+      </script>
+    {% endif %}
+  </div>
+</body>
+</html>

app/utils.py ADDED Viewed

	@@ -0,0 +1,3 @@

+def bbox_to_dict(bbox):
+    x0, y0, x1, y1 = bbox
+    return {'x0': int(x0), 'y0': int(y0), 'x1': int(x1), 'y1': int(y1)}

app/visual_highlight.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+import uuid
+import json
+import ast
+from pdf2image import convert_from_path
+from PIL import Image, ImageDraw
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+def load_calibration(config_path="highlight_calibration.json"):
+    """Load calibration values from JSON or fallback to defaults."""
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            calib = json.load(f)
+        print(f"✅ Loaded calibration: {calib}")
+        return calib
+    else:
+        print("⚠️ No calibration file found. Using defaults.")
+        return {"x_offset": 0, "x_scale": 1.0, "y_offset": 0, "y_scale": 1.0}
+def render_highlighted_pages(pdf_path, hits, output_dir=None, dpi=150):
+    """
+    Render PDF pages as images and highlight bounding boxes with calibration applied.
+    Crops the output image tightly around highlighted area (+20 px padding).
+    """
+    if output_dir is None:
+        output_dir = os.path.join(BASE_DIR, "highlighted")
+    os.makedirs(output_dir, exist_ok=True)
+    calib = load_calibration()
+    X_OFFSET = calib.get("x_offset", 0)
+    X_SCALE  = calib.get("x_scale", 1.0)
+    Y_OFFSET = calib.get("y_offset", 0)
+    Y_SCALE  = calib.get("y_scale", 1.0)
+    # Clean previous outputs
+    for old in os.listdir(output_dir):
+        try:
+            os.remove(os.path.join(output_dir, old))
+        except Exception:
+            pass
+    hits = hits[:1]
+    pages_to_render = sorted({h["metadata"]["page"] for h in hits})
+    pdf_images = convert_from_path(pdf_path, dpi=dpi)
+    result_paths = []
+    for page_num in pages_to_render:
+        page_index = page_num - 1
+        img = pdf_images[page_index].convert("RGBA")
+        w_img, h_img = img.size
+        overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
+        draw = ImageDraw.Draw(overlay)
+        page_bboxes = []
+        for h in hits:
+            meta = h.get("metadata", {})
+            if meta.get("page") != page_num:
+                continue
+            bbox = meta.get("bbox")
+            # Debug raw bbox
+            print(f"[DEBUG] page {page_num} raw bbox type: {type(bbox)} value: {bbox}")
+            # Safe parsing: accept list/tuple or stringified list
+            try:
+                if isinstance(bbox, str):
+                    bbox = ast.literal_eval(bbox)
+                if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4:
+                    print(f"[WARN] Invalid bbox for page {page_num}: {bbox}")
+                    continue
+                # Apply calibration
+                x0, y0, x1, y1 = [float(v) for v in bbox]
+                x0 = x0 * X_SCALE + X_OFFSET
+                x1 = x1 * X_SCALE + X_OFFSET
+                y0 = y0 * Y_SCALE + Y_OFFSET
+                y1 = y1 * Y_SCALE + Y_OFFSET
+            except Exception as e:
+                print(f"[ERROR] Failed to parse bbox for page {page_num}: {bbox} -> {e}")
+                continue
+            left, top = max(0, min(x0, x1)), max(0, min(y0, y1))
+            right, bottom = min(w_img, max(x0, x1)), min(h_img, max(y0, y1))
+            if right <= left or bottom <= top:
+                continue
+            page_bboxes.append((left, top, right, bottom))
+            draw.rectangle(
+                [left, top, right, bottom],
+                outline=(255, 0, 0),
+                width=4,
+                fill=(255, 0, 0, 100)
+            )
+        # Merge highlights with image
+        highlighted = Image.alpha_composite(img, overlay)
+        # --- 🧭 Crop around highlighted region (+20px padding) ---
+        if page_bboxes:
+            min_x = min(b[0] for b in page_bboxes)
+            min_y = min(b[1] for b in page_bboxes)
+            max_x = max(b[2] for b in page_bboxes)
+            max_y = max(b[3] for b in page_bboxes)
+            pad = 100
+            crop_box = (
+                max(0, int(min_x - pad)),
+                max(0, int(min_y - pad)),
+                int(min(max_x + pad, w_img)),
+                int(min(max_y + pad, h_img)),
+            )
+            cropped = highlighted.crop(crop_box)
+        else:
+            cropped = highlighted  # fallback if no bbox
+        # Log how many boxes were drawn
+        print(f"✅ Drew {len(page_bboxes)} boxes on page {page_num}")
+        out_path = os.path.join(output_dir, f"highlight_page{page_num}_{uuid.uuid4().hex}.png")
+        cropped.convert("RGB").save(out_path)
+        result_paths.append(out_path)
+        print(f"✅ Highlighted and cropped page {page_num}: {out_path}")
+    return result_paths
+# Example usage
+if __name__ == "__main__":
+    hits = [
+        {"metadata": {"page": 2, "bbox": [87, 222, 592, 250], "type": "text"}},
+    ]
+    render_highlighted_pages("samples/vdoc_rag_test.pdf", hits)

highlight_calibration.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "x_offset": -33.0,
+  "x_scale": 1.0,
+  "y_offset": -65.0,
+  "y_scale": 1.02
+}

notebooks/evaluate_embeddings.ipynb ADDED Viewed

	@@ -0,0 +1,264 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "55b021d5",
+   "metadata": {},
+   "source": [
+    "# Embedding & Retrieval Evaluation\n",
+    "\n",
+    "This notebook benchmarks embedding models and chunk sizes for retrieval quality using your project's Chroma index and collected feedback as a small labeled set. Metrics: Precision@K, Recall@K, and MRR."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18518993",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mFailed to start the Kernel. \n",
+      "\u001b[1;31mPermissionError: [WinError 5] Access is denied: 'C:\\\\Users\\\\abhin\\\\.ipython\\\\profile_default\\\\security'. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "# Standard imports\n",
+    "import os\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Project imports (uses your existing pipeline)\n",
+    "from app.feedback_manager import _load_feedback\n",
+    "from app.ingest import process_pdf\n",
+    "from app.embeddings import TextImageEmbedder\n",
+    "\n",
+    "# Config\n",
+    "BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
+    "PDF_PATH = os.path.join(BASE_DIR, \"samples\", \"vdoc_rag_test.pdf\")  # replace with a real sample PDF path\n",
+    "STORAGE_DIR = os.path.join(BASE_DIR, \"storage\", \"chroma_db\")\n",
+    "\n",
+    "MODELS_TO_TEST = [\n",
+    "    \"all-MiniLM-L6-v2\",\n",
+    "    \"multi-qa-MiniLM-L6-cos-v1\",\n",
+    "    \"paraphrase-MiniLM-L3-v2\",\n",
+    "    os.path.join(BASE_DIR, \"models\", \"vdoc_feedback_tuned\", \"latest\"),\n",
+    "]\n",
+    "CHUNK_SIZES = [200, 500, 800]  # in characters\n",
+    "TOP_K = 5\n",
+    "\n",
+    "print(\"Notebook configured. If the tuned model path does not exist, it will be skipped in runs.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "863ba97b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load feedback (if available)\n",
+    "feedback = _load_feedback()\n",
+    "print(f\"Loaded {len(feedback)} feedback entries.\")\n",
+    "if feedback:\n",
+    "    sample_queries = [f['question'] for f in feedback]\n",
+    "    sample_answers = [f['answer'] for f in feedback]\n",
+    "else:\n",
+    "    # fallback small test set\n",
+    "    sample_queries = [\n",
+    "        \"What is the trend in yearly sales?\",\n",
+    "        \"Who scored highest in the table?\",\n",
+    "        \"What is the event date?\",\n",
+    "    ]\n",
+    "    sample_answers = [\"increasing\", \"Charlie\", \"November 20, 2025\"]\n",
+    "\n",
+    "# Small helper to preview feedback structure\n",
+    "if feedback:\n",
+    "    display(pd.DataFrame(feedback)[['timestamp','question','answer','correctness']].tail(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad0fcb6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper: process the PDF into chunks (optional - heavy).\n",
+    "def load_chunks(pdf_path):\n",
+    "    if not os.path.exists(pdf_path):\n",
+    "        raise FileNotFoundError(f\"PDF not found: {pdf_path}\")\n",
+    "    print(\"Processing PDF into chunks (this may take a while)...\")\n",
+    "    docs = process_pdf(pdf_path)\n",
+    "    texts = [d['text'] for d in docs]\n",
+    "    return texts\n",
+    "\n",
+    "# Try to load sample chunks if available, otherwise create toy chunks from feedback answers\n",
+    "try:\n",
+    "    chunks = load_chunks(PDF_PATH)\n",
+    "    print(f\"Total chunks from PDF: {len(chunks)}\")\n",
+    "except Exception as e:\n",
+    "    print(\"Could not process PDF, falling back to feedback-derived tiny corpus:\", e)\n",
+    "    # fallback corpus built from sample answers/queries for quick runs\n",
+    "    chunks = [\n",
+    "        \"Yearly sales have been increasing steadily from 2018 to 2024, with a notable jump in 2021.\",\n",
+    "        \"Charlie achieved the highest score in the table with 98 points.\",\n",
+    "        \"The event will be held on November 20, 2025 at the downtown auditorium.\",\n",
+    "    ]\n",
+    "    print(f\"Using fallback chunks: {len(chunks)} items\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c8b6ffe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluation function (Precision@K, Recall@K, MRR)\n",
+    "def evaluate_model(model_name, chunks, queries, answers, chunk_size, top_k=TOP_K):\n",
+    "    print(f\"\\n🧠 Evaluating {model_name} (chunk size {chunk_size})\")\n",
+    "    # Skip model if path does not exist (for tuned model)\n",
+    "    if os.path.isabs(model_name) and not os.path.exists(model_name):\n",
+    "        print(f\"- Skipping (path not found): {model_name}\")\n",
+    "        return None\n",
+    "\n",
+    "    model = SentenceTransformer(model_name)\n",
+    "\n",
+    "    # Split chunks by size\n",
+    "    split_chunks = []\n",
+    "    for ch in chunks:\n",
+    "        for i in range(0, len(ch), chunk_size):\n",
+    "            split_chunks.append(ch[i:i+chunk_size])\n",
+    "    chunk_embeddings = model.encode(split_chunks, normalize_embeddings=True, show_progress_bar=False)\n",
+    "\n",
+    "    precision_scores, recall_scores, mrr_scores = [], [], []\n",
+    "\n",
+    "    # Precompute reference counts for recall denominator\n",
+    "    total_relevant_counts = []\n",
+    "    for ans in answers:\n",
+    "        total_relevant_counts.append(sum(1 for c in split_chunks if ans.lower() in c.lower()))\n",
+    "\n",
+    "    for q, ans in tqdm(list(zip(queries, answers)), total=len(queries), desc=f\"Evaluating {model_name}\"):\n",
+    "        qvec = model.encode([q], normalize_embeddings=True)\n",
+    "        sims = cosine_similarity(qvec, chunk_embeddings)[0]\n",
+    "        top_indices = np.argsort(sims)[::-1][:top_k]\n",
+    "        retrieved_chunks = [split_chunks[i] for i in top_indices]\n",
+    "\n",
+    "        relevant = [1 if ans.lower() in c.lower() else 0 for c in retrieved_chunks]\n",
+    "        precision = sum(relevant) / top_k\n",
+    "        recall = sum(relevant) / max(1, total_relevant_counts.pop(0))\n",
+    "        mrr = 0.0\n",
+    "        for rank, rel in enumerate(relevant, start=1):\n",
+    "            if rel == 1:\n",
+    "                mrr = 1.0 / rank\n",
+    "                break\n",
+    "\n",
+    "        precision_scores.append(precision)\n",
+    "        recall_scores.append(recall)\n",
+    "        mrr_scores.append(mrr)\n",
+    "\n",
+    "    return {\n",
+    "        \"model\": model_name,\n",
+    "        \"chunk_size\": chunk_size,\n",
+    "        \"precision\": float(np.mean(precision_scores)),\n",
+    "        \"recall\": float(np.mean(recall_scores)),\n",
+    "        \"mrr\": float(np.mean(mrr_scores)),\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca934bfc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run evaluation across models and chunk sizes\n",
+    "results = []\n",
+    "for model_name in MODELS_TO_TEST:\n",
+    "    for cs in CHUNK_SIZES:\n",
+    "        res = evaluate_model(model_name, chunks, sample_queries, sample_answers, cs)\n",
+    "        if res:\n",
+    "            results.append(res)\n",
+    "\n",
+    "df = pd.DataFrame(results)\n",
+    "if not df.empty:\n",
+    "    display(df)\n",
+    "else:\n",
+    "    print(\"No results to show (models may have been skipped).\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6f75729",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualization\n",
+    "if not df.empty:\n",
+    "    plt.figure(figsize=(8,5))\n",
+    "    for m in df['model'].unique():\n",
+    "        subset = df[df['model'] == m]\n",
+    "        plt.plot(subset['chunk_size'], subset['precision'], marker='o', label=f\"{m} (Precision)\")\n",
+    "    plt.title('Precision@5 vs Chunk Size')\n",
+    "    plt.xlabel('Chunk Size (characters)')\n",
+    "    plt.ylabel('Precision@5')\n",
+    "    plt.legend()\n",
+    "    plt.grid(True)\n",
+    "    plt.show()\n",
+    "\n",
+    "    plt.figure(figsize=(8,5))\n",
+    "    for m in df['model'].unique():\n",
+    "        subset = df[df['model'] == m]\n",
+    "        plt.plot(subset['chunk_size'], subset['recall'], marker='s', label=f\"{m} (Recall)\")\n",
+    "    plt.title('Recall@5 vs Chunk Size')\n",
+    "    plt.xlabel('Chunk Size (characters)')\n",
+    "    plt.ylabel('Recall@5')\n",
+    "    plt.legend()\n",
+    "    plt.grid(True)\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "249d2857",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save results to CSV for reporting\n",
+    "output_csv = os.path.join(BASE_DIR, 'notebooks', 'embedding_benchmark_results.csv')\n",
+    "if not df.empty:\n",
+    "    df.to_csv(output_csv, index=False)\n",
+    "    print(f\"✅ Benchmark results saved to {output_csv}\")\n",
+    "else:\n",
+    "    print(\"No data to save.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.13.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+fastapi
+uvicorn[standard]
+python-multipart
+pdf2image
+pdfplumber
+pytesseract
+Pillow
+sentence-transformers
+transformers
+torch
+chromadb
+numpy
+pandas
+aiofiles
+openai
+layoutparser
+opencv-python-headless
+matplotlib
+scikit-learn
+google-genai
+python-dotenv
+jinja2

samples/vdoc_rag_test.pdf ADDED Viewed

Binary file (52.1 kB). View file

test.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import uuid
+import json
+from pdf2image import convert_from_path
+from PIL import Image, ImageDraw
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+def load_calibration(config_path="highlight_calibration.json"):
+    """Load calibration values from JSON or fallback to defaults."""
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            calib = json.load(f)
+        print(f"✅ Loaded calibration: {calib}")
+        return calib
+    else:
+        print("⚠️ No calibration file found. Using defaults.")
+        return {"x_offset": 0, "x_scale": 1.0, "y_offset": 0, "y_scale": 1.0}
+def render_highlighted_pages(pdf_path, hits, output_dir=None, dpi=150):
+    """
+    Render PDF pages as images and highlight bounding boxes with calibration applied.
+    Crops the output image tightly around highlighted area (+20 px padding).
+    """
+    if output_dir is None:
+        output_dir = os.path.join(BASE_DIR, "highlighted")
+    os.makedirs(output_dir, exist_ok=True)
+    calib = load_calibration()
+    X_OFFSET = calib.get("x_offset", 0)
+    X_SCALE  = calib.get("x_scale", 1.0)
+    Y_OFFSET = calib.get("y_offset", 0)
+    Y_SCALE  = calib.get("y_scale", 1.0)
+    # Clean previous outputs
+    for old in os.listdir(output_dir):
+        try:
+            os.remove(os.path.join(output_dir, old))
+        except Exception:
+            pass
+    pages_to_render = sorted({h["metadata"]["page"] for h in hits})
+    pdf_images = convert_from_path(pdf_path, dpi=dpi)
+    result_paths = []
+    for page_num in pages_to_render:
+        page_index = page_num - 1
+        img = pdf_images[page_index].convert("RGBA")
+        w_img, h_img = img.size
+        overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
+        draw = ImageDraw.Draw(overlay)
+        page_bboxes = []
+        for h in hits:
+            meta = h["metadata"]
+            if meta["page"] != page_num:
+                continue
+            bbox = meta["bbox"]
+            if not bbox or len(bbox) != 4:
+                continue
+            # Apply calibration
+            x0, y0, x1, y1 = [float(v) for v in bbox]
+            x0 = x0 * X_SCALE + X_OFFSET
+            x1 = x1 * X_SCALE + X_OFFSET
+            y0 = y0 * Y_SCALE + Y_OFFSET
+            y1 = y1 * Y_SCALE + Y_OFFSET
+            left, top = max(0, min(x0, x1)), max(0, min(y0, y1))
+            right, bottom = min(w_img, max(x0, x1)), min(h_img, max(y0, y1))
+            if right <= left or bottom <= top:
+                continue
+            page_bboxes.append((left, top, right, bottom))
+            draw.rectangle(
+                [left, top, right, bottom],
+                outline=(255, 0, 0),
+                width=4,
+                fill=(255, 0, 0, 100)
+            )
+        # Merge highlights with image
+        highlighted = Image.alpha_composite(img, overlay)
+        # --- 🧭 Crop around highlighted region (+20px padding) ---
+        if page_bboxes:
+            min_x = min(b[0] for b in page_bboxes)
+            min_y = min(b[1] for b in page_bboxes)
+            max_x = max(b[2] for b in page_bboxes)
+            max_y = max(b[3] for b in page_bboxes)
+            pad = 100
+            crop_box = (
+                max(0, int(min_x - pad)),
+                max(0, int(min_y - pad)),
+                int(min(max_x + pad, w_img)),
+                int(min(max_y + pad, h_img)),
+            )
+            cropped = highlighted.crop(crop_box)
+        else:
+            cropped = highlighted  # fallback if no bbox
+        out_path = os.path.join(output_dir, f"highlight_page{page_num}_{uuid.uuid4().hex}.png")
+        cropped.convert("RGB").save(out_path)
+        result_paths.append(out_path)
+        print(f"✅ Highlighted and cropped page {page_num}: {out_path}")
+    return result_paths
+# Example usage
+if __name__ == "__main__":
+    hits = [
+        {"metadata": {"page": 1, "bbox": [87, 1926, 775, 1957], "type": "text"}},
+        {"metadata": {"page": 2, "bbox": [87, 222, 592, 250], "type": "text"}},
+    ]
+    render_highlighted_pages("samples/vdoc_rag_test.pdf", hits)

train_feedback_embeddings.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+train_feedback_embeddings.py
+Fine-tune the VDoc-RAG embedding model using stored user feedback.
+Place this file at the repository root and run:
+    python train_feedback_embeddings.py
+It will load feedback from `app/feedback.json`, prepare training pairs, fine-tune a
+SentenceTransformer model, and save checkpoints under `models/vdoc_feedback_tuned/`.
+"""
+import os
+import json
+from datetime import datetime
+from torch.utils.data import DataLoader
+try:
+    from sentence_transformers import SentenceTransformer, InputExample, losses
+except Exception as e:
+    raise ImportError("Please install sentence-transformers and torch to run this script: pip install sentence-transformers torch")
+# --- Paths ---
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+APP_DIR = os.path.join(BASE_DIR, "app")
+FEEDBACK_PATH = os.path.join(APP_DIR, "feedback.json")
+OUTPUT_DIR = os.path.join(BASE_DIR, "models", "vdoc_feedback_tuned")
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# --- Step 1: Load Feedback ---
+if not os.path.exists(FEEDBACK_PATH):
+    raise FileNotFoundError(f"❌ No feedback.json found at {FEEDBACK_PATH}")
+with open(FEEDBACK_PATH, "r", encoding="utf-8") as f:
+    feedback = json.load(f)
+if not feedback:
+    raise ValueError("⚠️ feedback.json is empty — collect feedback first!")
+# --- Step 2: Prepare Training Data ---
+train_examples = []
+for fb in feedback:
+    question = fb.get("question", "").strip()
+    answer = fb.get("answer", "").strip()
+    correctness = (fb.get("correctness") or "").lower()
+    if not question or not answer:
+        continue
+    if correctness not in ("correct", "incorrect"):
+        continue
+    label = 1.0 if correctness == "correct" else 0.0
+    train_examples.append(InputExample(texts=[question, answer], label=label))
+if len(train_examples) < 5:
+    raise ValueError(f"⚠️ Too few feedback entries ({len(train_examples)}). Need at least 5 to fine-tune meaningfully.")
+print(f"✅ Loaded {len(train_examples)} feedback samples for training.")
+# --- Step 3: Load Base Model ---
+base_model = os.environ.get("VDOCRAG_FEEDBACK_BASE", "all-MiniLM-L6-v2")
+print(f"📦 Loading base model: {base_model}")
+model = SentenceTransformer(base_model)
+# --- Step 4: Create DataLoader and Loss ---
+train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
+train_loss = losses.CosineSimilarityLoss(model)
+# --- Step 5: Train ---
+print("🚀 Starting fine-tuning...")
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=1,
+    warmup_steps=10,
+    show_progress_bar=True,
+)
+# --- Step 6: Save Fine-tuned Model ---
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+save_path = os.path.join(OUTPUT_DIR, f"checkpoint_{timestamp}")
+os.makedirs(save_path, exist_ok=True)
+model.save(save_path)
+print(f"✅ Fine-tuned model saved at: {save_path}")
+# --- Step 7: Create "latest" symlink / pointer ---
+latest_path = os.path.join(OUTPUT_DIR, "latest")
+try:
+    if os.path.exists(latest_path):
+        if os.path.islink(latest_path):
+            os.unlink(latest_path)
+        else:
+            import shutil
+            shutil.rmtree(latest_path)
+    os.symlink(save_path, latest_path, target_is_directory=True)
+    print(f"🔗 Symlink created: {latest_path} → {save_path}")
+except Exception:
+    # On Windows, symlink may fail — copy instead
+    import shutil
+    if os.path.exists(latest_path):
+        shutil.rmtree(latest_path, ignore_errors=True)
+    shutil.copytree(save_path, latest_path)
+    print(f"📁 Copied model to {latest_path} (symlink not supported).")
+print("\n🎉 Training complete! Your VDoc-RAG can now use:")
+print(f"   models/vdoc_feedback_tuned/latest/")