Spaces:
Runtime error
Runtime error
Add RAG Document Summarizer application
Browse files- .dockerignore +69 -0
- Dockerfile +38 -0
- README.md +36 -7
- app/__init__.py +1 -0
- app/__pycache__/__init__.cpython-312.pyc +0 -0
- app/__pycache__/chunking.cpython-312.pyc +0 -0
- app/__pycache__/document_loader.cpython-312.pyc +0 -0
- app/__pycache__/main.cpython-312.pyc +0 -0
- app/__pycache__/summarizer.cpython-312.pyc +0 -0
- app/__pycache__/vector_store.cpython-312.pyc +0 -0
- app/chunking.py +33 -0
- app/document_loader.py +419 -0
- app/main.py +882 -0
- app/summarizer.py +403 -0
- app/vector_store.py +55 -0
- requirements.txt +22 -0
.dockerignore
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Git
|
2 |
+
.git
|
3 |
+
.gitignore
|
4 |
+
|
5 |
+
# Python
|
6 |
+
__pycache__
|
7 |
+
*.pyc
|
8 |
+
*.pyo
|
9 |
+
*.pyd
|
10 |
+
.Python
|
11 |
+
env
|
12 |
+
pip-log.txt
|
13 |
+
pip-delete-this-directory.txt
|
14 |
+
.tox
|
15 |
+
.coverage
|
16 |
+
.coverage.*
|
17 |
+
.cache
|
18 |
+
nosetests.xml
|
19 |
+
coverage.xml
|
20 |
+
*.cover
|
21 |
+
*.log
|
22 |
+
.git
|
23 |
+
.mypy_cache
|
24 |
+
.pytest_cache
|
25 |
+
.hypothesis
|
26 |
+
|
27 |
+
# Virtual environments
|
28 |
+
venv/
|
29 |
+
env/
|
30 |
+
ENV/
|
31 |
+
|
32 |
+
# IDE
|
33 |
+
.vscode/
|
34 |
+
.idea/
|
35 |
+
*.swp
|
36 |
+
*.swo
|
37 |
+
*~
|
38 |
+
|
39 |
+
# OS
|
40 |
+
.DS_Store
|
41 |
+
.DS_Store?
|
42 |
+
._*
|
43 |
+
.Spotlight-V100
|
44 |
+
.Trashes
|
45 |
+
ehthumbs.db
|
46 |
+
Thumbs.db
|
47 |
+
|
48 |
+
# Project specific
|
49 |
+
uploaded_docs/*
|
50 |
+
!uploaded_docs/.gitkeep
|
51 |
+
offload/*
|
52 |
+
!offload/.gitkeep
|
53 |
+
*.zip
|
54 |
+
*.pdf
|
55 |
+
*.docx
|
56 |
+
*.pptx
|
57 |
+
|
58 |
+
# Docker
|
59 |
+
.dockerignore
|
60 |
+
|
61 |
+
# Documentation
|
62 |
+
README.md
|
63 |
+
IMPROVEMENTS.md
|
64 |
+
*.md
|
65 |
+
|
66 |
+
# Deployment scripts
|
67 |
+
deploy.*
|
68 |
+
|
69 |
+
pytesseract
|
Dockerfile
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use official Python image
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# System dependencies for OCR and PDF processing
|
5 |
+
RUN apt-get update && \
|
6 |
+
apt-get install -y tesseract-ocr poppler-utils curl unzip libgl1 libglib2.0-0 && \
|
7 |
+
rm -rf /var/lib/apt/lists/*
|
8 |
+
|
9 |
+
# Set workdir
|
10 |
+
WORKDIR /app
|
11 |
+
|
12 |
+
# Copy application code
|
13 |
+
COPY app/ ./app
|
14 |
+
COPY requirements.txt .
|
15 |
+
RUN mkdir -p ./uploaded_docs
|
16 |
+
|
17 |
+
# Install Python dependencies
|
18 |
+
RUN pip install --upgrade pip && pip install -r requirements.txt
|
19 |
+
|
20 |
+
# Download and install ngrok
|
21 |
+
RUN curl -s https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz | tar -xz -C /usr/local/bin
|
22 |
+
|
23 |
+
# Expose FastAPI and ngrok dashboard ports
|
24 |
+
EXPOSE 8000 4040
|
25 |
+
|
26 |
+
# Copy entrypoint script
|
27 |
+
COPY --from=busybox:latest /bin/sh /bin/sh
|
28 |
+
COPY --from=busybox:latest /bin/sleep /bin/sleep
|
29 |
+
|
30 |
+
# Entrypoint script to start both FastAPI and ngrok
|
31 |
+
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
32 |
+
RUN chmod +x /docker-entrypoint.sh
|
33 |
+
|
34 |
+
# Set environment variables (ngrok authtoken and Mistral API key can be set at runtime)
|
35 |
+
ENV NGROK_AUTHTOKEN=""
|
36 |
+
ENV MISTRAL_API_KEY=""
|
37 |
+
|
38 |
+
ENTRYPOINT ["/docker-entrypoint.sh"]
|
README.md
CHANGED
@@ -1,12 +1,41 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
-
license:
|
9 |
-
short_description: AI document
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: RAG Document Summarizer
|
3 |
+
emoji: 📄
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
+
license: mit
|
9 |
+
short_description: Advanced AI-powered document summarization and question answering using RAG
|
10 |
---
|
11 |
|
12 |
+
# RAG Document Summarizer
|
13 |
+
|
14 |
+
A modern, production-ready Retrieval-Augmented Generation (RAG) application for document summarization and question answering. Built with FastAPI, LangChain, and Mistral AI API, this project enables advanced document processing, chunking, vector search, and AI-powered summarization and querying.
|
15 |
+
|
16 |
+
## Features
|
17 |
+
- **Document Upload:** Supports PDF, DOCX, PPTX, and TXT files
|
18 |
+
- **OCR Support:** Extracts text from scanned PDFs using Tesseract OCR
|
19 |
+
- **Chunking:** Splits documents into manageable chunks for efficient processing
|
20 |
+
- **Vector Store:** Embeds and stores chunks for fast similarity search
|
21 |
+
- **AI Summarization:** Uses Mistral AI API for high-quality summaries and answers
|
22 |
+
- **Modern UI:** Clean, responsive web interface
|
23 |
+
|
24 |
+
## How to Use
|
25 |
+
1. Upload your documents (PDF, DOCX, PPTX, TXT)
|
26 |
+
2. The system will automatically process and chunk your documents
|
27 |
+
3. Ask questions about your documents using the query interface
|
28 |
+
4. Get AI-powered summaries and answers based on your document content
|
29 |
+
|
30 |
+
## Technical Stack
|
31 |
+
- **Backend:** FastAPI, Python 3.10+
|
32 |
+
- **AI/ML:** LangChain, Mistral AI API, Sentence Transformers
|
33 |
+
- **Vector Database:** ChromaDB
|
34 |
+
- **Document Processing:** PyPDF2, pdfplumber, unstructured, pytesseract
|
35 |
+
- **Frontend:** Modern HTML/CSS/JavaScript with Tailwind CSS
|
36 |
+
|
37 |
+
## Environment Variables
|
38 |
+
- `MISTRAL_API_KEY`: Required for AI-powered features (get from [Mistral AI](https://mistral.ai/))
|
39 |
+
|
40 |
+
## License
|
41 |
+
MIT License
|
app/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# This file makes the app directory a Python package
|
app/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (139 Bytes). View file
|
|
app/__pycache__/chunking.cpython-312.pyc
ADDED
Binary file (1.43 kB). View file
|
|
app/__pycache__/document_loader.cpython-312.pyc
ADDED
Binary file (19.8 kB). View file
|
|
app/__pycache__/main.cpython-312.pyc
ADDED
Binary file (41.1 kB). View file
|
|
app/__pycache__/summarizer.cpython-312.pyc
ADDED
Binary file (16.1 kB). View file
|
|
app/__pycache__/vector_store.cpython-312.pyc
ADDED
Binary file (3.08 kB). View file
|
|
app/chunking.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
2 |
+
from langchain.schema import Document
|
3 |
+
|
4 |
+
def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list:
|
5 |
+
"""
|
6 |
+
Split text into chunks using RecursiveCharacterTextSplitter
|
7 |
+
|
8 |
+
Args:
|
9 |
+
text: Text to split into chunks
|
10 |
+
chunk_size: Size of each chunk
|
11 |
+
chunk_overlap: Overlap between chunks
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
List of text chunks
|
15 |
+
"""
|
16 |
+
try:
|
17 |
+
if not text or not text.strip():
|
18 |
+
print("[WARNING] Empty or None text provided for chunking")
|
19 |
+
return []
|
20 |
+
|
21 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
22 |
+
chunk_size=chunk_size,
|
23 |
+
chunk_overlap=chunk_overlap,
|
24 |
+
length_function=len,
|
25 |
+
)
|
26 |
+
|
27 |
+
chunks = text_splitter.split_text(text)
|
28 |
+
print(f"[INFO] Created {len(chunks)} chunks from text")
|
29 |
+
return chunks
|
30 |
+
except Exception as e:
|
31 |
+
print(f"[ERROR] Text chunking failed: {e}")
|
32 |
+
# Return the original text as a single chunk as fallback
|
33 |
+
return [text] if text else []
|
app/document_loader.py
ADDED
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyPDFLoader, PDFPlumberLoader, UnstructuredPDFLoader, PyMuPDFLoader
|
2 |
+
from langchain_community.document_loaders import UnstructuredPowerPointLoader
|
3 |
+
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
4 |
+
from langchain_community.document_loaders import TextLoader
|
5 |
+
from langchain.schema import Document
|
6 |
+
import os
|
7 |
+
import fitz # PyMuPDF
|
8 |
+
import pytesseract
|
9 |
+
from PIL import Image
|
10 |
+
import io
|
11 |
+
import numpy as np
|
12 |
+
import cv2
|
13 |
+
from pdf2image import convert_from_path
|
14 |
+
import tempfile
|
15 |
+
import shutil
|
16 |
+
|
17 |
+
class DocumentLoader:
|
18 |
+
def __init__(self, file_path: str):
|
19 |
+
self.file_path = file_path
|
20 |
+
self.extension = os.path.splitext(file_path)[1].lower()
|
21 |
+
|
22 |
+
def load(self):
|
23 |
+
"""Load documents with enhanced PDF processing for scanned documents"""
|
24 |
+
try:
|
25 |
+
if self.extension == ".pdf":
|
26 |
+
return self._load_pdf_with_ocr()
|
27 |
+
elif self.extension == ".pptx":
|
28 |
+
return UnstructuredPowerPointLoader(self.file_path).load()
|
29 |
+
elif self.extension == ".docx":
|
30 |
+
return UnstructuredWordDocumentLoader(self.file_path).load()
|
31 |
+
elif self.extension == ".txt":
|
32 |
+
return TextLoader(self.file_path).load()
|
33 |
+
else:
|
34 |
+
raise ValueError(f"Unsupported file type: {self.extension}")
|
35 |
+
except Exception as e:
|
36 |
+
print(f"[ERROR] Document loading failed for {self.file_path}: {e}")
|
37 |
+
# Return a basic error document
|
38 |
+
return [Document(
|
39 |
+
page_content=f"Error loading document: {str(e)}. Please ensure the file is not corrupted and is in a supported format.",
|
40 |
+
metadata={"page": 1, "source": self.file_path, "error": str(e)}
|
41 |
+
)]
|
42 |
+
|
43 |
+
def _load_pdf_with_ocr(self):
|
44 |
+
"""Enhanced PDF loading with OCR support for scanned documents"""
|
45 |
+
try:
|
46 |
+
# First, try to extract text using PyMuPDF (most reliable for text-based PDFs)
|
47 |
+
print(f"[INFO] Attempting to extract text using PyMuPDF...")
|
48 |
+
documents = self._extract_text_with_pymupdf()
|
49 |
+
|
50 |
+
# Check if we got meaningful text content
|
51 |
+
total_text = " ".join([doc.page_content for doc in documents])
|
52 |
+
if len(total_text.strip()) > 50: # If we have substantial text, use it
|
53 |
+
print(f"[INFO] Successfully extracted {len(total_text)} characters using PyMuPDF")
|
54 |
+
return documents
|
55 |
+
|
56 |
+
# If text extraction failed or returned minimal content, try OCR
|
57 |
+
print(f"[INFO] Text extraction returned minimal content ({len(total_text)} chars). Attempting OCR...")
|
58 |
+
documents = self._extract_text_with_ocr()
|
59 |
+
|
60 |
+
if documents:
|
61 |
+
total_text = " ".join([doc.page_content for doc in documents])
|
62 |
+
print(f"[INFO] Successfully extracted {len(total_text)} characters using OCR")
|
63 |
+
return documents
|
64 |
+
|
65 |
+
# If OCR also fails, try other PDF loaders as fallback
|
66 |
+
print(f"[INFO] OCR failed. Trying alternative PDF loaders...")
|
67 |
+
documents = self._try_alternative_pdf_loaders()
|
68 |
+
|
69 |
+
if documents:
|
70 |
+
total_text = " ".join([doc.page_content for doc in documents])
|
71 |
+
print(f"[INFO] Successfully extracted {len(total_text)} characters using alternative loaders")
|
72 |
+
return documents
|
73 |
+
|
74 |
+
# If all methods fail, create a placeholder document with instructions
|
75 |
+
print(f"[WARNING] All text extraction methods failed. Creating placeholder document.")
|
76 |
+
return [Document(
|
77 |
+
page_content="This appears to be a scanned document or image-based PDF. To enable full text extraction, please install Tesseract OCR. For now, you can still use the document for basic operations.",
|
78 |
+
metadata={"page": 1, "source": self.file_path, "method": "placeholder"}
|
79 |
+
)]
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
print(f"[ERROR] PDF processing failed: {e}")
|
83 |
+
# Final fallback to basic PDF loader
|
84 |
+
return PyPDFLoader(self.file_path).load()
|
85 |
+
|
86 |
+
def _extract_text_with_pymupdf(self):
|
87 |
+
"""Extract text using PyMuPDF (handles most PDF types well)"""
|
88 |
+
try:
|
89 |
+
doc = fitz.open(self.file_path)
|
90 |
+
documents = []
|
91 |
+
|
92 |
+
for page_num in range(len(doc)):
|
93 |
+
page = doc.load_page(page_num)
|
94 |
+
|
95 |
+
# Try to extract text
|
96 |
+
text = page.get_text()
|
97 |
+
|
98 |
+
# If text is empty or very short, try to get text with more options
|
99 |
+
if not text or len(text.strip()) < 10:
|
100 |
+
text = page.get_text("text")
|
101 |
+
|
102 |
+
# If still no text, try to get text with layout preservation
|
103 |
+
if not text or len(text.strip()) < 10:
|
104 |
+
text = page.get_text("dict")
|
105 |
+
# Extract text from the dict structure
|
106 |
+
if "blocks" in text:
|
107 |
+
text_content = []
|
108 |
+
for block in text["blocks"]:
|
109 |
+
if "lines" in block:
|
110 |
+
for line in block["lines"]:
|
111 |
+
for span in line["spans"]:
|
112 |
+
text_content.append(span["text"])
|
113 |
+
text = " ".join(text_content)
|
114 |
+
|
115 |
+
if text and len(text.strip()) > 0:
|
116 |
+
documents.append(Document(
|
117 |
+
page_content=text.strip(),
|
118 |
+
metadata={"page": page_num + 1, "source": self.file_path}
|
119 |
+
))
|
120 |
+
|
121 |
+
doc.close()
|
122 |
+
return documents
|
123 |
+
|
124 |
+
except Exception as e:
|
125 |
+
print(f"[WARNING] PyMuPDF extraction failed: {e}")
|
126 |
+
return []
|
127 |
+
|
128 |
+
def _extract_text_with_ocr(self):
|
129 |
+
"""Extract text from scanned PDFs using OCR"""
|
130 |
+
try:
|
131 |
+
# Check if Tesseract is available and configure it
|
132 |
+
try:
|
133 |
+
import pytesseract
|
134 |
+
|
135 |
+
# Set Tesseract executable path explicitly
|
136 |
+
tesseract_path = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
137 |
+
if os.path.exists(tesseract_path):
|
138 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
139 |
+
print(f"[INFO] Tesseract found at: {tesseract_path}")
|
140 |
+
else:
|
141 |
+
# Try to find tesseract in PATH
|
142 |
+
import subprocess
|
143 |
+
try:
|
144 |
+
result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True)
|
145 |
+
if result.returncode == 0:
|
146 |
+
print("[INFO] Tesseract found in PATH")
|
147 |
+
else:
|
148 |
+
raise Exception("Tesseract not found in PATH")
|
149 |
+
except:
|
150 |
+
raise Exception("Tesseract executable not found")
|
151 |
+
|
152 |
+
# Test if tesseract is working
|
153 |
+
version = pytesseract.get_tesseract_version()
|
154 |
+
print(f"[INFO] Tesseract version: {version}")
|
155 |
+
|
156 |
+
except Exception as e:
|
157 |
+
print(f"[WARNING] Tesseract not available: {e}")
|
158 |
+
print("[INFO] Skipping OCR - Tesseract needs to be installed for OCR functionality")
|
159 |
+
return []
|
160 |
+
|
161 |
+
# Convert PDF to images
|
162 |
+
print(f"[INFO] Converting PDF to images for OCR...")
|
163 |
+
# Specify the Poppler path explicitly
|
164 |
+
poppler_path = r"C:\poppler\poppler-23.11.0\Library\bin"
|
165 |
+
images = convert_from_path(self.file_path, dpi=300, poppler_path=poppler_path)
|
166 |
+
|
167 |
+
documents = []
|
168 |
+
|
169 |
+
for page_num, image in enumerate(images):
|
170 |
+
print(f"[INFO] Processing page {page_num + 1} with OCR...")
|
171 |
+
|
172 |
+
# Convert PIL image to OpenCV format for preprocessing
|
173 |
+
img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
174 |
+
|
175 |
+
# Preprocess image for better OCR (returns multiple versions)
|
176 |
+
processed_images = self._preprocess_image_for_ocr(img_cv)
|
177 |
+
|
178 |
+
# Convert all processed images to PIL format
|
179 |
+
pil_images = []
|
180 |
+
for processed_img in processed_images:
|
181 |
+
try:
|
182 |
+
pil_img = Image.fromarray(processed_img)
|
183 |
+
pil_images.append(pil_img)
|
184 |
+
except:
|
185 |
+
# If conversion fails, use original image
|
186 |
+
pil_images.append(image)
|
187 |
+
|
188 |
+
# Perform OCR with multiple attempts and configurations
|
189 |
+
best_text = ""
|
190 |
+
best_length = 0
|
191 |
+
|
192 |
+
# OCR configurations to try (in order of preference)
|
193 |
+
ocr_configs = [
|
194 |
+
# Default configuration
|
195 |
+
{"config": "--oem 3 --psm 6", "name": "default"},
|
196 |
+
# Single uniform block of text
|
197 |
+
{"config": "--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?;:()[]{}'\"- ", "name": "alphanumeric"},
|
198 |
+
# Sparse text with OSD
|
199 |
+
{"config": "--oem 3 --psm 3", "name": "sparse_text"},
|
200 |
+
# Single text line
|
201 |
+
{"config": "--oem 3 --psm 7", "name": "single_line"},
|
202 |
+
# Single word
|
203 |
+
{"config": "--oem 3 --psm 8", "name": "single_word"},
|
204 |
+
# Single word in a circle
|
205 |
+
{"config": "--oem 3 --psm 9", "name": "circular_text"},
|
206 |
+
# Single character
|
207 |
+
{"config": "--oem 3 --psm 10", "name": "single_char"},
|
208 |
+
# Sparse text
|
209 |
+
{"config": "--oem 3 --psm 11", "name": "sparse_text_alt"},
|
210 |
+
# Raw line
|
211 |
+
{"config": "--oem 3 --psm 12", "name": "raw_line"},
|
212 |
+
# Uniform block of text
|
213 |
+
{"config": "--oem 3 --psm 13", "name": "uniform_block"}
|
214 |
+
]
|
215 |
+
|
216 |
+
try:
|
217 |
+
# Try OCR on all preprocessed images with all configurations
|
218 |
+
for img_idx, pil_image in enumerate(pil_images):
|
219 |
+
for config in ocr_configs:
|
220 |
+
try:
|
221 |
+
text = pytesseract.image_to_string(
|
222 |
+
pil_image,
|
223 |
+
config=config["config"],
|
224 |
+
lang='eng' # Specify English language
|
225 |
+
)
|
226 |
+
|
227 |
+
# Clean the text
|
228 |
+
cleaned_text = self._clean_ocr_text(text)
|
229 |
+
|
230 |
+
# Check if this configuration produced better results
|
231 |
+
if len(cleaned_text.strip()) > best_length:
|
232 |
+
best_text = cleaned_text
|
233 |
+
best_length = len(cleaned_text.strip())
|
234 |
+
print(f"[INFO] Better OCR result with image {img_idx+1}, config {config['name']}: {best_length} characters")
|
235 |
+
|
236 |
+
except Exception as config_error:
|
237 |
+
print(f"[DEBUG] OCR config {config['name']} failed for image {img_idx+1}: {config_error}")
|
238 |
+
continue
|
239 |
+
|
240 |
+
# Use the best result
|
241 |
+
if best_text and len(best_text.strip()) > 10:
|
242 |
+
documents.append(Document(
|
243 |
+
page_content=best_text.strip(),
|
244 |
+
metadata={"page": page_num + 1, "source": self.file_path, "method": "OCR"}
|
245 |
+
))
|
246 |
+
print(f"[INFO] OCR extracted {len(best_text)} characters from page {page_num + 1}")
|
247 |
+
else:
|
248 |
+
print(f"[WARNING] OCR returned minimal text for page {page_num + 1} (best: {best_length} chars)")
|
249 |
+
|
250 |
+
except Exception as e:
|
251 |
+
print(f"[WARNING] OCR failed for page {page_num + 1}: {e}")
|
252 |
+
continue
|
253 |
+
|
254 |
+
return documents
|
255 |
+
|
256 |
+
except Exception as e:
|
257 |
+
print(f"[ERROR] OCR processing failed: {e}")
|
258 |
+
return []
|
259 |
+
|
260 |
+
def _preprocess_image_for_ocr(self, image):
|
261 |
+
"""Preprocess image for better OCR results"""
|
262 |
+
try:
|
263 |
+
# Convert to grayscale
|
264 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
265 |
+
|
266 |
+
# Apply noise reduction
|
267 |
+
denoised = cv2.fastNlMeansDenoising(gray)
|
268 |
+
|
269 |
+
# Try multiple preprocessing approaches
|
270 |
+
processed_images = []
|
271 |
+
|
272 |
+
# Approach 1: Adaptive thresholding
|
273 |
+
try:
|
274 |
+
thresh1 = cv2.adaptiveThreshold(
|
275 |
+
denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
|
276 |
+
)
|
277 |
+
processed_images.append(thresh1)
|
278 |
+
except:
|
279 |
+
pass
|
280 |
+
|
281 |
+
# Approach 2: Otsu thresholding
|
282 |
+
try:
|
283 |
+
_, thresh2 = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
284 |
+
processed_images.append(thresh2)
|
285 |
+
except:
|
286 |
+
pass
|
287 |
+
|
288 |
+
# Approach 3: Simple thresholding
|
289 |
+
try:
|
290 |
+
_, thresh3 = cv2.threshold(denoised, 127, 255, cv2.THRESH_BINARY)
|
291 |
+
processed_images.append(thresh3)
|
292 |
+
except:
|
293 |
+
pass
|
294 |
+
|
295 |
+
# Approach 4: Original grayscale (sometimes works better)
|
296 |
+
processed_images.append(denoised)
|
297 |
+
|
298 |
+
# Apply morphological operations to clean up
|
299 |
+
cleaned_images = []
|
300 |
+
for img in processed_images:
|
301 |
+
try:
|
302 |
+
# Small kernel for fine details
|
303 |
+
kernel_small = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
|
304 |
+
cleaned_small = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel_small)
|
305 |
+
cleaned_images.append(cleaned_small)
|
306 |
+
|
307 |
+
# Medium kernel for general cleaning
|
308 |
+
kernel_medium = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
309 |
+
cleaned_medium = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel_medium)
|
310 |
+
cleaned_images.append(cleaned_medium)
|
311 |
+
|
312 |
+
except:
|
313 |
+
cleaned_images.append(img)
|
314 |
+
|
315 |
+
# Return all processed images for testing
|
316 |
+
return cleaned_images
|
317 |
+
|
318 |
+
except Exception as e:
|
319 |
+
print(f"[WARNING] Image preprocessing failed: {e}")
|
320 |
+
return [image]
|
321 |
+
|
322 |
+
def _clean_ocr_text(self, text):
|
323 |
+
"""Clean and improve OCR text"""
|
324 |
+
if not text:
|
325 |
+
return text
|
326 |
+
|
327 |
+
# Remove excessive whitespace
|
328 |
+
text = ' '.join(text.split())
|
329 |
+
|
330 |
+
# Fix common OCR errors
|
331 |
+
text = text.replace('|', 'I') # Common OCR error
|
332 |
+
text = text.replace('0', 'O') # Sometimes numbers are confused with letters
|
333 |
+
text = text.replace('1', 'l') # Sometimes 1 is confused with l
|
334 |
+
text = text.replace('l', 'I') # Sometimes l is confused with I
|
335 |
+
text = text.replace('rn', 'm') # Common OCR error
|
336 |
+
text = text.replace('cl', 'd') # Common OCR error
|
337 |
+
text = text.replace('vv', 'w') # Common OCR error
|
338 |
+
|
339 |
+
# Remove lines that are likely noise (very short lines)
|
340 |
+
lines = text.split('\n')
|
341 |
+
cleaned_lines = []
|
342 |
+
for line in lines:
|
343 |
+
line = line.strip()
|
344 |
+
# Keep lines with more than 2 characters and not just punctuation
|
345 |
+
if len(line) > 2 and not all(c in '.,!?;:()[]{}' for c in line):
|
346 |
+
cleaned_lines.append(line)
|
347 |
+
|
348 |
+
# Join lines and clean up
|
349 |
+
result = '\n'.join(cleaned_lines)
|
350 |
+
|
351 |
+
# Remove excessive newlines
|
352 |
+
result = '\n'.join(line for line in result.split('\n') if line.strip())
|
353 |
+
|
354 |
+
return result
|
355 |
+
|
356 |
+
def _try_alternative_pdf_loaders(self):
|
357 |
+
"""Try alternative PDF loaders if primary methods fail"""
|
358 |
+
loaders = [
|
359 |
+
("PDFPlumberLoader", lambda: PDFPlumberLoader(self.file_path).load()),
|
360 |
+
("UnstructuredPDFLoader", lambda: UnstructuredPDFLoader(self.file_path).load()),
|
361 |
+
("PyPDFLoader", lambda: PyPDFLoader(self.file_path).load())
|
362 |
+
]
|
363 |
+
|
364 |
+
for loader_name, loader_func in loaders:
|
365 |
+
try:
|
366 |
+
print(f"[INFO] Trying {loader_name}...")
|
367 |
+
documents = loader_func()
|
368 |
+
total_text = " ".join([doc.page_content for doc in documents])
|
369 |
+
if len(total_text.strip()) > 10:
|
370 |
+
print(f"[INFO] {loader_name} successfully extracted {len(total_text)} characters")
|
371 |
+
return documents
|
372 |
+
except Exception as e:
|
373 |
+
print(f"[WARNING] {loader_name} failed: {e}")
|
374 |
+
continue
|
375 |
+
|
376 |
+
return []
|
377 |
+
|
378 |
+
def get_page_count(self):
|
379 |
+
"""Get page count for different document types"""
|
380 |
+
if self.extension == ".pdf":
|
381 |
+
try:
|
382 |
+
# Try PyMuPDF first (most reliable)
|
383 |
+
doc = fitz.open(self.file_path)
|
384 |
+
page_count = len(doc)
|
385 |
+
doc.close()
|
386 |
+
return page_count
|
387 |
+
except Exception:
|
388 |
+
try:
|
389 |
+
# Fallback to PyPDF2
|
390 |
+
import PyPDF2
|
391 |
+
with open(self.file_path, "rb") as f:
|
392 |
+
reader = PyPDF2.PdfReader(f)
|
393 |
+
return len(reader.pages)
|
394 |
+
except Exception:
|
395 |
+
return None
|
396 |
+
elif self.extension == ".pptx":
|
397 |
+
try:
|
398 |
+
from pptx import Presentation
|
399 |
+
prs = Presentation(self.file_path)
|
400 |
+
return len(prs.slides)
|
401 |
+
except Exception:
|
402 |
+
return None
|
403 |
+
elif self.extension == ".docx":
|
404 |
+
try:
|
405 |
+
from docx import Document as DocxDocument
|
406 |
+
doc = DocxDocument(self.file_path)
|
407 |
+
# DOCX doesn't have strict pages, but we can estimate by section breaks or paragraphs
|
408 |
+
return max(1, len(doc.paragraphs) // 30) # Rough estimate: 30 paragraphs per page
|
409 |
+
except Exception:
|
410 |
+
return None
|
411 |
+
elif self.extension == ".txt":
|
412 |
+
try:
|
413 |
+
with open(self.file_path, "r", encoding="utf-8") as f:
|
414 |
+
words = f.read().split()
|
415 |
+
return max(1, len(words) // 500)
|
416 |
+
except Exception:
|
417 |
+
return None
|
418 |
+
else:
|
419 |
+
return None
|
app/main.py
ADDED
@@ -0,0 +1,882 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, UploadFile, File, Form
|
2 |
+
from fastapi.responses import HTMLResponse, JSONResponse
|
3 |
+
from typing import List, Optional
|
4 |
+
import os
|
5 |
+
from .document_loader import DocumentLoader
|
6 |
+
from .chunking import chunk_text
|
7 |
+
from .vector_store import add_to_vector_store, similarity_search
|
8 |
+
from .summarizer import DocumentSummarizer, clean_markdown_formatting
|
9 |
+
|
10 |
+
# Remove Qwen/transformers imports and model initialization
|
11 |
+
|
12 |
+
app = FastAPI(title="RAG Document Summarizer", version="1.0.0")
|
13 |
+
|
14 |
+
print("[INFO] RAG Application starting up...")
|
15 |
+
|
16 |
+
# Global exception handler to ensure all errors return JSON
|
17 |
+
@app.exception_handler(Exception)
|
18 |
+
async def global_exception_handler(request, exc):
|
19 |
+
print(f"[ERROR] Unhandled exception: {exc}")
|
20 |
+
return JSONResponse(
|
21 |
+
status_code=500,
|
22 |
+
content={"error": f"Internal server error: {str(exc)}"}
|
23 |
+
)
|
24 |
+
|
25 |
+
# Remove Qwen2-0.5B model instance for queries (CPU-optimized)
|
26 |
+
|
27 |
+
def initialize_qwen_model():
|
28 |
+
"""Initialize Qwen2-0.5B model for query responses (CPU-optimized)"""
|
29 |
+
# This function is no longer needed as Qwen model is removed.
|
30 |
+
# Keeping it for now, but it will not initialize the model.
|
31 |
+
print("[INFO] Qwen model is no longer available. Using simulated responses for queries.")
|
32 |
+
return False
|
33 |
+
|
34 |
+
# Initialize model on startup (non-blocking)
|
35 |
+
@app.on_event("startup")
|
36 |
+
async def startup_event():
|
37 |
+
print("[INFO] Starting RAG application...")
|
38 |
+
# Initialize model in background to avoid blocking startup
|
39 |
+
import asyncio
|
40 |
+
asyncio.create_task(initialize_qwen_model_async())
|
41 |
+
|
42 |
+
async def initialize_qwen_model_async():
|
43 |
+
"""Initialize Qwen model asynchronously to avoid blocking startup"""
|
44 |
+
try:
|
45 |
+
initialize_qwen_model()
|
46 |
+
except Exception as e:
|
47 |
+
print(f"[WARNING] Model initialization failed: {e}")
|
48 |
+
print("[INFO] Application will continue with simulated responses")
|
49 |
+
|
50 |
+
@app.get("/health")
|
51 |
+
async def health_check():
|
52 |
+
"""Simple health check endpoint"""
|
53 |
+
return {"status": "healthy", "message": "RAG application is running"}
|
54 |
+
|
55 |
+
@app.get("/", response_class=HTMLResponse)
|
56 |
+
async def read_root():
|
57 |
+
return """
|
58 |
+
<!DOCTYPE html>
|
59 |
+
<html lang="en">
|
60 |
+
<head>
|
61 |
+
<meta charset="UTF-8">
|
62 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
63 |
+
<title>AI Document Summarizer & Query Resolver</title>
|
64 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
65 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf-lib/1.17.1/pdf-lib.min.js"></script>
|
66 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/mammoth/1.5.0/mammoth.browser.min.js"></script>
|
67 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/jszip/3.10.1/jszip.min.js"></script>
|
68 |
+
<style>
|
69 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
70 |
+
|
71 |
+
:root {
|
72 |
+
--pastel-blue: #89CFF0;
|
73 |
+
--pastel-green: #A8E6CF;
|
74 |
+
--pastel-purple: #D7A9E3;
|
75 |
+
--pastel-pink: #F5B7B1;
|
76 |
+
}
|
77 |
+
|
78 |
+
* {
|
79 |
+
font-family: 'Inter', sans-serif;
|
80 |
+
}
|
81 |
+
|
82 |
+
body {
|
83 |
+
background: linear-gradient(135deg, #1a1a1a 0%, #2a2a2a 100%);
|
84 |
+
color: #e0e0e0;
|
85 |
+
line-height: 1.6;
|
86 |
+
}
|
87 |
+
|
88 |
+
.glass-effect {
|
89 |
+
background: rgba(0, 0, 0, 0.3);
|
90 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
91 |
+
backdrop-filter: blur(10px);
|
92 |
+
}
|
93 |
+
|
94 |
+
.file-upload-area {
|
95 |
+
background: linear-gradient(135deg, rgba(30, 30, 30, 0.5) 0%, rgba(50, 50, 50, 0.5) 100%);
|
96 |
+
border: 2px dashed rgba(255, 255, 255, 0.2);
|
97 |
+
transition: all 0.3s ease;
|
98 |
+
}
|
99 |
+
|
100 |
+
.file-upload-area:hover {
|
101 |
+
border-color: var(--pastel-blue);
|
102 |
+
}
|
103 |
+
|
104 |
+
.chunk-card {
|
105 |
+
background: linear-gradient(135deg, rgba(0, 0, 0, 0.2) 0%, rgba(0, 0, 0, 0.1) 100%);
|
106 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
107 |
+
}
|
108 |
+
|
109 |
+
.progress-bar {
|
110 |
+
background: linear-gradient(90deg, var(--pastel-blue) 0%, var(--pastel-green) 100%);
|
111 |
+
}
|
112 |
+
|
113 |
+
@keyframes float {
|
114 |
+
0%, 100% { transform: translateY(0px); }
|
115 |
+
50% { transform: translateY(-5px); }
|
116 |
+
}
|
117 |
+
|
118 |
+
.floating-element {
|
119 |
+
animation: float 5s ease-in-out infinite;
|
120 |
+
}
|
121 |
+
|
122 |
+
.query-bubble {
|
123 |
+
background: linear-gradient(135deg, #333 0%, #444 100%);
|
124 |
+
border-radius: 20px 20px 5px 20px;
|
125 |
+
}
|
126 |
+
|
127 |
+
.response-bubble {
|
128 |
+
background: linear-gradient(135deg, rgba(0, 0, 0, 0.2) 0%, rgba(0, 0, 0, 0.1) 100%);
|
129 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
130 |
+
border-radius: 20px 20px 20px 5px;
|
131 |
+
}
|
132 |
+
|
133 |
+
h1 {
|
134 |
+
font-size: 3.75rem;
|
135 |
+
font-weight: 700;
|
136 |
+
}
|
137 |
+
|
138 |
+
h2 {
|
139 |
+
font-size: 2.25rem;
|
140 |
+
font-weight: 600;
|
141 |
+
}
|
142 |
+
</style>
|
143 |
+
</head>
|
144 |
+
<body>
|
145 |
+
<!-- Background Effects -->
|
146 |
+
<div class="fixed inset-0 overflow-hidden pointer-events-none">
|
147 |
+
<div class="absolute top-20 left-10 w-64 h-64 bg-[var(--pastel-blue)] rounded-full opacity-5 blur-3xl floating-element"></div>
|
148 |
+
<div class="absolute top-40 right-20 w-48 h-48 bg-[var(--pastel-green)] rounded-full opacity-5 blur-2xl floating-element" style="animation-delay: 1s;"></div>
|
149 |
+
<div class="absolute bottom-20 left-1/3 w-56 h-56 bg-[var(--pastel-purple)] rounded-full opacity-5 blur-3xl floating-element" style="animation-delay: 2s;"></div>
|
150 |
+
</div>
|
151 |
+
|
152 |
+
<!-- Header -->
|
153 |
+
<header class="relative z-10 py-8">
|
154 |
+
<div class="container mx-auto px-6">
|
155 |
+
<div class="text-center">
|
156 |
+
<h1 class="text-6xl font-bold mb-4">AI Document Summarizer</h1>
|
157 |
+
<p class="text-xl mb-6">Advanced Document Processing and Query Resolution</p>
|
158 |
+
</div>
|
159 |
+
</div>
|
160 |
+
</header>
|
161 |
+
|
162 |
+
<!-- Main Content -->
|
163 |
+
<main class="container mx-auto px-6 pb-16">
|
164 |
+
<!-- File Upload Section -->
|
165 |
+
<div class="glass-effect rounded-3xl p-10 mb-8">
|
166 |
+
<h2 class="text-4xl font-bold mb-6 text-center">Document Upload</h2>
|
167 |
+
|
168 |
+
<div id="fileUploadArea" class="file-upload-area rounded-2xl p-12 text-center cursor-pointer">
|
169 |
+
<div class="mb-4">
|
170 |
+
<svg class="w-16 h-16 mx-auto text-[var(--pastel-blue)] mb-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
171 |
+
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 16a4 4 0 01-.88-7.903A5 5 0 1115.9 6L16 6a5 5 0 011 9.9M15 13l-3-3m0 0l-3 3m3-3v12"></path>
|
172 |
+
</svg>
|
173 |
+
</div>
|
174 |
+
<p class="text-2xl font-semibold mb-2">Drop your documents here</p>
|
175 |
+
<p class="text-gray-400 mb-4">or click to browse</p>
|
176 |
+
<div class="flex justify-center space-x-2 text-sm text-gray-500">
|
177 |
+
<span>Supports: PDF, DOCX, PPTX, TXT</span>
|
178 |
+
<span>•</span>
|
179 |
+
<span>Max size: 100MB</span>
|
180 |
+
</div>
|
181 |
+
</div>
|
182 |
+
|
183 |
+
<input type="file" id="fileInput" multiple accept=".pdf,.docx,.pptx,.txt" class="hidden">
|
184 |
+
|
185 |
+
<!-- Processing Status -->
|
186 |
+
<div id="processingStatus" class="mt-6 hidden">
|
187 |
+
<div class="bg-gray-800 rounded-xl p-4">
|
188 |
+
<div class="flex items-center justify-between mb-2">
|
189 |
+
<span class="font-medium">Processing Document...</span>
|
190 |
+
<span id="processingPercentage" class="text-[var(--pastel-blue)] font-bold">0%</span>
|
191 |
+
</div>
|
192 |
+
<div class="w-full bg-gray-700 rounded-full h-2">
|
193 |
+
<div id="progressBar" class="progress-bar h-2 rounded-full" style="width: 0%"></div>
|
194 |
+
</div>
|
195 |
+
<div id="processingSteps" class="mt-4 space-y-2"></div>
|
196 |
+
</div>
|
197 |
+
</div>
|
198 |
+
|
199 |
+
<!-- File List -->
|
200 |
+
<div id="fileList" class="mt-6 space-y-3"></div>
|
201 |
+
</div>
|
202 |
+
|
203 |
+
<!-- Document Analysis & Summary -->
|
204 |
+
<div id="documentAnalysis" class="glass-effect rounded-3xl p-10 mb-8 hidden">
|
205 |
+
<h2 class="text-4xl font-bold mb-6">Document Analysis</h2>
|
206 |
+
|
207 |
+
<div class="grid md:grid-cols-2 lg:grid-cols-3 gap-6 mb-8">
|
208 |
+
<div class="chunk-card rounded-xl p-6">
|
209 |
+
<h3 class="text-lg font-semibold mb-2">Document Type</h3>
|
210 |
+
<p id="documentType" class="text-gray-400">-</p>
|
211 |
+
</div>
|
212 |
+
|
213 |
+
<div class="chunk-card rounded-xl p-6">
|
214 |
+
<h3 class="text-lg font-semibold mb-2">Page Count</h3>
|
215 |
+
<p id="pageCount" class="text-gray-400">-</p>
|
216 |
+
</div>
|
217 |
+
|
218 |
+
<div class="chunk-card rounded-xl p-6">
|
219 |
+
<h3 class="text-lg font-semibold mb-2">Chunks Created</h3>
|
220 |
+
<p id="chunkCount" class="text-gray-400">-</p>
|
221 |
+
</div>
|
222 |
+
</div>
|
223 |
+
|
224 |
+
<div class="chunk-card rounded-xl p-6">
|
225 |
+
<h3 class="text-xl font-semibold mb-4">Document Summary</h3>
|
226 |
+
<div id="documentSummary" class="text-gray-400 leading-relaxed">
|
227 |
+
<div class="processing-animation">Generating summary...</div>
|
228 |
+
</div>
|
229 |
+
</div>
|
230 |
+
</div>
|
231 |
+
|
232 |
+
<!-- Query Interface -->
|
233 |
+
<div class="glass-effect rounded-3xl p-10 mb-8">
|
234 |
+
<h2 class="text-4xl font-bold mb-6">Query Resolver</h2>
|
235 |
+
|
236 |
+
<div class="mb-6">
|
237 |
+
<div class="relative">
|
238 |
+
<input
|
239 |
+
type="text"
|
240 |
+
id="queryInput"
|
241 |
+
placeholder="Ask anything about your document..."
|
242 |
+
class="w-full px-6 py-4 bg-gray-800 border border-gray-700 rounded-2xl text-white placeholder-gray-500 focus:outline-none focus:border-[var(--pastel-blue)]"
|
243 |
+
disabled
|
244 |
+
>
|
245 |
+
<button
|
246 |
+
id="querySubmit"
|
247 |
+
class="absolute right-2 top-2 px-6 py-2 bg-gray-800 hover:bg-[var(--pastel-blue)]/20 rounded-xl text-white font-medium transition-all duration-200 disabled:opacity-50"
|
248 |
+
disabled
|
249 |
+
>
|
250 |
+
Ask
|
251 |
+
</button>
|
252 |
+
</div>
|
253 |
+
<div class="mt-3 flex flex-wrap gap-2">
|
254 |
+
<button class="suggestion-btn px-4 py-2 bg-gray-800 hover:bg-[var(--pastel-blue)]/20 rounded-full text-sm text-gray-400 transition-all duration-200">
|
255 |
+
What are the key points?
|
256 |
+
</button>
|
257 |
+
<button class="suggestion-btn px-4 py-2 bg-gray-800 hover:bg-[var(--pastel-blue)]/20 rounded-full text-sm text-gray-400 transition-all duration-200">
|
258 |
+
Explain the main concepts
|
259 |
+
</button>
|
260 |
+
<button class="suggestion-btn px-4 py-2 bg-gray-800 hover:bg-[var(--pastel-blue)]/20 rounded-full text-sm text-gray-400 transition-all duration-200">
|
261 |
+
What conclusions are drawn?
|
262 |
+
</button>
|
263 |
+
</div>
|
264 |
+
</div>
|
265 |
+
|
266 |
+
<div id="queryHistory" class="space-y-4 max-h-96 overflow-y-auto"></div>
|
267 |
+
</div>
|
268 |
+
</main>
|
269 |
+
|
270 |
+
<script>
|
271 |
+
// Global state
|
272 |
+
let documents = [];
|
273 |
+
let currentDocument = null;
|
274 |
+
let documentChunks = [];
|
275 |
+
let isProcessing = false;
|
276 |
+
|
277 |
+
// Initialize application
|
278 |
+
document.addEventListener('DOMContentLoaded', function() {
|
279 |
+
initializeFileUpload();
|
280 |
+
initializeQueryInterface();
|
281 |
+
initializeSuggestions();
|
282 |
+
});
|
283 |
+
|
284 |
+
function initializeFileUpload() {
|
285 |
+
const fileUploadArea = document.getElementById('fileUploadArea');
|
286 |
+
const fileInput = document.getElementById('fileInput');
|
287 |
+
|
288 |
+
fileUploadArea.addEventListener('click', () => {
|
289 |
+
if (!isProcessing) {
|
290 |
+
fileInput.click();
|
291 |
+
}
|
292 |
+
});
|
293 |
+
|
294 |
+
fileInput.addEventListener('change', (e) => {
|
295 |
+
handleFiles(e.target.files);
|
296 |
+
});
|
297 |
+
|
298 |
+
fileUploadArea.addEventListener('dragover', (e) => {
|
299 |
+
e.preventDefault();
|
300 |
+
fileUploadArea.classList.add('dragover');
|
301 |
+
});
|
302 |
+
|
303 |
+
fileUploadArea.addEventListener('dragleave', () => {
|
304 |
+
fileUploadArea.classList.remove('dragover');
|
305 |
+
});
|
306 |
+
|
307 |
+
fileUploadArea.addEventListener('drop', (e) => {
|
308 |
+
e.preventDefault();
|
309 |
+
fileUploadArea.classList.remove('dragover');
|
310 |
+
handleFiles(e.dataTransfer.files);
|
311 |
+
});
|
312 |
+
}
|
313 |
+
|
314 |
+
function initializeQueryInterface() {
|
315 |
+
const queryInput = document.getElementById('queryInput');
|
316 |
+
const querySubmit = document.getElementById('querySubmit');
|
317 |
+
|
318 |
+
querySubmit.addEventListener('click', () => {
|
319 |
+
const query = queryInput.value.trim();
|
320 |
+
if (query) {
|
321 |
+
processQuery(query);
|
322 |
+
queryInput.value = '';
|
323 |
+
}
|
324 |
+
});
|
325 |
+
|
326 |
+
queryInput.addEventListener('keypress', (e) => {
|
327 |
+
if (e.key === 'Enter') {
|
328 |
+
querySubmit.click();
|
329 |
+
}
|
330 |
+
});
|
331 |
+
}
|
332 |
+
|
333 |
+
function initializeSuggestions() {
|
334 |
+
document.querySelectorAll('.suggestion-btn').forEach(btn => {
|
335 |
+
btn.addEventListener('click', () => {
|
336 |
+
const query = btn.textContent.trim();
|
337 |
+
document.getElementById('queryInput').value = query;
|
338 |
+
document.getElementById('querySubmit').click();
|
339 |
+
});
|
340 |
+
});
|
341 |
+
}
|
342 |
+
|
343 |
+
async function handleFiles(files) {
|
344 |
+
if (isProcessing) return;
|
345 |
+
|
346 |
+
for (const file of files) {
|
347 |
+
if (validateFile(file)) {
|
348 |
+
await processDocument(file);
|
349 |
+
}
|
350 |
+
}
|
351 |
+
}
|
352 |
+
|
353 |
+
function validateFile(file) {
|
354 |
+
const allowedTypes = [
|
355 |
+
'application/pdf',
|
356 |
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
357 |
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
358 |
+
'text/plain'
|
359 |
+
];
|
360 |
+
|
361 |
+
if (!allowedTypes.includes(file.type)) {
|
362 |
+
showNotification('Unsupported file type. Please upload PDF, DOCX, PPTX, or TXT files.', 'error');
|
363 |
+
return false;
|
364 |
+
}
|
365 |
+
|
366 |
+
if (file.size > 100 * 1024 * 1024) {
|
367 |
+
showNotification('File too large. Maximum size is 100MB.', 'error');
|
368 |
+
return false;
|
369 |
+
}
|
370 |
+
|
371 |
+
return true;
|
372 |
+
}
|
373 |
+
|
374 |
+
async function processDocument(file) {
|
375 |
+
isProcessing = true;
|
376 |
+
showProcessingStatus();
|
377 |
+
|
378 |
+
try {
|
379 |
+
updateProcessingStep('Uploading document...', 10);
|
380 |
+
|
381 |
+
// Create FormData for file upload
|
382 |
+
const formData = new FormData();
|
383 |
+
formData.append('file', file);
|
384 |
+
|
385 |
+
updateProcessingStep('Processing document...', 30);
|
386 |
+
|
387 |
+
// Send file to backend
|
388 |
+
const response = await fetch('/upload', {
|
389 |
+
method: 'POST',
|
390 |
+
body: formData
|
391 |
+
});
|
392 |
+
|
393 |
+
if (!response.ok) {
|
394 |
+
let errorMessage = 'Upload failed';
|
395 |
+
let errorText = '';
|
396 |
+
try {
|
397 |
+
// Try to parse as JSON
|
398 |
+
const errorData = await response.json();
|
399 |
+
errorMessage = errorData.error || 'Upload failed';
|
400 |
+
} catch (jsonError) {
|
401 |
+
// Only try to read as text if .json() fails and errorText is still empty
|
402 |
+
if (!errorText) {
|
403 |
+
errorText = await response.text();
|
404 |
+
console.error('Non-JSON error response:', errorText);
|
405 |
+
}
|
406 |
+
errorMessage = `Server error (${response.status}): ${response.statusText}`;
|
407 |
+
}
|
408 |
+
throw new Error(errorMessage);
|
409 |
+
}
|
410 |
+
|
411 |
+
updateProcessingStep('Analyzing content...', 70);
|
412 |
+
|
413 |
+
// Only read the response body once
|
414 |
+
const result = await response.json();
|
415 |
+
|
416 |
+
updateProcessingStep('Processing complete!', 100);
|
417 |
+
|
418 |
+
const document = {
|
419 |
+
id: Date.now(),
|
420 |
+
name: file.name,
|
421 |
+
type: getFileType(file.name),
|
422 |
+
size: file.size,
|
423 |
+
pageCount: result.page_estimate || 1,
|
424 |
+
chunks: result.chunk_count || 0,
|
425 |
+
summary: result.summary,
|
426 |
+
classification: result.classification,
|
427 |
+
processingMethod: result.processing_method
|
428 |
+
};
|
429 |
+
|
430 |
+
documents.push(document);
|
431 |
+
currentDocument = document;
|
432 |
+
|
433 |
+
displayDocumentInfo(document);
|
434 |
+
enableQueryInterface();
|
435 |
+
|
436 |
+
setTimeout(() => {
|
437 |
+
hideProcessingStatus();
|
438 |
+
showNotification('Document processed successfully!', 'success');
|
439 |
+
}, 1000);
|
440 |
+
|
441 |
+
} catch (error) {
|
442 |
+
console.error('Error processing document:', error);
|
443 |
+
showNotification('Error processing document: ' + error.message, 'error');
|
444 |
+
hideProcessingStatus();
|
445 |
+
}
|
446 |
+
|
447 |
+
isProcessing = false;
|
448 |
+
}
|
449 |
+
|
450 |
+
function getFileType(filename) {
|
451 |
+
const extension = filename.split('.').pop().toLowerCase();
|
452 |
+
const typeMap = {
|
453 |
+
'pdf': 'PDF Document',
|
454 |
+
'docx': 'Word Document',
|
455 |
+
'pptx': 'PowerPoint Presentation',
|
456 |
+
'txt': 'Text Document'
|
457 |
+
};
|
458 |
+
return typeMap[extension] || 'Unknown';
|
459 |
+
}
|
460 |
+
|
461 |
+
function displayDocumentInfo(docData) {
|
462 |
+
document.getElementById('documentType').textContent = docData.type;
|
463 |
+
document.getElementById('pageCount').textContent = `${docData.pageCount} pages (${docData.classification})`;
|
464 |
+
document.getElementById('chunkCount').textContent = `${docData.chunks} chunks`;
|
465 |
+
|
466 |
+
const summaryElement = document.getElementById('documentSummary');
|
467 |
+
summaryElement.innerHTML = '';
|
468 |
+
|
469 |
+
let i = 0;
|
470 |
+
const summary = docData.summary;
|
471 |
+
const typeInterval = setInterval(() => {
|
472 |
+
if (i < summary.length) {
|
473 |
+
summaryElement.textContent += summary.charAt(i);
|
474 |
+
i++;
|
475 |
+
} else {
|
476 |
+
clearInterval(typeInterval);
|
477 |
+
}
|
478 |
+
}, 20);
|
479 |
+
|
480 |
+
document.getElementById('documentAnalysis').classList.remove('hidden');
|
481 |
+
}
|
482 |
+
|
483 |
+
function enableQueryInterface() {
|
484 |
+
document.getElementById('queryInput').disabled = false;
|
485 |
+
document.getElementById('querySubmit').disabled = false;
|
486 |
+
document.querySelectorAll('.suggestion-btn').forEach(btn => {
|
487 |
+
btn.disabled = false;
|
488 |
+
});
|
489 |
+
}
|
490 |
+
|
491 |
+
async function processQuery(query) {
|
492 |
+
if (!currentDocument) return;
|
493 |
+
|
494 |
+
addQueryToHistory(query);
|
495 |
+
|
496 |
+
try {
|
497 |
+
const formData = new FormData();
|
498 |
+
formData.append('filename', currentDocument.name);
|
499 |
+
formData.append('query', query);
|
500 |
+
|
501 |
+
const response = await fetch('/query', {
|
502 |
+
method: 'POST',
|
503 |
+
body: formData
|
504 |
+
});
|
505 |
+
|
506 |
+
if (!response.ok) {
|
507 |
+
let errorMessage = 'Query failed';
|
508 |
+
try {
|
509 |
+
const errorData = await response.json();
|
510 |
+
errorMessage = errorData.error || 'Query failed';
|
511 |
+
} catch (jsonError) {
|
512 |
+
// If response is not JSON (e.g., HTML error page), get text content
|
513 |
+
const errorText = await response.text();
|
514 |
+
console.error('Non-JSON error response:', errorText);
|
515 |
+
errorMessage = `Server error (${response.status}): ${response.statusText}`;
|
516 |
+
}
|
517 |
+
throw new Error(errorMessage);
|
518 |
+
}
|
519 |
+
|
520 |
+
const result = await response.json();
|
521 |
+
addResponseToHistory(result.answer);
|
522 |
+
|
523 |
+
} catch (error) {
|
524 |
+
console.error('Error processing query:', error);
|
525 |
+
addResponseToHistory('Sorry, I encountered an error while processing your query. Please try again.');
|
526 |
+
}
|
527 |
+
}
|
528 |
+
|
529 |
+
function addQueryToHistory(query) {
|
530 |
+
const historyContainer = document.getElementById('queryHistory');
|
531 |
+
const queryElement = document.createElement('div');
|
532 |
+
queryElement.className = 'query-bubble p-4 ml-8';
|
533 |
+
queryElement.innerHTML = `
|
534 |
+
<div class="flex items-start">
|
535 |
+
<div class="flex-shrink-0 w-8 h-8 bg-gray-700 rounded-full flex items-center justify-center mr-3 mt-1">
|
536 |
+
<span class="text-sm">U</span>
|
537 |
+
</div>
|
538 |
+
<div class="flex-1">
|
539 |
+
<p class="font-medium">${query}</p>
|
540 |
+
<p class="text-sm text-gray-500 mt-1">${new Date().toLocaleTimeString()}</p>
|
541 |
+
</div>
|
542 |
+
</div>
|
543 |
+
`;
|
544 |
+
historyContainer.appendChild(queryElement);
|
545 |
+
historyContainer.scrollTop = historyContainer.scrollHeight;
|
546 |
+
}
|
547 |
+
|
548 |
+
function addResponseToHistory(response) {
|
549 |
+
const historyContainer = document.getElementById('queryHistory');
|
550 |
+
const responseElement = document.createElement('div');
|
551 |
+
responseElement.className = 'response-bubble p-4 mr-8';
|
552 |
+
responseElement.innerHTML = `
|
553 |
+
<div class="flex items-start">
|
554 |
+
<div class="flex-shrink-0 w-8 h-8 bg-[var(--pastel-blue)] rounded-full flex items-center justify-center mr-3 mt-1">
|
555 |
+
<span class="text-sm">A</span>
|
556 |
+
</div>
|
557 |
+
<div class="flex-1">
|
558 |
+
<div class="typing-indicator mb-2">
|
559 |
+
<span class="inline-block w-2 h-2 bg-[var(--pastel-blue)] rounded-full animate-pulse"></span>
|
560 |
+
<span class="inline-block w-2 h-2 bg-[var(--pastel-blue)] rounded-full animate-pulse ml-1" style="animation-delay: 0.2s;"></span>
|
561 |
+
<span class="inline-block w-2 h-2 bg-[var(--pastel-blue)] rounded-full animate-pulse ml-1" style="animation-delay: 0.4s;"></span>
|
562 |
+
</div>
|
563 |
+
<p class="response-text hidden leading-relaxed"></p>
|
564 |
+
<p class="text-sm text-gray-500 mt-2">${new Date().toLocaleTimeString()}</p>
|
565 |
+
</div>
|
566 |
+
</div>
|
567 |
+
`;
|
568 |
+
historyContainer.appendChild(responseElement);
|
569 |
+
historyContainer.scrollTop = historyContainer.scrollHeight;
|
570 |
+
|
571 |
+
setTimeout(() => {
|
572 |
+
const typingIndicator = responseElement.querySelector('.typing-indicator');
|
573 |
+
const responseText = responseElement.querySelector('.response-text');
|
574 |
+
|
575 |
+
typingIndicator.classList.add('hidden');
|
576 |
+
responseText.classList.remove('hidden');
|
577 |
+
|
578 |
+
let i = 0;
|
579 |
+
const typeInterval = setInterval(() => {
|
580 |
+
if (i < response.length) {
|
581 |
+
responseText.textContent += response.charAt(i);
|
582 |
+
i++;
|
583 |
+
historyContainer.scrollTop = historyContainer.scrollHeight;
|
584 |
+
} else {
|
585 |
+
clearInterval(typeInterval);
|
586 |
+
}
|
587 |
+
}, 30);
|
588 |
+
}, 1500);
|
589 |
+
}
|
590 |
+
|
591 |
+
function showProcessingStatus() {
|
592 |
+
document.getElementById('processingStatus').classList.remove('hidden');
|
593 |
+
document.getElementById('fileUploadArea').style.opacity = '0.5';
|
594 |
+
document.getElementById('fileUploadArea').style.pointerEvents = 'none';
|
595 |
+
}
|
596 |
+
|
597 |
+
function hideProcessingStatus() {
|
598 |
+
document.getElementById('processingStatus').classList.add('hidden');
|
599 |
+
document.getElementById('fileUploadArea').style.opacity = '1';
|
600 |
+
document.getElementById('fileUploadArea').style.pointerEvents = 'auto';
|
601 |
+
}
|
602 |
+
|
603 |
+
function updateProcessingStep(message, percentage) {
|
604 |
+
const stepsContainer = document.getElementById('processingSteps');
|
605 |
+
const progressBar = document.getElementById('progressBar');
|
606 |
+
const percentageDisplay = document.getElementById('processingPercentage');
|
607 |
+
|
608 |
+
progressBar.style.width = percentage + '%';
|
609 |
+
percentageDisplay.textContent = percentage + '%';
|
610 |
+
|
611 |
+
const stepElement = document.createElement('div');
|
612 |
+
stepElement.className = 'flex items-center text-sm text-gray-400';
|
613 |
+
stepElement.innerHTML = `
|
614 |
+
<div class="w-2 h-2 bg-[var(--pastel-blue)] rounded-full mr-3 flex-shrink-0"></div>
|
615 |
+
<span>${message}</span>
|
616 |
+
`;
|
617 |
+
stepsContainer.appendChild(stepElement);
|
618 |
+
|
619 |
+
while (stepsContainer.children.length > 3) {
|
620 |
+
stepsContainer.removeChild(stepsContainer.firstChild);
|
621 |
+
}
|
622 |
+
}
|
623 |
+
|
624 |
+
function showNotification(message, type = 'info') {
|
625 |
+
const notification = document.createElement('div');
|
626 |
+
const bgColor = type === 'error' ? 'bg-red-500' : type === 'success' ? 'bg-green-500' : 'bg-blue-500';
|
627 |
+
|
628 |
+
notification.className = `fixed top-4 right-4 ${bgColor} text-white px-6 py-3 rounded-lg shadow-lg z-50 transform translate-x-full transition-transform duration-300`;
|
629 |
+
notification.textContent = message;
|
630 |
+
|
631 |
+
document.body.appendChild(notification);
|
632 |
+
|
633 |
+
setTimeout(() => {
|
634 |
+
notification.classList.remove('translate-x-full');
|
635 |
+
}, 100);
|
636 |
+
|
637 |
+
setTimeout(() => {
|
638 |
+
notification.classList.add('translate-x-full');
|
639 |
+
setTimeout(() => {
|
640 |
+
if (notification.parentNode) {
|
641 |
+
notification.parentNode.removeChild(notification);
|
642 |
+
}
|
643 |
+
}, 300);
|
644 |
+
}, 4000);
|
645 |
+
}
|
646 |
+
</script>
|
647 |
+
</body>
|
648 |
+
</html>
|
649 |
+
"""
|
650 |
+
|
651 |
+
@app.post("/upload")
|
652 |
+
async def upload_document(file: UploadFile = File(...)):
|
653 |
+
"""Upload and process a document with improved error handling and logging"""
|
654 |
+
try:
|
655 |
+
print(f"[INFO] Received file: {file.filename}")
|
656 |
+
upload_dir = "uploaded_docs"
|
657 |
+
try:
|
658 |
+
os.makedirs(upload_dir, exist_ok=True)
|
659 |
+
except Exception as e:
|
660 |
+
print(f"[ERROR] Failed to create upload directory: {e}")
|
661 |
+
return JSONResponse(status_code=500, content={"error": f"Failed to create upload directory: {str(e)}"})
|
662 |
+
|
663 |
+
file_location = os.path.join(upload_dir, file.filename)
|
664 |
+
try:
|
665 |
+
with open(file_location, "wb") as f:
|
666 |
+
f.write(await file.read())
|
667 |
+
print(f"[INFO] File saved to: {file_location}")
|
668 |
+
except Exception as e:
|
669 |
+
print(f"[ERROR] Failed to save file: {e}")
|
670 |
+
return JSONResponse(status_code=500, content={"error": f"Failed to save file: {str(e)}"})
|
671 |
+
|
672 |
+
try:
|
673 |
+
loader = DocumentLoader(file_location)
|
674 |
+
documents = loader.load()
|
675 |
+
print(f"[INFO] Loaded {len(documents)} document(s) from file.")
|
676 |
+
# Get real page/slide count
|
677 |
+
page_count = loader.get_page_count() or 1
|
678 |
+
except Exception as e:
|
679 |
+
print(f"[ERROR] Document loading failed: {e}")
|
680 |
+
return JSONResponse(status_code=400, content={"error": f"Document loading failed: {str(e)}"})
|
681 |
+
|
682 |
+
try:
|
683 |
+
text_content = " ".join([doc.page_content for doc in documents])
|
684 |
+
print(f"[INFO] Extracted text content, length: {len(text_content)} characters.")
|
685 |
+
except Exception as e:
|
686 |
+
print(f"[ERROR] Failed to extract text: {e}")
|
687 |
+
return JSONResponse(status_code=500, content={"error": f"Failed to extract text: {str(e)}"})
|
688 |
+
|
689 |
+
try:
|
690 |
+
summarizer = DocumentSummarizer()
|
691 |
+
summary_result = await summarizer.summarize_document(text_content)
|
692 |
+
print(f"[INFO] Document summarized. Classification: {summary_result.get('classification')}")
|
693 |
+
except Exception as e:
|
694 |
+
print(f"[ERROR] Summarization failed: {e}")
|
695 |
+
return JSONResponse(status_code=500, content={"error": f"Summarization failed: {str(e)}"})
|
696 |
+
|
697 |
+
try:
|
698 |
+
chunks = chunk_text(text_content)
|
699 |
+
print(f"[INFO] Created {len(chunks)} chunk(s) for vector store.")
|
700 |
+
except Exception as e:
|
701 |
+
print(f"[ERROR] Chunking failed: {e}")
|
702 |
+
return JSONResponse(status_code=500, content={"error": f"Chunking failed: {str(e)}"})
|
703 |
+
|
704 |
+
try:
|
705 |
+
add_to_vector_store(chunks)
|
706 |
+
print(f"[INFO] Chunks added to vector store.")
|
707 |
+
except Exception as e:
|
708 |
+
print(f"[ERROR] Vector store addition failed: {e}")
|
709 |
+
return JSONResponse(status_code=500, content={"error": f"Vector store addition failed: {str(e)}"})
|
710 |
+
|
711 |
+
# Store chunks for small document queries (in-memory, keyed by filename)
|
712 |
+
if not hasattr(app.state, 'doc_chunks'):
|
713 |
+
app.state.doc_chunks = {}
|
714 |
+
app.state.doc_chunks[file.filename] = chunks
|
715 |
+
|
716 |
+
return {
|
717 |
+
"filename": file.filename,
|
718 |
+
"summary": summary_result["summary"],
|
719 |
+
"classification": summary_result["classification"],
|
720 |
+
"chunk_count": summary_result["chunk_count"],
|
721 |
+
"processing_method": summary_result["processing_method"],
|
722 |
+
"page_estimate": page_count
|
723 |
+
}
|
724 |
+
|
725 |
+
except Exception as e:
|
726 |
+
print(f"[ERROR] Unexpected error: {e}")
|
727 |
+
return JSONResponse(
|
728 |
+
status_code=500,
|
729 |
+
content={"error": f"Unexpected error processing document: {str(e)}"}
|
730 |
+
)
|
731 |
+
|
732 |
+
@app.post("/summarize")
|
733 |
+
async def summarize_document(filename: str = Form(...)):
|
734 |
+
"""Generate summary for a specific document"""
|
735 |
+
try:
|
736 |
+
file_location = f"uploaded_docs/{filename}"
|
737 |
+
if not os.path.exists(file_location):
|
738 |
+
return JSONResponse(
|
739 |
+
status_code=404,
|
740 |
+
content={"error": "Document not found"}
|
741 |
+
)
|
742 |
+
|
743 |
+
# Load and process document
|
744 |
+
loader = DocumentLoader(file_location)
|
745 |
+
documents = loader.load()
|
746 |
+
text_content = " ".join([doc.page_content for doc in documents])
|
747 |
+
|
748 |
+
# Generate summary
|
749 |
+
summarizer = DocumentSummarizer()
|
750 |
+
summary_result = await summarizer.summarize_document(text_content)
|
751 |
+
|
752 |
+
return {
|
753 |
+
"filename": filename,
|
754 |
+
"summary": summary_result["summary"],
|
755 |
+
"classification": summary_result["classification"],
|
756 |
+
"chunk_count": summary_result["chunk_count"],
|
757 |
+
"processing_method": summary_result["processing_method"]
|
758 |
+
}
|
759 |
+
|
760 |
+
except Exception as e:
|
761 |
+
return JSONResponse(
|
762 |
+
status_code=500,
|
763 |
+
content={"error": f"Error summarizing document: {str(e)}"}
|
764 |
+
)
|
765 |
+
|
766 |
+
@app.post("/query")
|
767 |
+
async def query_document(filename: str = Form(...), query: str = Form(...)):
|
768 |
+
"""Query a document using RAG pipeline"""
|
769 |
+
try:
|
770 |
+
# Try to get all chunks for small documents
|
771 |
+
chunks = None
|
772 |
+
if hasattr(app.state, 'doc_chunks') and filename in app.state.doc_chunks:
|
773 |
+
chunks = app.state.doc_chunks[filename]
|
774 |
+
|
775 |
+
# If we have all chunks, check if the document is small
|
776 |
+
is_small_doc = False
|
777 |
+
if chunks is not None:
|
778 |
+
# Heuristic: if number of chunks < 20, treat as small document
|
779 |
+
is_small_doc = len(chunks) < 20
|
780 |
+
|
781 |
+
if is_small_doc:
|
782 |
+
# Use all chunks as context
|
783 |
+
context_chunks = chunks
|
784 |
+
else:
|
785 |
+
# Use similarity search for large documents or if chunks not available
|
786 |
+
search_results = similarity_search(query, top_k=5)
|
787 |
+
context_chunks = search_results.get("documents", [[]])[0]
|
788 |
+
|
789 |
+
context = " ".join(context_chunks)
|
790 |
+
|
791 |
+
# Generate a more intelligent response based on the actual context
|
792 |
+
if not context_chunks:
|
793 |
+
answer = f"I couldn't find specific information in the document that directly answers your question: '{query}'. The document may not contain relevant content for this query."
|
794 |
+
else:
|
795 |
+
# Create a more contextual response based on the found chunks
|
796 |
+
answer = generate_contextual_response(query, context_chunks)
|
797 |
+
|
798 |
+
return {
|
799 |
+
"filename": filename,
|
800 |
+
"query": query,
|
801 |
+
"answer": answer,
|
802 |
+
"context_chunks": len(context_chunks)
|
803 |
+
}
|
804 |
+
|
805 |
+
except Exception as e:
|
806 |
+
return JSONResponse(
|
807 |
+
status_code=500,
|
808 |
+
content={"error": f"Error processing query: {str(e)}"}
|
809 |
+
)
|
810 |
+
|
811 |
+
def generate_contextual_response(query: str, context_chunks: List[str]) -> str:
|
812 |
+
full_context = " ".join(context_chunks)
|
813 |
+
if len(full_context) > 8000:
|
814 |
+
sentences = full_context.split('. ')
|
815 |
+
if len(sentences) > 20:
|
816 |
+
relevant_sentences = sentences[:5] + sentences[-5:]
|
817 |
+
full_context = '. '.join(relevant_sentences)
|
818 |
+
# Use Mistral API for contextual response
|
819 |
+
from app.summarizer import DocumentSummarizer
|
820 |
+
summarizer = DocumentSummarizer()
|
821 |
+
prompt = f"You are a helpful assistant that answers questions based on document content. Provide comprehensive, accurate answers using the given context. Use plain text format without markdown. Provide detailed responses that fully address the user's question.\n\nQuestion: {query}\n\nContext: {full_context}\n\nAnswer (comprehensive, plain text):"
|
822 |
+
return summarizer.call_mistral_api(prompt)
|
823 |
+
|
824 |
+
def generate_simulated_response(query: str, full_context: str) -> str:
|
825 |
+
"""Generate a simulated response when Qwen2-0.5B is not available"""
|
826 |
+
|
827 |
+
# Analyze the query type and generate appropriate response
|
828 |
+
query_lower = query.lower()
|
829 |
+
|
830 |
+
if any(word in query_lower for word in ["key", "main", "important", "points", "summary"]):
|
831 |
+
# Extract key points from the context
|
832 |
+
sentences = full_context.split('. ')
|
833 |
+
key_points = []
|
834 |
+
for sentence in sentences[:min(5, len(sentences))]: # Allow up to 5 key points
|
835 |
+
if len(sentence.strip()) > 10: # Include more meaningful sentences
|
836 |
+
key_points.append(sentence.strip())
|
837 |
+
|
838 |
+
if key_points:
|
839 |
+
answer = f"Based on the document content, here are the key points:\n\n"
|
840 |
+
for i, point in enumerate(key_points, 1):
|
841 |
+
answer += f"{i}. {point}\n"
|
842 |
+
else:
|
843 |
+
answer = f"The document contains information about your query, but I couldn't extract specific key points from the available content."
|
844 |
+
|
845 |
+
elif any(word in query_lower for word in ["explain", "what is", "how", "why"]):
|
846 |
+
# Provide explanatory response with more content
|
847 |
+
if len(full_context) > 300:
|
848 |
+
# Take more content for explanations
|
849 |
+
relevant_part = full_context[:1000] + "..." if len(full_context) > 1000 else full_context
|
850 |
+
answer = f"Based on the document, here's what I found regarding your question '{query}':\n\n{relevant_part}"
|
851 |
+
else:
|
852 |
+
answer = f"The document provides the following information about your query: {full_context}"
|
853 |
+
|
854 |
+
elif any(word in query_lower for word in ["conclusion", "result", "find", "found"]):
|
855 |
+
# Look for conclusions or results
|
856 |
+
sentences = full_context.split('. ')
|
857 |
+
conclusion_sentences = []
|
858 |
+
for sentence in sentences:
|
859 |
+
if any(word in sentence.lower() for word in ["conclude", "result", "therefore", "thus", "finally", "overall"]):
|
860 |
+
conclusion_sentences.append(sentence)
|
861 |
+
|
862 |
+
if conclusion_sentences:
|
863 |
+
answer = f"Based on the document analysis, here are the conclusions related to your query:\n\n"
|
864 |
+
for sentence in conclusion_sentences[:3]: # Allow up to 3 conclusions
|
865 |
+
answer += f"• {sentence}\n"
|
866 |
+
else:
|
867 |
+
answer = f"The document contains relevant information about your query, but I couldn't identify specific conclusions from the available content."
|
868 |
+
|
869 |
+
else:
|
870 |
+
# General response with more content
|
871 |
+
if len(full_context) > 300:
|
872 |
+
# Take more sentences for general responses
|
873 |
+
sentences = full_context.split('. ')
|
874 |
+
summary_sentences = sentences[:min(8, len(sentences))] # Increased from 4 to 8 sentences
|
875 |
+
summary = '. '.join(summary_sentences)
|
876 |
+
answer = f"Regarding your question '{query}', the document contains the following relevant information:\n\n{summary}"
|
877 |
+
else:
|
878 |
+
answer = f"The document provides this information related to your query: {full_context}"
|
879 |
+
|
880 |
+
# Clean markdown formatting from the answer
|
881 |
+
answer = clean_markdown_formatting(answer)
|
882 |
+
return answer
|
app/summarizer.py
ADDED
@@ -0,0 +1,403 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict, Any
|
2 |
+
import asyncio
|
3 |
+
import re
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.schema import Document
|
6 |
+
import requests
|
7 |
+
import os
|
8 |
+
|
9 |
+
# Remove top-level import of transformers and torch
|
10 |
+
# from transformers import AutoTokenizer, AutoModelForCausalLM
|
11 |
+
# import torch
|
12 |
+
|
13 |
+
def clean_markdown_formatting(text: str) -> str:
|
14 |
+
"""
|
15 |
+
Clean markdown formatting from text and convert to plain text
|
16 |
+
|
17 |
+
Args:
|
18 |
+
text: Text that may contain markdown formatting
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
Cleaned plain text without markdown
|
22 |
+
"""
|
23 |
+
if not text:
|
24 |
+
return text
|
25 |
+
|
26 |
+
# Remove markdown headers (# ## ### etc.)
|
27 |
+
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
|
28 |
+
|
29 |
+
# Remove bold formatting (**text** or __text__)
|
30 |
+
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
|
31 |
+
text = re.sub(r'__(.*?)__', r'\1', text)
|
32 |
+
|
33 |
+
# Remove italic formatting (*text* or _text_)
|
34 |
+
text = re.sub(r'\*(.*?)\*', r'\1', text)
|
35 |
+
text = re.sub(r'_(.*?)_', r'\1', text)
|
36 |
+
|
37 |
+
# Remove code formatting (`text`)
|
38 |
+
text = re.sub(r'`(.*?)`', r'\1', text)
|
39 |
+
|
40 |
+
# Remove links [text](url) -> text
|
41 |
+
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
42 |
+
|
43 |
+
# Remove inline links [text] -> text
|
44 |
+
text = re.sub(r'\[([^\]]+)\]', r'\1', text)
|
45 |
+
|
46 |
+
# Remove strikethrough ~~text~~
|
47 |
+
text = re.sub(r'~~(.*?)~~', r'\1', text)
|
48 |
+
|
49 |
+
# Remove blockquotes (> text)
|
50 |
+
text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
|
51 |
+
|
52 |
+
# Remove horizontal rules (---, ***, ___)
|
53 |
+
text = re.sub(r'^[-*_]{3,}$', '', text, flags=re.MULTILINE)
|
54 |
+
|
55 |
+
# Clean up extra whitespace
|
56 |
+
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text) # Remove excessive line breaks
|
57 |
+
text = re.sub(r' +', ' ', text) # Remove multiple spaces
|
58 |
+
text = re.sub(r'\n +', '\n', text) # Remove leading spaces after line breaks
|
59 |
+
|
60 |
+
# Clean up the text
|
61 |
+
text = text.strip()
|
62 |
+
|
63 |
+
return text
|
64 |
+
|
65 |
+
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY", "")
|
66 |
+
if not MISTRAL_API_KEY:
|
67 |
+
print("[WARNING] MISTRAL_API_KEY environment variable is not set. API calls will fail.")
|
68 |
+
MISTRAL_API_URL = "https://api.mistral.ai/v1/chat/completions"
|
69 |
+
|
70 |
+
class DocumentSummarizer:
|
71 |
+
def __init__(self, chunk_size=1200, chunk_overlap=200):
|
72 |
+
"""
|
73 |
+
Initialize the document summarizer (CPU-optimized version)
|
74 |
+
|
75 |
+
Args:
|
76 |
+
llm_model: Qwen2-0.5B model instance (CPU-friendly)
|
77 |
+
chunk_size: Size of text chunks for processing (optimized for CPU)
|
78 |
+
chunk_overlap: Overlap between chunks (reduced for memory efficiency)
|
79 |
+
"""
|
80 |
+
self.chunk_size = chunk_size
|
81 |
+
self.chunk_overlap = chunk_overlap
|
82 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
83 |
+
chunk_size=chunk_size,
|
84 |
+
chunk_overlap=chunk_overlap,
|
85 |
+
length_function=len,
|
86 |
+
separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
|
87 |
+
)
|
88 |
+
|
89 |
+
def classify_document_size(self, text: str) -> Dict[str, Any]:
|
90 |
+
"""
|
91 |
+
Classify document as small or large based on content length
|
92 |
+
|
93 |
+
Args:
|
94 |
+
text: Document text content
|
95 |
+
|
96 |
+
Returns:
|
97 |
+
Dict with classification info
|
98 |
+
"""
|
99 |
+
words = len(text.split())
|
100 |
+
pages_estimate = words // 500 # Rough estimate: 500 words per page
|
101 |
+
is_large = pages_estimate > 15
|
102 |
+
|
103 |
+
return {
|
104 |
+
"is_large": is_large,
|
105 |
+
"word_count": words,
|
106 |
+
"page_estimate": pages_estimate,
|
107 |
+
"classification": "Large Document" if is_large else "Small Document"
|
108 |
+
}
|
109 |
+
|
110 |
+
def create_chunks(self, text: str) -> List[Document]:
|
111 |
+
"""
|
112 |
+
Create text chunks using RecursiveCharacterTextSplitter
|
113 |
+
|
114 |
+
Args:
|
115 |
+
text: Document text content
|
116 |
+
|
117 |
+
Returns:
|
118 |
+
List of Document chunks
|
119 |
+
"""
|
120 |
+
chunks = self.text_splitter.split_text(text)
|
121 |
+
return [Document(page_content=chunk, metadata={"chunk_id": i})
|
122 |
+
for i, chunk in enumerate(chunks)]
|
123 |
+
|
124 |
+
def _truncate_text_for_model(self, text: str, max_tokens: int = 4000) -> str:
|
125 |
+
"""
|
126 |
+
Truncate text to fit within model context limits (increased limits for better summaries)
|
127 |
+
|
128 |
+
Args:
|
129 |
+
text: Text to truncate
|
130 |
+
max_tokens: Maximum tokens allowed (increased from 2000)
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
Truncated text
|
134 |
+
"""
|
135 |
+
return text[:max_tokens * 4]
|
136 |
+
|
137 |
+
def call_mistral_api(self, prompt: str) -> str:
|
138 |
+
headers = {
|
139 |
+
"Authorization": f"Bearer {MISTRAL_API_KEY}",
|
140 |
+
"Content-Type": "application/json"
|
141 |
+
}
|
142 |
+
data = {
|
143 |
+
"model": "mistral-medium",
|
144 |
+
"messages": [
|
145 |
+
{"role": "user", "content": prompt}
|
146 |
+
],
|
147 |
+
"max_tokens": 500,
|
148 |
+
"temperature": 0.3,
|
149 |
+
"top_p": 0.8
|
150 |
+
}
|
151 |
+
try:
|
152 |
+
response = requests.post(MISTRAL_API_URL, headers=headers, json=data, timeout=60)
|
153 |
+
response.raise_for_status()
|
154 |
+
result = response.json()
|
155 |
+
return result["choices"][0]["message"]["content"].strip()
|
156 |
+
except Exception as e:
|
157 |
+
print(f"[WARNING] Error calling Mistral API: {e}")
|
158 |
+
return "[Error: Unable to generate summary with Mistral AI API.]"
|
159 |
+
|
160 |
+
async def generate_chunk_summary(self, chunk: Document) -> str:
|
161 |
+
"""
|
162 |
+
Generate summary for a single chunk using Qwen2-0.5B (CPU-optimized)
|
163 |
+
|
164 |
+
Args:
|
165 |
+
chunk: Document chunk to summarize
|
166 |
+
|
167 |
+
Returns:
|
168 |
+
Summary text for the chunk
|
169 |
+
"""
|
170 |
+
truncated_content = self._truncate_text_for_model(chunk.page_content, max_tokens=3000)
|
171 |
+
prompt = f"""You are an expert document summarizer. Create comprehensive summaries that capture key information from text chunks. Provide summaries in plain text format without markdown formatting.\n\nText to summarize:\n{truncated_content}\n\nSummary:"""
|
172 |
+
response = self.call_mistral_api(prompt)
|
173 |
+
return clean_markdown_formatting(response)
|
174 |
+
|
175 |
+
def _simulate_chunk_summary(self, text: str) -> str:
|
176 |
+
"""
|
177 |
+
Simulate chunk summary generation (fallback when LLM not available)
|
178 |
+
|
179 |
+
Args:
|
180 |
+
text: Text to summarize
|
181 |
+
|
182 |
+
Returns:
|
183 |
+
Simulated summary
|
184 |
+
"""
|
185 |
+
# Create a balanced summary simulation
|
186 |
+
words = text.split()
|
187 |
+
if len(words) < 30:
|
188 |
+
return text
|
189 |
+
|
190 |
+
# Split into sentences and take key information
|
191 |
+
sentences = text.split('. ')
|
192 |
+
if len(sentences) <= 2:
|
193 |
+
return text
|
194 |
+
|
195 |
+
# For longer text, create a meaningful summary
|
196 |
+
if len(sentences) > 4:
|
197 |
+
# Take first sentence, middle sentence, and last sentence for context
|
198 |
+
summary_sentences = [sentences[0]] # Introduction
|
199 |
+
middle_idx = len(sentences) // 2
|
200 |
+
summary_sentences.append(sentences[middle_idx]) # Key point
|
201 |
+
summary_sentences.append(sentences[-1]) # Conclusion
|
202 |
+
else:
|
203 |
+
# For shorter text, take first 2 sentences
|
204 |
+
summary_sentences = sentences[:2]
|
205 |
+
|
206 |
+
summary = '. '.join(summary_sentences)
|
207 |
+
return summary + ('.' if not summary.endswith('.') else '')
|
208 |
+
|
209 |
+
async def summarize_small_document(self, chunks: List[Document]) -> str:
|
210 |
+
"""
|
211 |
+
Summarize small documents (≤15 pages) by summarizing all chunks and combining
|
212 |
+
|
213 |
+
Args:
|
214 |
+
chunks: List of document chunks
|
215 |
+
|
216 |
+
Returns:
|
217 |
+
Combined document summary
|
218 |
+
"""
|
219 |
+
print(f"Processing small document with {len(chunks)} chunks...")
|
220 |
+
|
221 |
+
# Generate summaries for all chunks
|
222 |
+
chunk_summaries = []
|
223 |
+
for i, chunk in enumerate(chunks):
|
224 |
+
print(f"Summarizing chunk {i+1}/{len(chunks)}...")
|
225 |
+
summary = await self.generate_chunk_summary(chunk)
|
226 |
+
chunk_summaries.append(summary)
|
227 |
+
|
228 |
+
# Combine all chunk summaries
|
229 |
+
combined_summary = " ".join(chunk_summaries)
|
230 |
+
|
231 |
+
# Generate final summary from combined summaries
|
232 |
+
final_summary = await self.generate_final_summary(combined_summary, "small")
|
233 |
+
|
234 |
+
return final_summary
|
235 |
+
|
236 |
+
async def summarize_large_document(self, chunks: List[Document]) -> str:
|
237 |
+
"""
|
238 |
+
Summarize large documents (>15 pages) using hierarchical summarization
|
239 |
+
|
240 |
+
Args:
|
241 |
+
chunks: List of document chunks
|
242 |
+
|
243 |
+
Returns:
|
244 |
+
Hierarchical document summary
|
245 |
+
"""
|
246 |
+
print(f"Processing large document with {len(chunks)} chunks using hierarchical summarization...")
|
247 |
+
|
248 |
+
# Step 1: Generate chunk-level summaries
|
249 |
+
chunk_summaries = []
|
250 |
+
for i, chunk in enumerate(chunks):
|
251 |
+
print(f"Generating chunk summary {i+1}/{len(chunks)}...")
|
252 |
+
summary = await self.generate_chunk_summary(chunk)
|
253 |
+
chunk_summaries.append(summary)
|
254 |
+
|
255 |
+
# Step 2: Group summaries into sections (for very large documents)
|
256 |
+
if len(chunk_summaries) > 50:
|
257 |
+
section_summaries = await self._create_section_summaries(chunk_summaries)
|
258 |
+
else:
|
259 |
+
section_summaries = chunk_summaries
|
260 |
+
|
261 |
+
# Step 3: Generate section-level summaries
|
262 |
+
section_level_summaries = []
|
263 |
+
for i, section in enumerate(section_summaries):
|
264 |
+
print(f"Generating section summary {i+1}/{len(section_summaries)}...")
|
265 |
+
if isinstance(section, list):
|
266 |
+
combined_section = " ".join(section)
|
267 |
+
else:
|
268 |
+
combined_section = section
|
269 |
+
section_summary = await self.generate_chunk_summary(
|
270 |
+
Document(page_content=combined_section, metadata={"section_id": i})
|
271 |
+
)
|
272 |
+
section_level_summaries.append(section_summary)
|
273 |
+
|
274 |
+
# Step 4: Generate final hierarchical summary
|
275 |
+
final_combined = " ".join(section_level_summaries)
|
276 |
+
final_summary = await self.generate_final_summary(final_combined, "large")
|
277 |
+
|
278 |
+
return final_summary
|
279 |
+
|
280 |
+
async def _create_section_summaries(self, chunk_summaries: List[str]) -> List[List[str]]:
|
281 |
+
"""
|
282 |
+
Group chunk summaries into sections for very large documents
|
283 |
+
|
284 |
+
Args:
|
285 |
+
chunk_summaries: List of chunk summaries
|
286 |
+
|
287 |
+
Returns:
|
288 |
+
List of section summaries (each section is a list of chunk summaries)
|
289 |
+
"""
|
290 |
+
section_size = max(10, len(chunk_summaries) // 10) # Create ~10 sections
|
291 |
+
sections = []
|
292 |
+
|
293 |
+
for i in range(0, len(chunk_summaries), section_size):
|
294 |
+
section = chunk_summaries[i:i + section_size]
|
295 |
+
sections.append(section)
|
296 |
+
|
297 |
+
return sections
|
298 |
+
|
299 |
+
async def generate_final_summary(self, combined_text: str, doc_type: str) -> str:
|
300 |
+
"""
|
301 |
+
Generate final summary from combined text using Qwen2-0.5B (CPU-optimized)
|
302 |
+
|
303 |
+
Args:
|
304 |
+
combined_text: Combined text to summarize
|
305 |
+
doc_type: Type of document (small/large)
|
306 |
+
|
307 |
+
Returns:
|
308 |
+
Final document summary
|
309 |
+
"""
|
310 |
+
prompt = f"""You are an expert document summarizer. Create a final summary for the following combined text. Provide a comprehensive, plain text summary.\n\nText:\n{combined_text}\n\nFinal Summary:"""
|
311 |
+
response = self.call_mistral_api(prompt)
|
312 |
+
return clean_markdown_formatting(response)
|
313 |
+
|
314 |
+
def _simulate_final_summary(self, combined_text: str, doc_type: str) -> str:
|
315 |
+
"""
|
316 |
+
Simulate final summary generation (fallback when LLM not available)
|
317 |
+
|
318 |
+
Args:
|
319 |
+
combined_text: Combined text to summarize
|
320 |
+
doc_type: Type of document (small/large)
|
321 |
+
|
322 |
+
Returns:
|
323 |
+
Simulated final summary
|
324 |
+
"""
|
325 |
+
# Create a balanced final summary
|
326 |
+
sentences = combined_text.split('. ')
|
327 |
+
|
328 |
+
if len(sentences) <= 3:
|
329 |
+
return combined_text
|
330 |
+
|
331 |
+
# For small documents, take key sentences for better context
|
332 |
+
if doc_type == "small":
|
333 |
+
if len(sentences) <= 5:
|
334 |
+
summary_sentences = sentences
|
335 |
+
else:
|
336 |
+
# Take introduction, key point, and conclusion for small docs
|
337 |
+
summary_sentences = [sentences[0]] # Introduction
|
338 |
+
middle_idx = len(sentences) // 2
|
339 |
+
summary_sentences.append(sentences[middle_idx]) # Key point
|
340 |
+
summary_sentences.append(sentences[-1]) # Conclusion
|
341 |
+
else:
|
342 |
+
# For large documents, create a comprehensive summary
|
343 |
+
if len(sentences) <= 6:
|
344 |
+
summary_sentences = sentences
|
345 |
+
else:
|
346 |
+
# Take introduction, 2 key points, and conclusion
|
347 |
+
summary_sentences = [sentences[0]] # Introduction
|
348 |
+
# Take 2 key points from different parts
|
349 |
+
quarter_idx = len(sentences) // 4
|
350 |
+
three_quarter_idx = (3 * len(sentences)) // 4
|
351 |
+
summary_sentences.append(sentences[quarter_idx]) # First key point
|
352 |
+
summary_sentences.append(sentences[three_quarter_idx]) # Second key point
|
353 |
+
summary_sentences.append(sentences[-1]) # Conclusion
|
354 |
+
|
355 |
+
summary = '. '.join(summary_sentences)
|
356 |
+
return summary + ('.' if not summary.endswith('.') else '')
|
357 |
+
|
358 |
+
async def summarize_document(self, text: str) -> Dict[str, Any]:
|
359 |
+
"""
|
360 |
+
Main method to summarize a document
|
361 |
+
|
362 |
+
Args:
|
363 |
+
text: Document text content
|
364 |
+
|
365 |
+
Returns:
|
366 |
+
Dict with summary results
|
367 |
+
"""
|
368 |
+
# Classify document size
|
369 |
+
classification = self.classify_document_size(text)
|
370 |
+
|
371 |
+
# Create chunks
|
372 |
+
chunks = self.create_chunks(text)
|
373 |
+
|
374 |
+
# Generate summary based on document size
|
375 |
+
if classification["is_large"]:
|
376 |
+
summary = await self.summarize_large_document(chunks)
|
377 |
+
processing_method = "Hierarchical Summarization"
|
378 |
+
else:
|
379 |
+
summary = await self.summarize_small_document(chunks)
|
380 |
+
processing_method = "Chunk-wise Summarization"
|
381 |
+
|
382 |
+
return {
|
383 |
+
"summary": summary,
|
384 |
+
"classification": classification["classification"],
|
385 |
+
"word_count": classification["word_count"],
|
386 |
+
"page_estimate": classification["page_estimate"],
|
387 |
+
"chunk_count": len(chunks),
|
388 |
+
"processing_method": processing_method
|
389 |
+
}
|
390 |
+
|
391 |
+
async def summarize_text(text: str, llm_model=None) -> Dict[str, Any]:
|
392 |
+
"""
|
393 |
+
Convenience function to summarize text
|
394 |
+
|
395 |
+
Args:
|
396 |
+
text: Text to summarize
|
397 |
+
llm_model: Optional LLM model instance
|
398 |
+
|
399 |
+
Returns:
|
400 |
+
Dict with summary results
|
401 |
+
"""
|
402 |
+
summarizer = DocumentSummarizer(llm_model=llm_model)
|
403 |
+
return await summarizer.summarize_document(text)
|
app/vector_store.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
|
3 |
+
chroma_client = chromadb.Client()
|
4 |
+
model = None
|
5 |
+
|
6 |
+
def get_model():
|
7 |
+
global model
|
8 |
+
if model is None:
|
9 |
+
try:
|
10 |
+
from sentence_transformers import SentenceTransformer
|
11 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
12 |
+
except Exception as e:
|
13 |
+
print(f"[ERROR] Could not load SentenceTransformer: {e}")
|
14 |
+
model = None
|
15 |
+
return model
|
16 |
+
|
17 |
+
COLLECTION_NAME = "documents"
|
18 |
+
|
19 |
+
# Ensure collection exists
|
20 |
+
if not chroma_client.list_collections() or COLLECTION_NAME not in [c.name for c in chroma_client.list_collections()]:
|
21 |
+
chroma_client.create_collection(COLLECTION_NAME)
|
22 |
+
collection = chroma_client.get_collection(COLLECTION_NAME)
|
23 |
+
|
24 |
+
def add_to_vector_store(chunks, metadatas=None):
|
25 |
+
try:
|
26 |
+
if not chunks:
|
27 |
+
print("[WARNING] No chunks provided to vector store")
|
28 |
+
return
|
29 |
+
model_instance = get_model()
|
30 |
+
if model_instance is None:
|
31 |
+
print("[ERROR] Embedding model not available.")
|
32 |
+
return
|
33 |
+
embeddings = model_instance.encode(chunks).tolist()
|
34 |
+
ids = [f"chunk_{i}" for i in range(len(chunks))]
|
35 |
+
collection.add(documents=chunks, embeddings=embeddings, ids=ids, metadatas=metadatas)
|
36 |
+
print(f"[INFO] Added {len(chunks)} chunks to vector store")
|
37 |
+
except Exception as e:
|
38 |
+
print(f"[ERROR] Failed to add chunks to vector store: {e}")
|
39 |
+
# Don't raise the exception to prevent the entire upload from failing
|
40 |
+
|
41 |
+
def similarity_search(query, top_k=5):
|
42 |
+
try:
|
43 |
+
if not query or not query.strip():
|
44 |
+
return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
|
45 |
+
model_instance = get_model()
|
46 |
+
if model_instance is None:
|
47 |
+
print("[ERROR] Embedding model not available.")
|
48 |
+
return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
|
49 |
+
embedding = model_instance.encode([query]).tolist()[0]
|
50 |
+
results = collection.query(query_embeddings=[embedding], n_results=top_k)
|
51 |
+
return results
|
52 |
+
except Exception as e:
|
53 |
+
print(f"[ERROR] Similarity search failed: {e}")
|
54 |
+
# Return empty results instead of failing
|
55 |
+
return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
|
requirements.txt
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
langchain
|
4 |
+
chromadb
|
5 |
+
pypdf2
|
6 |
+
pdfplumber
|
7 |
+
unstructured
|
8 |
+
pymupdf
|
9 |
+
python-pptx
|
10 |
+
python-docx
|
11 |
+
sentence-transformers
|
12 |
+
pydantic
|
13 |
+
httpx
|
14 |
+
python-multipart
|
15 |
+
langchain-community
|
16 |
+
requests
|
17 |
+
pytesseract
|
18 |
+
opencv-python
|
19 |
+
pillow
|
20 |
+
numpy
|
21 |
+
pdf2image
|
22 |
+
# torch, transformers, onnxruntime removed for CPU-only, API-based summarization
|