import os import json import tempfile from datetime import datetime from flask import Flask, render_template, request, jsonify, session, redirect, url_for import google.generativeai as genai from sentence_transformers import SentenceTransformer # Removed ChromaDB and added Qdrant from qdrant_client import QdrantClient from qdrant_client.models import VectorParams, Distance, Filter, FieldCondition, MatchValue, PointStruct, SearchParams # LangChain splitter from langchain.text_splitter import RecursiveCharacterTextSplitter import arxiv import PyPDF2 from docx import Document import requests from werkzeug.utils import secure_filename from dotenv import load_dotenv import uuid import re from bs4 import BeautifulSoup import logging import numpy as np # Load environment variables load_dotenv() # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = Flask(__name__) app.secret_key = os.getenv('SECRET_KEY', 'research-radar-secret-key-2024') # Configuration UPLOAD_FOLDER = 'uploads' ALLOWED_EXTENSIONS = {'txt', 'pdf', 'docx'} MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16MB max file size app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH # Ensure directories exist os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Initialize models and services try: # Configure Gemini API gemini_api_key = os.getenv('GEMINI_API_KEY') if gemini_api_key: genai.configure(api_key=gemini_api_key) gemini_model = genai.GenerativeModel('gemini-2.5-flash') logger.info("✅ Gemini API initialized successfully") else: gemini_model = None logger.warning("⚠️ Gemini API key not found. AI features will be limited.") # Initialize sentence transformer for embeddings (local model) from config import Config local_model_path = Config.LOCAL_MODEL_PATH if os.path.exists(local_model_path): embedding_model = SentenceTransformer(local_model_path) logger.info(f"✅ Local sentence transformer model loaded from: {local_model_path}") else: # Fallback to downloading if local model not found embedding_model = SentenceTransformer(Config.EMBEDDING_MODEL) logger.warning(f"⚠️ Local model not found at {local_model_path}, downloading {Config.EMBEDDING_MODEL} from HuggingFace") # Determine vector size dynamically try: _probe_vec = embedding_model.encode(["probe text"]) VECTOR_SIZE = int(_probe_vec.shape[-1]) if hasattr(_probe_vec, 'shape') else len(_probe_vec[0]) except Exception: VECTOR_SIZE = 384 # fallback for all-MiniLM-L6-v2 # Initialize Qdrant client qdrant_url = os.getenv('QDRANT_URL') qdrant_api_key = os.getenv('QDRANT_API_KEY') qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key, timeout=120) logger.info("✅ Qdrant client initialized") # Ensure default collection exists def ensure_qdrant_collection(collection_name: str, vector_size: int) -> None: try: qdrant_client.get_collection(collection_name) except Exception: qdrant_client.recreate_collection( collection_name=collection_name, vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE) ) logger.info(f"✅ Created Qdrant collection: {collection_name}") # Ensure payload index for document_id exists try: qdrant_client.create_payload_index( collection_name=collection_name, field_name="document_id", field_schema="keyword" ) logger.info("✅ Ensured payload index for 'document_id'") except Exception: # Likely already exists pass ensure_qdrant_collection('research_papers', VECTOR_SIZE) except Exception as e: logger.error(f"❌ Initialization error: {e}") embedding_model = None gemini_model = None qdrant_client = None VECTOR_SIZE = None def allowed_file(filename): """Check if file extension is allowed""" return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS def extract_text_from_pdf(file_path): """Extract text from PDF file""" try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return text except Exception as e: print(f"PDF extraction error: {e}") return "" def extract_text_from_docx(file_path): """Extract text from DOCX file""" try: doc = Document(file_path) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text except Exception as e: print(f"DOCX extraction error: {e}") return "" def extract_text_from_txt(file_path): """Extract text from TXT file""" try: with open(file_path, 'r', encoding='utf-8') as file: return file.read() except Exception as e: print(f"TXT extraction error: {e}") return "" def process_document(file_path, filename): """Process uploaded document and extract text""" file_extension = filename.rsplit('.', 1)[1].lower() if file_extension == 'pdf': return extract_text_from_pdf(file_path) elif file_extension == 'docx': return extract_text_from_docx(file_path) elif file_extension == 'txt': return extract_text_from_txt(file_path) else: return "" def search_arxiv_papers(query, max_results=10): """Search arXiv papers""" try: client = arxiv.Client() search = arxiv.Search( query=query, max_results=max_results, sort_by=arxiv.SortCriterion.Relevance ) papers = [] for result in client.results(search): paper = { 'title': result.title, 'authors': [author.name for author in result.authors], 'summary': result.summary, 'url': result.entry_id, 'pdf_url': result.pdf_url, 'published': result.published.strftime('%Y-%m-%d'), 'category': result.primary_category } papers.append(paper) return papers except Exception as e: print(f"arXiv search error: {e}") return [] def generate_summary(text, max_length=500): """Generate summary using Gemini API""" try: if not gemini_model: return "Summary generation unavailable - API not configured" prompt = f""" Please provide a comprehensive summary of this research paper/document in approximately {max_length} words. Focus on: 1. Main research question/objective 2. Key methodology 3. Important findings 4. Conclusions and implications Text to summarize: {text[:80000]} """ response = gemini_model.generate_content(prompt) return response.text except Exception as e: logger.error(f"Summary generation error: {e}") return "Error generating summary. Please try again." # Text chunking using LangChain def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200): splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", " ", ""] ) return splitter.split_text(text) # Qdrant helpers def ensure_qdrant_collection(collection_name: str, vector_size: int) -> None: """Create Qdrant collection if it doesn't exist""" if not qdrant_client: return try: qdrant_client.get_collection(collection_name) except Exception: qdrant_client.recreate_collection( collection_name=collection_name, vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE) ) # Ensure payload index for document_id exists for efficient filtering/scrolling try: qdrant_client.create_payload_index( collection_name=collection_name, field_name="document_id", field_schema="keyword" ) except Exception: pass def add_document_to_vector_db(text, metadata, doc_id, collection_name="research_papers"): """Add chunked document vectors to Qdrant for chat functionality""" try: if not embedding_model or not qdrant_client or not VECTOR_SIZE: return False ensure_qdrant_collection(collection_name, VECTOR_SIZE) # Split text using recursive text splitter chunks = chunk_text(text, chunk_size=1200, chunk_overlap=250) if not chunks: return False embeddings = embedding_model.encode(chunks) vectors = embeddings.tolist() if hasattr(embeddings, 'tolist') else embeddings points = [] for i, (chunk, vector) in enumerate(zip(chunks, vectors)): payload = dict(metadata or {}) payload.update({ 'document_id': doc_id, 'chunk_index': i, 'total_chunks': len(chunks), 'content': chunk, }) points.append( PointStruct( id=str(uuid.uuid4()), vector=vector, payload=payload ) ) qdrant_client.upsert(collection_name=collection_name, points=points, wait=True) return True except Exception as e: print(f"Vector DB error: {e}") return False def query_vector_db(query, doc_id, collection_name="research_papers", n_results=3): """Query Qdrant for similar chunks for the given document_id""" try: if not embedding_model or not qdrant_client or not VECTOR_SIZE: return [] ensure_qdrant_collection(collection_name, VECTOR_SIZE) query_embedding = embedding_model.encode([query]) query_vector = query_embedding[0].tolist() if hasattr(query_embedding, 'tolist') else list(query_embedding[0]) flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))]) results = qdrant_client.search( collection_name=collection_name, query_vector=query_vector, limit=n_results, query_filter=flt, with_payload=True, with_vectors=False ) documents = [] for r in results or []: payload = getattr(r, 'payload', None) or {} documents.append(payload.get('content', '')) return {'documents': [documents]} except Exception as e: print(f"Vector DB query error: {e}") return [] def get_all_chunks_for_document(doc_id: str, collection_name: str = "research_papers"): """Retrieve all chunks for a document from Qdrant, ordered by chunk_index""" try: all_points = [] next_offset = None flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))]) while True: points, next_offset = qdrant_client.scroll( collection_name=collection_name, scroll_filter=flt, limit=500, offset=next_offset, with_payload=True, with_vectors=False ) all_points.extend(points) if not next_offset: break # Order by chunk_index all_points.sort(key=lambda p: p.payload.get('chunk_index', 0)) return [p.payload.get('content', '') for p in all_points] except Exception as e: print(f"Qdrant scroll error: {e}") return [] def get_all_documents(collection_name: str = "research_papers"): """Get all unique documents from Qdrant with their metadata""" try: if not qdrant_client: return [] # Get all points to extract unique documents all_points = [] next_offset = None while True: points, next_offset = qdrant_client.scroll( collection_name=collection_name, limit=1000, offset=next_offset, with_payload=True, with_vectors=False ) all_points.extend(points) if not next_offset: break # Group by document_id and extract metadata documents = {} for point in all_points: payload = point.payload or {} doc_id = payload.get('document_id') if not doc_id: continue if doc_id not in documents: # Create document metadata from first chunk doc_type = payload.get('type', 'document') # Generate proper title based on type title = payload.get('title', 'Untitled Document') if doc_type == 'arxiv_paper' and payload.get('pdf_url'): # Extract arXiv ID from URL for better title pdf_url = payload.get('pdf_url', '') if 'arxiv.org/pdf/' in pdf_url: arxiv_id = pdf_url.split('/')[-1].replace('.pdf', '') title = f"arXiv:{arxiv_id}" elif 'arxiv.org/abs/' in pdf_url: arxiv_id = pdf_url.split('/')[-1] title = f"arXiv:{arxiv_id}" elif doc_type == 'uploaded_document' and payload.get('filename'): title = payload.get('filename') documents[doc_id] = { 'document_id': doc_id, 'title': title, 'authors': payload.get('authors', ['Unknown']), 'published': payload.get('published', 'Unknown Date'), 'category': payload.get('category', 'Research'), 'filename': payload.get('filename', ''), 'pdf_url': payload.get('pdf_url', ''), 'type': doc_type, 'upload_date': payload.get('upload_date', ''), 'total_chunks': payload.get('total_chunks', 0), 'word_count': payload.get('word_count', 0) } # Convert to list and sort by upload date (newest first) doc_list = list(documents.values()) doc_list.sort(key=lambda x: x.get('upload_date', ''), reverse=True) return doc_list except Exception as e: print(f"Error getting documents: {e}") return [] def get_document_metadata(doc_id: str, collection_name: str = "research_papers"): """Get metadata for a specific document""" try: if not qdrant_client: return None # Get first chunk to extract metadata flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))]) results = qdrant_client.scroll( collection_name=collection_name, scroll_filter=flt, limit=1, with_payload=True, with_vectors=False ) if results and results[0]: payload = results[0][0].payload or {} return { 'document_id': doc_id, 'title': payload.get('title', 'Untitled Document'), 'authors': payload.get('authors', ['Unknown']), 'published': payload.get('published', 'Unknown Date'), 'category': payload.get('category', 'Research'), 'filename': payload.get('filename', ''), 'pdf_url': payload.get('pdf_url', ''), 'type': payload.get('type', 'document'), 'upload_date': payload.get('upload_date', ''), 'total_chunks': payload.get('total_chunks', 0), 'word_count': payload.get('word_count', 0) } return None except Exception as e: print(f"Error getting document metadata: {e}") return None # Paper ingestion helpers def resolve_pdf_url(url_or_pdf: str) -> str: if not url_or_pdf: return '' if 'arxiv.org/pdf/' in url_or_pdf and url_or_pdf.endswith('.pdf'): return url_or_pdf # convert arXiv abs to pdf m = re.search(r"arxiv\.org/(abs|pdf)/([\w\.-]+)", url_or_pdf) if m: arxiv_id = m.group(2) if not arxiv_id.endswith('.pdf'): return f"https://arxiv.org/pdf/{arxiv_id}.pdf" return f"https://arxiv.org/pdf/{arxiv_id}" return url_or_pdf def download_pdf_to_temp(pdf_url: str) -> str: r = requests.get(pdf_url, stream=True, timeout=30) r.raise_for_status() with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: for chunk in r.iter_content(chunk_size=8192): if chunk: tmp.write(chunk) return tmp.name def ingest_paper(pdf_url: str, paper_meta: dict = None) -> tuple: """Download PDF, extract text, chunk, embed and store in Qdrant. Returns (doc_id, word_count).""" pdf_url = resolve_pdf_url(pdf_url) doc_id = str(uuid.uuid4()) tmp_path = None try: tmp_path = download_pdf_to_temp(pdf_url) text_content = extract_text_from_pdf(tmp_path) if not text_content.strip(): return None, 0 metadata = { 'source': 'arxiv', 'pdf_url': pdf_url, 'type': 'arxiv_paper' } if paper_meta: metadata.update(paper_meta) ok = add_document_to_vector_db(text_content, metadata, doc_id) if not ok: return None, 0 # set active document session['active_document_id'] = doc_id return doc_id, len(text_content.split()) finally: if tmp_path and os.path.exists(tmp_path): try: os.remove(tmp_path) except Exception: pass def generate_summary_from_qdrant(doc_id: str, max_chars: int = 80000) -> str: chunks = get_all_chunks_for_document(doc_id) if not chunks: return "No content available to summarize." # Concatenate up to max_chars full_text = '' for chunk in chunks: if len(full_text) + len(chunk) > max_chars: break full_text += (chunk + '\n') return generate_summary(full_text) def generate_chat_response(question, context_docs): """Generate chat response using Gemini with context""" try: if not gemini_model: return "Chat functionality unavailable - API not configured" context = "\n\n".join(context_docs) if context_docs else "" prompt = f""" You are a research assistant helping users understand academic papers. Answer the following question based on the provided context from research papers. If the context doesn't contain relevant information, say so politely and suggest what information would be needed. Context from research papers: {context} Question: {question} Please provide a clear, accurate, and helpful response. """ response = gemini_model.generate_content(prompt) return response.text except Exception as e: logger.error(f"Chat response error: {e}") return "Error generating response. Please try again." # Routes @app.route('/') def index(): """Main page""" return render_template('index.html') @app.route('/search', methods=['POST']) def search_papers(): """Search arXiv papers""" try: data = request.get_json() query = data.get('query', '').strip() if not query: return jsonify({'error': 'Query is required'}), 400 papers = search_arxiv_papers(query, max_results=10) return jsonify({'papers': papers}) except Exception as e: return jsonify({'error': f'Search failed: {str(e)}'}), 500 @app.route('/ingest-paper', methods=['POST']) def ingest_paper_endpoint(): """Ingest a paper PDF by URL: download, chunk, embed, store in Qdrant.""" try: data = request.get_json() pdf_url = data.get('pdf_url') or data.get('url') title = data.get('title') authors = data.get('authors') published = data.get('published') if not pdf_url: return jsonify({'error': 'pdf_url is required'}), 400 doc_id, word_count = ingest_paper(pdf_url, paper_meta={'title': title, 'authors': authors, 'published': published}) if not doc_id: return jsonify({'error': 'Failed to ingest paper'}), 500 return jsonify({'success': True, 'doc_id': doc_id, 'word_count': word_count}) except Exception as e: logger.error(f"Ingestion failed: {e}", exc_info=True) return jsonify({'error': f'Ingestion failed: {str(e)}'}), 500 @app.route('/upload', methods=['POST']) def upload_file(): """Handle file upload""" try: if 'file' not in request.files: return jsonify({'error': 'No file selected'}), 400 file = request.files['file'] if file.filename == '': return jsonify({'error': 'No file selected'}), 400 if file and allowed_file(file.filename): filename = secure_filename(file.filename) # Generate a unique ID for this document session doc_id = str(uuid.uuid4()) # Use a temporary file to avoid cluttering the upload folder with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file: file.save(tmp_file.name) tmp_file_path = tmp_file.name # Extract text from document text_content = process_document(tmp_file_path, filename) # Clean up temporary file immediately os.remove(tmp_file_path) if not text_content.strip(): return jsonify({'error': 'Could not extract text from file'}), 400 # Generate summary summary = generate_summary(text_content) # Add to vector database for chat metadata = { 'filename': file.filename, 'upload_date': datetime.now().isoformat(), 'type': 'uploaded_document' } add_document_to_vector_db(text_content, metadata, doc_id) # Store the active document ID in the session session['active_document_id'] = doc_id return jsonify({ 'success': True, 'filename': file.filename, 'summary': summary, 'word_count': len(text_content.split()), 'doc_id': doc_id # Send doc_id to frontend }) return jsonify({'error': 'Invalid file type'}), 400 except Exception as e: logger.error(f"Upload failed: {e}", exc_info=True) return jsonify({'error': f'Upload failed: {str(e)}'}), 500 @app.route('/summarize-paper', methods=['POST']) def summarize_paper(): """Summarize paper: if doc_id provided, summarize from Qdrant; else ingest then summarize.""" try: data = request.get_json() doc_id = data.get('doc_id') paper_url = data.get('url', '').strip() pdf_url = data.get('pdf_url') if not doc_id and not (paper_url or pdf_url): return jsonify({'error': 'doc_id or url/pdf_url is required'}), 400 # If doc_id not provided, ingest first paper_data = None if not doc_id: # If only abs URL provided, try resolve via arxiv client for metadata try: # Extract arXiv ID from URL arxiv_id = None if paper_url: arxiv_id = paper_url.split('/')[-1].replace('.pdf', '') if arxiv_id: client = arxiv.Client() search = arxiv.Search(id_list=[arxiv_id]) for result in client.results(search): paper_data = { 'title': result.title, 'authors': [author.name for author in result.authors], 'summary': result.summary, 'url': result.entry_id, 'pdf_url': result.pdf_url, 'published': result.published.strftime('%Y-%m-%d') } break except Exception: paper_data = None ingest_pdf = pdf_url or (paper_data['pdf_url'] if paper_data and paper_data.get('pdf_url') else resolve_pdf_url(paper_url)) new_doc_id, _ = ingest_paper(ingest_pdf, paper_meta=paper_data or {}) if not new_doc_id: return jsonify({'error': 'Failed to ingest paper'}), 500 doc_id = new_doc_id session['active_document_id'] = doc_id # Summarize from Qdrant chunks summary = generate_summary_from_qdrant(doc_id) return jsonify({ 'success': True, 'summary': summary, 'doc_id': doc_id, 'paper': paper_data }) except Exception as e: return jsonify({'error': f'Request failed: {str(e)}'}), 500 @app.route('/chat', methods=['POST']) def chat(): """Handle chat queries for the active document""" try: data = request.get_json() # Accept both 'message' and 'question' for backward compatibility question = data.get('message', data.get('question', '')).strip() doc_id = session.get('active_document_id') if not question: return jsonify({'error': 'Message is required'}), 400 # If no active document, provide general help if not doc_id: if not gemini_model: return jsonify({'error': 'AI service is not available. Please check your API configuration.'}), 500 # Generate a general response without document context try: prompt = f""" You are a helpful AI research assistant for Research Radar. The user asked: "{question}" Since no document is currently loaded, provide a helpful response about: 1. How to use Research Radar (search papers, upload documents, chat features) 2. General research guidance if the question is research-related 3. Suggest they upload a document or search for papers to get more specific help Keep your response friendly and informative. """ response = gemini_model.generate_content(prompt) return jsonify({ 'success': True, 'response': response.text, 'context_found': False, 'no_document': True }) except Exception as e: return jsonify({ 'success': True, 'response': "Hello! I'm your AI research assistant. To get started, please upload a document or search for papers using the navigation above. Then I can help you analyze content, answer questions, and provide insights about your research materials.", 'context_found': False, 'no_document': True }) # Query vector database for relevant context from the active document search_results = query_vector_db(question, doc_id) context_docs = [] if search_results and isinstance(search_results, dict) and 'documents' in search_results: context_docs = search_results['documents'][0] # Generate response response = generate_chat_response(question, context_docs) return jsonify({ 'success': True, 'response': response, 'context_found': len(context_docs) > 0 }) except Exception as e: return jsonify({'error': f'Chat failed: {str(e)}'}), 500 @app.route('/documents', methods=['GET']) def get_documents(): """Get all documents from the vector database""" try: documents = get_all_documents() return jsonify({'success': True, 'documents': documents}) except Exception as e: return jsonify({'error': f'Failed to get documents: {str(e)}'}), 500 @app.route('/documents/', methods=['GET']) def get_document(doc_id): """Get a specific document's metadata""" try: metadata = get_document_metadata(doc_id) if not metadata: return jsonify({'error': 'Document not found'}), 404 return jsonify({'success': True, 'document': metadata}) except Exception as e: return jsonify({'error': f'Failed to get document: {str(e)}'}), 500 @app.route('/documents//summary', methods=['GET']) def get_document_summary(doc_id): """Get summary for a specific document""" try: summary = generate_summary_from_qdrant(doc_id) metadata = get_document_metadata(doc_id) if not metadata: return jsonify({'error': 'Document not found'}), 404 return jsonify({ 'success': True, 'summary': summary, 'document': metadata }) except Exception as e: return jsonify({'error': f'Failed to get summary: {str(e)}'}), 500 @app.route('/documents//activate', methods=['POST']) def activate_document(doc_id): """Set a document as the active document for chat""" try: metadata = get_document_metadata(doc_id) if not metadata: return jsonify({'error': 'Document not found'}), 404 session['active_document_id'] = doc_id return jsonify({ 'success': True, 'message': 'Document activated', 'document': metadata }) except Exception as e: return jsonify({'error': f'Failed to activate document: {str(e)}'}), 500 @app.route('/documents/', methods=['DELETE']) def delete_document(doc_id): """Delete a document from Qdrant""" try: if not qdrant_client: return jsonify({'error': 'Vector database not available'}), 500 # Delete all points for this document flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))]) qdrant_client.delete( collection_name="research_papers", points_selector=flt ) return jsonify({ 'success': True, 'message': 'Document deleted successfully' }) except Exception as e: return jsonify({'error': f'Failed to delete document: {str(e)}'}), 500 @app.route('/documents', methods=['DELETE']) def clear_all_documents(): """Clear all documents from Qdrant""" try: if not qdrant_client: return jsonify({'error': 'Vector database not available'}), 500 # Delete all points qdrant_client.delete( collection_name="research_papers", points_selector=None ) return jsonify({ 'success': True, 'message': 'All documents cleared successfully' }) except Exception as e: return jsonify({'error': f'Failed to clear documents: {str(e)}'}), 500 @app.route('/clear-session', methods=['POST']) def clear_session(): """Clear the active document from the session""" session.pop('active_document_id', None) return jsonify({'success': True, 'message': 'Session cleared.'}) @app.route('/health') def health_check(): """Health check endpoint""" return jsonify({ 'status': 'healthy', 'gemini_available': gemini_model is not None, 'embeddings_available': embedding_model is not None, 'vector_db_available': qdrant_client is not None }) if __name__ == '__main__': print("🚀 Research Radar - Starting Flask Application...") print("📚 Features: arXiv search, document upload, AI summaries, chat functionality") print("🔑 Make sure to set GEMINI_API_KEY in your .env file") print("🗄 Using Qdrant as Vector DB. Ensure Qdrant is reachable via QDRANT_URL") # Get port from environment variable (for Hugging Face Spaces) port = int(os.environ.get('PORT', 5000)) debug = os.environ.get('FLASK_ENV') == 'development' print(f"🌐 Access the app at: http://localhost:{port}") app.run(debug=debug, host='0.0.0.0', port=port)