Spaces:

PIXity
/

Pix-Agent

Runtime error

App Files Files Community

Pix-Agent / app /utils /pdf_processor.py

ManTea

QA version persionality

c8b8c9b 3 months ago

raw

history blame

21.1 kB

	import os
	import logging
	import uuid
	import pinecone
	from app.utils.pinecone_fix import PineconeConnectionManager, check_connection
	import time
	import os
	from typing import List, Dict, Any, Optional

	# Langchain imports for document processing
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_google_genai import GoogleGenerativeAIEmbeddings
	import google.generativeai as genai

	# Configure logger
	logger = logging.getLogger(__name__)

	class PDFProcessor:
	"""Process PDF files and create embeddings in Pinecone"""

	def __init__(self, index_name="testbot768", namespace="Default", api_key=None, vector_db_id=None, mock_mode=False, correlation_id=None):
	self.index_name = index_name
	self.namespace = namespace
	self.api_key = api_key
	self.vector_db_id = vector_db_id
	self.pinecone_index = None
	self.mock_mode = mock_mode
	self.correlation_id = correlation_id or str(uuid.uuid4())[:8]
	self.google_api_key = os.environ.get("GOOGLE_API_KEY")

	# Initialize Pinecone connection if not in mock mode
	if not self.mock_mode and self.api_key:
	try:
	# Use connection manager from pinecone_fix
	logger.info(f"[{self.correlation_id}] Initializing Pinecone connection to {self.index_name}")
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)
	logger.info(f"[{self.correlation_id}] Successfully connected to Pinecone index {self.index_name}")
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Failed to initialize Pinecone: {str(e)}")
	# Fall back to mock mode if connection fails
	self.mock_mode = True
	logger.warning(f"[{self.correlation_id}] Falling back to mock mode due to connection error")

	async def process_pdf(self, file_path, document_id=None, metadata=None, progress_callback=None):
	"""Process a PDF file and create vector embeddings

	This method:
	1. Extracts text from PDF using PyPDFLoader
	2. Splits text into chunks using RecursiveCharacterTextSplitter
	3. Creates embeddings using Google Gemini model
	4. Stores embeddings in Pinecone
	"""
	logger.info(f"[{self.correlation_id}] Processing PDF: {file_path}")

	if self.mock_mode:
	logger.info(f"[{self.correlation_id}] MOCK: Processing PDF {file_path}")
	# Mock implementation - just return success
	if progress_callback:
	await progress_callback(None, document_id, "embedding_complete", 1.0, "Mock processing completed")
	return {"success": True, "message": "PDF processed successfully"}

	try:
	# Initialize metadata if not provided
	if metadata is None:
	metadata = {}

	# Ensure document_id is included
	if document_id is None:
	document_id = str(uuid.uuid4())

	# Add document_id to metadata
	metadata["document_id"] = document_id

	# The namespace to use might be in vdb-X format if vector_db_id provided
	actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace

	# 1. Extract text from PDF
	logger.info(f"[{self.correlation_id}] Extracting text from PDF: {file_path}")
	if progress_callback:
	await progress_callback(None, document_id, "text_extraction", 0.2, "Extracting text from PDF")

	loader = PyPDFLoader(file_path)
	documents = loader.load()
	total_text_length = sum(len(doc.page_content) for doc in documents)

	logger.info(f"[{self.correlation_id}] Extracted {len(documents)} pages, total text length: {total_text_length}")

	# 2. Split text into chunks
	if progress_callback:
	await progress_callback(None, document_id, "chunking", 0.4, "Splitting text into chunks")

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=100,
	length_function=len,
	separators=["\n\n", "\n", " ", ""]
	)

	chunks = text_splitter.split_documents(documents)

	logger.info(f"[{self.correlation_id}] Split into {len(chunks)} chunks")

	# 3. Create embeddings
	if progress_callback:
	await progress_callback(None, document_id, "embedding", 0.6, "Creating embeddings")

	# Initialize Google Gemini for embeddings
	if not self.google_api_key:
	raise ValueError("Google API key not found in environment variables")

	genai.configure(api_key=self.google_api_key)

	# First, get the expected dimensions from Pinecone
	logger.info(f"[{self.correlation_id}] Checking Pinecone index dimensions")
	if not self.pinecone_index:
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)

	stats = self.pinecone_index.describe_index_stats()
	pinecone_dimension = stats.dimension
	logger.info(f"[{self.correlation_id}] Pinecone index dimension: {pinecone_dimension}")

	# Create embedding model
	embedding_model = GoogleGenerativeAIEmbeddings(
	model="models/embedding-001",
	google_api_key=self.google_api_key,
	task_type="retrieval_document" # Use document embedding mode for longer text
	)

	# Get a sample embedding to check dimensions
	sample_embedding = embedding_model.embed_query("test")
	embedding_dimension = len(sample_embedding)

	logger.info(f"[{self.correlation_id}] Generated embeddings with dimension: {embedding_dimension}")

	# Dimension handling - if mismatch, we handle it appropriately
	if embedding_dimension != pinecone_dimension:
	logger.warning(f"[{self.correlation_id}] Embedding dimension mismatch: got {embedding_dimension}, need {pinecone_dimension}")

	if embedding_dimension < pinecone_dimension:
	# For upscaling from 768 to 1536: duplicate each value and scale appropriately
	# This is one approach to handle dimension mismatches while preserving semantic information
	logger.info(f"[{self.correlation_id}] Using duplication strategy to upscale from {embedding_dimension} to {pinecone_dimension}")

	if embedding_dimension * 2 == pinecone_dimension:
	# Perfect doubling (768 -> 1536)
	def adjust_embedding(embedding):
	# Duplicate each value to double the dimension
	return [val for val in embedding for _ in range(2)]
	else:
	# Generic padding with zeros
	pad_size = pinecone_dimension - embedding_dimension
	def adjust_embedding(embedding):
	return embedding + [0.0] * pad_size
	else:
	# Truncation strategy - take first pinecone_dimension values
	logger.info(f"[{self.correlation_id}] Will truncate embeddings from {embedding_dimension} to {pinecone_dimension}")

	def adjust_embedding(embedding):
	return embedding[:pinecone_dimension]
	else:
	# No adjustment needed
	def adjust_embedding(embedding):
	return embedding

	# Process in batches to avoid memory issues
	batch_size = 10
	vectors_to_upsert = []

	for i in range(0, len(chunks), batch_size):
	batch = chunks[i:i+batch_size]

	# Extract text content
	texts = [chunk.page_content for chunk in batch]

	# Create embeddings for batch
	embeddings = embedding_model.embed_documents(texts)

	# Prepare vectors for Pinecone
	for j, (chunk, embedding) in enumerate(zip(batch, embeddings)):
	# Adjust embedding dimensions if needed
	adjusted_embedding = adjust_embedding(embedding)

	# Verify dimensions are correct
	if len(adjusted_embedding) != pinecone_dimension:
	raise ValueError(f"Dimension mismatch after adjustment: got {len(adjusted_embedding)}, expected {pinecone_dimension}")

	# Create metadata for this chunk
	chunk_metadata = {
	"document_id": document_id,
	"page": chunk.metadata.get("page", 0),
	"chunk_id": f"{document_id}-chunk-{i+j}",
	"text": chunk.page_content[:1000], # Store first 1000 chars of text
	**metadata # Include original metadata
	}

	# Create vector record
	vector = {
	"id": f"{document_id}-{i+j}",
	"values": adjusted_embedding,
	"metadata": chunk_metadata
	}

	vectors_to_upsert.append(vector)

	logger.info(f"[{self.correlation_id}] Processed batch {i//batch_size + 1}/{(len(chunks)-1)//batch_size + 1}")

	# 4. Store embeddings in Pinecone
	if progress_callback:
	await progress_callback(None, document_id, "storing", 0.8, f"Storing {len(vectors_to_upsert)} vectors in Pinecone")

	logger.info(f"[{self.correlation_id}] Upserting {len(vectors_to_upsert)} vectors to Pinecone index {self.index_name}, namespace {actual_namespace}")

	# Use PineconeConnectionManager for better error handling
	result = PineconeConnectionManager.upsert_vectors_with_validation(
	self.pinecone_index,
	vectors_to_upsert,
	namespace=actual_namespace
	)

	logger.info(f"[{self.correlation_id}] Successfully upserted {result.get('upserted_count', 0)} vectors to Pinecone")

	if progress_callback:
	await progress_callback(None, document_id, "embedding_complete", 1.0, "Processing completed")

	# Return success with stats
	return {
	"success": True,
	"document_id": document_id,
	"chunks_processed": len(chunks),
	"total_text_length": total_text_length,
	"vectors_created": len(vectors_to_upsert),
	"vectors_upserted": result.get('upserted_count', 0),
	"message": "PDF processed successfully"
	}
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Error processing PDF: {str(e)}")
	return {
	"success": False,
	"error": f"Error processing PDF: {str(e)}"
	}

	async def list_namespaces(self):
	"""List all namespaces in the Pinecone index"""
	if self.mock_mode:
	logger.info(f"[{self.correlation_id}] MOCK: Listing namespaces")
	return {"success": True, "namespaces": ["test"]}

	try:
	if not self.pinecone_index:
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)

	# Get index stats which includes namespaces
	stats = self.pinecone_index.describe_index_stats()
	namespaces = list(stats.get("namespaces", {}).keys())

	return {
	"success": True,
	"namespaces": namespaces
	}
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Error listing namespaces: {str(e)}")
	return {
	"success": False,
	"error": f"Error listing namespaces: {str(e)}"
	}

	async def delete_namespace(self):
	"""Delete all vectors in a namespace"""
	if self.mock_mode:
	logger.info(f"[{self.correlation_id}] MOCK: Deleting namespace '{self.namespace}'")
	return {
	"success": True,
	"namespace": self.namespace,
	"deleted_count": 100,
	"message": f"Successfully deleted namespace '{self.namespace}'"
	}

	try:
	if not self.pinecone_index:
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)

	logger.info(f"[{self.correlation_id}] Deleting namespace '{self.namespace}' from index '{self.index_name}'")

	# Check if namespace exists
	stats = self.pinecone_index.describe_index_stats()
	namespaces = stats.get("namespaces", {})

	if self.namespace in namespaces:
	vector_count = namespaces[self.namespace].get("vector_count", 0)
	# Delete all vectors in namespace
	self.pinecone_index.delete(delete_all=True, namespace=self.namespace)
	return {
	"success": True,
	"namespace": self.namespace,
	"deleted_count": vector_count,
	"message": f"Successfully deleted namespace '{self.namespace}' with {vector_count} vectors"
	}
	else:
	return {
	"success": True,
	"namespace": self.namespace,
	"deleted_count": 0,
	"message": f"Namespace '{self.namespace}' does not exist - nothing to delete"
	}
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Error deleting namespace: {str(e)}")
	return {
	"success": False,
	"namespace": self.namespace,
	"error": f"Error deleting namespace: {str(e)}"
	}

	async def delete_document(self, document_id):
	"""Delete vectors associated with a specific document ID"""
	logger.info(f"[{self.correlation_id}] Deleting vectors for document '{document_id}' from namespace '{self.namespace}'")

	if self.mock_mode:
	logger.info(f"[{self.correlation_id}] MOCK: Deleting document vectors for '{document_id}'")
	# In mock mode, simulate deleting 10 vectors
	return {
	"success": True,
	"document_id": document_id,
	"namespace": self.namespace,
	"deleted_count": 10,
	"message": f"Successfully deleted vectors for document '{document_id}' from namespace '{self.namespace}'"
	}

	try:
	if not self.pinecone_index:
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)

	# Use metadata filtering to find vectors with matching document_id
	# The specific namespace to use might be vdb-X format if vector_db_id provided
	actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace

	# Search for vectors with this document ID
	results = self.pinecone_index.query(
	vector=[0] * 1536, # Dummy vector, we only care about metadata filter
	top_k=1,
	include_metadata=True,
	filter={"document_id": document_id},
	namespace=actual_namespace
	)

	# If no vectors found, return success with warning
	if len(results.get("matches", [])) == 0:
	logger.warning(f"[{self.correlation_id}] No vectors found for document '{document_id}' in namespace '{actual_namespace}'")
	return {
	"success": True,
	"document_id": document_id,
	"namespace": actual_namespace,
	"deleted_count": 0,
	"warning": f"No vectors found for document '{document_id}' in namespace '{actual_namespace}'",
	"message": f"Successfully deleted 0 vectors for document '{document_id}' from namespace '{actual_namespace}'"
	}

	# Delete vectors by filter
	result = self.pinecone_index.delete(
	filter={"document_id": document_id},
	namespace=actual_namespace
	)

	# Get delete count from result
	deleted_count = result.get("deleted_count", 0)

	return {
	"success": True,
	"document_id": document_id,
	"namespace": actual_namespace,
	"deleted_count": deleted_count,
	"message": f"Successfully deleted {deleted_count} vectors for document '{document_id}' from namespace '{actual_namespace}'"
	}
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Error deleting document vectors: {str(e)}")
	return {
	"success": False,
	"document_id": document_id,
	"error": f"Error deleting document vectors: {str(e)}"
	}

	async def list_documents(self):
	"""List all documents in the Pinecone index"""
	if self.mock_mode:
	logger.info(f"[{self.correlation_id}] MOCK: Listing documents in namespace '{self.namespace}'")
	return {
	"success": True,
	"namespace": self.namespace,
	"documents": [
	{"id": "doc1", "title": "Sample Document 1"},
	{"id": "doc2", "title": "Sample Document 2"}
	]
	}

	try:
	if not self.pinecone_index:
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)

	# The namespace to use might be in vdb-X format if vector_db_id provided
	actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace

	# Get index stats
	stats = self.pinecone_index.describe_index_stats()
	namespaces = stats.get("namespaces", {})
	total_vectors = namespaces.get(actual_namespace, {}).get("vector_count", 0)

	# Query unique document IDs
	# Use a sparse vector with top_k=0 to just get metadata stats
	# This is more efficient than retrieving actual vectors
	results = self.pinecone_index.query(
	vector=[0] * 1536, # Dummy vector for metadata-only query
	top_k=100, # Limit to 100 results
	include_metadata=True,
	namespace=actual_namespace
	)

	# Extract unique document IDs from metadata
	document_map = {}
	matches = results.get("matches", [])

	for match in matches:
	metadata = match.get("metadata", {})
	doc_id = metadata.get("document_id")

	if doc_id and doc_id not in document_map:
	document_map[doc_id] = {
	"id": doc_id,
	"title": metadata.get("title", "Unknown"),
	"chunks": 1
	}
	elif doc_id:
	document_map[doc_id]["chunks"] += 1

	documents = list(document_map.values())

	return {
	"success": True,
	"namespace": actual_namespace,
	"index_name": self.index_name,
	"total_vectors": total_vectors,
	"documents": documents
	}
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Error listing documents: {str(e)}")
	return {
	"success": False,
	"error": f"Error listing documents: {str(e)}"
	}