Spaces:

MathWizard1729
/

PDF_Chatbot_Gradio_ChromaDB

Sleeping

App Files Files Community

MathWizard1729 commited on May 27

Commit

9ce288c

verified ·

1 Parent(s): db62c2d

Delete indexer.py

Browse files

Files changed (1) hide show

indexer.py +0 -145

indexer.py DELETED Viewed

@@ -1,145 +0,0 @@
-import os
-import logging
-from dotenv import load_dotenv
-from PyPDF2 import PdfReader
-from langchain.docstore.document import Document
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import BedrockEmbeddings
-from langchain_chroma import Chroma
-from botocore.exceptions import ClientError
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-def load_environment():
-    """Load environment variables from .env file or system environment."""
-    load_dotenv()
-    required_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_REGION']
-    for var in required_vars:
-        if not os.getenv(var):
-            logger.error(f"Missing environment variable: {var}")
-            raise ValueError(f"Missing environment variable: {var}")
-    logger.info("Environment variables loaded successfully")
-def load_uploaded_pdfs(uploaded_files):
-    """Load and extract text from uploaded PDF files."""
-    documents = []
-    pdf_count = 0
-    try:
-        for file_path in uploaded_files:
-            pdf_count += 1
-            file_name = os.path.basename(file_path)
-            logger.info(f"Loading uploaded PDF: {file_name}")
-            # Open the file from the provided path
-            with open(file_path, 'rb') as pdf_file:
-                pdf_reader = PdfReader(pdf_file)
-                # Extract text from each page
-                text = ""
-                for page_num, page in enumerate(pdf_reader.pages):
-                    page_text = page.extract_text() or ""
-                    text += page_text
-                    # Create a LangChain Document for each page
-                    documents.append(Document(
-                        page_content=page_text,
-                        metadata={"source": file_name, "page": page_num + 1}
-                    ))
-                if not text.strip():
-                    logger.warning(f"No text extracted from {file_name}")
-        if not documents:
-            logger.warning("No PDF files provided or no text extracted")
-        else:
-            logger.info(f"Loaded {len(documents)} pages from {pdf_count} PDFs")
-        return documents, pdf_count
-    except Exception as e:
-        logger.error(f"Error loading PDFs: {str(e)}")
-        raise
-def split_documents(documents):
-    """Split documents into chunks for embedding."""
-    try:
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,
-            chunk_overlap=200,
-            length_function=len
-        )
-        chunks = text_splitter.split_documents(documents)
-        logger.info(f"Split documents into {len(chunks)} chunks")
-        return chunks
-    except Exception as e:
-        logger.error(f"Error splitting documents: {str(e)}")
-        raise
-def initialize_embeddings():
-    """Initialize Amazon Bedrock embeddings."""
-    try:
-        embeddings = BedrockEmbeddings(
-            model_id="amazon.titan-embed-text-v1",
-            region_name=os.getenv("AWS_REGION")
-        )
-        logger.info("Initialized Bedrock embeddings")
-        return embeddings
-    except ClientError as e:
-        logger.error(f"Error initializing Bedrock embeddings: {str(e)}")
-        raise
-def store_in_chroma(chunks, embeddings, db_directory="./chroma_db", collection_name="pdf_rag"):
-    """Store document chunks and embeddings in Chroma vector database."""
-    try:
-        # Clear existing Chroma database if it exists
-        if os.path.exists(db_directory):
-            import shutil
-            shutil.rmtree(db_directory)
-            logger.info(f"Cleared existing Chroma database at {db_directory}")
-        os.makedirs(db_directory, exist_ok=True)
-        vector_store = Chroma.from_documents(
-            documents=chunks,
-            embedding=embeddings,
-            collection_name=collection_name,
-            persist_directory=db_directory
-        )
-        logger.info(f"Stored {len(chunks)} chunks in Chroma vector database at {db_directory}")
-        return vector_store
-    except Exception as e:
-        logger.error(f"Error storing in Chroma: {str(e)}")
-        raise
-def index_uploaded_pdfs(uploaded_files, db_directory="./chroma_db"):
-    """Index uploaded PDF files and return vector store and summary."""
-    try:
-        # Load environment variables
-        load_environment()
-        # Load and process PDFs
-        documents, pdf_count = load_uploaded_pdfs(uploaded_files)
-        if not documents:
-            return None, {"pdf_count": 0, "page_count": 0, "chunk_count": 0, "db_location": db_directory}
-        # Split documents into chunks
-        chunks = split_documents(documents)
-        # Initialize embeddings
-        embeddings = initialize_embeddings()
-        # Store in Chroma
-        vector_store = store_in_chroma(chunks, embeddings, db_directory)
-        # Summary
-        summary = {
-            "pdf_count": pdf_count,
-            "page_count": len(documents),
-            "chunk_count": len(chunks),
-            "db_location": db_directory
-        }
-        logger.info("Indexing Summary:")
-        logger.info(f"  Number of PDFs processed: {summary['pdf_count']}")
-        logger.info(f"  Total pages loaded: {summary['page_count']}")
-        logger.info(f"  Total chunks created: {summary['chunk_count']}")
-        logger.info(f"  Chroma database location: {summary['db_location']}")
-        return vector_store, summary
-    except Exception as e:
-        logger.error(f"Indexing failed: {str(e)}")
-        raise