Spaces:

jarif
/

AI-Powered-PDF-Document-Search-and-QA

Sleeping

File size: 2,837 Bytes

575af7a
 
6cdbf89
575af7a
 
 
c2d2148
575af7a
c2d2148
575af7a
 
 
6cdbf89
575af7a
c2d2148
575af7a
c2d2148
575af7a
 
 
 
c2d2148
575af7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2d2148
575af7a
 
 
 
 
 
c2d2148
575af7a
 
 
c2d2148
 
575af7a
 
 
 
 
c2d2148
575af7a
 
 
 
 
 
 
c2d2148
 
 
 
6cdbf89
575af7a
6cdbf89
575af7a
 
6cdbf89

import os
import logging
import faiss
from langchain_community.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def create_faiss_index():
    documents = []
    docs_dir = "docs"  # Directory where PDF files are stored

    # Check if the 'docs' directory exists
    if not os.path.exists(docs_dir):
        logger.error(f"The directory '{docs_dir}' does not exist.")
        return

    # Walk through the 'docs' directory and load PDF files
    for root, dirs, files in os.walk(docs_dir):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                logger.info(f"Loading document: {file_path}")
                try:
                    loader = PDFMinerLoader(file_path)
                    loaded_docs = loader.load()
                    if loaded_docs:
                        logger.info(f"Loaded {len(loaded_docs)} documents from {file_path}")
                        documents.extend(loaded_docs)
                    else:
                        logger.warning(f"No documents loaded from {file_path}")
                except Exception as e:
                    logger.error(f"Error loading {file_path}: {e}")

    # Check if any documents were loaded
    if not documents:
        logger.error("No documents were loaded. Check the 'docs' directory and file paths.")
        return

    logger.info(f"Loaded {len(documents)} documents.")

    # Split documents into text chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_documents(documents)
    logger.info(f"Created {len(texts)} text chunks.")

    # Check if text chunks were created
    if not texts:
        logger.error("No text chunks created. Check the text splitting process.")
        return

    try:
        # Initialize embeddings using HuggingFace models
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        logger.info("Embeddings initialized successfully.")
    except Exception as e:
        logger.error(f"Failed to initialize embeddings: {e}")
        return

    try:
        # Create a FAISS index and save it
        index = faiss.IndexFlatL2(embeddings.embedding_size)
        vector_store = FAISS.from_documents(texts, embeddings, index)
        vector_store.save_local("faiss_index")
        logger.info(f"Created FAISS index with {len(texts)} vectors.")
    except Exception as e:
        logger.error(f"Failed to create FAISS index: {e}")

if __name__ == "__main__":
    create_faiss_index()