Spaces:

jarif
/

AI-Powered-PDF-Document-Search-and-QA

Sleeping

File size: 3,865 Bytes

import os
import logging
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_documents(docs_dir):
    documents = []
    for root, dirs, files in os.walk(docs_dir):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                logger.info(f"Loading document: {file_path}")
                try:
                    loader = PyPDFLoader(file_path)
                    loaded_docs = loader.load()
                    if loaded_docs:
                        documents.extend(loaded_docs)
                        logger.info(f"Loaded {len(loaded_docs)} pages from {file_path}.")
                    else:
                        logger.warning(f"No content extracted from {file_path}. Possibly encrypted or empty.")
                except Exception as e:
                    logger.error(f"Error loading {file_path}: {e}")
    return documents

def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_documents(documents)

    if not texts:
        logger.error("No text chunks were created. Check the text splitting process.")
        return None

    logger.info(f"Created {len(texts)} text chunks.")
    for i, text in enumerate(texts[:5]):  # Sample first 5 chunks
        logger.debug(f"Sample chunk {i}: {text[:100]}...")  # Print first 100 characters
    
    return texts

def create_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    
    try:
        sample_embedding = embeddings.embed_query("sample text")
        logger.debug(f"Sample embedding: {sample_embedding[:5]}... (truncated for brevity)")
    except Exception as e:
        logger.error(f"Error generating sample embedding: {e}")
        return None

    return embeddings

def create_faiss_index(texts, embeddings):
    try:
        db = FAISS.from_documents(texts, embeddings)
        logger.info(f"Created FAISS index with {len(texts)} vectors")
        # Directly check the FAISS index size
        if len(db.index) > 0:
            logger.info(f"FAISS index contains {len(db.index)} vectors.")
        else:
            logger.error("FAISS index contains 0 vectors after creation. Check the data and embeddings.")
    except Exception as e:
        logger.error(f"Failed to create FAISS index: {e}")
        return None

    return db

def save_faiss_index(db, index_path):
    try:
        db.save_local(index_path)
        logger.info(f"FAISS index saved to {index_path}")
    except Exception as e:
        logger.error(f"Failed to save FAISS index to {index_path}: {e}")

def main():
    docs_dir = "docs"  # Adjust to your document directory
    index_path = "faiss_index"

    logger.info("Starting document processing...")

    # Load documents
    documents = load_documents(docs_dir)
    if not documents:
        logger.error("No documents were loaded. Exiting.")
        return

    # Split text into chunks
    texts = split_text(documents)
    if texts is None:
        logger.error("Text splitting failed. Exiting.")
        return

    # Create embeddings
    embeddings = create_embeddings()
    if embeddings is None:
        logger.error("Embeddings creation failed. Exiting.")
        return

    # Create FAISS index
    db = create_faiss_index(texts, embeddings)
    if db is None:
        logger.error("FAISS index creation failed. Exiting.")
        return

    # Save FAISS index
    save_faiss_index(db, index_path)

if __name__ == "__main__":
    main()