Spaces:

hoshoo21
/

Custom_RAG

Sleeping

File size: 3,571 Bytes

7a837d4

import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

load_dotenv()

DATA_PATH ='data/'
FILE_NAME = 'ABriefHistoryofTime.pdf'
CHROMA_PATH= "chroma_db"

def load_documents():
    pdf_path = os.path.join(DATA_PATH, FILE_NAME)
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    documents = [doc for doc in documents if doc.page_content.strip() != ""]
    print(type(documents[0])) 
    print (f"Loaded {len(documents)} pages from pdf {pdf_path}")

    return documents
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size =1000,
        chunk_overlap = 200,
        length_function = len,
        is_separator_regex=False
    )
    
    all_splits = text_splitter.split_documents(documents)
    print (f"Split into {len(all_splits)} chunks")
    return all_splits
def get_embedding_functions(model_name = "nomic-embed-text"):
    embeddings = OllamaEmbeddings(model=model_name)
    print(f"Initialized embeddings with model {model_name}")
    return embeddings

def get_vector_store(embedding_function, persist_directory = CHROMA_PATH):
    vectorstore = Chroma(
                            persist_directory=persist_directory, 
                            embedding_function= embedding_function
                          )
    print (f"Vector store initilialized/loaded from: {persist_directory}")
    return vectorstore

def index_documents(chunks, embeding_function, persist_directory = CHROMA_PATH):
    print (f"Indexing {len(chunks)} chunks")
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding= embeding_function,
        persist_directory=persist_directory
    )
    vectorstore.persist()
    print (f"Indexing complete. Data saved to : {persist_directory}")
    return vectorstore

loaded_docs = load_documents()
print(f"Document type: {type(loaded_docs)}")           # should be a list
print(f"Number of docs: {len(loaded_docs)}")           # should be > 0
print(f"First item type: {type(loaded_docs[0])}")      # should be langchain.docstore.document.Document

for i, doc in enumerate(loaded_docs[:3]):
    print(f"\n📄 Doc {i} content preview:\n{doc.page_content[:300]}")

chunks = split_documents(loaded_docs)
if chunks:
    print("Sample split:", chunks[0].page_content[:300])
embeding_function = get_embedding_functions()

vector_chroma_store =index_documents(chunks,embeding_function=embeding_function)


def load_llm(model_name="qwen:1.8b"):
    llm = Ollama(model=model_name)
    print(f"✅ Loaded LLM: {model_name}")
    return llm

def create_qa_chain(llm, vector_store):
    retriever = vector_store.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True  # Optional: to see context
    )
    print("✅ QA Chain initialized")
    return qa_chain

def ask_question(qa_chain, question):
    print(f"\n❓ Question: {question}")
    result = qa_chain({"query": question})
    print(f"\n💬 Answer:\n{result['result']}")
    return result


llm = load_llm()

qa_chain = create_qa_chain(llm,vector_store=vector_chroma_store)


ask_question(qa_chain, "What is the main idea of the first chapter?")
ask_question(qa_chain, "Who is the author of Breif history time ?")