import os from dotenv import load_dotenv from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_ollama import OllamaEmbeddings from langchain_community.vectorstores import Chroma from langchain.chains import RetrievalQA from langchain_community.llms import Ollama load_dotenv() DATA_PATH ='data/' FILE_NAME = 'ABriefHistoryofTime.pdf' CHROMA_PATH= "chroma_db" def load_documents(): pdf_path = os.path.join(DATA_PATH, FILE_NAME) loader = PyPDFLoader(pdf_path) documents = loader.load() documents = [doc for doc in documents if doc.page_content.strip() != ""] print(type(documents[0])) print (f"Loaded {len(documents)} pages from pdf {pdf_path}") return documents def split_documents(documents): text_splitter = RecursiveCharacterTextSplitter( chunk_size =1000, chunk_overlap = 200, length_function = len, is_separator_regex=False ) all_splits = text_splitter.split_documents(documents) print (f"Split into {len(all_splits)} chunks") return all_splits def get_embedding_functions(model_name = "nomic-embed-text"): embeddings = OllamaEmbeddings(model=model_name) print(f"Initialized embeddings with model {model_name}") return embeddings def get_vector_store(embedding_function, persist_directory = CHROMA_PATH): vectorstore = Chroma( persist_directory=persist_directory, embedding_function= embedding_function ) print (f"Vector store initilialized/loaded from: {persist_directory}") return vectorstore def index_documents(chunks, embeding_function, persist_directory = CHROMA_PATH): print (f"Indexing {len(chunks)} chunks") vectorstore = Chroma.from_documents( documents=chunks, embedding= embeding_function, persist_directory=persist_directory ) vectorstore.persist() print (f"Indexing complete. Data saved to : {persist_directory}") return vectorstore loaded_docs = load_documents() print(f"Document type: {type(loaded_docs)}") # should be a list print(f"Number of docs: {len(loaded_docs)}") # should be > 0 print(f"First item type: {type(loaded_docs[0])}") # should be langchain.docstore.document.Document for i, doc in enumerate(loaded_docs[:3]): print(f"\nšŸ“„ Doc {i} content preview:\n{doc.page_content[:300]}") chunks = split_documents(loaded_docs) if chunks: print("Sample split:", chunks[0].page_content[:300]) embeding_function = get_embedding_functions() vector_chroma_store =index_documents(chunks,embeding_function=embeding_function) def load_llm(model_name="qwen:1.8b"): llm = Ollama(model=model_name) print(f"āœ… Loaded LLM: {model_name}") return llm def create_qa_chain(llm, vector_store): retriever = vector_store.as_retriever() qa_chain = RetrievalQA.from_chain_type( llm=llm, retriever=retriever, return_source_documents=True # Optional: to see context ) print("āœ… QA Chain initialized") return qa_chain def ask_question(qa_chain, question): print(f"\nā“ Question: {question}") result = qa_chain({"query": question}) print(f"\nšŸ’¬ Answer:\n{result['result']}") return result llm = load_llm() qa_chain = create_qa_chain(llm,vector_store=vector_chroma_store) ask_question(qa_chain, "What is the main idea of the first chapter?") ask_question(qa_chain, "Who is the author of Breif history time ?")