Spaces:
Sleeping
Sleeping
import os | |
from dotenv import load_dotenv | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_ollama import OllamaEmbeddings | |
from langchain_community.vectorstores import Chroma | |
from langchain.chains import RetrievalQA | |
from langchain_community.llms import Ollama | |
load_dotenv() | |
DATA_PATH ='data/' | |
FILE_NAME = 'ABriefHistoryofTime.pdf' | |
CHROMA_PATH= "chroma_db" | |
def load_documents(): | |
pdf_path = os.path.join(DATA_PATH, FILE_NAME) | |
loader = PyPDFLoader(pdf_path) | |
documents = loader.load() | |
documents = [doc for doc in documents if doc.page_content.strip() != ""] | |
print(type(documents[0])) | |
print (f"Loaded {len(documents)} pages from pdf {pdf_path}") | |
return documents | |
def split_documents(documents): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size =1000, | |
chunk_overlap = 200, | |
length_function = len, | |
is_separator_regex=False | |
) | |
all_splits = text_splitter.split_documents(documents) | |
print (f"Split into {len(all_splits)} chunks") | |
return all_splits | |
def get_embedding_functions(model_name = "nomic-embed-text"): | |
embeddings = OllamaEmbeddings(model=model_name) | |
print(f"Initialized embeddings with model {model_name}") | |
return embeddings | |
def get_vector_store(embedding_function, persist_directory = CHROMA_PATH): | |
vectorstore = Chroma( | |
persist_directory=persist_directory, | |
embedding_function= embedding_function | |
) | |
print (f"Vector store initilialized/loaded from: {persist_directory}") | |
return vectorstore | |
def index_documents(chunks, embeding_function, persist_directory = CHROMA_PATH): | |
print (f"Indexing {len(chunks)} chunks") | |
vectorstore = Chroma.from_documents( | |
documents=chunks, | |
embedding= embeding_function, | |
persist_directory=persist_directory | |
) | |
vectorstore.persist() | |
print (f"Indexing complete. Data saved to : {persist_directory}") | |
return vectorstore | |
loaded_docs = load_documents() | |
print(f"Document type: {type(loaded_docs)}") # should be a list | |
print(f"Number of docs: {len(loaded_docs)}") # should be > 0 | |
print(f"First item type: {type(loaded_docs[0])}") # should be langchain.docstore.document.Document | |
for i, doc in enumerate(loaded_docs[:3]): | |
print(f"\nπ Doc {i} content preview:\n{doc.page_content[:300]}") | |
chunks = split_documents(loaded_docs) | |
if chunks: | |
print("Sample split:", chunks[0].page_content[:300]) | |
embeding_function = get_embedding_functions() | |
vector_chroma_store =index_documents(chunks,embeding_function=embeding_function) | |
def load_llm(model_name="qwen:1.8b"): | |
llm = Ollama(model=model_name) | |
print(f"β Loaded LLM: {model_name}") | |
return llm | |
def create_qa_chain(llm, vector_store): | |
retriever = vector_store.as_retriever() | |
qa_chain = RetrievalQA.from_chain_type( | |
llm=llm, | |
retriever=retriever, | |
return_source_documents=True # Optional: to see context | |
) | |
print("β QA Chain initialized") | |
return qa_chain | |
def ask_question(qa_chain, question): | |
print(f"\nβ Question: {question}") | |
result = qa_chain({"query": question}) | |
print(f"\n㪠Answer:\n{result['result']}") | |
return result | |
llm = load_llm() | |
qa_chain = create_qa_chain(llm,vector_store=vector_chroma_store) | |
ask_question(qa_chain, "What is the main idea of the first chapter?") | |
ask_question(qa_chain, "Who is the author of Breif history time ?") |