Spaces:
Sleeping
Sleeping
File size: 3,571 Bytes
7a837d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama
load_dotenv()
DATA_PATH ='data/'
FILE_NAME = 'ABriefHistoryofTime.pdf'
CHROMA_PATH= "chroma_db"
def load_documents():
pdf_path = os.path.join(DATA_PATH, FILE_NAME)
loader = PyPDFLoader(pdf_path)
documents = loader.load()
documents = [doc for doc in documents if doc.page_content.strip() != ""]
print(type(documents[0]))
print (f"Loaded {len(documents)} pages from pdf {pdf_path}")
return documents
def split_documents(documents):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size =1000,
chunk_overlap = 200,
length_function = len,
is_separator_regex=False
)
all_splits = text_splitter.split_documents(documents)
print (f"Split into {len(all_splits)} chunks")
return all_splits
def get_embedding_functions(model_name = "nomic-embed-text"):
embeddings = OllamaEmbeddings(model=model_name)
print(f"Initialized embeddings with model {model_name}")
return embeddings
def get_vector_store(embedding_function, persist_directory = CHROMA_PATH):
vectorstore = Chroma(
persist_directory=persist_directory,
embedding_function= embedding_function
)
print (f"Vector store initilialized/loaded from: {persist_directory}")
return vectorstore
def index_documents(chunks, embeding_function, persist_directory = CHROMA_PATH):
print (f"Indexing {len(chunks)} chunks")
vectorstore = Chroma.from_documents(
documents=chunks,
embedding= embeding_function,
persist_directory=persist_directory
)
vectorstore.persist()
print (f"Indexing complete. Data saved to : {persist_directory}")
return vectorstore
loaded_docs = load_documents()
print(f"Document type: {type(loaded_docs)}") # should be a list
print(f"Number of docs: {len(loaded_docs)}") # should be > 0
print(f"First item type: {type(loaded_docs[0])}") # should be langchain.docstore.document.Document
for i, doc in enumerate(loaded_docs[:3]):
print(f"\nπ Doc {i} content preview:\n{doc.page_content[:300]}")
chunks = split_documents(loaded_docs)
if chunks:
print("Sample split:", chunks[0].page_content[:300])
embeding_function = get_embedding_functions()
vector_chroma_store =index_documents(chunks,embeding_function=embeding_function)
def load_llm(model_name="qwen:1.8b"):
llm = Ollama(model=model_name)
print(f"β
Loaded LLM: {model_name}")
return llm
def create_qa_chain(llm, vector_store):
retriever = vector_store.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=retriever,
return_source_documents=True # Optional: to see context
)
print("β
QA Chain initialized")
return qa_chain
def ask_question(qa_chain, question):
print(f"\nβ Question: {question}")
result = qa_chain({"query": question})
print(f"\n㪠Answer:\n{result['result']}")
return result
llm = load_llm()
qa_chain = create_qa_chain(llm,vector_store=vector_chroma_store)
ask_question(qa_chain, "What is the main idea of the first chapter?")
ask_question(qa_chain, "Who is the author of Breif history time ?") |