Spaces:

hoshoo21
/

Custom_RAG

Sleeping

Custom_RAG / Local.py

hoshoo21

deployment

7a837d4 21 days ago

3.57 kB

	import os
	from dotenv import load_dotenv
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_ollama import OllamaEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain.chains import RetrievalQA
	from langchain_community.llms import Ollama

	load_dotenv()

	DATA_PATH ='data/'
	FILE_NAME = 'ABriefHistoryofTime.pdf'
	CHROMA_PATH= "chroma_db"

	def load_documents():
	pdf_path = os.path.join(DATA_PATH, FILE_NAME)
	loader = PyPDFLoader(pdf_path)
	documents = loader.load()
	documents = [doc for doc in documents if doc.page_content.strip() != ""]
	print(type(documents[0]))
	print (f"Loaded {len(documents)} pages from pdf {pdf_path}")

	return documents
	def split_documents(documents):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size =1000,
	chunk_overlap = 200,
	length_function = len,
	is_separator_regex=False
	)

	all_splits = text_splitter.split_documents(documents)
	print (f"Split into {len(all_splits)} chunks")
	return all_splits
	def get_embedding_functions(model_name = "nomic-embed-text"):
	embeddings = OllamaEmbeddings(model=model_name)
	print(f"Initialized embeddings with model {model_name}")
	return embeddings

	def get_vector_store(embedding_function, persist_directory = CHROMA_PATH):
	vectorstore = Chroma(
	persist_directory=persist_directory,
	embedding_function= embedding_function
	)
	print (f"Vector store initilialized/loaded from: {persist_directory}")
	return vectorstore

	def index_documents(chunks, embeding_function, persist_directory = CHROMA_PATH):
	print (f"Indexing {len(chunks)} chunks")
	vectorstore = Chroma.from_documents(
	documents=chunks,
	embedding= embeding_function,
	persist_directory=persist_directory
	)
	vectorstore.persist()
	print (f"Indexing complete. Data saved to : {persist_directory}")
	return vectorstore

	loaded_docs = load_documents()
	print(f"Document type: {type(loaded_docs)}") # should be a list
	print(f"Number of docs: {len(loaded_docs)}") # should be > 0
	print(f"First item type: {type(loaded_docs[0])}") # should be langchain.docstore.document.Document

	for i, doc in enumerate(loaded_docs[:3]):
	print(f"\n📄 Doc {i} content preview:\n{doc.page_content[:300]}")

	chunks = split_documents(loaded_docs)
	if chunks:
	print("Sample split:", chunks[0].page_content[:300])
	embeding_function = get_embedding_functions()

	vector_chroma_store =index_documents(chunks,embeding_function=embeding_function)


	def load_llm(model_name="qwen:1.8b"):
	llm = Ollama(model=model_name)
	print(f"✅ Loaded LLM: {model_name}")
	return llm

	def create_qa_chain(llm, vector_store):
	retriever = vector_store.as_retriever()
	qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	retriever=retriever,
	return_source_documents=True # Optional: to see context
	)
	print("✅ QA Chain initialized")
	return qa_chain

	def ask_question(qa_chain, question):
	print(f"\n❓ Question: {question}")
	result = qa_chain({"query": question})
	print(f"\n💬 Answer:\n{result['result']}")
	return result


	llm = load_llm()

	qa_chain = create_qa_chain(llm,vector_store=vector_chroma_store)


	ask_question(qa_chain, "What is the main idea of the first chapter?")
	ask_question(qa_chain, "Who is the author of Breif history time ?")