Spaces:

pratikroy311
/

IndianTaxationBot

Runtime error

App Files Files Community

IndianTaxationBot / utils.py

pratikroy311

Update utils.py

68cfdb8 verified 12 months ago

raw

history blame contribute delete

2.94 kB

	# Import necessary libraries
	from langchain.document_loaders import DirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import SentenceTransformerEmbeddings
	from langchain.vectorstores import Chroma
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	from langchain_community.llms import HuggingFacePipeline
	from langchain.chains.question_answering import load_qa_chain


	# Load and process documents
	dir = "data"

	def load_docs(dir):
	loader = DirectoryLoader(dir)
	docs = loader.load()
	return docs

	docs = load_docs(dir)

	def split_docs(docs, chunk_size=512, chunk_overlap=20):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	split_docs = text_splitter.split_documents(docs)
	return split_docs

	docs = split_docs(docs)

	# Initialize embeddings and vector store
	embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
	persist_directory = "chroma_db"
	vectordb = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)
	vectordb.persist()
	new_db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

	def get_similar_docs(query, k=2, score=False):
	if score:
	similar_docs = new_db.similarity_search_with_score(query, k=k)
	else:
	similar_docs = new_db.similarity_search(query, k=k)
	return similar_docs

	# Load LLM model from Hugging Face

	# model_name = "HuggingFaceH4/zephyr-7b-beta"
	# model = AutoModelForCausalLM.from_pretrained(model_name)
	# tokenizer = AutoTokenizer.from_pretrained(model_name)

	# model = AutoModelForCausalLM.from_pretrained("gpt2")
	# tokenizer = AutoTokenizer.from_pretrained("gpt2")
	tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama_v1.1")
	model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama_v1.1")
	text_generation_pipeline = pipeline(
	model=model,
	tokenizer=tokenizer,
	task="text-generation",
	temperature=0.2,
	do_sample=True,
	repetition_penalty=1.1,
	return_full_text=True,
	max_new_tokens=400,
	)
	# text_generation_pipeline = pipeline("text-generation", model="bigscience/bloom-1b7")

	llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

	chain = load_qa_chain(llm, chain_type="stuff")

	def get_helpful_answer(text):
	# Find the index of "Helpful Answer:"
	index = text.find("Helpful Answer:")

	# If "Helpful Answer:" is not found, return an empty string
	if index == -1:
	return ""

	# Add the length of "Helpful Answer:" to the index to start from the end of this string
	index += len("Helpful Answer:")

	# Return the text from this index to the end
	return text[index:].strip()

	def get_answer(query):
	similar_docs = get_similar_docs(query)
	answer = chain.run(input_documents=similar_docs, question=query)
	answer = get_helpful_answer(answer)
	return answer