Spaces:
Sleeping
Sleeping
| import tiktoken | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_chroma import Chroma | |
| from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.memory import ConversationSummaryBufferMemory | |
| from langchain_groq import ChatGroq | |
| import os | |
| tokenizer = tiktoken.get_encoding('cl100k_base') | |
| FILE_NAMEs = os.listdir('data') | |
| # system_template = """ you are LIC Customer Service Chatbot. | |
| # Use the following pieces of context to answer the user's question. | |
| # If you don't know the answer, just say that you don't know, don't try to make up an answer. | |
| # ---------------- | |
| # {context}""" | |
| SYSTEM_PROMPT = """ | |
| you are LIC Customer Service Chatbot. | |
| Use the following pieces of context to answer the user's question. | |
| If you don't know the answer, just say that you don't know, don't try to make up an answer. | |
| context: {context} | |
| previous message summary: {previous_message_summary} | |
| """ | |
| human_template = "{question}" | |
| NLP_MODEL_NAME = "llama3-70b-8192" | |
| REASONING_MODEL_NAME = "mixtral-8x7b-32768" | |
| REASONING_MODEL_TEMPERATURE = 0 | |
| NLP_MODEL_TEMPERATURE = 0 | |
| NLP_MODEL_MAX_TOKENS = 5400 | |
| VECTOR_MAX_TOKENS = 100 | |
| VECTORS_TOKEN_OVERLAP_SIZE = 20 | |
| NUMBER_OF_VECTORS_FOR_RAG = 7 | |
| # create the length function | |
| def tiktoken_len(text): | |
| tokens = tokenizer.encode( | |
| text, | |
| disallowed_special=() | |
| ) | |
| return len(tokens) | |
| def get_vectorstore(): | |
| model_name = "BAAI/bge-small-en" | |
| model_kwargs = {"device": "cpu"} | |
| encode_kwargs = {"normalize_embeddings": True} | |
| hf = HuggingFaceBgeEmbeddings( | |
| model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
| ) | |
| all_splits = [] | |
| for file_name in FILE_NAMEs: | |
| if file_name.endswith(".pdf"): | |
| loader = PyPDFLoader(os.path.join("data",file_name)) | |
| data = loader.load()[0].page_content | |
| else: | |
| with open(os.path.join("data",file_name), "r") as f: | |
| data = f.read() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=VECTOR_MAX_TOKENS, | |
| chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE, | |
| length_function=tiktoken_len, | |
| separators=["\n\n\n","\n\n", "\n", " ", ""] | |
| ) | |
| all_splits = all_splits + text_splitter.split_text(data) | |
| vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf) | |
| return vectorstore | |
| chat = ChatGroq(temperature=0, groq_api_key="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", model_name="llama3-8b-8192", streaming=True) | |
| rag_memory = ConversationSummaryBufferMemory(llm=chat, max_token_limit=3000) | |
| my_vector_store = get_vectorstore() |