# -*- coding: utf-8 -*- """app.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/14JJlKx1Oj4px4gdVwHn55FstUl2Dvh9z """ #|export import os from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.chains import ConversationalRetrievalChain from langchain.embeddings import HuggingFaceEmbeddings from langchain.llms import HuggingFacePipeline from langchain.chains import ConversationChain from langchain.memory import ConversationBufferMemory from langchain.llms import HuggingFaceHub import pandas as pd from pathlib import Path import chromadb import gradio as gr from transformers import AutoTokenizer import transformers import torch import tqdm import accelerate #|export def initialize_database(file_path): # Create list of documents (when valid) collection_name = Path(file_path).stem # Fix potential issues from naming convention ## Remove space collection_name = collection_name.replace(" ","-") ## Limit lenght to 50 characters collection_name = collection_name[:50] ## Enforce start and end as alphanumeric character if not collection_name[0].isalnum(): collection_name[0] = 'A' if not collection_name[-1].isalnum(): collection_name[-1] = 'Z' # print('list_file_path: ', list_file_path) print('Collection name: ', collection_name) # Load document and create splits doc_splits = load_doc(file_path) # Create or load vector database # global vector_db vector_db = create_db(doc_splits, collection_name) return vector_db, collection_name, "Complete!" #|export def load_doc(file_path): loader = PyPDFLoader(file_path) pages = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50) doc_splits = text_splitter.split_documents(pages) return doc_splits #|export def create_db(splits, collection_name): embedding = HuggingFaceEmbeddings() new_client = chromadb.EphemeralClient() vectordb = Chroma.from_documents( documents=splits, embedding=embedding, client=new_client, collection_name=collection_name, # persist_directory=default_persist_directory ) return vectordb #|export splt = load_doc('/content/data.pdf') #|export vec = initialize_database('/content/data.pdf') #|export vec_cre = create_db(splt, 'data') vec_cre #|export def initialize_llmchain(temperature, max_tokens, top_k, vector_db): memory = ConversationBufferMemory( memory_key="chat_history", output_key='answer', return_messages=True ) llm = HuggingFaceHub( repo_id='mistralai/Mixtral-8x7B-Instruct-v0.1', model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens, "top_k": top_k, "load_in_8bit": True} ) retriever=vector_db.as_retriever() qa_chain = ConversationalRetrievalChain.from_llm( llm, retriever=retriever, chain_type="stuff", memory=memory, # combine_docs_chain_kwargs={"prompt": your_prompt}) return_source_documents=True, #return_generated_question=False, verbose=False, ) return qa_chain #|export qa = initialize_llmchain(0.7, 1024, 1, vec_cre) #|export def format_chat_history(message, chat_history): formatted_chat_history = [] for user_message, bot_message in chat_history: formatted_chat_history.append(f"User: {user_message}") formatted_chat_history.append(f"Assistant: {bot_message}") return formatted_chat_history #|export def conversation(message, history): formatted_chat_history = format_chat_history(message, history) response = qa({"question": message, "chat_history": formatted_chat_history}) response_answer = response["answer"] if response_answer.find("Helpful Answer:") != -1: response_answer = response_answer.split("Helpful Answer:")[-1] return response_answer #|export gr.ChatInterface(conversation).launch()