import os import openai import sys sys.path.append('../..') from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter from langchain.vectorstores import DocArrayInMemorySearch from langchain.document_loaders import TextLoader from langchain.chains import RetrievalQA, ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory from langchain.chat_models import ChatOpenAI from langchain.document_loaders import TextLoader from langchain.document_loaders import GitLoader from langchain.llms import OpenAI from langchain.memory import ConversationBufferMemory from langchain.vectorstores import Chroma from langchain.embeddings.openai import OpenAIEmbeddings from langchain.prompts import PromptTemplate import datetime import shutil # Function to load the data from github using langchain with string type url, string type branch, string type file_filter def loader(url: str, branch: str, file_filter: str): repo_path = "./github_repo" if os.path.exists(repo_path): shutil.rmtree(repo_path) loader = GitLoader( clone_url= url, repo_path="./github_repo/", branch=branch, file_filter=lambda file_path: file_path.endswith(tuple(file_filter.split(','))) # Filter out files in Data but whole repo is cloned ) data = loader.load() return data #Function to split the data into chunks using recursive character text splitter def split_data(data): splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=150, length_function=len, # Function to measure the length of chunks while splitting add_start_index=True # Include the starting position of each chunk in metadata ) chunks = splitter.split_documents(data) return chunks #Function to ingest the chunks into a vectorstore of doc def ingest_chunks(chunks): embedding = OpenAIEmbeddings() vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding) repo_path = "./github_repo" if os.path.exists(repo_path): shutil.rmtree(repo_path) return vector_store #Retreival function to get the data from the database and reply to the user def retreival(vector_store): # Selecting the right model llm_name = "gpt-3.5-turbo" #Creating LLM llm = ChatOpenAI(model=llm_name, temperature=0.7) # Creating Prompt template template = """ You're a Git Code summarisation assistant who searches through "SOURCE DOCUMENTS" and provides helpful sumamries with "CODE SNIPPETS". Given the following extracted parts of a long document and a question, create a final answer with "CODE SNIPPETS" from "SOURCE DOCUMENTS". If you don't know the answer, just say that you don't know. Don't try to make up an answer. ========= QUESTION: {question} ========= CONTEXT: {context} ========= FINAL ANSWER:""" PROMPT = PromptTemplate(input_variables=["context", "question"], template=template,) #Creating memory memory = ConversationBufferMemory( memory_key="chat_history", input_key="question", output_key="answer", return_messages=True) #Creating the retriever, this can also be a contextual compressed retriever retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5}) #search_type can be "similarity" or "mmr" chain = ConversationalRetrievalChain.from_llm( llm=llm, chain_type="stuff", #chain type can be refine, stuff, map_reduce retriever=retriever, memory=memory, return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work combine_docs_chain_kwargs=dict({"prompt": PROMPT}) ) return chain #Class using all above components to create QA system class ConversationalResponse: def __init__(self, url, branch, file_filter): self.url = url self.branch = branch self.file_filter = file_filter self.data = loader(self.url, self.branch, self.file_filter) self.chunks = split_data(self.data) self.vector_store = ingest_chunks(self.chunks) self.chain_type = "stuff" self.k = 5 self.chain = retreival(self.vector_store) def __call__(self, question): agent = self.chain(question) return agent['answer']