learn-ai

Sleeping

App Files Files Community

dh-mc commited on Aug 4, 2023

Commit

4359eb6

1 Parent(s): 2826548

code complete

Browse files

Files changed (4) hide show

app.py +1 -0
app_modules/llm_chat_chain.py +2 -1
app_modules/llm_loader.py +4 -2
server.py +39 -81

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from timeit import default_timer as timer
 import gradio as gr
 from anyio.from_thread import start_blocking_portal
 from app_modules.init import app_init
 from app_modules.utils import print_llm_response, remove_extra_spaces

 import gradio as gr
 from anyio.from_thread import start_blocking_portal
 from app_modules.init import app_init
 from app_modules.utils import print_llm_response, remove_extra_spaces

app_modules/llm_chat_chain.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from langchain.chains import ConversationalRetrievalChain
 from langchain.chains.base import Chain
 from langchain.memory import ConversationBufferMemory
-from langchain import LLMChain, PromptTemplate
 from app_modules.llm_inference import LLMInference

+from langchain import LLMChain, PromptTemplate
 from langchain.chains import ConversationalRetrievalChain
 from langchain.chains.base import Chain
 from langchain.memory import ConversationBufferMemory
 from app_modules.llm_inference import LLMInference

app_modules/llm_loader.py CHANGED Viewed

@@ -93,7 +93,7 @@ class LLMLoader:
     def __init__(self, llm_model_type, max_tokens_limit: int = 2048):
         self.llm_model_type = llm_model_type
         self.llm = None
-        self.streamer = TextIteratorStreamer("")
         self.max_tokens_limit = max_tokens_limit
         self.search_kwargs = {"k": 4}
@@ -138,7 +138,9 @@ class LLMLoader:
             bnb_8bit_use_double_quant=load_quantized_model == "8bit",
         )
-        callbacks = [self.streamer]
         if custom_handler is not None:
             callbacks.append(custom_handler)

     def __init__(self, llm_model_type, max_tokens_limit: int = 2048):
         self.llm_model_type = llm_model_type
         self.llm = None
+        self.streamer = None
         self.max_tokens_limit = max_tokens_limit
         self.search_kwargs = {"k": 4}
             bnb_8bit_use_double_quant=load_quantized_model == "8bit",
         )
+        callbacks = []
+        if self.streamer is not None:
+            callbacks.append(self.streamer)
         if custom_handler is not None:
             callbacks.append(custom_handler)

server.py CHANGED Viewed

@@ -1,74 +1,21 @@
 """Main entrypoint for the app."""
 import json
 import os
-import time
-from queue import Queue
 from timeit import default_timer as timer
 from typing import List, Optional
-from langchain.embeddings import HuggingFaceInstructEmbeddings
-from langchain.vectorstores.chroma import Chroma
-from langchain.vectorstores.faiss import FAISS
 from lcserve import serving
 from pydantic import BaseModel
-from app_modules.presets import *
-from app_modules.qa_chain import QAChain
-from app_modules.utils import *
-# Constants
-init_settings()
-# https://github.com/huggingface/transformers/issues/17611
-os.environ["CURL_CA_BUNDLE"] = ""
-hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
-print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
-print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
-hf_embeddings_model_name = (
-    os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
-)
-n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
-index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
-using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
-llm_model_type = os.environ.get("LLM_MODEL_TYPE")
 chat_history_enabled = os.environ.get("CHAT_HISTORY_ENABLED") == "true"
-show_param_settings = os.environ.get("SHOW_PARAM_SETTINGS") == "true"
-share_gradio_app = os.environ.get("SHARE_GRADIO_APP") == "true"
-streaming_enabled = True  # llm_model_type in ["openai", "llamacpp"]
-start = timer()
-embeddings = HuggingFaceInstructEmbeddings(
-    model_name=hf_embeddings_model_name,
-    model_kwargs={"device": hf_embeddings_device_type},
-)
-end = timer()
-print(f"Completed in {end - start:.3f}s")
-start = timer()
-print(f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}")
-if not os.path.isdir(index_path):
-    raise ValueError(f"{index_path} does not exist!")
-elif using_faiss:
-    vectorstore = FAISS.load_local(index_path, embeddings)
-else:
-    vectorstore = Chroma(embedding_function=embeddings, persist_directory=index_path)
-end = timer()
-print(f"Completed in {end - start:.3f}s")
-start = timer()
-qa_chain = QAChain(vectorstore, llm_model_type)
-qa_chain.init(n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type)
-end = timer()
-print(f"Completed in {end - start:.3f}s")
 class ChatResponse(BaseModel):
@@ -80,30 +27,41 @@ class ChatResponse(BaseModel):
 @serving(websocket=True)
-def chat(question: str, history: Optional[List], **kwargs) -> str:
     # Get the `streaming_handler` from `kwargs`. This is used to stream data to the client.
-    streaming_handler = kwargs.get("streaming_handler") if streaming_enabled else None
-    chat_history = []
-    if chat_history_enabled:
-        for element in history:
-            item = (element[0] or "", element[1] or "")
-            chat_history.append(item)
-    start = timer()
-    result = qa_chain.call(
-        {"question": question, "chat_history": chat_history}, streaming_handler
-    )
-    end = timer()
-    print(f"Completed in {end - start:.3f}s")
-    resp = ChatResponse(sourceDocs=result["source_documents"])
-    if not streaming_enabled:
-        resp.token = remove_extra_spaces(result["answer"])
-        print(resp.token)
-    return json.dumps(resp.dict())
 if __name__ == "__main__":
-    print_llm_response(json.loads(chat("What is PCI DSS?", [])))

 """Main entrypoint for the app."""
 import json
 import os
 from timeit import default_timer as timer
 from typing import List, Optional
 from lcserve import serving
 from pydantic import BaseModel
+from app_modules.init import app_init
+from app_modules.llm_chat_chain import ChatChain
+from app_modules.utils import print_llm_response
+llm_loader, qa_chain = app_init()
 chat_history_enabled = os.environ.get("CHAT_HISTORY_ENABLED") == "true"
+uuid_to_chat_chain_mapping = dict()
 class ChatResponse(BaseModel):
 @serving(websocket=True)
+def chat(
+    question: str, history: Optional[List] = [], uuid: Optional[str] = None, **kwargs
+) -> str:
+    print(f"uuid: {uuid}")
     # Get the `streaming_handler` from `kwargs`. This is used to stream data to the client.
+    streaming_handler = kwargs.get("streaming_handler")
+    if uuid is None:
+        chat_history = []
+        if chat_history_enabled:
+            for element in history:
+                item = (element[0] or "", element[1] or "")
+                chat_history.append(item)
+        start = timer()
+        result = qa_chain.call_chain(
+            {"question": question, "chat_history": chat_history}, streaming_handler
+        )
+        end = timer()
+        print(f"Completed in {end - start:.3f}s")
+        resp = ChatResponse(sourceDocs=result["source_documents"])
+        return json.dumps(resp.dict())
+    else:
+        if uuid in uuid_to_chat_chain_mapping:
+            chat = uuid_to_chat_chain_mapping[uuid]
+        else:
+            chat = ChatChain(llm_loader)
+            uuid_to_chat_chain_mapping[uuid] = chat
+        result = chat.call_chain({"question": question}, streaming_handler)
+        print(f"result: {result}")
+        resp = ChatResponse(sourceDocs=[])
+        return json.dumps(resp.dict())
 if __name__ == "__main__":
+    print_llm_response(json.loads(chat("What's deep learning?", [])))