Spaces:
Running
Running
File size: 6,758 Bytes
e9698e9 b4b5bdf e9698e9 37f8a37 7710388 e9698e9 b4b5bdf e9698e9 6f8159c e9698e9 fc1544a 6f8159c 51727c4 6f8159c 37f8a37 b4b5bdf a3a378d 01b468b 3fcc7da 01b468b a3a378d e9698e9 01b468b e9698e9 01b468b e9698e9 37f8a37 e9698e9 e0e448c e9698e9 7710388 e9698e9 37f8a37 e9698e9 51727c4 37f8a37 e9698e9 37f8a37 e9698e9 37f8a37 51727c4 37f8a37 51727c4 e9698e9 51727c4 e9698e9 51727c4 7710388 51727c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import logging
import os
from buster.busterbot import Buster, BusterConfig
from buster.completers import ChatGPTCompleter, DocumentAnswerer
from buster.formatters.documents import DocumentsFormatterJSON
from buster.formatters.prompts import PromptFormatter
from buster.retriever import DeepLakeRetriever, Retriever
from buster.tokenizers import GPTTokenizer
from buster.validators import QuestionAnswerValidator, Validator
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# required
ACTIVELOOP_TOKEN = os.getenv("ACTIVELOOP_TOKEN")
if ACTIVELOOP_TOKEN is None:
logger.warning("No activeloop token found, you will not be able to fetch data.")
DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "wiki_tai_langchain")
DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
# if you want to use a local dataset, set the env. variable, it overrides all others
DEEPLAKE_DATASET_PATH = os.getenv(
"DEEPLAKE_DATASET_PATH", f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
)
logger.info(f"{DEEPLAKE_DATASET_PATH=}")
example_questions = [
"What is the LLama model?",
"What is a Large Language Model?",
"What is an embedding?",
]
buster_cfg = BusterConfig(
validator_cfg={
"unknown_response_templates": [
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
],
"unknown_threshold": 0.85,
"embedding_model": "text-embedding-ada-002",
"use_reranking": True,
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
"check_question_prompt": """You are a chatbot, answering questions about large language models and artificial intelligence.
Users will ask all sorts of questions, and some might be tangentially related.
Users will learn to build LLM-powered apps, with LangChain & Deep Lake among other technologies.
As long as a question is somewhat related to the topic, respond 'true'. If a question is completely unrelated, respond 'false'.
For example:
Q: How can I setup my own chatbot?
true
Q: What is the meaning of life?
false
A user will now submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
"completion_kwargs": {
"model": "gpt-3.5-turbo",
"stream": False,
"temperature": 0,
},
},
retriever_cfg={
"path": f"{DEEPLAKE_DATASET_PATH}",
"top_k": 3,
"thresh": 0.7,
"max_tokens": 2000,
"embedding_model": "text-embedding-ada-002",
"exec_option": "compute_engine",
"use_tql": True,
},
documents_answerer_cfg={
"no_documents_message": "No blog posts are available for this question.",
},
completion_cfg={
"completion_kwargs": {
"model": "gpt-3.5-turbo",
"stream": True,
"temperature": 0,
},
},
tokenizer_cfg={
"model_name": "gpt-3.5-turbo",
},
documents_formatter_cfg={
"max_tokens": 3500,
"columns": ["content", "source", "title"],
},
prompt_formatter_cfg={
"max_tokens": 3500,
"text_before_docs": (
"You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
"You are provided information found in the json documentation. "
"Only respond with infomration inside the json documentation. DO NOT use additional information, even if you know the answer. "
"If the answer is in the documentation, summarize it in a helpful way to the user. "
"If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
"Here is the information you can use (json documentation): "
),
"text_after_docs": (
"REMEMBER:\n"
"You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
"You are provided information found in the . "
"Here are the rules you must follow:\n"
"* Only respond with infomration inside the json documentation. DO NOT providew additional information, even if you know the answer. "
"* If the answer is in the documentation, summarize it in a helpful way to the user. "
"* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
"* Only summarize the information in the json documentation, do not respond otherwise. "
"* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
"* Do not reference any links, urls or hyperlinks in your answers.\n"
"* Make sure to format your answers in Markdown format, including code block and snippets.\n"
"* If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
"'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?'"
"For example:\n"
"What is the meaning of life for a qa bot?\n"
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?"
"Now answer the following question:\n"
),
},
)
def setup_buster(buster_cfg):
retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
document_answerer: DocumentAnswerer = DocumentAnswerer(
completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
documents_formatter=DocumentsFormatterJSON(
tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
),
prompt_formatter=PromptFormatter(
tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
),
**buster_cfg.documents_answerer_cfg,
)
validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
buster: Buster = Buster(
retriever=retriever, document_answerer=document_answerer, validator=validator
)
return buster
|