File size: 10,653 Bytes
e9698e9
b4b5bdf
e9698e9
5c3db13
e9698e9
37f8a37
7710388
e9698e9
362f139
b4b5bdf
e9698e9
362f139
 
 
a61504d
 
5c3db13
 
 
362f139
 
a61504d
5c3db13
 
 
 
 
a61504d
 
e9698e9
 
 
6f8159c
 
362f139
e9698e9
51727c4
362f139
 
 
 
 
3b8bb94
362f139
 
b350792
362f139
 
b4b5bdf
a3a378d
01b468b
3fcc7da
01b468b
a3a378d
 
362f139
 
 
 
 
 
 
 
 
e9698e9
 
 
362f139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9698e9
362f139
 
 
 
 
 
 
 
 
 
e9698e9
 
37f8a37
71cffeb
362f139
 
 
e0e448c
 
a5371c1
975bc54
e9698e9
 
 
 
 
 
362f139
e9698e9
 
 
 
 
362f139
e9698e9
 
362f139
7710388
e9698e9
 
362f139
e9698e9
0b9f9a6
37f8a37
0b9f9a6
 
51727c4
4c27275
e9698e9
 
 
0b9f9a6
 
e9698e9
0b9f9a6
4c27275
51727c4
0b9f9a6
 
51727c4
 
362f139
 
e9698e9
 
 
 
 
51727c4
 
 
 
 
 
7710388
51727c4
 
 
 
 
 
 
362f139
51727c4
 
 
0b9f9a6
51727c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import logging
import os

import deeplake
from buster.busterbot import Buster, BusterConfig
from buster.completers import ChatGPTCompleter, DocumentAnswerer
from buster.formatters.documents import DocumentsFormatterJSON
from buster.formatters.prompts import PromptFormatter
from buster.llm_utils import get_openai_embedding_constructor
from buster.retriever import DeepLakeRetriever, Retriever
from buster.tokenizers import GPTTokenizer
from buster.validators import Validator
from dotenv import load_dotenv

from utils import init_mongo_db

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

load_dotenv()

MONGODB_URI = os.getenv("MONGODB_URI")
mongo_db = (
    init_mongo_db(uri=MONGODB_URI, db_name="towardsai-buster")
    if MONGODB_URI
    else logger.warning("No mongodb uri found, you will not be able to save data.")
)


logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

ACTIVELOOP_TOKEN = os.getenv("ACTIVELOOP_TOKEN")
if ACTIVELOOP_TOKEN is None:
    logger.warning("No activeloop token found.")


DEEPLAKE_DATASET_PATH = "local_dataset"
if os.path.exists(DEEPLAKE_DATASET_PATH):
    logger.info(f"{DEEPLAKE_DATASET_PATH=}")
else:
    from huggingface_hub import snapshot_download

    snapshot_download(
        repo_id="towardsai-tutors/buster-ai-tutor-data",
        local_dir=".",
        repo_type="dataset",
    )

example_questions = [
    "What is the LLama model?",
    "What is a Large Language Model?",
    "What is an embedding?",
]

# kwargs to pass to the client
client_kwargs = {
    "timeout": 60,
    "max_retries": 0,
}

embedding_fn = get_openai_embedding_constructor(
    model="text-embedding-3-small", client_kwargs=client_kwargs
)

buster_cfg = BusterConfig(
    validator_cfg={
        "question_validator_cfg": {
            "invalid_question_response": "This question does not seem relevant my AI knowledge. If the question is related to AI, please send us feedback! \n PS: I'm still learning, so I might not know the answer to your question, you can also try without acronyms in your question. Email us at [email protected] for any issue with the bot!",
            "completion_kwargs": {
                "model": "gpt-4o-mini",
                "stream": False,
                "temperature": 1,
            },
            "client_kwargs": client_kwargs,
            # check_question_prompt is a system prompt
            "check_question_prompt": """You are a chatbot, answering questions about large language models and artificial intelligence.
    # Your job is to determine whether user's question is valid or not. Users will not always submit a question either.
    # Users will ask all sorts of questions, and some might be tangentially related to artificial intelligence (AI), machine learning (ML) and natural language processing (NLP).
    # Users will learn to build LLM-powered apps, with LangChain, LlamaIndex & Deep Lake among other technologies including OpenAI, RAG and more.
    # As long as a question is somewhat related to the topic of AI, ML, NLP, RAG, data and techniques used in AI like vectors, memories, embeddings, tokenization, encoding, databases, RAG (Retrieval-Augmented Generation), Langchain, LlamaIndex, LLM (Large Language Models), Preprocessing techniques, Document loading, Chunking, Indexing of document segments, Embedding models, Chains, Memory modules, Vector stores, Chat models, Sequential chains, Information Retrieval, Data connectors, LlamaHub, Node objects, Query engines, Fine-tuning, Activeloop’s Deep Memory, Prompt engineering, Synthetic training dataset, Inference, Recall rates, Query construction, Query expansion, Query transformation, Re-ranking, Cohere Reranker, Recursive retrieval, Small-to-big retrieval, Hybrid searches, Hit Rate, Mean Reciprocal Rank (MRR), GPT-4, Agents, OpenGPTs, Zero-shot ReAct, Conversational Agent, OpenAI Assistants API, Hugging Face Inference API, Code Interpreter, Knowledge Retrieval, Function Calling, Whisper, Dall-E 3, GPT-4 Vision, Unstructured, Deep Lake, FaithfulnessEvaluator, RAGAS, LangSmith, LangChain Hub, LangServe, REST API, respond 'true'. If a question is on a different subject or unrelated, respond 'false'.
    # Make sure the question is a valid question.
    # Here is a list of acronyms and concepts related to Artificial Intelligence AI that you can accept from users, they can be uppercase or lowercase:
    # [TQL, Deep Memory, LLM, Llama, llamaindex, llama-index, lang chain, langchain, llama index, GPT, NLP, RLHF, RLAIF, Mistral, SFT, Cohere, NanoGPT, ReAct, LoRA, QLoRA, LMMOps, Alpaca, Flan, Weights and Biases, W&B, IDEFICS, Flamingo, LLaVA, BLIP, Falcon]
    # Here are some examples:
    # Q: How can I setup my own chatbot?
    # true
    # Q: What is the meaning of life?
    # false
    # Q: What is rlhf?
    # true
    # Q:
    # """,
        },
        "answer_validator_cfg": {
            "unknown_response_templates": [
                "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
            ],
            "unknown_threshold": 0.3,  # compare the embedding of the response to the embedding of the prompt-engineered "I don't know" embedding. if above threshold, we assume answer is not relevant
            "embedding_fn": embedding_fn,
        },
        "documents_validator_cfg": {
            "completion_kwargs": {
                "model": "gpt-4o-mini",
                "stream": False,
                "temperature": 1,
            },
            "client_kwargs": client_kwargs,
        },
        "use_reranking": True,
        "validate_documents": False,
    },
    retriever_cfg={
        "path": f"{DEEPLAKE_DATASET_PATH}",
        "top_k": 5,
        "thresh": 0.2,
        "max_tokens": 100_000,
        "embedding_model": embedding_fn,
        "exec_option": "compute_engine",
        "use_tql": True,
        "deep_memory": False,
        "activeloop_token": ACTIVELOOP_TOKEN,
    },
    documents_answerer_cfg={
        "no_documents_message": "No blog posts are available for this question.",
    },
    completion_cfg={
        "completion_kwargs": {
            "model": "gpt-4o-mini",
            "stream": True,
            "temperature": 0,
        },
    },
    tokenizer_cfg={
        "model_name": "gpt-4o-mini",
    },
    documents_formatter_cfg={
        "max_tokens": 100_000,
        "columns": ["content", "source", "title"],
    },
    prompt_formatter_cfg={
        "max_tokens": 100_000,
        "text_before_docs": (
            "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
            "You are provided information found in the json documentation. "
            "Only respond with information inside the json documentation. DO NOT use additional information, even if you know the answer. "
            "If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation, answer in 5 paragraphs."
            "If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
            "Here is the information you can use (json documentation) in order: "
        ),
        "text_after_docs": (
            "REMEMBER:\n"
            "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
            "You are provided information found in the json documentation. "
            "Here are the rules you must follow:\n"
            "* Only respond with information inside the json documentation. DO NOT provide additional information, even if you know the answer. "
            "* If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation. Your answer needs to be pertinent and not redundant giving a clear explanation as if you were a teacher. "
            "* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
            "* Only use information summarized from the json documentation, do not respond otherwise. "
            "* Do not refer to the json documentation directly, but use the instructions provided within it to answer questions. "
            "* Do not reference any links, urls or hyperlinks in your answers.\n"
            "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
            "* If the documents retrieved do not answer the question, simply reply with:\n"
            "I'm sorry, but I couldn't find any relevant information in the documents retrieved. If you have any other questions, feel free to ask!"
            "Now answer the following question:\n"
        ),
    },
)


def setup_buster(buster_cfg):
    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
    document_answerer: DocumentAnswerer = DocumentAnswerer(
        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
        documents_formatter=DocumentsFormatterJSON(
            tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
        ),
        prompt_formatter=PromptFormatter(
            tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
        ),
        **buster_cfg.documents_answerer_cfg,
    )
    validator: Validator = Validator(**buster_cfg.validator_cfg)
    buster: Buster = Buster(
        retriever=retriever, document_answerer=document_answerer, validator=validator
    )

    return buster