File size: 3,228 Bytes
e322dc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from langchain.document_loaders import ApifyDatasetLoader
from langchain.utilities import ApifyWrapper
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.vectorstores.deeplake import DeepLake
from langchain_cohere import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
from langchain.memory import ConversationBufferWindowMemory
import os
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
from langchain_groq import ChatGroq
from dotenv import load_dotenv
load_dotenv()

def get_and_load_data():
    apify_key = os.getenv("apify")

    apify = ApifyWrapper()

    loader = apify.call_actor(
        actor_id="apify/website-content-crawler",
        run_input={"startUrls": [{"url": "https://en.wikipedia.org/wiki/Artificial_intelligence"}]},
        dataset_mapping_function=lambda dataset_item: Document(
            page_content=dataset_item["text"] if dataset_item["text"] else "No content available",
            metadata={
                "source": dataset_item["url"],
                "title": dataset_item["metadata"]["title"]
            }
        ),
    )
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=20, length_function=len
    )
    docs_split = text_splitter.split_documents(docs)
    embeddings = CohereEmbeddings(model="embed-english-v2.0")
    username = "gneyapandya1234"
    db_id= "educational_chatbot"
    
    dbs = DeepLake(dataset_path=f"hub://{username}/{db_id}", embedding_function=embeddings)
    dbs.add_documents(docs_split)
    
def deeplake():
    embeddings= CohereEmbeddings(model = "embed-english-v2.0")
    dbs = DeepLake(
        dataset_path="hub://gneyapandya1234/educational_chatbot",
        read_only=True,
        embedding_function= embeddings
    )
    retriever = dbs.as_retriever()
    retriever.search_kwargs["distance_metric"] = "cos"
    retriever.search_kwargs["fetch_k"] = 20
    # retriever.search_kwargs["maximal_marginal_relevance"] = True
    retriever.search_kwargs["k"] = 20
    
    compressor = CohereRerank(
        model = "rerank-english-v2.0",
        top_n=5
    )
    compressor_retriever = ContextualCompressionRetriever(
        base_compressor = compressor , base_retriever=retriever
    )
    print("DOne")
    return dbs, compressor_retriever, retriever

def memory():
    mem = ConversationBufferWindowMemory(
        k=3,
        memory_key="chat_history",
        return_messages=True,
        output_key="answer"
    )
    return mem
def create_llm():
    llm = ChatGroq(api_key= os.getenv("GROQ_API_KEY"),model="llama3-70b-8192")
    return llm

def chain(llm,compression_retriever,memory):
    qa = ConversationalRetrievalChain.from_llm(
        llm = llm,
        memory= memory,
        retriever= compression_retriever,
        verbose= True,
        return_source_documents=True
    )
    return qa
def final_function():
    llm = create_llm()
    mem =memory()
    dbs, compressor_retriever, retriever = deeplake()
    qa= chain(llm,compressor_retriever,mem)
    return qa, mem