File size: 4,891 Bytes
c4d1b84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c43af2c
c4d1b84
 
 
 
 
 
 
 
06f21ae
c4d1b84
 
 
96c2b60
c4d1b84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eedadc8
 
 
 
 
 
c4d1b84
 
 
 
 
 
 
 
 
 
 
 
 
eedadc8
c4d1b84
 
 
 
 
06f21ae
c4d1b84
c43af2c
 
 
eedadc8
 
 
 
 
 
c4d1b84
 
eedadc8
c4d1b84
 
eedadc8
c4d1b84
 
 
380f174
c4d1b84
 
 
 
 
 
 
c43af2c
661818a
c43af2c
 
661818a
c43af2c
c4d1b84
 
90fd8c5
c4d1b84
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import HuggingFaceEndpoint

from pathlib import Path
import chromadb
from unidecode import unidecode

from transformers import AutoTokenizer
from transformers import pipeline
import transformers
import torch
import tqdm
import accelerate

def load_doc(file_path):
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1024, chunk_overlap = 128)
    doc_splits = text_splitter.split_documents(pages)
    return doc_splits



splt = load_doc('data.pdf')

def initialize_database(file_path):
    # Create list of documents (when valid)
    collection_name = Path(file_path).stem
    # Fix potential issues from naming convention
    ## Remove space
    collection_name = collection_name.replace(" ","-")
    ## Limit lenght to 50 characters
    collection_name = collection_name[:50]
    ## Enforce start and end as alphanumeric character
    if not collection_name[0].isalnum():
        collection_name[0] = 'A'
    if not collection_name[-1].isalnum():
        collection_name[-1] = 'Z'
    # print('list_file_path: ', list_file_path)
    print('Collection name: ', collection_name)
    # Load document and create splits
    doc_splits = load_doc(file_path)
    # global vector_db
    vector_db = create_db(doc_splits, collection_name)
    return vector_db, collection_name, "Complete!"

def create_db(splits, collection_name):
    embedding = HuggingFaceEmbeddings()
    new_client = chromadb.EphemeralClient()
    vectordb = Chroma.from_documents(
        documents=splits,
        embedding=embedding,
        client=new_client,
        collection_name=collection_name,
    )
    return vectordb

vec = initialize_database('data.pdf')

vec_cre = create_db(splt, 'data')


def initialize_llmchain(temperature, max_tokens, top_k, vector_db):
    #Use memory if you want for the chatbot to be conversational, in this case it is just for answering from the document
    # memory = ConversationBufferMemory(
    #     memory_key="chat_history",
    #     output_key='answer',
    #     return_messages=True
    # )

    llm = HuggingFaceEndpoint(
            repo_id='mistralai/Mixtral-8x7B-Instruct-v0.1',
            temperature = temperature,
            max_new_tokens = max_tokens,
            top_k = top_k,
            load_in_8bit = True
        )
    retriever=vector_db.as_retriever()
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=retriever,
        chain_type="stuff",
        #memory=memory,
        return_source_documents=True,
        verbose=False,
    )
    return qa_chain

qa = initialize_llmchain(0.6, 1024, 40, vec_cre) #The model question answer

pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr") # This pipeline translate english to french , it isn't adviced as it add more latency


# def format_chat_history(message, chat_history):
#     formatted_chat_history = []
#     for user_message, bot_message in chat_history:
#         formatted_chat_history.append(f"User: {user_message}")
#         formatted_chat_history.append(f"Assistant: {bot_message}")
#     return formatted_chat_history

def conversation(message, history):
    #formatted_chat_history = format_chat_history(message, history)

    # Generate response using QA chain
    response = qa({"question": message + " According to the document", "chat_history": []})
    response_answer = response["answer"]
    if response_answer.find("Helpful Answer:") != -1:
        response_answer = response_answer.split("Helpful Answer:")[-1]
    #You can also return from where the model got the answer to fine-tune or adjust your model mais ici c'est bon
    response_sources = response["source_documents"]
    response_source1 = response_sources[0].page_content.strip()
    response_source2 = response_sources[1].page_content.strip()
    response_source3 = response_sources[2].page_content.strip()
    response_source1_page = response_sources[0].metadata["page"] + 1
    response_source2_page = response_sources[1].metadata["page"] + 1
    response_source3_page = response_sources[2].metadata["page"] + 1
    #If you want the return in english leave it at :
    return response_answer

    #If you want the return in french
    #return pipe(response_answer)[0]['translation_text'] + " (Traduis d'anglais en français)"
    




gr.ChatInterface(conversation).launch()