File size: 5,804 Bytes
51fe9d2
 
0489db2
 
 
 
51fe9d2
 
0489db2
 
 
 
 
7a7c4d5
0489db2
51fe9d2
0489db2
 
 
7a7c4d5
 
 
 
51fe9d2
0489db2
7a7c4d5
51fe9d2
0489db2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51fe9d2
0489db2
 
 
 
51fe9d2
 
 
d5bd88b
 
0489db2
 
7a7c4d5
0489db2
 
 
 
 
 
 
 
7a7c4d5
 
d5bd88b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a7c4d5
d5bd88b
 
 
 
 
 
0489db2
d5bd88b
0489db2
d5bd88b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51fe9d2
d5bd88b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import streamlit as st
from openai.error import OpenAIError
from .utils import *
from typing import Text, Union

multiple_files = False

def clear_submit():
    """
    Toggles the file_submitted internal session state variable to False.
    """
    st.session_state["file_submitted"] = False

def set_openai_api_key(api_key:Text)->bool:
    """Sets the internal OpenAI API key to the given value.

    Args:
        api_key (Text): OpenAI API key
    """
    if not (api_key.startswith('sk-') and len(api_key)==51):
        st.error("Invalid OpenAI API key! Please provide a valid key.")
        return False
    
    st.session_state["OPENAI_API_KEY"] = api_key
    st.session_state["api_key_configured"] = True
    return True

def file_to_doc(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
    """Converts a file to a document using specialized parsers."""
    if file.name.endswith(".pdf"):
        doc = parse_pdf(file)
    elif file.name.endswith(".docx"):
        doc = parse_docx(file)
    elif file.name.split["."][1] in [".txt", ".py", ".json", ".html", ".css", ".md" ]:
        doc = parse_txt(file)
    else:
        st.error("File type not yet supported! Supported files: [.pdf, .docx, .txt, .py, .json, .html, .css, .md]")
        doc = None
    
    return doc

# this function can be used to define a single doc processing pipeline
# def document_embedding_pipeline(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:  

def qa_main():
    st.markdown("<h2>This app allows to chat with files!</h2>", unsafe_allow_html=True)
    st.write("Just upload something using and start chatting with a version of GPT4 that has read the file!")
    
    index = None
    doc = None

    upload_document_greenlight = False
    uploaded_processed_document_greenlight = False
    # OpenAI API Key - TODO: consider adding a key valid for everyone
    st.header("Configure OpenAI API Key")
    st.warning('Please enter your OpenAI API Key!', icon='⚠️')
    user_secret = st.text_input(
        "Insert your OpenAI API key here ([get your API key](https://platform.openai.com/account/api-keys)).",
        type="password",
        placeholder="Paste your OpenAI API key here (sk-...)",
        help="You can get your API key from https://platform.openai.com/account/api-keys.",
        value=st.session_state.get("OPENAI_API_KEY", ""),
    )
    if user_secret:
        if set_openai_api_key(user_secret):
            st.success('OpenAI API key successfully provided!', icon='✅')
            upload_document_greenlight = True
    
    if upload_document_greenlight:
        # File that needs to be queried
        st.header("Upload a file")
        uploaded_file = st.file_uploader(
            "Upload a pdf, docx, or txt file (scanned documents not supported)",
            type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
            help="Scanned documents are not supported yet 🥲",
            on_change=clear_submit, 
            accept_multiple_files=multiple_files,
        )
            
        # reading the uploaded file
        if uploaded_file is not None:
            # toggle internal file submission state to True
            st.session_state["file_submitted"] = True
            # parse the file using custom parsers
            doc = file_to_doc(uploaded_file)
            # converts the files into a list of documents
            text = text_to_docs(text=tuple(doc))

            try:
                with st.spinner("Indexing the document... This might take a while!"):
                    index = embed_docs(tuple(text))
                    st.session_state["api_key_configured"] = True
            except OpenAIError as e:
                st.error("OpenAI error encountered: ", e._message)
        
            uploaded_processed_document_greenlight = True
        
    if uploaded_processed_document_greenlight: 
        if "messages" not in st.session_state:
            st.session_state["messages"] = []

        for message in st.session_state.messages:
            with st.chat_message(message["role"]):
                st.markdown(message["content"])

        if prompt := st.chat_input("Ask the document something..."):
            st.session_state.messages.append({"role": "user", "content": prompt})
            with st.chat_message("user"):
                st.markdown(prompt)

            with st.chat_message("assistant"):
                message_placeholder = st.empty()
                # retrieving the most relevant sources
                sources = search_docs(index, prompt)
                # producing the answer, live
                full_response = ""
                for answer_bit in get_answer(sources, prompt)["output_text"]:
                    full_response += answer_bit
                    message_placeholder.markdown(full_response + "▌")
                
                message_placeholder.markdown(full_response)

                # answer = get_answer(sources, prompt)
                # message_placeholder.markdown(answer["output_text"])
            
            # st.session_state.messages.append({"role": "assistant", "content": answer["output_text"]})
            st.session_state.messages.append({"role": "assistant", "content": full_response})

# This might be useful to add memory to the chatbot harnessing a more low-level approach
# llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

# memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')
# retriever = your_vector_store.as_retriever()

# # Create the multipurpose chain
# qachat = ConversationalRetrievalChain.from_llm(
#     llm=ChatOpenAI(temperature=0),
#     memory=memory,
#     retriever=retriever, 
#     return_source_documents=True
# )

# qachat("Ask your question here...")