Dao3 davila7 commited on
Commit
6ce432e
·
0 Parent(s):

Duplicate from davila7/filegpt

Browse files

Co-authored-by: Daniel Avila <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README 2.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">
2
+ FileGPT 🤖
3
+ </h1>
4
+
5
+ Read the article to know how it works: <a href="">Medium Article</a>
6
+
7
+ With File GPT you will be able to extract all the information from a file.
8
+ You will obtain the transcription, the embedding of each segment and also ask questions to the file through a chat.
9
+
10
+ All code was written with the help of <a href="https://codegpt.co">Code GPT</a>
11
+
12
+ <a href="https://codegpt.co" target="_blank"><img width="753" alt="Captura de Pantalla 2023-02-08 a la(s) 9 16 43 p  m" src="https://user-images.githubusercontent.com/6216945/217699939-eca3ae47-c488-44da-9cf6-c7caef69e1a7.png"></a>
13
+
14
+ <hr>
15
+ <br>
16
+
17
+ # Features
18
+
19
+ - Read any pdf, docx, txt or csv file
20
+ - Embedding texts segments with Langchain and OpenAI (**text-embedding-ada-002**)
21
+ - Chat with the file using **streamlit-chat** and LangChain QA with source and (**text-davinci-003**)
22
+
23
+ # Running Locally
24
+
25
+ 1. Clone the repository
26
+
27
+ ```bash
28
+ git clone https://github.com/davila7/file-gpt
29
+ cd file-gpt
30
+ ```
31
+ 2. Install dependencies
32
+
33
+ These dependencies are required to install with the requirements.txt file:
34
+
35
+ * openai
36
+ * pypdf
37
+ * scikit-learn
38
+ * numpy
39
+ * tiktoken
40
+ * docx2txt
41
+ * langchain
42
+ * pydantic
43
+ * typing
44
+ * faiss-cpu
45
+ * streamlit_chat
46
+
47
+ ```bash
48
+ pip install -r requirements.txt
49
+ ```
50
+ 3. Run the Streamlit server
51
+
52
+ ```bash
53
+ streamlit run app.py
54
+ ```
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FileGPT
3
+ emoji: 🐢
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: davila7/filegpt
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__init__.py ADDED
File without changes
__pycache__/embeddings.cpython-310.pyc ADDED
Binary file (4.42 kB). View file
 
__pycache__/prompts.cpython-310.pyc ADDED
Binary file (2.19 kB). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (5.33 kB). View file
 
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_chat import message
3
+ import os
4
+ from utils import (
5
+ parse_docx,
6
+ parse_pdf,
7
+ parse_txt,
8
+ parse_csv,
9
+ parse_pptx,
10
+ search_docs,
11
+ embed_docs,
12
+ text_to_docs,
13
+ get_answer,
14
+ parse_any,
15
+ get_sources,
16
+ wrap_text_in_html,
17
+ )
18
+ from openai.error import OpenAIError
19
+
20
+ def clear_submit():
21
+ st.session_state["submit"] = False
22
+
23
+ def set_openai_api_key(api_key: str):
24
+ st.session_state["OPENAI_API_KEY"] = api_key
25
+
26
+ st.markdown('<h1>File GPT 🤖<small> by <a href="https://codegpt.co">Code GPT</a></small></h1>', unsafe_allow_html=True)
27
+
28
+ # Sidebar
29
+ index = None
30
+ doc = None
31
+ with st.sidebar:
32
+ user_secret = st.text_input(
33
+ "OpenAI API Key",
34
+ type="password",
35
+ placeholder="Paste your OpenAI API key here (sk-...)",
36
+ help="You can get your API key from https://platform.openai.com/account/api-keys.",
37
+ value=st.session_state.get("OPENAI_API_KEY", ""),
38
+ )
39
+ if user_secret:
40
+ set_openai_api_key(user_secret)
41
+
42
+ uploaded_file = st.file_uploader(
43
+ "Upload a pdf, docx, or txt file",
44
+ type=["pdf", "docx", "txt", "csv", "pptx", "js", "py", "json", "html", "css", "md"],
45
+ help="Scanned documents are not supported yet!",
46
+ on_change=clear_submit,
47
+ )
48
+
49
+ if uploaded_file is not None:
50
+ if uploaded_file.name.endswith(".pdf"):
51
+ doc = parse_pdf(uploaded_file)
52
+ elif uploaded_file.name.endswith(".docx"):
53
+ doc = parse_docx(uploaded_file)
54
+ elif uploaded_file.name.endswith(".csv"):
55
+ doc = parse_csv(uploaded_file)
56
+ elif uploaded_file.name.endswith(".txt"):
57
+ doc = parse_txt(uploaded_file)
58
+ elif uploaded_file.name.endswith(".pptx"):
59
+ doc = parse_pptx(uploaded_file)
60
+ else:
61
+ doc = parse_any(uploaded_file)
62
+ #st.error("File type not supported")
63
+ #doc = None
64
+ text = text_to_docs(doc)
65
+ st.write(text)
66
+ try:
67
+ with st.spinner("Indexing document... This may take a while⏳"):
68
+ index = embed_docs(text)
69
+ st.session_state["api_key_configured"] = True
70
+ except OpenAIError as e:
71
+ st.error(e._message)
72
+
73
+ tab1, tab2 = st.tabs(["Intro", "Chat with the File"])
74
+ with tab1:
75
+ st.markdown("### How does it work?")
76
+ st.write("File GPT is a tool that allows you to ask questions about a document and get answers from the document. The tool uses the OpenAI API to embed the document and then uses the Embedding API to find the most similar documents to the question. The tool then uses LangChain to obtain the answer from the most similar documents.")
77
+ st.write("The tool is currently in beta and is not perfect. It is recommended to use it with short documents.")
78
+ st.write("""---""")
79
+ st.markdown("### How to use it?")
80
+ st.write("To use the tool you must first add your OpenAI API Key and then upload a document. The tool currently supports the following file types: pdf, docx, txt, csv, pptx. Once the document is uploaded, the tool will index the document and embed it. This may take a while depending on the size of the document. Once the document is indexed, you can ask questions about the document. The tool will return the answer to the question and the source of the answer.")
81
+ st.markdown('<p>Read the article to know more details: <a target="_blank" href="https://medium.com/@dan.avila7/file-gpt-conversaci%C3%B3n-por-chat-con-un-archivo-698d17570358">Medium Article (Spanish)</a></p>', unsafe_allow_html=True)
82
+ st.write("## File GPT was written with the following tools:")
83
+ st.markdown("#### Code GPT")
84
+ st.write('All code was written with the help of Code GPT. Visit https://codegpt.co to get the extension.')
85
+ st.markdown("#### Streamlit")
86
+ st.write('The design was written with <a target="_blank" href="https://streamlit.io/">Streamlit</a>.', unsafe_allow_html=True)
87
+ st.markdown("#### LangChain")
88
+ st.write('Question answering with source <a target="_blank" href="https://langchain.readthedocs.io/en/latest/use_cases/question_answering.html#adding-in-sources">Langchain QA</a>.', unsafe_allow_html=True)
89
+ st.markdown("#### Embedding")
90
+ st.write('<a target="_blank" href="https://platform.openai.com/docs/guides/embeddings">Embedding</a> is done via the OpenAI API with "text-embedding-ada-002"', unsafe_allow_html=True)
91
+ st.write("Please note that you must have credits in your OpenAI account to use this tool. Each file uploaded to the platform consumes credits for embedding and each query consumes credits to obtain the response.")
92
+ st.markdown("""---""")
93
+ st.write('Author: <a target="_blank" href="https://www.linkedin.com/in/daniel-avila-arias/">Daniel Avila</a>', unsafe_allow_html=True)
94
+ st.write('Repo: <a target="_blank" href="https://github.com/davila7/file-gpt">Github</a>', unsafe_allow_html=True)
95
+ st.write("This software was developed with Code GPT, for more information visit: https://codegpt.co", unsafe_allow_html=True)
96
+
97
+ with tab2:
98
+ st.write('To obtain an API Key you must create an OpenAI account at the following link: https://openai.com/api/')
99
+ if 'generated' not in st.session_state:
100
+ st.session_state['generated'] = []
101
+
102
+ if 'past' not in st.session_state:
103
+ st.session_state['past'] = []
104
+
105
+ def get_text():
106
+ if user_secret:
107
+ st.header("Ask me something about the document:")
108
+ input_text = st.text_area("You:", on_change=clear_submit)
109
+ return input_text
110
+ user_input = get_text()
111
+
112
+ button = st.button("Submit")
113
+ if button or st.session_state.get("submit"):
114
+ if not user_input:
115
+ st.error("Please enter a question!")
116
+ else:
117
+ st.session_state["submit"] = True
118
+ sources = search_docs(index, user_input)
119
+ try:
120
+ answer = get_answer(sources, user_input)
121
+ st.session_state.past.append(user_input)
122
+ st.session_state.generated.append(answer["output_text"].split("SOURCES: ")[0])
123
+ except OpenAIError as e:
124
+ st.error(e._message)
125
+ if st.session_state['generated']:
126
+ for i in range(len(st.session_state['generated'])-1, -1, -1):
127
+ message(st.session_state["generated"][i], key=str(i))
128
+ message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
embeddings.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Wrapper around OpenAI embedding models."""
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from pydantic import BaseModel, Extra, root_validator
5
+
6
+ from langchain.embeddings.base import Embeddings
7
+ from langchain.utils import get_from_dict_or_env
8
+
9
+ from tenacity import (
10
+ retry,
11
+ retry_if_exception_type,
12
+ stop_after_attempt,
13
+ wait_exponential,
14
+ )
15
+ from openai.error import Timeout, APIError, APIConnectionError, RateLimitError
16
+
17
+
18
+ class OpenAIEmbeddings(BaseModel, Embeddings):
19
+ """Wrapper around OpenAI embedding models.
20
+ To use, you should have the ``openai`` python package installed, and the
21
+ environment variable ``OPENAI_API_KEY`` set with your API key or pass it
22
+ as a named parameter to the constructor.
23
+ Example:
24
+ .. code-block:: python
25
+ from langchain.embeddings import OpenAIEmbeddings
26
+ openai = OpenAIEmbeddings(openai_api_key="my-api-key")
27
+ """
28
+
29
+ client: Any #: :meta private:
30
+ document_model_name: str = "text-embedding-ada-002"
31
+ query_model_name: str = "text-embedding-ada-002"
32
+ openai_api_key: Optional[str] = None
33
+
34
+ class Config:
35
+ """Configuration for this pydantic object."""
36
+
37
+ extra = Extra.forbid
38
+
39
+ # TODO: deprecate this
40
+ @root_validator(pre=True, allow_reuse=True)
41
+ def get_model_names(cls, values: Dict) -> Dict:
42
+ """Get model names from just old model name."""
43
+ if "model_name" in values:
44
+ if "document_model_name" in values:
45
+ raise ValueError(
46
+ "Both `model_name` and `document_model_name` were provided, "
47
+ "but only one should be."
48
+ )
49
+ if "query_model_name" in values:
50
+ raise ValueError(
51
+ "Both `model_name` and `query_model_name` were provided, "
52
+ "but only one should be."
53
+ )
54
+ model_name = values.pop("model_name")
55
+ values["document_model_name"] = f"text-search-{model_name}-doc-001"
56
+ values["query_model_name"] = f"text-search-{model_name}-query-001"
57
+ return values
58
+
59
+ @root_validator(allow_reuse=True)
60
+ def validate_environment(cls, values: Dict) -> Dict:
61
+ """Validate that api key and python package exists in environment."""
62
+ openai_api_key = get_from_dict_or_env(
63
+ values, "openai_api_key", "OPENAI_API_KEY"
64
+ )
65
+ try:
66
+ import openai
67
+
68
+ openai.api_key = openai_api_key
69
+ values["client"] = openai.Embedding
70
+ except ImportError:
71
+ raise ValueError(
72
+ "Could not import openai python package. "
73
+ "Please it install it with `pip install openai`."
74
+ )
75
+ return values
76
+
77
+ @retry(
78
+ reraise=True,
79
+ stop=stop_after_attempt(100),
80
+ wait=wait_exponential(multiplier=1, min=10, max=60),
81
+ retry=(
82
+ retry_if_exception_type(Timeout)
83
+ | retry_if_exception_type(APIError)
84
+ | retry_if_exception_type(APIConnectionError)
85
+ | retry_if_exception_type(RateLimitError)
86
+ ),
87
+ )
88
+ def _embedding_func(self, text: str, *, engine: str) -> List[float]:
89
+ """Call out to OpenAI's embedding endpoint with exponential backoff."""
90
+ # replace newlines, which can negatively affect performance.
91
+ text = text.replace("\n", " ")
92
+ return self.client.create(input=[text], engine=engine)["data"][0]["embedding"]
93
+
94
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
95
+ """Call out to OpenAI's embedding endpoint for embedding search docs.
96
+ Args:
97
+ texts: The list of texts to embed.
98
+ Returns:
99
+ List of embeddings, one for each text.
100
+ """
101
+ responses = [
102
+ self._embedding_func(text, engine=self.document_model_name)
103
+ for text in texts
104
+ ]
105
+ return responses
106
+
107
+ def embed_query(self, text: str) -> List[float]:
108
+ """Call out to OpenAI's embedding endpoint for embedding query text.
109
+ Args:
110
+ text: The text to embed.
111
+ Returns:
112
+ Embeddings for the text.
113
+ """
114
+ embedding = self._embedding_func(text, engine=self.query_model_name)
115
+ return embedding
prompts.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import PromptTemplate
2
+
3
+ ## Use a shorter template to reduce the number of tokens in the prompt
4
+ template = """Create a final answer to the given questions using the provided document excerpts(in no particular order) as references. ALWAYS include a "SOURCES" section in your answer including only the minimal set of sources needed to answer the question. If you are unable to answer the question, simply state that you do not know. Do not attempt to fabricate an answer and leave the SOURCES section empty.
5
+ ---------
6
+ QUESTION: What is the purpose of ARPA-H?
7
+ =========
8
+ Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more.
9
+ Source: 1-32
10
+ Content: While we’re at it, let’s make sure every American can get the health care they need. \n\nWe’ve already made historic investments in health care. \n\nWe’ve made it easier for Americans to get the care they need, when they need it. \n\nWe’ve made it easier for Americans to get the treatments they need, when they need them. \n\nWe’ve made it easier for Americans to get the medications they need, when they need them.
11
+ Source: 1-33
12
+ Content: The V.A. is pioneering new ways of linking toxic exposures to disease, already helping veterans get the care they deserve. \n\nWe need to extend that same care to all Americans. \n\nThat’s why I’m calling on Congress to pass legislation that would establish a national registry of toxic exposures, and provide health care and financial assistance to those affected.
13
+ Source: 1-30
14
+ =========
15
+ FINAL ANSWER: The purpose of ARPA-H is to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more.
16
+ SOURCES: 1-32
17
+ ---------
18
+ QUESTION: {question}
19
+ =========
20
+ {summaries}
21
+ =========
22
+ FINAL ANSWER:"""
23
+
24
+ STUFF_PROMPT = PromptTemplate(
25
+ template=template, input_variables=["summaries", "question"]
26
+ )
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ pypdf
3
+ scikit-learn
4
+ numpy
5
+ tiktoken
6
+ docx2txt
7
+ langchain
8
+ pydantic
9
+ typing
10
+ faiss-cpu
11
+ streamlit_chat
12
+ python-pptx
utils.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.vectorstores.faiss import FAISS
3
+ from langchain import OpenAI, Cohere
4
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
5
+ from embeddings import OpenAIEmbeddings
6
+ from langchain.llms import OpenAI
7
+ from langchain.docstore.document import Document
8
+ from langchain.vectorstores import FAISS, VectorStore
9
+ import docx2txt
10
+ from typing import List, Dict, Any
11
+ import re
12
+ import numpy as np
13
+ from io import StringIO
14
+ from io import BytesIO
15
+ import streamlit as st
16
+ from prompts import STUFF_PROMPT
17
+ from pypdf import PdfReader
18
+ from openai.error import AuthenticationError
19
+ import pptx
20
+
21
+ @st.experimental_memo()
22
+ def parse_docx(file: BytesIO) -> str:
23
+ text = docx2txt.process(file)
24
+ # Remove multiple newlines
25
+ text = re.sub(r"\n\s*\n", "\n\n", text)
26
+ return text
27
+
28
+
29
+ @st.experimental_memo()
30
+ def parse_pdf(file: BytesIO) -> List[str]:
31
+ pdf = PdfReader(file)
32
+ output = []
33
+ for page in pdf.pages:
34
+ text = page.extract_text()
35
+ # Merge hyphenated words
36
+ text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
37
+ # Fix newlines in the middle of sentences
38
+ text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
39
+ # Remove multiple newlines
40
+ text = re.sub(r"\n\s*\n", "\n\n", text)
41
+
42
+ output.append(text)
43
+
44
+ return output
45
+
46
+
47
+ @st.experimental_memo()
48
+ def parse_txt(file: BytesIO) -> str:
49
+ text = file.read().decode("utf-8")
50
+ # Remove multiple newlines
51
+ text = re.sub(r"\n\s*\n", "\n\n", text)
52
+ return text
53
+
54
+ @st.experimental_memo()
55
+ def parse_pptx(file: BytesIO) -> str:
56
+
57
+ ppt_file = pptx.Presentation(file)
58
+
59
+ string_data = ""
60
+
61
+ for slide in ppt_file.slides:
62
+ for shape in slide.shapes:
63
+ if shape.has_text_frame:
64
+ string_data += shape.text_frame.text + '\n'
65
+ return string_data
66
+
67
+ @st.experimental_memo()
68
+ def parse_csv(uploaded_file):
69
+ # To read file as bytes:
70
+ #bytes_data = uploaded_file.getvalue()
71
+ #st.write(bytes_data)
72
+
73
+ # To convert to a string based IO:
74
+ stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
75
+ #st.write(stringio)
76
+
77
+ # To read file as string:
78
+ string_data = stringio.read()
79
+ #st.write(string_data)
80
+
81
+ # Can be used wherever a "file-like" object is accepted:
82
+ # dataframe = pd.read_csv(uploaded_file)
83
+ return string_data
84
+
85
+ @st.experimental_memo()
86
+ def parse_any(uploaded_file):
87
+ stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
88
+ string_data = stringio.read()
89
+ return string_data
90
+
91
+ @st.cache(allow_output_mutation=True)
92
+ def text_to_docs(text: str) -> List[Document]:
93
+ """Converts a string or list of strings to a list of Documents
94
+ with metadata."""
95
+ if isinstance(text, str):
96
+ # Take a single string as one page
97
+ text = [text]
98
+ page_docs = [Document(page_content=page) for page in text]
99
+
100
+ # Add page numbers as metadata
101
+ for i, doc in enumerate(page_docs):
102
+ doc.metadata["page"] = i + 1
103
+
104
+ # Split pages into chunks
105
+ doc_chunks = []
106
+
107
+ for doc in page_docs:
108
+ text_splitter = RecursiveCharacterTextSplitter(
109
+ chunk_size=800,
110
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
111
+ chunk_overlap=0,
112
+ )
113
+ chunks = text_splitter.split_text(doc.page_content)
114
+ for i, chunk in enumerate(chunks):
115
+ doc = Document(
116
+ page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
117
+ )
118
+ # Add sources a metadata
119
+ doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
120
+ doc_chunks.append(doc)
121
+ return doc_chunks
122
+
123
+
124
+ @st.cache(allow_output_mutation=True, show_spinner=False)
125
+ def embed_docs(docs: List[Document]) -> VectorStore:
126
+ """Embeds a list of Documents and returns a FAISS index"""
127
+
128
+ if not st.session_state.get("OPENAI_API_KEY"):
129
+ raise AuthenticationError(
130
+ "Enter your OpenAI API key in the sidebar. You can get a key at https://platform.openai.com/account/api-keys."
131
+ )
132
+ else:
133
+ # Embed the chunks
134
+ embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY")) # type: ignore
135
+ index = FAISS.from_documents(docs, embeddings)
136
+
137
+ return index
138
+
139
+
140
+ @st.cache(allow_output_mutation=True)
141
+ def search_docs(index: VectorStore, query: str) -> List[Document]:
142
+ """Searches a FAISS index for similar chunks to the query
143
+ and returns a list of Documents."""
144
+
145
+ # Search for similar chunks
146
+ docs = index.similarity_search(query, k=5)
147
+ return docs
148
+
149
+
150
+ @st.cache(allow_output_mutation=True)
151
+ def get_answer(docs: List[Document], query: str) -> Dict[str, Any]:
152
+ """Gets an answer to a question from a list of Documents."""
153
+
154
+ # Get the answer
155
+ chain = load_qa_with_sources_chain(OpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY")), chain_type="stuff", prompt=STUFF_PROMPT) # type: ignore
156
+
157
+ answer = chain(
158
+ {"input_documents": docs, "question": query}, return_only_outputs=True
159
+ )
160
+ return answer
161
+
162
+
163
+ @st.cache(allow_output_mutation=True)
164
+ def get_sources(answer: Dict[str, Any], docs: List[Document]) -> List[Document]:
165
+ """Gets the source documents for an answer."""
166
+
167
+ # Get sources for the answer
168
+ source_keys = [s for s in answer["output_text"].split("SOURCES: ")[-1].split(", ")]
169
+
170
+ source_docs = []
171
+ for doc in docs:
172
+ if doc.metadata["source"] in source_keys:
173
+ source_docs.append(doc)
174
+
175
+ return source_docs
176
+
177
+
178
+ def wrap_text_in_html(text: str) -> str:
179
+ """Wraps each text block separated by newlines in <p> tags"""
180
+ if isinstance(text, list):
181
+ # Add horizontal rules between pages
182
+ text = "\n<hr/>\n".join(text)
183
+ return "".join([f"<p>{line}</p>" for line in text.split("\n")])