bondares commited on
Commit
3006f55
·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.index filter=lfs diff=lfs merge=lfs -text
36
+ *.db filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ .vscode
3
+ huggingface/
4
+ .env
5
+ .streamlit
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7.4-stretch
2
+
3
+ WORKDIR /home/user
4
+
5
+ RUN apt-get update && apt-get install -y curl git pkg-config cmake
6
+
7
+ # copy code
8
+ COPY setup.py /home/user
9
+ COPY utils.py /home/user
10
+ COPY app.py /home/user
11
+ COPY requirements.txt /home/user
12
+
13
+ # install as a package
14
+ RUN pip install --upgrade pip
15
+ RUN pip install -r requirements.txt
16
+ RUN python3 -c "from utils import get_pipelines;get_pipelines()"
17
+
18
+ EXPOSE 8501
19
+
20
+ # cmd for running the API
21
+ CMD ["python", "-m", "streamlit", "run", "app.py"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Ai Advisor
3
+ emoji: 📈
4
+ colorFrom: yellow
5
+ colorTo: red
6
+ sdk: streamlit
7
+ sdk_version: 1.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ cwd = os.getcwd()
4
+ os.environ['PYTORCH_TRANSFORMERS_CACHE'] = os.path.join(cwd, 'huggingface/transformers/')
5
+ os.environ['TRANSFORMERS_CACHE'] = os.path.join(cwd, 'huggingface/transformers/')
6
+ os.environ['HF_HOME'] = os.path.join(cwd, 'huggingface/')
7
+ # import sys
8
+ import logging
9
+ from json import JSONDecodeError
10
+ from pathlib import Path
11
+
12
+ # import zipfile
13
+ import pandas as pd
14
+ import streamlit as st
15
+ from markdown import markdown
16
+
17
+ from utils import get_backlink, get_pipelines, query, send_feedback, upload_doc
18
+
19
+ # Adjust to a question that you would like users to see in the search bar when they load the UI:
20
+ DEFAULT_QUESTION_AT_STARTUP = os.getenv(
21
+ "DEFAULT_QUESTION_AT_STARTUP", "How to get TPS?")
22
+ DEFAULT_ANSWER_AT_STARTUP = os.getenv(
23
+ "DEFAULT_ANSWER_AT_STARTUP", "You must file a Form I-765")
24
+
25
+ # Sliders
26
+ DEFAULT_DOCS_FROM_RETRIEVER = int(
27
+ os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "5"))
28
+ DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "1"))
29
+
30
+
31
+ # Whether the file upload should be enabled or not
32
+ DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD", "True"))
33
+
34
+ LANG_MAP = {"English": "English", "Ukrainian": "Ukrainian", "russian": "russian"}
35
+
36
+
37
+ pipelines = get_pipelines()
38
+
39
+
40
+ def set_state_if_absent(key, value):
41
+ if key not in st.session_state:
42
+ st.session_state[key] = value
43
+
44
+
45
+ def main():
46
+
47
+ st.set_page_config(page_title="AI advisor")
48
+
49
+ # Persistent state
50
+ set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
51
+ set_state_if_absent("answer", DEFAULT_ANSWER_AT_STARTUP)
52
+ set_state_if_absent("results", None)
53
+ set_state_if_absent("raw_json", None)
54
+ set_state_if_absent("random_question_requested", False)
55
+
56
+ # Small callback to reset the interface in case the text of the question changes
57
+ def reset_results(*args):
58
+ st.session_state.answer = None
59
+ st.session_state.results = None
60
+ st.session_state.raw_json = None
61
+
62
+ # Title
63
+ st.write("# AI Immigration advisor")
64
+
65
+ # Sidebar
66
+ st.sidebar.header("Options")
67
+ language = LANG_MAP[st.sidebar.selectbox(
68
+ "Select language: ", ("English", "Ukrainian", "russian"))]
69
+ debug = False
70
+ # debug = st.sidebar.checkbox("Show debug info")
71
+ if debug:
72
+ top_k_reader = st.sidebar.slider(
73
+ "Max. number of answers",
74
+ min_value=1,
75
+ max_value=100,
76
+ value=DEFAULT_NUMBER_OF_ANSWERS,
77
+ step=1,
78
+ on_change=reset_results,
79
+ )
80
+
81
+ top_k_retriever = st.sidebar.slider(
82
+ "Max. number of documents from retriever",
83
+ min_value=1,
84
+ max_value=100,
85
+ value=DEFAULT_DOCS_FROM_RETRIEVER,
86
+ step=1,
87
+ on_change=reset_results,
88
+ )
89
+ else:
90
+ top_k_reader = DEFAULT_NUMBER_OF_ANSWERS
91
+ top_k_retriever = DEFAULT_DOCS_FROM_RETRIEVER
92
+ # File upload block
93
+ if not DISABLE_FILE_UPLOAD:
94
+ st.sidebar.write("## File Upload:")
95
+ data_files = st.sidebar.file_uploader(
96
+ "", type=["pdf", "txt", "docx"], accept_multiple_files=True)
97
+ for data_file in data_files:
98
+ # Upload file
99
+ if data_file:
100
+ raw_json = upload_doc(data_file)
101
+ st.sidebar.write(str(data_file.name) + "    ✅ ")
102
+ if debug:
103
+ st.subheader("REST API JSON response")
104
+ st.sidebar.write(raw_json)
105
+
106
+ # st.sidebar.markdown(
107
+ # f"""
108
+ # <style>
109
+ # a {{
110
+ # text-decoration: none;
111
+ # }}
112
+ # .haystack-footer {{
113
+ # text-align: center;
114
+ # }}
115
+ # .haystack-footer h4 {{
116
+ # margin: 0.1rem;
117
+ # padding:0;
118
+ # }}
119
+ # footer {{
120
+ # opacity: 0;
121
+ # }}
122
+ # </style>
123
+ # <div class="haystack-footer">
124
+ # <hr />
125
+ # <h4>Debug parameters</h4>
126
+ # <small>Data crawled from <a href="https://www.uscis.gov">USCIS</a></small></div>
127
+ # """,
128
+ # unsafe_allow_html=True,
129
+ # )
130
+
131
+ # Search bar
132
+ question = st.text_input(
133
+ "", value=st.session_state.question, max_chars=100, on_change=reset_results)
134
+ col1, col2 = st.columns(2)
135
+ col1.markdown(
136
+ "<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
137
+ col2.markdown(
138
+ "<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
139
+
140
+ # Run button
141
+ run_pressed = col1.button("Run")
142
+
143
+ run_query = (
144
+ run_pressed or question != st.session_state.question
145
+ ) and not st.session_state.random_question_requested
146
+
147
+ # Get results for query
148
+ if run_query and question:
149
+ reset_results()
150
+ st.session_state.question = question
151
+
152
+ with st.spinner("🧠 &nbsp;&nbsp; Performing neural search on documents... \n "):
153
+ try:
154
+ st.session_state.results, st.session_state.raw_json = query(
155
+ pipelines, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever, language=language
156
+ )
157
+ except JSONDecodeError as je:
158
+ st.error(
159
+ "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
160
+ return
161
+ except Exception as e:
162
+ logging.exception(e)
163
+ if "The server is busy processing requests" in str(e) or "503" in str(e):
164
+ st.error(
165
+ "🧑‍🌾 &nbsp;&nbsp; All our workers are busy! Try again later.")
166
+ else:
167
+ st.error(
168
+ "🐞 &nbsp;&nbsp; An error occurred during the request.")
169
+ return
170
+
171
+ if st.session_state.results:
172
+
173
+ st.write("## Results:")
174
+
175
+ for count, result in enumerate(st.session_state.results):
176
+ if result["answer"]:
177
+ answer, context = result["answer"], result["context"]
178
+ start_idx = context.find(answer)
179
+ end_idx = start_idx + len(answer)
180
+ # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
181
+ st.write(
182
+ markdown(f"**Answer:** {answer}"), unsafe_allow_html=True)
183
+ # st.write(
184
+ # markdown(context[:start_idx] + str(annotation(answer, "ANSWER", "#8ef")) + context[end_idx:]),
185
+ # unsafe_allow_html=True,
186
+ # )
187
+ source = ""
188
+ url, title = get_backlink(result)
189
+ if url and title:
190
+ source = f"[{result['document']['meta']['title']}]({result['document']['meta']['url']})"
191
+ else:
192
+ source = f"{result['source']}"
193
+ st.markdown(f"**Source:** {source}")
194
+
195
+ else:
196
+ st.info(
197
+ "🤔 &nbsp;&nbsp; Unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
198
+ )
199
+
200
+ st.write("___")
201
+
202
+ if debug:
203
+ st.subheader("REST API JSON response")
204
+ st.write(st.session_state.raw_json)
205
+
206
+
207
+ main()
config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+
5
+ PIPELINE_YAML_PATH = os.getenv(
6
+ "PIPELINE_YAML_PATH", str((Path(__file__).parent / "pipeline" / "pipelines.haystack-pipeline.yml").absolute())
7
+ )
8
+ FAISS_INDEX_PATH = os.getenv(
9
+ "FAISS_INDEX_PATH", str((Path(__file__).parent / "faiss.index").absolute())
10
+ )
11
+ QUERY_PIPELINE_NAME = os.getenv("QUERY_PIPELINE_NAME", "query")
12
+ INDEXING_PIPELINE_NAME = os.getenv("INDEXING_PIPELINE_NAME", "indexing")
13
+
14
+ FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", str((Path(__file__).parent / "file-upload").absolute()))
15
+
16
+ LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
17
+ ROOT_PATH = os.getenv("ROOT_PATH", "/")
18
+
19
+ CONCURRENT_REQUEST_PER_WORKER = int(os.getenv("CONCURRENT_REQUEST_PER_WORKER", "4"))
faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa1c641d84f7bf7e58968610c31e913a3d2c44e341fde639c0c8a0ce48b16d19
3
+ size 24035373
faiss.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sql_url": "sqlite:///openai_faiss_document_store.db", "embedding_dim": 1024, "faiss_index_factory_str": "Flat", "similarity": "cosine"}
openai_faiss_document_store.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:532271b901d948f42992d4ccc4e9e3b5b9b7ebb9ea1547ce6d7adf09c3b81bda
3
+ size 17989632
pipeline/__init__.py ADDED
File without changes
pipeline/pipelines.haystack-pipeline.yml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
2
+
3
+ version: 1.7.0
4
+
5
+ components: # define all the building-blocks for Pipeline
6
+ - name: DocumentStore
7
+ type: FAISSDocumentStore # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents
8
+ params:
9
+ faiss_index_path: faiss.index
10
+ # faiss_config_path: rest_api/faiss.json
11
+ # sql_url: sqlite:///rest_api/faiss_document_store.db
12
+ - name: Retriever
13
+ type: DensePassageRetriever
14
+ params:
15
+ document_store: DocumentStore # params can reference other components defined in the YAML
16
+ passage_embedding_model: vblagoje/dpr-ctx_encoder-single-lfqa-wiki
17
+ query_embedding_model: vblagoje/dpr-question_encoder-single-lfqa-wiki
18
+ - name: Generator # custom-name for the component; helpful for visualization & debugging
19
+ type: Seq2SeqGenerator # Haystack Class name for the component
20
+ params:
21
+ model_name_or_path: vblagoje/bart_lfqa
22
+ max_length: 300
23
+ min_length: 10
24
+ # - name: TextFileConverter
25
+ # type: TextConverter
26
+ # - name: PDFFileConverter
27
+ # type: PDFToTextConverter
28
+ # - name: Preprocessor
29
+ # type: PreProcessor
30
+ # params:
31
+ # split_by: word
32
+ # split_length: 300
33
+ # - name: FileTypeClassifier
34
+ # type: FileTypeClassifier
35
+
36
+ pipelines:
37
+ - name: query # generative-qa Pipeline
38
+ nodes:
39
+ - name: Retriever
40
+ inputs: [Query]
41
+ - name: Generator
42
+ inputs: [Retriever]
43
+ # - name: indexing
44
+ # nodes:
45
+ # - name: FileTypeClassifier
46
+ # inputs: [File]
47
+ # - name: TextFileConverter
48
+ # inputs: [FileTypeClassifier.output_1]
49
+ # - name: PDFFileConverter
50
+ # inputs: [FileTypeClassifier.output_2]
51
+ # - name: Preprocessor
52
+ # inputs: [PDFFileConverter, TextFileConverter]
53
+ # - name: Retriever
54
+ # inputs: [Preprocessor]
55
+ # - name: DocumentStore
56
+ # inputs: [Retriever]
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ farm-haystack==1.13.2
2
+ streamlit>=1.2.0, <2
3
+ st-annotated-text>=2.0.0, <3
4
+ markdown>=3.3.4, <4
5
+ faiss-cpu==1.7.2
6
+ openai==0.27.2
setup.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from setuptools import setup, find_packages
5
+
6
+
7
+ VERSION = "0.0.0"
8
+ try:
9
+ # After git clone, VERSION.txt is in the root folder
10
+ VERSION = open(Path(__file__).parent.parent / "VERSION.txt", "r").read()
11
+ except Exception:
12
+ try:
13
+ # In Docker, VERSION.txt is in the same folder
14
+ VERSION = open(Path(__file__).parent / "VERSION.txt", "r").read()
15
+ except Exception as e:
16
+ logging.exception("No VERSION.txt found!")
17
+
18
+ setup(
19
+ name="farm-haystack-ui",
20
+ version=VERSION,
21
+ description="Demo UI for Haystack (https://github.com/deepset-ai/haystack)",
22
+ author="deepset.ai",
23
+ author_email="[email protected]",
24
+ url=" https://github.com/deepset-ai/haystack/tree/master/ui",
25
+ classifiers=[
26
+ "Development Status :: 5 - Production/Stable",
27
+ "Intended Audience :: Science/Research",
28
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
29
+ "Operating System :: OS Independent",
30
+ "Programming Language :: Python",
31
+ "Programming Language :: Python :: 3",
32
+ "Programming Language :: Python :: 3.7",
33
+ "Programming Language :: Python :: 3.8",
34
+ "Programming Language :: Python :: 3.9",
35
+ "Programming Language :: Python :: 3.10",
36
+ ],
37
+ packages=find_packages(),
38
+ python_requires=">=3.7, <4",
39
+ install_requires=["streamlit>=1.2.0, <2", "st-annotated-text>=2.0.0, <3", "markdown>=3.3.4, <4", "farm-haystack==1.7.0"],
40
+ )
test/__init__.py ADDED
File without changes
test/test_ui_utils.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unittest.mock import patch
2
+
3
+ from ui.utils import haystack_is_ready
4
+
5
+
6
+ def test_haystack_is_ready():
7
+ with patch("requests.get") as mocked_get:
8
+ mocked_get.return_value.status_code = 200
9
+ assert haystack_is_ready()
10
+
11
+
12
+ def test_haystack_is_ready_fail():
13
+ with patch("requests.get") as mocked_get:
14
+ mocked_get.return_value.status_code = 400
15
+ assert not haystack_is_ready()
utils.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Tuple, Optional
2
+ import yaml
3
+ import os
4
+
5
+ cwd = os.getcwd()
6
+ os.environ["PYTORCH_TRANSFORMERS_CACHE"] = os.path.join(
7
+ cwd, "huggingface/transformers/"
8
+ )
9
+ os.environ["TRANSFORMERS_CACHE"] = os.path.join(cwd, "huggingface/transformers/")
10
+ os.environ["HF_HOME"] = os.path.join(cwd, "huggingface/")
11
+ import logging
12
+ import json
13
+ from time import sleep, time
14
+
15
+ import requests
16
+ import streamlit as st
17
+
18
+ from pathlib import Path
19
+
20
+ from haystack.pipelines.base import Pipeline
21
+ from haystack.nodes import EmbeddingRetriever, Shaper
22
+ from haystack.document_stores import FAISSDocumentStore, InMemoryDocumentStore
23
+ from haystack.errors import PipelineConfigError
24
+ # from googletrans import Translator
25
+ import openai
26
+
27
+ openai.api_key = st.secrets["gpt35_api_key"]
28
+
29
+ logger = logging.getLogger(__name__)
30
+ pipelines = None
31
+
32
+ system_prompt_template = """You are an US experienced immigration attorney. Your answer should be in your own words, be detailed and be no longer than 350 words.
33
+ You should ask for more information or clarifications to give a more precise answer for an each client's case.
34
+ Synthesize a comprehensive answer from your knowledge and the following topk most relevant paragraphs and the given question.
35
+ Give an answer in the {0} language.
36
+ """
37
+ users_prompt_template = """
38
+ Paragraphs: {0}
39
+
40
+ Question: {1}
41
+ """
42
+ # Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would
43
+ # end up with different indices. The same applies for InMemoryDocumentStore.
44
+ UNSUPPORTED_DOC_STORES = (FAISSDocumentStore, InMemoryDocumentStore)
45
+
46
+ API_ENDPOINT = os.getenv("API_ENDPOINT", "http://localhost:8000")
47
+ STATUS = "initialized"
48
+ HS_VERSION = "hs_version"
49
+ DOC_REQUEST = "query"
50
+ DOC_FEEDBACK = "feedback"
51
+ DOC_UPLOAD = "file-upload"
52
+
53
+ # translator = Translator()
54
+
55
+
56
+ def query(
57
+ pipelines, query, filters={}, language="en", top_k_reader=3, top_k_retriever=5
58
+ ) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
59
+ """
60
+ Send a query to the REST API and parse the answer.
61
+ Returns both a ready-to-use representation of the results and the raw JSON.
62
+ """
63
+ query_pipeline = pipelines.get("query_pipeline", None)
64
+ start_time = time()
65
+
66
+ params = {
67
+ "retriever": {"top_k": top_k_retriever},
68
+ }
69
+
70
+ lang = language.lower() or "english"
71
+
72
+ response = query_pipeline.run(
73
+ query=query,
74
+ params=params,
75
+ )
76
+ context = ""
77
+ sources = []
78
+ for doc in response["documents"]:
79
+ doc = doc.to_dict()
80
+ doc_name = doc["meta"].get("name")
81
+ doc_url = doc["meta"].get("url")
82
+ source = (
83
+ "https://www.uscis.gov/sites/default/files/document/forms/" + doc_name
84
+ if doc_name
85
+ else doc_url
86
+ )
87
+ if not source.endswith('.txt'):
88
+ sources.append(source)
89
+ if len(context)<top_k_reader:
90
+ context += " " + doc.get("content")
91
+ # Ensure answers and documents exist, even if they're empty lists
92
+ if not "documents" in response:
93
+ response["documents"] = []
94
+
95
+ # prepare openAI api call
96
+ messages = []
97
+ system_prompt = system_prompt_template.format(lang)
98
+ user_prompt = users_prompt_template.format(context, response["query"])
99
+ messages.append({"role": "system", "content": system_prompt})
100
+ messages.append({"role": "user", "content": user_prompt})
101
+
102
+ openai_response = openai.ChatCompletion.create(
103
+ model="gpt-3.5-turbo", messages=messages
104
+ )
105
+ bot_response = openai_response["choices"][0]["message"]["content"]
106
+ response["answers"] = [bot_response]
107
+ logger.info(
108
+ json.dumps(
109
+ {
110
+ "request": query,
111
+ "response": response,
112
+ "time": f"{(time() - start_time):.2f}",
113
+ },
114
+ default=str,
115
+ )
116
+ )
117
+
118
+ # Format response
119
+ results = []
120
+ answers = response["answers"]
121
+ documents = response["documents"]
122
+ for answer, doc in zip(answers, documents):
123
+ doc = doc.to_dict()
124
+ if answer:
125
+ context = doc.get("content")
126
+ results.append(
127
+ {
128
+ "context": "..." + context if context else "",
129
+ "answer": answer,
130
+ "source": "\n".join(sources),
131
+ "_raw": answer,
132
+ }
133
+ )
134
+ else:
135
+ results.append({"context": None, "answer": None, "_raw": answer})
136
+ return results, response
137
+
138
+
139
+ def send_feedback(
140
+ query, answer_obj, is_correct_answer, is_correct_document, document
141
+ ) -> None:
142
+ """
143
+ Send a feedback (label) to the REST API
144
+ """
145
+ url = f"{API_ENDPOINT}/{DOC_FEEDBACK}"
146
+ req = {
147
+ "query": query,
148
+ "document": document,
149
+ "is_correct_answer": is_correct_answer,
150
+ "is_correct_document": is_correct_document,
151
+ "origin": "user-feedback",
152
+ "answer": answer_obj,
153
+ }
154
+ response_raw = requests.post(url, json=req)
155
+ if response_raw.status_code >= 400:
156
+ raise ValueError(
157
+ f"An error was returned [code {response_raw.status_code}]: {response_raw.json()}"
158
+ )
159
+
160
+
161
+ def upload_doc(file):
162
+ url = f"{API_ENDPOINT}/{DOC_UPLOAD}"
163
+ files = [("files", file)]
164
+ response = requests.post(url, files=files).json()
165
+ return response
166
+
167
+
168
+ def get_backlink(result) -> Tuple[Optional[str], Optional[str]]:
169
+ if result.get("document", None):
170
+ doc = result["document"]
171
+ if isinstance(doc, dict):
172
+ if doc.get("meta", None):
173
+ if isinstance(doc["meta"], dict):
174
+ if doc["meta"].get("url", None) and doc["meta"].get("title", None):
175
+ return doc["meta"]["url"], doc["meta"]["title"]
176
+ return None, None
177
+
178
+
179
+ def setup_pipelines() -> Dict[str, Any]:
180
+ # Re-import the configuration variables
181
+ import config # pylint: disable=reimported
182
+
183
+ pipelines = {}
184
+ document_store = FAISSDocumentStore(
185
+ faiss_config_path="faiss.json", faiss_index_path="faiss.index"
186
+ )
187
+ retriever = EmbeddingRetriever(
188
+ document_store=document_store,
189
+ batch_size=128,
190
+ embedding_model="ada",
191
+ api_key=st.secrets["api_key"],
192
+ max_seq_len=1024,
193
+ )
194
+
195
+ shaper = Shaper(
196
+ func="join_documents", inputs={"documents": "documents"}, outputs=["documents"]
197
+ )
198
+
199
+ pipe = Pipeline()
200
+ pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
201
+
202
+ logging.info(f"Loaded pipeline nodes: {pipe.graph.nodes.keys()}")
203
+ pipelines["query_pipeline"] = pipe
204
+
205
+ # Find document store
206
+
207
+ logging.info(f"Loaded docstore: {document_store}")
208
+ pipelines["document_store"] = document_store
209
+
210
+ # Load indexing pipeline (if available)
211
+ try:
212
+ indexing_pipeline = Pipeline.load_from_yaml(
213
+ Path(config.PIPELINE_YAML_PATH), pipeline_name=config.INDEXING_PIPELINE_NAME
214
+ )
215
+ docstore = indexing_pipeline.get_document_store()
216
+ if isinstance(docstore, UNSUPPORTED_DOC_STORES):
217
+ indexing_pipeline = None
218
+ raise PipelineConfigError(
219
+ "Indexing pipelines with FAISSDocumentStore or InMemoryDocumentStore are not supported by the REST APIs."
220
+ )
221
+
222
+ except PipelineConfigError as e:
223
+ indexing_pipeline = None
224
+ logger.error(f"{e.message}\nFile Upload API will not be available.")
225
+
226
+ finally:
227
+ pipelines["indexing_pipeline"] = indexing_pipeline
228
+
229
+ # Create directory for uploaded files
230
+ os.makedirs(config.FILE_UPLOAD_PATH, exist_ok=True)
231
+
232
+ return pipelines
233
+
234
+
235
+ def get_pipelines():
236
+ global pipelines # pylint: disable=global-statement
237
+ if not pipelines:
238
+ pipelines = setup_pipelines()
239
+ return pipelines