rodrigomasini bondares commited on
Commit
b1f727f
·
0 Parent(s):

Duplicate from bondares/ai_advisor

Browse files

Co-authored-by: Stanislav Bondarenko <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.index filter=lfs diff=lfs merge=lfs -text
36
+ *.db filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ .vscode
3
+ huggingface/
4
+ .env
5
+ .streamlit
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7.4-stretch
2
+
3
+ WORKDIR /home/user
4
+
5
+ RUN apt-get update && apt-get install -y curl git pkg-config cmake
6
+
7
+ # copy code
8
+ COPY setup.py /home/user
9
+ COPY utils.py /home/user
10
+ COPY app.py /home/user
11
+ COPY requirements.txt /home/user
12
+
13
+ # install as a package
14
+ RUN pip install --upgrade pip
15
+ RUN pip install -r requirements.txt
16
+ RUN python3 -c "from utils import get_pipelines;get_pipelines()"
17
+
18
+ EXPOSE 8501
19
+
20
+ # cmd for running the API
21
+ CMD ["python", "-m", "streamlit", "run", "app.py"]
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Ai Advisor
3
+ emoji: 📈
4
+ colorFrom: yellow
5
+ colorTo: red
6
+ sdk: streamlit
7
+ sdk_version: 1.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: bondares/ai_advisor
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ cwd = os.getcwd()
4
+ os.environ['PYTORCH_TRANSFORMERS_CACHE'] = os.path.join(cwd, 'huggingface/transformers/')
5
+ os.environ['TRANSFORMERS_CACHE'] = os.path.join(cwd, 'huggingface/transformers/')
6
+ os.environ['HF_HOME'] = os.path.join(cwd, 'huggingface/')
7
+ # import sys
8
+ import logging
9
+ from json import JSONDecodeError
10
+ from pathlib import Path
11
+
12
+ # import zipfile
13
+ import pandas as pd
14
+ import streamlit as st
15
+ from markdown import markdown
16
+
17
+ from utils import get_backlink, get_pipelines, query, send_feedback, upload_doc
18
+
19
+ # Adjust to a question that you would like users to see in the search bar when they load the UI:
20
+ DEFAULT_QUESTION_AT_STARTUP = os.getenv(
21
+ "DEFAULT_QUESTION_AT_STARTUP", "How to get TPS?")
22
+ DEFAULT_ANSWER_AT_STARTUP = os.getenv(
23
+ "DEFAULT_ANSWER_AT_STARTUP", "You must file a Form I-765")
24
+
25
+ # Sliders
26
+ DEFAULT_DOCS_FROM_RETRIEVER = int(
27
+ os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "5"))
28
+ DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "1"))
29
+
30
+
31
+ # Whether the file upload should be enabled or not
32
+ DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD", "True"))
33
+
34
+ LANG_MAP = {"English": "English", "Ukrainian": "Ukrainian", "russian": "russian"}
35
+
36
+
37
+ pipelines = get_pipelines()
38
+
39
+
40
+ def set_state_if_absent(key, value):
41
+ if key not in st.session_state:
42
+ st.session_state[key] = value
43
+
44
+
45
+ def main():
46
+
47
+ st.set_page_config(page_title="AI advisor")
48
+
49
+ # Persistent state
50
+ set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
51
+ set_state_if_absent("answer", DEFAULT_ANSWER_AT_STARTUP)
52
+ set_state_if_absent("results", None)
53
+ set_state_if_absent("raw_json", None)
54
+ set_state_if_absent("random_question_requested", False)
55
+
56
+ # Small callback to reset the interface in case the text of the question changes
57
+ def reset_results(*args):
58
+ st.session_state.answer = None
59
+ st.session_state.results = None
60
+ st.session_state.raw_json = None
61
+
62
+ # Title
63
+ st.write("# AI Immigration advisor")
64
+
65
+ # Sidebar
66
+ st.sidebar.header("Options")
67
+ language = st.sidebar.selectbox(
68
+ "Select language: ", ("English", "Ukrainian", "Spanish", "French", "Italian", "Arabic", "Hindi", "Portuguese", "Mandarin Chinese", "Japanese", "russian"))
69
+ debug = False
70
+ debug = False
71
+ # debug = st.sidebar.checkbox("Show debug info")
72
+ if debug:
73
+ top_k_reader = st.sidebar.slider(
74
+ "Max. number of answers",
75
+ min_value=1,
76
+ max_value=100,
77
+ value=DEFAULT_NUMBER_OF_ANSWERS,
78
+ step=1,
79
+ on_change=reset_results,
80
+ )
81
+
82
+ top_k_retriever = st.sidebar.slider(
83
+ "Max. number of documents from retriever",
84
+ min_value=1,
85
+ max_value=100,
86
+ value=DEFAULT_DOCS_FROM_RETRIEVER,
87
+ step=1,
88
+ on_change=reset_results,
89
+ )
90
+ else:
91
+ top_k_reader = DEFAULT_NUMBER_OF_ANSWERS
92
+ top_k_retriever = DEFAULT_DOCS_FROM_RETRIEVER
93
+ # File upload block
94
+ if not DISABLE_FILE_UPLOAD:
95
+ st.sidebar.write("## File Upload:")
96
+ data_files = st.sidebar.file_uploader(
97
+ "", type=["pdf", "txt", "docx"], accept_multiple_files=True)
98
+ for data_file in data_files:
99
+ # Upload file
100
+ if data_file:
101
+ raw_json = upload_doc(data_file)
102
+ st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; ✅ ")
103
+ if debug:
104
+ st.subheader("REST API JSON response")
105
+ st.sidebar.write(raw_json)
106
+
107
+ # st.sidebar.markdown(
108
+ # f"""
109
+ # <style>
110
+ # a {{
111
+ # text-decoration: none;
112
+ # }}
113
+ # .haystack-footer {{
114
+ # text-align: center;
115
+ # }}
116
+ # .haystack-footer h4 {{
117
+ # margin: 0.1rem;
118
+ # padding:0;
119
+ # }}
120
+ # footer {{
121
+ # opacity: 0;
122
+ # }}
123
+ # </style>
124
+ # <div class="haystack-footer">
125
+ # <hr />
126
+ # <h4>Debug parameters</h4>
127
+ # <small>Data crawled from <a href="https://www.uscis.gov">USCIS</a></small></div>
128
+ # """,
129
+ # unsafe_allow_html=True,
130
+ # )
131
+
132
+ # Search bar
133
+ question = st.text_input(
134
+ "", value=st.session_state.question, max_chars=100, on_change=reset_results)
135
+ col1, col2 = st.columns(2)
136
+ col1.markdown(
137
+ "<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
138
+ col2.markdown(
139
+ "<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
140
+
141
+ # Run button
142
+ run_pressed = col1.button("Run")
143
+
144
+ run_query = (
145
+ run_pressed or question != st.session_state.question
146
+ ) and not st.session_state.random_question_requested
147
+
148
+ # Get results for query
149
+ if run_query and question:
150
+ reset_results()
151
+ st.session_state.question = question
152
+
153
+ with st.spinner("🧠 &nbsp;&nbsp; Performing neural search on documents... \n "):
154
+ try:
155
+ st.session_state.results, st.session_state.raw_json = query(
156
+ pipelines, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever, language=language
157
+ )
158
+ except JSONDecodeError as je:
159
+ st.error(
160
+ "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
161
+ return
162
+ except Exception as e:
163
+ logging.exception(e)
164
+ if "The server is busy processing requests" in str(e) or "503" in str(e):
165
+ st.error(
166
+ "🧑‍🌾 &nbsp;&nbsp; All our workers are busy! Try again later.")
167
+ else:
168
+ st.error(
169
+ "🐞 &nbsp;&nbsp; An error occurred during the request.")
170
+ return
171
+
172
+ if st.session_state.results:
173
+
174
+ st.write("## Results:")
175
+
176
+ for count, result in enumerate(st.session_state.results):
177
+ if result["answer"]:
178
+ answer, context = result["answer"], result["context"]
179
+ start_idx = context.find(answer)
180
+ end_idx = start_idx + len(answer)
181
+ # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
182
+ st.write(
183
+ markdown(f"**Answer:** {answer}"), unsafe_allow_html=True)
184
+ # st.write(
185
+ # markdown(context[:start_idx] + str(annotation(answer, "ANSWER", "#8ef")) + context[end_idx:]),
186
+ # unsafe_allow_html=True,
187
+ # )
188
+ source = ""
189
+ url, title = get_backlink(result)
190
+ if url and title:
191
+ source = f"[{result['document']['meta']['title']}]({result['document']['meta']['url']})"
192
+ else:
193
+ source = f"{result['source']}"
194
+ st.markdown(f"**Source:** {source}")
195
+
196
+ else:
197
+ st.info(
198
+ "🤔 &nbsp;&nbsp; Unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
199
+ )
200
+
201
+ st.write("___")
202
+
203
+ if debug:
204
+ st.subheader("REST API JSON response")
205
+ st.write(st.session_state.raw_json)
206
+
207
+
208
+ main()
config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+
5
+ PIPELINE_YAML_PATH = os.getenv(
6
+ "PIPELINE_YAML_PATH", str((Path(__file__).parent / "pipeline" / "pipelines.haystack-pipeline.yml").absolute())
7
+ )
8
+ FAISS_INDEX_PATH = os.getenv(
9
+ "FAISS_INDEX_PATH", str((Path(__file__).parent / "faiss.index").absolute())
10
+ )
11
+ QUERY_PIPELINE_NAME = os.getenv("QUERY_PIPELINE_NAME", "query")
12
+ INDEXING_PIPELINE_NAME = os.getenv("INDEXING_PIPELINE_NAME", "indexing")
13
+
14
+ FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", str((Path(__file__).parent / "file-upload").absolute()))
15
+
16
+ LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
17
+ ROOT_PATH = os.getenv("ROOT_PATH", "/")
18
+
19
+ CONCURRENT_REQUEST_PER_WORKER = int(os.getenv("CONCURRENT_REQUEST_PER_WORKER", "4"))
faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa1c641d84f7bf7e58968610c31e913a3d2c44e341fde639c0c8a0ce48b16d19
3
+ size 24035373
faiss.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sql_url": "sqlite:///openai_faiss_document_store.db", "embedding_dim": 1024, "faiss_index_factory_str": "Flat", "similarity": "cosine"}
openai_faiss_document_store.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:532271b901d948f42992d4ccc4e9e3b5b9b7ebb9ea1547ce6d7adf09c3b81bda
3
+ size 17989632
pipeline/__init__.py ADDED
File without changes
pipeline/pipelines.haystack-pipeline.yml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
2
+
3
+ version: 1.7.0
4
+
5
+ components: # define all the building-blocks for Pipeline
6
+ - name: DocumentStore
7
+ type: FAISSDocumentStore # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents
8
+ params:
9
+ faiss_index_path: faiss.index
10
+ # faiss_config_path: rest_api/faiss.json
11
+ # sql_url: sqlite:///rest_api/faiss_document_store.db
12
+ - name: Retriever
13
+ type: DensePassageRetriever
14
+ params:
15
+ document_store: DocumentStore # params can reference other components defined in the YAML
16
+ passage_embedding_model: vblagoje/dpr-ctx_encoder-single-lfqa-wiki
17
+ query_embedding_model: vblagoje/dpr-question_encoder-single-lfqa-wiki
18
+ - name: Generator # custom-name for the component; helpful for visualization & debugging
19
+ type: Seq2SeqGenerator # Haystack Class name for the component
20
+ params:
21
+ model_name_or_path: vblagoje/bart_lfqa
22
+ max_length: 300
23
+ min_length: 10
24
+ # - name: TextFileConverter
25
+ # type: TextConverter
26
+ # - name: PDFFileConverter
27
+ # type: PDFToTextConverter
28
+ # - name: Preprocessor
29
+ # type: PreProcessor
30
+ # params:
31
+ # split_by: word
32
+ # split_length: 300
33
+ # - name: FileTypeClassifier
34
+ # type: FileTypeClassifier
35
+
36
+ pipelines:
37
+ - name: query # generative-qa Pipeline
38
+ nodes:
39
+ - name: Retriever
40
+ inputs: [Query]
41
+ - name: Generator
42
+ inputs: [Retriever]
43
+ # - name: indexing
44
+ # nodes:
45
+ # - name: FileTypeClassifier
46
+ # inputs: [File]
47
+ # - name: TextFileConverter
48
+ # inputs: [FileTypeClassifier.output_1]
49
+ # - name: PDFFileConverter
50
+ # inputs: [FileTypeClassifier.output_2]
51
+ # - name: Preprocessor
52
+ # inputs: [PDFFileConverter, TextFileConverter]
53
+ # - name: Retriever
54
+ # inputs: [Preprocessor]
55
+ # - name: DocumentStore
56
+ # inputs: [Retriever]
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ farm-haystack==1.13.2
2
+ streamlit>=1.2.0, <2
3
+ st-annotated-text>=2.0.0, <3
4
+ markdown>=3.3.4, <4
5
+ faiss-cpu==1.7.2
6
+ openai==0.27.2
setup.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from setuptools import setup, find_packages
5
+
6
+
7
+ VERSION = "0.0.0"
8
+ try:
9
+ # After git clone, VERSION.txt is in the root folder
10
+ VERSION = open(Path(__file__).parent.parent / "VERSION.txt", "r").read()
11
+ except Exception:
12
+ try:
13
+ # In Docker, VERSION.txt is in the same folder
14
+ VERSION = open(Path(__file__).parent / "VERSION.txt", "r").read()
15
+ except Exception as e:
16
+ logging.exception("No VERSION.txt found!")
17
+
18
+ setup(
19
+ name="farm-haystack-ui",
20
+ version=VERSION,
21
+ description="Demo UI for Haystack (https://github.com/deepset-ai/haystack)",
22
+ author="deepset.ai",
23
+ author_email="[email protected]",
24
+ url=" https://github.com/deepset-ai/haystack/tree/master/ui",
25
+ classifiers=[
26
+ "Development Status :: 5 - Production/Stable",
27
+ "Intended Audience :: Science/Research",
28
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
29
+ "Operating System :: OS Independent",
30
+ "Programming Language :: Python",
31
+ "Programming Language :: Python :: 3",
32
+ "Programming Language :: Python :: 3.7",
33
+ "Programming Language :: Python :: 3.8",
34
+ "Programming Language :: Python :: 3.9",
35
+ "Programming Language :: Python :: 3.10",
36
+ ],
37
+ packages=find_packages(),
38
+ python_requires=">=3.7, <4",
39
+ install_requires=["streamlit>=1.2.0, <2", "st-annotated-text>=2.0.0, <3", "markdown>=3.3.4, <4", "farm-haystack==1.7.0"],
40
+ )
test/__init__.py ADDED
File without changes
test/test_ui_utils.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unittest.mock import patch
2
+
3
+ from ui.utils import haystack_is_ready
4
+
5
+
6
+ def test_haystack_is_ready():
7
+ with patch("requests.get") as mocked_get:
8
+ mocked_get.return_value.status_code = 200
9
+ assert haystack_is_ready()
10
+
11
+
12
+ def test_haystack_is_ready_fail():
13
+ with patch("requests.get") as mocked_get:
14
+ mocked_get.return_value.status_code = 400
15
+ assert not haystack_is_ready()
utils.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Tuple, Optional
2
+ import yaml
3
+ import os
4
+
5
+ cwd = os.getcwd()
6
+ os.environ["PYTORCH_TRANSFORMERS_CACHE"] = os.path.join(
7
+ cwd, "huggingface/transformers/"
8
+ )
9
+ os.environ["TRANSFORMERS_CACHE"] = os.path.join(cwd, "huggingface/transformers/")
10
+ os.environ["HF_HOME"] = os.path.join(cwd, "huggingface/")
11
+ import logging
12
+ import json
13
+ from time import sleep, time
14
+
15
+ import requests
16
+ import streamlit as st
17
+
18
+ from pathlib import Path
19
+
20
+ from haystack.pipelines.base import Pipeline
21
+ from haystack.nodes import EmbeddingRetriever, Shaper
22
+ from haystack.document_stores import FAISSDocumentStore, InMemoryDocumentStore
23
+ from haystack.errors import PipelineConfigError
24
+ # from googletrans import Translator
25
+ import openai
26
+
27
+ openai.api_key = st.secrets["gpt35_api_key"]
28
+
29
+ logger = logging.getLogger(__name__)
30
+ pipelines = None
31
+
32
+ system_prompt_template = """You are an US experienced immigration attorney. Your answer should be in your own words, be detailed and be no longer than 350 words.
33
+ You should ask for more information or clarifications to give a more precise answer for an each client's case.
34
+ Synthesize a comprehensive answer from your knowledge and the following topk most relevant paragraphs and the given question.
35
+ Do not tell them to have a consultation with a certified attorney. Clients have been warned that you are our assistant and that's enough.
36
+ Give an answer in the {0} language.
37
+ """
38
+ users_prompt_template = """
39
+ Paragraphs: {0}
40
+
41
+ Question: {1}
42
+ """
43
+ # Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would
44
+ # end up with different indices. The same applies for InMemoryDocumentStore.
45
+ UNSUPPORTED_DOC_STORES = (FAISSDocumentStore, InMemoryDocumentStore)
46
+
47
+ API_ENDPOINT = os.getenv("API_ENDPOINT", "http://localhost:8000")
48
+ STATUS = "initialized"
49
+ HS_VERSION = "hs_version"
50
+ DOC_REQUEST = "query"
51
+ DOC_FEEDBACK = "feedback"
52
+ DOC_UPLOAD = "file-upload"
53
+
54
+ # translator = Translator()
55
+
56
+
57
+ def query(
58
+ pipelines, query, filters={}, language="en", top_k_reader=3, top_k_retriever=5
59
+ ) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
60
+ """
61
+ Send a query to the REST API and parse the answer.
62
+ Returns both a ready-to-use representation of the results and the raw JSON.
63
+ """
64
+ query_pipeline = pipelines.get("query_pipeline", None)
65
+ start_time = time()
66
+
67
+ params = {
68
+ "retriever": {"top_k": top_k_retriever},
69
+ }
70
+
71
+ lang = language.lower() or "english"
72
+
73
+ response = query_pipeline.run(
74
+ query=query,
75
+ params=params,
76
+ )
77
+ context = ""
78
+ sources = []
79
+ for doc in response["documents"]:
80
+ doc = doc.to_dict()
81
+ doc_name = doc["meta"].get("name")
82
+ doc_url = doc["meta"].get("url")
83
+ source = (
84
+ "https://www.uscis.gov/sites/default/files/document/forms/" + doc_name
85
+ if doc_name
86
+ else doc_url
87
+ )
88
+ if not source.endswith('.txt'):
89
+ sources.append(source)
90
+ if len(context)<top_k_reader:
91
+ context += " " + doc.get("content")
92
+ # Ensure answers and documents exist, even if they're empty lists
93
+ if not "documents" in response:
94
+ response["documents"] = []
95
+
96
+ # prepare openAI api call
97
+ messages = []
98
+ system_prompt = system_prompt_template.format(lang)
99
+ user_prompt = users_prompt_template.format(context, response["query"])
100
+ messages.append({"role": "system", "content": system_prompt})
101
+ messages.append({"role": "user", "content": user_prompt})
102
+
103
+ openai_response = openai.ChatCompletion.create(
104
+ model="gpt-3.5-turbo", messages=messages
105
+ )
106
+ bot_response = openai_response["choices"][0]["message"]["content"]
107
+ response["answers"] = [bot_response]
108
+ logger.info(
109
+ json.dumps(
110
+ {
111
+ "request": query,
112
+ "response": response,
113
+ "time": f"{(time() - start_time):.2f}",
114
+ },
115
+ default=str,
116
+ )
117
+ )
118
+
119
+ # Format response
120
+ results = []
121
+ answers = response["answers"]
122
+ documents = response["documents"]
123
+ for answer, doc in zip(answers, documents):
124
+ doc = doc.to_dict()
125
+ if answer:
126
+ context = doc.get("content")
127
+ results.append(
128
+ {
129
+ "context": "..." + context if context else "",
130
+ "answer": answer,
131
+ "source": "\n".join(sources),
132
+ "_raw": answer,
133
+ }
134
+ )
135
+ else:
136
+ results.append({"context": None, "answer": None, "_raw": answer})
137
+ return results, response
138
+
139
+
140
+ def send_feedback(
141
+ query, answer_obj, is_correct_answer, is_correct_document, document
142
+ ) -> None:
143
+ """
144
+ Send a feedback (label) to the REST API
145
+ """
146
+ url = f"{API_ENDPOINT}/{DOC_FEEDBACK}"
147
+ req = {
148
+ "query": query,
149
+ "document": document,
150
+ "is_correct_answer": is_correct_answer,
151
+ "is_correct_document": is_correct_document,
152
+ "origin": "user-feedback",
153
+ "answer": answer_obj,
154
+ }
155
+ response_raw = requests.post(url, json=req)
156
+ if response_raw.status_code >= 400:
157
+ raise ValueError(
158
+ f"An error was returned [code {response_raw.status_code}]: {response_raw.json()}"
159
+ )
160
+
161
+
162
+ def upload_doc(file):
163
+ url = f"{API_ENDPOINT}/{DOC_UPLOAD}"
164
+ files = [("files", file)]
165
+ response = requests.post(url, files=files).json()
166
+ return response
167
+
168
+
169
+ def get_backlink(result) -> Tuple[Optional[str], Optional[str]]:
170
+ if result.get("document", None):
171
+ doc = result["document"]
172
+ if isinstance(doc, dict):
173
+ if doc.get("meta", None):
174
+ if isinstance(doc["meta"], dict):
175
+ if doc["meta"].get("url", None) and doc["meta"].get("title", None):
176
+ return doc["meta"]["url"], doc["meta"]["title"]
177
+ return None, None
178
+
179
+
180
+ def setup_pipelines() -> Dict[str, Any]:
181
+ # Re-import the configuration variables
182
+ import config # pylint: disable=reimported
183
+
184
+ pipelines = {}
185
+ document_store = FAISSDocumentStore(
186
+ faiss_config_path="faiss.json", faiss_index_path="faiss.index"
187
+ )
188
+ retriever = EmbeddingRetriever(
189
+ document_store=document_store,
190
+ batch_size=128,
191
+ embedding_model="ada",
192
+ api_key=st.secrets["api_key"],
193
+ max_seq_len=1024,
194
+ )
195
+
196
+ shaper = Shaper(
197
+ func="join_documents", inputs={"documents": "documents"}, outputs=["documents"]
198
+ )
199
+
200
+ pipe = Pipeline()
201
+ pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
202
+
203
+ logging.info(f"Loaded pipeline nodes: {pipe.graph.nodes.keys()}")
204
+ pipelines["query_pipeline"] = pipe
205
+
206
+ # Find document store
207
+
208
+ logging.info(f"Loaded docstore: {document_store}")
209
+ pipelines["document_store"] = document_store
210
+
211
+ # Load indexing pipeline (if available)
212
+ try:
213
+ indexing_pipeline = Pipeline.load_from_yaml(
214
+ Path(config.PIPELINE_YAML_PATH), pipeline_name=config.INDEXING_PIPELINE_NAME
215
+ )
216
+ docstore = indexing_pipeline.get_document_store()
217
+ if isinstance(docstore, UNSUPPORTED_DOC_STORES):
218
+ indexing_pipeline = None
219
+ raise PipelineConfigError(
220
+ "Indexing pipelines with FAISSDocumentStore or InMemoryDocumentStore are not supported by the REST APIs."
221
+ )
222
+
223
+ except PipelineConfigError as e:
224
+ indexing_pipeline = None
225
+ logger.error(f"{e.message}\nFile Upload API will not be available.")
226
+
227
+ finally:
228
+ pipelines["indexing_pipeline"] = indexing_pipeline
229
+
230
+ # Create directory for uploaded files
231
+ os.makedirs(config.FILE_UPLOAD_PATH, exist_ok=True)
232
+
233
+ return pipelines
234
+
235
+
236
+ def get_pipelines():
237
+ global pipelines # pylint: disable=global-statement
238
+ if not pipelines:
239
+ pipelines = setup_pipelines()
240
+ return pipelines