Docfile whitead commited on
Commit
46a26c2
·
0 Parent(s):

Duplicate from whitead/paper-qa

Browse files

Co-authored-by: Andrew White <[email protected]>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +21 -0
  3. app.py +207 -0
  4. requirements.txt +4 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Paper Qa
3
+ emoji: ❓
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.18.0
8
+ app_file: app.py
9
+ license: mit
10
+ duplicated_from: whitead/paper-qa
11
+ ---
12
+
13
+ # Paper QA
14
+
15
+ This tool will enable asking questions of your uploaded text or PDF documents.
16
+ It uses OpenAI's GPT models and thus you must enter your API key below. This
17
+ tool is under active development and currently uses many tokens - up to 10,000
18
+ for a single query. That is $0.10-0.20 per query, so please be careful!
19
+
20
+ * [PaperQA](https://github.com/whitead/paper-qa) is the code used to build this tool.
21
+ * [langchain](https://github.com/hwchase17/langchain) is the main library this tool utilizes.
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import paperqa
3
+ import pickle
4
+ import pandas as pd
5
+ from pathlib import Path
6
+ import requests
7
+ import zipfile
8
+ import io
9
+ import tempfile
10
+ import os
11
+
12
+
13
+ css_style = """
14
+
15
+ .gradio-container {
16
+ font-family: "IBM Plex Mono";
17
+ }
18
+ """
19
+
20
+
21
+ def request_pathname(files, data, openai_api_key):
22
+ if files is None:
23
+ return [[]]
24
+ for file in files:
25
+ # make sure we're not duplicating things in the dataset
26
+ if file.name in [x[0] for x in data]:
27
+ continue
28
+ data.append([file.name, None, None])
29
+ return [[len(data), 0]], data, data, validate_dataset(pd.DataFrame(data), openai_api_key)
30
+
31
+
32
+ def validate_dataset(dataset, openapi):
33
+ docs_ready = dataset.iloc[-1, 0] != ""
34
+ if docs_ready and type(openapi) is str and len(openapi) > 0:
35
+ return "✨Ready✨"
36
+ elif docs_ready:
37
+ return "⚠️Waiting for key⚠️"
38
+ elif type(openapi) is str and len(openapi) > 0:
39
+ return "⚠️Waiting for documents⚠️"
40
+ else:
41
+ return "⚠️Waiting for documents and key⚠️"
42
+
43
+
44
+ def make_stats(docs):
45
+ return [[len(docs.doc_previews), sum([x[0] for x in docs.doc_previews])]]
46
+
47
+
48
+ # , progress=gr.Progress()):
49
+ def do_ask(question, button, openapi, dataset, length, do_marg, k, max_sources, docs):
50
+ passages = ""
51
+ docs_ready = dataset.iloc[-1, 0] != ""
52
+ if button == "✨Ready✨" and type(openapi) is str and len(openapi) > 0 and docs_ready:
53
+ os.environ['OPENAI_API_KEY'] = openapi.strip()
54
+ if docs is None:
55
+ docs = paperqa.Docs()
56
+ # dataset is pandas dataframe
57
+ for _, row in dataset.iterrows():
58
+ try:
59
+ docs.add(row['filepath'], row['citation string'],
60
+ key=row['key'], disable_check=True)
61
+ yield "", "", "", docs, make_stats(docs)
62
+ except Exception as e:
63
+ pass
64
+ else:
65
+ yield "", "", "", docs, [[0, 0]]
66
+ #progress(0, "Building Index...")
67
+ docs._build_faiss_index()
68
+ #progress(0.25, "Querying...")
69
+ for i, result in enumerate(docs.query_gen(question,
70
+ length_prompt=f'use {length:d} words',
71
+ marginal_relevance=do_marg,
72
+ k=k, max_sources=max_sources)):
73
+ #progress(0.25 + 0.1 * i, "Generating Context" + str(i))
74
+ yield result.formatted_answer, result.context, passages, docs, make_stats(docs)
75
+ #progress(1.0, "Done!")
76
+ # format the passages
77
+ for i, (key, passage) in enumerate(result.passages.items()):
78
+ passages += f'Disabled for now'
79
+ yield result.formatted_answer, result.context, passages, docs, make_stats(docs)
80
+
81
+
82
+ def download_repo(gh_repo, data, openai_api_key, pbar=gr.Progress()):
83
+ # download zipped version of repo
84
+ r = requests.get(f'https://api.github.com/repos/{gh_repo}/zipball')
85
+ if r.status_code == 200:
86
+ pbar(1, 'Downloaded')
87
+
88
+ # iterate through files in zip
89
+ with zipfile.ZipFile(io.BytesIO(r.content)) as z:
90
+ for i, f in enumerate(z.namelist()):
91
+ # skip directories
92
+ if f.endswith('/'):
93
+ continue
94
+ # try to read as plaintext (skip binary files)
95
+ try:
96
+ text = z.read(f).decode('utf-8')
97
+ except UnicodeDecodeError:
98
+ continue
99
+ # check if it's bigger than 100kb or smaller than 10 bytes
100
+ if len(text) > 1e5 or len(text) < 10:
101
+ continue
102
+ # have to save to temporary file so we have a path
103
+ with tempfile.NamedTemporaryFile(delete=False) as tmp:
104
+ tmp.write(text.encode('utf-8'))
105
+ tmp.flush()
106
+ path = tmp.name
107
+ # strip off the first directory of f
108
+ rel_path = '/'.join(f.split('/')[1:])
109
+ key = os.path.basename(f)
110
+ citation = f'[{rel_path}](https://github.com/{gh_repo}/tree/main/{rel_path})'
111
+ if path in [x[0] for x in data]:
112
+ continue
113
+ data.append([path, citation, key])
114
+ yield [[len(data), 0]], data, data, validate_dataset(pd.DataFrame(data), openai_api_key)
115
+ pbar(int((i+1)/len(z.namelist()) * 99),
116
+ f'Added {f}')
117
+ pbar(100, 'Done')
118
+ else:
119
+ raise ValueError('Unknown Github Repo')
120
+ return data
121
+
122
+
123
+ with gr.Blocks(css=css_style) as demo:
124
+
125
+ docs = gr.State(None)
126
+ data = gr.State([])
127
+ openai_api_key = gr.State('')
128
+
129
+ gr.Markdown(f"""
130
+ # Document Question and Answer (v{paperqa.__version__})
131
+
132
+ *By Andrew White ([@andrewwhite01](https://twitter.com/andrewwhite01))*
133
+
134
+ This tool will enable asking questions of your uploaded text, PDF documents,
135
+ or scrape github repos.
136
+ It uses OpenAI's GPT models and thus you must enter your API key below. This
137
+ tool is under active development and currently uses many tokens - up to 10,000
138
+ for a single query. That is $0.10-0.20 per query, so please be careful!
139
+
140
+ * [PaperQA](https://github.com/whitead/paper-qa) is the code used to build this tool.
141
+ * [langchain](https://github.com/hwchase17/langchain) is the main library this tool utilizes.
142
+
143
+ 1. Enter API Key ([What is that?](https://platform.openai.com/account/api-keys))
144
+ 2. Upload your documents
145
+ 3. Ask a questions
146
+ """)
147
+ openai_api_key = gr.Textbox(
148
+ label="OpenAI API Key", placeholder="sk-...", type="password")
149
+ with gr.Tab('File Upload'):
150
+ uploaded_files = gr.File(
151
+ label="Your Documents Upload (PDF or txt)", file_count="multiple", )
152
+ with gr.Tab('Github Repo'):
153
+ gh_repo = gr.Textbox(
154
+ label="Github Repo", placeholder="whitead/paper-qa")
155
+ download = gr.Button("Download Repo")
156
+
157
+ with gr.Accordion("See Docs:", open=False):
158
+ dataset = gr.Dataframe(
159
+ headers=["filepath", "citation string", "key"],
160
+ datatype=["str", "str", "str"],
161
+ col_count=(3, "fixed"),
162
+ interactive=False,
163
+ label="Documents and Citations",
164
+ overflow_row_behaviour='paginate',
165
+ max_rows=5
166
+ )
167
+ buildb = gr.Textbox("⚠️Waiting for documents and key...",
168
+ label="Status", interactive=False, show_label=True,
169
+ max_lines=1)
170
+ stats = gr.Dataframe(headers=['Docs', 'Chunks'],
171
+ datatype=['number', 'number'],
172
+ col_count=(2, "fixed"),
173
+ interactive=False,
174
+ label="Doc Stats")
175
+ openai_api_key.change(validate_dataset, inputs=[
176
+ dataset, openai_api_key], outputs=[buildb])
177
+ dataset.change(validate_dataset, inputs=[
178
+ dataset, openai_api_key], outputs=[buildb])
179
+ uploaded_files.change(request_pathname, inputs=[
180
+ uploaded_files, data, openai_api_key], outputs=[stats, data, dataset, buildb])
181
+ download.click(fn=download_repo, inputs=[
182
+ gh_repo, data, openai_api_key], outputs=[stats, data, dataset, buildb])
183
+ query = gr.Textbox(
184
+ placeholder="Enter your question here...", label="Question")
185
+ with gr.Row():
186
+ length = gr.Slider(25, 200, value=100, step=5,
187
+ label='Words in answer')
188
+ marg = gr.Checkbox(True, label='Max marginal relevance')
189
+ k = gr.Slider(1, 20, value=10, step=1,
190
+ label='Chunks to examine')
191
+ sources = gr.Slider(1, 10, value=5, step=1,
192
+ label='Contexts to include')
193
+
194
+ ask = gr.Button("Ask Question")
195
+ answer = gr.Markdown(label="Answer")
196
+ with gr.Accordion("Context", open=True):
197
+ context = gr.Markdown(label="Context")
198
+
199
+ with gr.Accordion("Raw Text", open=False):
200
+ passages = gr.Markdown(label="Passages")
201
+ ask.click(fn=do_ask, inputs=[query, buildb,
202
+ openai_api_key, dataset,
203
+ length, marg, k, sources,
204
+ docs], outputs=[answer, context, passages, docs, stats])
205
+
206
+ demo.queue(concurrency_count=20)
207
+ demo.launch(show_error=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ paper-qa>=0.0.21
2
+ gradio
3
+ requests
4
+ transformers