alanwnl commited on
Commit
39c23d5
·
1 Parent(s): 5669a2c

Add application file

Browse files
Files changed (1) hide show
  1. app.py +83 -62
app.py CHANGED
@@ -27,7 +27,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
27
 
28
  text_list = []
29
 
30
- for i in range(start_page-1, end_page):
31
  text = doc.load_page(i).get_text("text")
32
  text = preprocess(text)
33
  text_list.append(text)
@@ -40,13 +40,14 @@ def text_to_chunks(texts, word_length=150, start_page=1):
40
  text_toks = [t.split(' ') for t in texts]
41
  page_nums = []
42
  chunks = []
43
-
44
  for idx, words in enumerate(text_toks):
45
  for i in range(0, len(words), word_length):
46
- chunk = words[i:i+word_length]
47
- if (i+word_length) > len(words) and (len(chunk) < word_length) and (
48
- len(text_toks) != (idx+1)):
49
- text_toks[idx+1] = chunk + text_toks[idx+1]
 
50
  continue
51
  chunk = ' '.join(chunk).strip()
52
  chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
@@ -55,12 +56,12 @@ def text_to_chunks(texts, word_length=150, start_page=1):
55
 
56
 
57
  class SemanticSearch:
58
-
59
  def __init__(self):
60
- self.use = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
 
61
  self.fitted = False
62
-
63
-
64
  def fit(self, data, batch=1000, n_neighbors=5):
65
  self.data = data
66
  self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -68,29 +69,26 @@ class SemanticSearch:
68
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
69
  self.nn.fit(self.embeddings)
70
  self.fitted = True
71
-
72
-
73
  def __call__(self, text, return_data=True):
74
  inp_emb = self.use([text])
75
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
76
-
77
  if return_data:
78
  return [self.data[i] for i in neighbors]
79
  else:
80
  return neighbors
81
-
82
-
83
  def get_text_embedding(self, texts, batch=1000):
84
  embeddings = []
85
  for i in range(0, len(texts), batch):
86
- text_batch = texts[i:(i+batch)]
87
  emb_batch = self.use(text_batch)
88
  embeddings.append(emb_batch)
89
  embeddings = np.vstack(embeddings)
90
  return embeddings
91
 
92
 
93
-
94
  def load_recommender(path, start_page=1):
95
  global recommender
96
  texts = pdf_to_text(path, start_page=start_page)
@@ -100,37 +98,38 @@ def load_recommender(path, start_page=1):
100
 
101
 
102
  ####################
103
- def generate_text(openAI_key,prompt,engine="chatgpt"):
 
 
 
 
104
  openai.api_type = "azure"
105
- openai.api_base = "https://api.hku.hk"
106
- openai.api_version = "2023-03-15-preview"
107
  openai.api_key = openAI_key
108
- completions = openai.ChatCompletion.create(
109
- engine="chatgpt",
110
- max_tokens=1024,
111
- n=1,
112
- stop=None,
113
- temperature=0.7,
114
- messages=[
115
- {"role": "user", "content": prompt}
116
- ]
117
- )
118
  print(completions)
119
  message = completions['choices'][0]['message']['content']
120
  return message
121
 
122
 
123
-
124
-
125
- #####################
126
-
127
- def generate_answer(question,openAI_key):
128
  topn_chunks = recommender(question)
 
 
129
  prompt = ""
130
  prompt += 'search results:\n\n'
131
  for c in topn_chunks:
132
  prompt += c + '\n\n'
133
-
134
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
135
  "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
136
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
@@ -139,21 +138,20 @@ def generate_answer(question,openAI_key):
139
  "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
140
  "search results which has nothing to do with the question. Only answer what is asked. The "\
141
  "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
142
-
143
  prompt += f"Query: {question}\nAnswer:"
144
- answer = generate_text(openAI_key, prompt,"chatgpt")
 
145
  return answer
146
 
147
 
148
-
149
-
150
-
151
- def question_answer(url, file, question,openAI_key):
152
- if openAI_key.strip()=='':
153
  return '[ERROR]: Please enter you Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
154
  if url.strip() == '' and file == None:
155
  return '[ERROR]: Both URL and PDF is empty. Provide atleast one.'
156
-
157
  if url.strip() != '' and file != None:
158
  return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
159
 
@@ -172,13 +170,17 @@ def question_answer(url, file, question,openAI_key):
172
  if question.strip() == '':
173
  return '[ERROR]: Question field is empty'
174
 
175
- return generate_answer(question,openAI_key)
176
-
177
- recommender = SemanticSearch()
178
-
179
- title = 'PDF GPT'
180
- description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
181
 
 
 
 
 
 
 
 
182
 
183
  with gr.Blocks() as demo:
184
 
@@ -186,21 +188,40 @@ with gr.Blocks() as demo:
186
  gr.Markdown(description)
187
 
188
  with gr.Row():
189
-
190
  with gr.Group():
191
- gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
192
- openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  url = gr.Textbox(label='Enter PDF URL here')
194
  gr.Markdown("<center><h4>OR<h4></center>")
195
- file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
196
- question = gr.Textbox(label='Enter your question here')
197
- btn = gr.Button(value='Submit')
198
- btn.style(full_width=True)
199
-
200
  with gr.Group():
 
 
201
  answer = gr.Textbox(label='The answer to your question is :')
202
 
203
- btn.click(question_answer, inputs=[url, file, question,openAI_key], outputs=[answer])
204
- #openai.api_key = os.getenv('Your_Key_Here')
205
- demo.launch()
206
-
 
 
 
 
 
27
 
28
  text_list = []
29
 
30
+ for i in range(start_page - 1, end_page):
31
  text = doc.load_page(i).get_text("text")
32
  text = preprocess(text)
33
  text_list.append(text)
 
40
  text_toks = [t.split(' ') for t in texts]
41
  page_nums = []
42
  chunks = []
43
+
44
  for idx, words in enumerate(text_toks):
45
  for i in range(0, len(words), word_length):
46
+ chunk = words[i:i + word_length]
47
+ if (i + word_length) > len(words) and (
48
+ len(chunk) < word_length) and (len(text_toks)
49
+ != (idx + 1)):
50
+ text_toks[idx + 1] = chunk + text_toks[idx + 1]
51
  continue
52
  chunk = ' '.join(chunk).strip()
53
  chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
 
56
 
57
 
58
  class SemanticSearch:
59
+
60
  def __init__(self):
61
+ self.use = hub.load(
62
+ "https://tfhub.dev/google/universal-sentence-encoder/4")
63
  self.fitted = False
64
+
 
65
  def fit(self, data, batch=1000, n_neighbors=5):
66
  self.data = data
67
  self.embeddings = self.get_text_embedding(data, batch=batch)
 
69
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
70
  self.nn.fit(self.embeddings)
71
  self.fitted = True
72
+
 
73
  def __call__(self, text, return_data=True):
74
  inp_emb = self.use([text])
75
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
76
+
77
  if return_data:
78
  return [self.data[i] for i in neighbors]
79
  else:
80
  return neighbors
81
+
 
82
  def get_text_embedding(self, texts, batch=1000):
83
  embeddings = []
84
  for i in range(0, len(texts), batch):
85
+ text_batch = texts[i:(i + batch)]
86
  emb_batch = self.use(text_batch)
87
  embeddings.append(emb_batch)
88
  embeddings = np.vstack(embeddings)
89
  return embeddings
90
 
91
 
 
92
  def load_recommender(path, start_page=1):
93
  global recommender
94
  texts = pdf_to_text(path, start_page=start_page)
 
98
 
99
 
100
  ####################
101
+ def generate_text(openAI_key,
102
+ openAI_base,
103
+ openAI_API_version,
104
+ prompt,
105
+ engine="chatgpt"):
106
  openai.api_type = "azure"
107
+ openai.api_base = openAI_base
108
+ openai.api_version = openAI_API_version
109
  openai.api_key = openAI_key
110
+ completions = openai.ChatCompletion.create(engine="chatgpt",
111
+ max_tokens=1024,
112
+ n=1,
113
+ stop=None,
114
+ temperature=1.0,
115
+ messages=[{
116
+ "role": "user",
117
+ "content": prompt
118
+ }])
 
119
  print(completions)
120
  message = completions['choices'][0]['message']['content']
121
  return message
122
 
123
 
124
+ def generate_answer(question, openAI_key, openAI_base, openAI_API_version):
 
 
 
 
125
  topn_chunks = recommender(question)
126
+ print(len(topn_chunks))
127
+ print(*topn_chunks, sep = "\n")
128
  prompt = ""
129
  prompt += 'search results:\n\n'
130
  for c in topn_chunks:
131
  prompt += c + '\n\n'
132
+
133
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
134
  "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
135
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
 
138
  "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
139
  "search results which has nothing to do with the question. Only answer what is asked. The "\
140
  "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
141
+
142
  prompt += f"Query: {question}\nAnswer:"
143
+ answer = generate_text(openAI_key, openAI_base, openAI_API_version, prompt,
144
+ "chatgpt")
145
  return answer
146
 
147
 
148
+ def question_answer(url, file, question, openAI_key, openAI_base,
149
+ openAI_API_version):
150
+ if openAI_key.strip() == '':
 
 
151
  return '[ERROR]: Please enter you Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
152
  if url.strip() == '' and file == None:
153
  return '[ERROR]: Both URL and PDF is empty. Provide atleast one.'
154
+
155
  if url.strip() != '' and file != None:
156
  return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
157
 
 
170
  if question.strip() == '':
171
  return '[ERROR]: Question field is empty'
172
 
173
+ return generate_answer(question, openAI_key, openAI_base,
174
+ openAI_API_version)
175
+ recommender = SemanticSearch()
 
 
 
176
 
177
+ title = 'PDF GPT Azure'
178
+ description = """
179
+ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI.
180
+ It gives hallucination free response than other tools as the embeddings are better than OpenAI.
181
+ The returned response can even cite the page number in square brackets([]) where the information is located,
182
+ adding credibility to the responses and helping to locate pertinent information quickly.
183
+ """
184
 
185
  with gr.Blocks() as demo:
186
 
 
188
  gr.Markdown(description)
189
 
190
  with gr.Row():
191
+
192
  with gr.Group():
193
+ gr.Markdown(
194
+ f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>'
195
+ )
196
+ gr.Dropdown(label="API Type",
197
+ choices=["azure", "OpenAI"],
198
+ info="Azure or Open AI",
199
+ value="azure"),
200
+ #####################
201
+ ##
202
+ ## REMEMBER to remove the key before public deploy
203
+ ##
204
+ #####################
205
+ openAI_key = gr.Textbox(label='Enter your Azure OpenAI API key here')
206
+ openAI_base = gr.Textbox(label='api_base',
207
+ value="https://api.hku.hk")
208
+ openAI_API_version = gr.Textbox(label='API version',
209
+ value="2023-03-15-preview")
210
  url = gr.Textbox(label='Enter PDF URL here')
211
  gr.Markdown("<center><h4>OR<h4></center>")
212
+ file = gr.File(label='Upload your PDF/ Research Paper / Book here',
213
+ file_types=['.pdf'])
214
+
 
 
215
  with gr.Group():
216
+ question = gr.Textbox(label='Enter your question here')
217
+ btn = gr.Button(value='Submit', scale=1)
218
  answer = gr.Textbox(label='The answer to your question is :')
219
 
220
+ btn.click(question_answer,
221
+ inputs=[
222
+ url, file, question, openAI_key, openAI_base,
223
+ openAI_API_version
224
+ ],
225
+ outputs=[answer])
226
+ #openai.api_key = os.getenv('Your_Key_Here')
227
+ demo.launch()