Tj commited on
Commit
3a2942a
·
1 Parent(s): ac11709

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -26
app.py CHANGED
@@ -40,7 +40,7 @@ def text_to_chunks(texts, word_length=150, start_page=1):
40
  text_toks = [t.split(' ') for t in texts]
41
  page_nums = []
42
  chunks = []
43
-
44
  for idx, words in enumerate(text_toks):
45
  for i in range(0, len(words), word_length):
46
  chunk = words[i:i+word_length]
@@ -54,11 +54,11 @@ def text_to_chunks(texts, word_length=150, start_page=1):
54
  return chunks
55
 
56
  class SemanticSearch:
57
-
58
  def __init__(self):
59
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
60
  self.fitted = False
61
-
62
  def fit(self, data, batch=1000, n_neighbors=5):
63
  self.data = data
64
  self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -66,16 +66,16 @@ class SemanticSearch:
66
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
67
  self.nn.fit(self.embeddings)
68
  self.fitted = True
69
-
70
  def __call__(self, text, return_data=True):
71
  inp_emb = self.use([text])
72
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
73
-
74
  if return_data:
75
  return [self.data[i] for i in neighbors]
76
  else:
77
  return neighbors
78
-
79
  def get_text_embedding(self, texts, batch=1000):
80
  embeddings = []
81
  for i in range(0, len(texts), batch):
@@ -85,18 +85,15 @@ class SemanticSearch:
85
  embeddings = np.vstack(embeddings)
86
  return embeddings
87
 
88
-
89
- def load_recommender():
90
  global recommender
91
- download_pdf(PDF_URL, 'corpus.pdf')
92
- texts = pdf_to_text('corpus.pdf', start_page=1)
93
- chunks = text_to_chunks(texts, start_page=1)
94
  recommender.fit(chunks)
95
- return 'Corpus Loaded.'
96
-
97
-
98
- def generate_text(prompt, engine="text-davinci-003"):
99
- openai.api_key = OPENAI_API_KEY
100
  completions = openai.Completion.create(
101
  engine=engine,
102
  prompt=prompt,
@@ -108,13 +105,13 @@ def generate_text(prompt, engine="text-davinci-003"):
108
  message = completions.choices[0].text
109
  return message
110
 
111
- def generate_answer(question):
112
  topn_chunks = recommender(question)
113
  prompt = ""
114
  prompt += 'search results:\n\n'
115
  for c in topn_chunks:
116
  prompt += c + '\n\n'
117
-
118
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
119
  "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
120
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
@@ -125,25 +122,24 @@ def generate_answer(question):
125
  "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
126
 
127
  prompt += f"Query: {question}\nAnswer:"
128
- answer = generate_text(prompt, "text-davinci-003")
129
  return answer
130
 
131
- def question_answer(url, question):
132
- if url.strip() == '':
133
- return '[ERROR]: URL is empty. Provide a valid URL.'
134
-
135
  download_pdf(url, 'corpus.pdf')
136
  load_recommender('corpus.pdf')
137
 
138
  if question.strip() == '':
139
  return '[ERROR]: Question field is empty'
140
 
141
- return generate_answer(question)
142
 
143
  recommender = SemanticSearch()
144
 
145
  title = 'PDF GPT'
146
- description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
147
 
148
  with gr.Blocks() as demo:
149
  gr.Markdown(f'<center><h1>{title}</h1></center>')
@@ -158,6 +154,7 @@ with gr.Blocks() as demo:
158
  with gr.Group():
159
  answer = gr.Textbox(label='The answer to your question is :')
160
 
161
- btn.click(question_answer, inputs=[PDF_URL, question], outputs=[answer])
162
 
163
  demo.launch()
 
 
40
  text_toks = [t.split(' ') for t in texts]
41
  page_nums = []
42
  chunks = []
43
+
44
  for idx, words in enumerate(text_toks):
45
  for i in range(0, len(words), word_length):
46
  chunk = words[i:i+word_length]
 
54
  return chunks
55
 
56
  class SemanticSearch:
57
+
58
  def __init__(self):
59
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
60
  self.fitted = False
61
+
62
  def fit(self, data, batch=1000, n_neighbors=5):
63
  self.data = data
64
  self.embeddings = self.get_text_embedding(data, batch=batch)
 
66
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
67
  self.nn.fit(self.embeddings)
68
  self.fitted = True
69
+
70
  def __call__(self, text, return_data=True):
71
  inp_emb = self.use([text])
72
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
73
+
74
  if return_data:
75
  return [self.data[i] for i in neighbors]
76
  else:
77
  return neighbors
78
+
79
  def get_text_embedding(self, texts, batch=1000):
80
  embeddings = []
81
  for i in range(0, len(texts), batch):
 
85
  embeddings = np.vstack(embeddings)
86
  return embeddings
87
 
88
+ def load_recommender(path, start_page=1):
 
89
  global recommender
90
+ texts = pdf_to_text(path, start_page=start_page)
91
+ chunks = text_to_chunks(texts, start_page=start_page)
 
92
  recommender.fit(chunks)
93
+ return 'Corpus Loaded'
94
+
95
+ def generate_text(openAI_key, prompt, engine="text-davinci-003"):
96
+ openai.api_key = openAI_key
 
97
  completions = openai.Completion.create(
98
  engine=engine,
99
  prompt=prompt,
 
105
  message = completions.choices[0].text
106
  return message
107
 
108
+ def generate_answer(question, openAI_key):
109
  topn_chunks = recommender(question)
110
  prompt = ""
111
  prompt += 'search results:\n\n'
112
  for c in topn_chunks:
113
  prompt += c + '\n\n'
114
+
115
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
116
  "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
117
  "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
 
122
  "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
123
 
124
  prompt += f"Query: {question}\nAnswer:"
125
+ answer = generate_text(openAI_key, prompt, "text-davinci-003")
126
  return answer
127
 
128
+ def question_answer(question):
129
+ openAI_key = OPENAI_API_KEY
130
+ url = PDF_URL
 
131
  download_pdf(url, 'corpus.pdf')
132
  load_recommender('corpus.pdf')
133
 
134
  if question.strip() == '':
135
  return '[ERROR]: Question field is empty'
136
 
137
+ return generate_answer(question, openAI_key)
138
 
139
  recommender = SemanticSearch()
140
 
141
  title = 'PDF GPT'
142
+ description = """PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
143
 
144
  with gr.Blocks() as demo:
145
  gr.Markdown(f'<center><h1>{title}</h1></center>')
 
154
  with gr.Group():
155
  answer = gr.Textbox(label='The answer to your question is :')
156
 
157
+ btn.click(question_answer, inputs=[question], outputs=[answer])
158
 
159
  demo.launch()
160
+