SYNOPSYS\anmolm commited on
Commit
05b0d1b
·
1 Parent(s): b693f0f

Copy pdfChatter files

Browse files
Files changed (2) hide show
  1. app.py +193 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.request
2
+ import fitz
3
+ import re
4
+ import numpy as np
5
+ import tensorflow_hub as hub
6
+ import openai
7
+ import gradio as gr
8
+ import os
9
+ from sklearn.neighbors import NearestNeighbors
10
+
11
+ def download_pdf(url, output_path):
12
+ urllib.request.urlretrieve(url, output_path)
13
+
14
+
15
+ def preprocess(text):
16
+ text = text.replace('\n', ' ')
17
+ text = re.sub('\s+', ' ', text)
18
+ return text
19
+
20
+
21
+ def pdf_to_text(path, start_page=1, end_page=None):
22
+ doc = fitz.open(path)
23
+ total_pages = doc.page_count
24
+
25
+ if end_page is None:
26
+ end_page = total_pages
27
+
28
+ text_list = []
29
+
30
+ for i in range(start_page-1, end_page):
31
+ text = doc.load_page(i).get_text("text")
32
+ text = preprocess(text)
33
+ text_list.append(text)
34
+
35
+ doc.close()
36
+ return text_list
37
+
38
+
39
+ def text_to_chunks(texts, word_length=150, start_page=1):
40
+ text_toks = [t.split(' ') for t in texts]
41
+ page_nums = []
42
+ chunks = []
43
+
44
+ for idx, words in enumerate(text_toks):
45
+ for i in range(0, len(words), word_length):
46
+ chunk = words[i:i+word_length]
47
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
48
+ len(text_toks) != (idx+1)):
49
+ text_toks[idx+1] = chunk + text_toks[idx+1]
50
+ continue
51
+ chunk = ' '.join(chunk).strip()
52
+ chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
53
+ chunks.append(chunk)
54
+ return chunks
55
+
56
+
57
+ class SemanticSearch:
58
+
59
+ def __init__(self):
60
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
61
+ self.fitted = False
62
+
63
+
64
+ def fit(self, data, batch=1000, n_neighbors=5):
65
+ self.data = data
66
+ self.embeddings = self.get_text_embedding(data, batch=batch)
67
+ n_neighbors = min(n_neighbors, len(self.embeddings))
68
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
69
+ self.nn.fit(self.embeddings)
70
+ self.fitted = True
71
+
72
+
73
+ def __call__(self, text, return_data=True):
74
+ inp_emb = self.use([text])
75
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
76
+
77
+ if return_data:
78
+ return [self.data[i] for i in neighbors]
79
+ else:
80
+ return neighbors
81
+
82
+
83
+ def get_text_embedding(self, texts, batch=1000):
84
+ embeddings = []
85
+ for i in range(0, len(texts), batch):
86
+ text_batch = texts[i:(i+batch)]
87
+ emb_batch = self.use(text_batch)
88
+ embeddings.append(emb_batch)
89
+ embeddings = np.vstack(embeddings)
90
+ return embeddings
91
+
92
+
93
+
94
+ def load_recommender(path, start_page=1):
95
+ global recommender
96
+ texts = pdf_to_text(path, start_page=start_page)
97
+ chunks = text_to_chunks(texts, start_page=start_page)
98
+ recommender.fit(chunks)
99
+ return 'Corpus Loaded.'
100
+
101
+
102
+ def generate_text(openAI_key,prompt, engine="text-davinci-003"):
103
+ openai.api_key = openAI_key
104
+ completions = openai.Completion.create(
105
+ engine=engine,
106
+ prompt=prompt,
107
+ max_tokens=512,
108
+ n=1,
109
+ stop=None,
110
+ temperature=0.7,
111
+ )
112
+ message = completions.choices[0].text
113
+ return message
114
+
115
+
116
+ def generate_answer(question,openAI_key):
117
+ topn_chunks = recommender(question)
118
+ prompt = ""
119
+ prompt += 'search results:\n\n'
120
+ for c in topn_chunks:
121
+ prompt += c + '\n\n'
122
+
123
+ prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
124
+ "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
125
+ "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
126
+ "with the same name, create separate answers for each. Only include information found in the results and "\
127
+ "don't add any additional information. Make sure the answer is correct and don't output false content. "\
128
+ "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
129
+ "search results which has nothing to do with the question. Only answer what is asked. The "\
130
+ "answer should be short and concise. \n\nQuery: {question}\nAnswer: "
131
+
132
+ prompt += f"Query: {question}\nAnswer:"
133
+ answer = generate_text(openAI_key, prompt,"text-davinci-003")
134
+ return answer
135
+
136
+
137
+ def question_answer(url, file, question,openAI_key):
138
+ if openAI_key.strip()=='':
139
+ return '[ERROR]: Please enter you Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
140
+ if url.strip() == '' and file == None:
141
+ return '[ERROR]: Both URL and PDF is empty. Provide atleast one.'
142
+
143
+ if url.strip() != '' and file != None:
144
+ return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
145
+
146
+ if url.strip() != '':
147
+ glob_url = url
148
+ download_pdf(glob_url, 'corpus.pdf')
149
+ load_recommender('corpus.pdf')
150
+
151
+ else:
152
+ old_file_name = file.name
153
+ file_name = file.name
154
+ file_name = file_name[:-12] + file_name[-4:]
155
+ os.rename(old_file_name, file_name)
156
+ load_recommender(file_name)
157
+
158
+ if question.strip() == '':
159
+ return '[ERROR]: Question field is empty'
160
+
161
+ return generate_answer(question,openAI_key)
162
+
163
+
164
+ recommender = SemanticSearch()
165
+
166
+ title = 'Pdf Chatbot'
167
+ description = """ Pdf Chatbot uses Universal Sentence Encoder and Open AI."""
168
+
169
+
170
+ with gr.Blocks() as demo:
171
+
172
+ gr.Markdown(f'<center><h1>{title}</h1></center>')
173
+ gr.Markdown(description)
174
+
175
+ with gr.Row():
176
+
177
+ with gr.Group():
178
+ gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
179
+ openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
180
+ url = gr.Textbox(label='Enter PDF URL here')
181
+ gr.Markdown("<center><h4>OR<h4></center>")
182
+ file = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'])
183
+ question = gr.Textbox(label='Enter your question here')
184
+ btn = gr.Button(value='Submit')
185
+ btn.style(full_width=True)
186
+
187
+ with gr.Group():
188
+ answer = gr.Textbox(label='The answer to your question is :')
189
+
190
+ btn.click(question_answer, inputs=[url, file, question,openAI_key], outputs=[answer])
191
+ #openai.api_key = os.getenv('Your_Key_Here')
192
+ demo.launch()
193
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ PyMuPDF
3
+ numpy
4
+ scikit-learn
5
+ tensorflow
6
+ tensorflow-hub
7
+ openai