hamza50 commited on
Commit
84cebb3
·
1 Parent(s): 4cc8654

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +211 -4
app.py CHANGED
@@ -1,7 +1,214 @@
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
1
  import gradio as gr
2
 
3
+ import spacy
4
+ from spacy.lang.en.stop_words import STOP_WORDS
5
+ from string import punctuation
6
+ from collections import Counter
7
+ from heapq import nlargest
8
+ import os
9
+ nlp = spacy.load("en_core_web_sm")
10
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
11
+ import datetime
12
+
13
+ from spacy import displacy
14
+
15
+ import matplotlib.pyplot as plt
16
+ from wordcloud import WordCloud
17
+ from matplotlib import pyplot as plt
18
+
19
+ import nltk
20
+ from rank_bm25 import BM25Okapi
21
+ from sklearn.feature_extraction import _stop_words
22
+ import string
23
+ from tqdm.autonotebook import tqdm
24
+
25
+ import pandas as pd
26
+
27
+ import scipy.spatial
28
+ import pickle
29
+ from sentence_transformers import SentenceTransformer, util
30
+ import torch
31
+ import time
32
+ import torch
33
+ import transformers
34
+ from transformers import BartTokenizer, BartForConditionalGeneration
35
+ from string import punctuation
36
+ # tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
37
+
38
+ import numpy as np
39
+ from sentence_transformers import SentenceTransformer
40
+ import scipy.spatial
41
+
42
+
43
+
44
+
45
+
46
+
47
+ #import os
48
+
49
+ def load_model():
50
+ return SentenceTransformer('all-MiniLM-L6-v2'),SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'),CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
51
+
52
+ embedder,bi_encoder,cross_encoder = load_model()
53
+
54
+
55
+
56
+
57
+ def lower_case(input_str):
58
+ input_str = input_str.lower()
59
+ return input_str
60
+
61
+ df_all = pd.read_csv('paris_clean_newer.csv')
62
+
63
+
64
+ df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).text.apply(''.join).reset_index(name='all_review')
65
+ df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')
66
+ df_combined_paris_summary = df_combined_paris_summary[['Hotel','summary']]
67
+
68
+ import re
69
+
70
+ # df_combined = pd.read_csv('df_combined.csv')
71
+
72
+ df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
73
+
74
+
75
+ df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
76
+ df_basic = df_all[['Hotel','description','price_per_night']].drop_duplicates()
77
+ df_basic = df_basic.merge(df_combined_paris_summary,how='left')
78
+ df_combined_e = df_combined.merge(df_basic)
79
+ df_combined_e['all_review'] =df_combined_e['description']+ df_combined_e['all_review'] + df_combined_e['price_per_night']
80
+
81
+ df = df_combined_e.copy()
82
+
83
+
84
+ df_sentences = df_combined_e.set_index("all_review")
85
+
86
+ df_sentences = df_sentences["Hotel"].to_dict()
87
+ df_sentences_list = list(df_sentences.keys())
88
+
89
+ df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
90
+ #
91
+ corpus = df_sentences_list
92
+ # corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
93
+ corpus_embeddings = np.load('embeddings.npy')
94
+
95
+ bi_encoder.max_seq_length = 512 #Truncate long passages to 256 tokens
96
+ top_k = 32 #Number of passages we want to retrieve with the bi-encoder
97
+
98
+ #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
99
+
100
+ # corpus_embeddings_h = np.load('embeddings_h_r.npy')
101
+
102
+ with open('corpus_embeddings_bi_encoder.pickle', 'rb') as pkl:
103
+ doc_embedding = pickle.load(pkl)
104
+
105
+ with open('tokenized_corpus.pickle', 'rb') as pkl:
106
+ tokenized_corpus = pickle.load(pkl)
107
+
108
+ bm25 = BM25Okapi(tokenized_corpus)
109
+ passages = corpus
110
+
111
+
112
+
113
+
114
+ # We lower case our text and remove stop-words from indexing
115
+ def bm25_tokenizer(text):
116
+ tokenized_doc = []
117
+ for token in text.lower().split():
118
+ token = token.strip(string.punctuation)
119
+
120
+ if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
121
+ tokenized_doc.append(token)
122
+ return tokenized_doc
123
+
124
+
125
+ def search(query):
126
+ print("Input question:", query)
127
+ print("\n-------------------------\n")
128
+
129
+ ##### BM25 search (lexical search) #####
130
+ bm25_scores = bm25.get_scores(bm25_tokenizer(query))
131
+ top_n = np.argpartition(bm25_scores, -5)[-5:]
132
+ bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
133
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
134
+
135
+ bm25list = []
136
+ print("Top-5 lexical search (BM25) hits")
137
+ for hit in bm25_hits[0:5]:
138
+ row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
139
+
140
+ print("\t{:.3f}\t".format(hit['score']),row_dict['Hotel'].values[0])
141
+ de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
142
+ print(f'\tPrice Per night: {de.price_per_night.values[0]}')
143
+ print(de.description.values[0])
144
+ # doc = corpus[hit['corpus_id']]
145
+ # kp.get_key_phrases(doc)
146
+
147
+ bm25list.append(
148
+ {
149
+ "name":row_dict['Hotel'].values[0],
150
+ "score": hit['score'],
151
+ "desc":de.description.values[0],
152
+ "price": de.price_per_night.values[0],
153
+ }
154
+
155
+ )
156
+
157
+
158
+ #### Sematic Search #####
159
+ # Encode the query using the bi-encoder and find potentially relevant passages
160
+ question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
161
+ # question_embedding = question_embedding.cuda()
162
+ hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
163
+ hits = hits[0] # Get the hits for the first query
164
+
165
+ ##### Re-Ranking #####
166
+ # Now, score all retrieved passages with the cross_encoder
167
+ cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
168
+ cross_scores = cross_encoder.predict(cross_inp)
169
+
170
+ # Sort results by the cross-encoder scores
171
+ for idx in range(len(cross_scores)):
172
+ hits[idx]['cross-score'] = cross_scores[idx]
173
+
174
+ # Output of top-5 hits from bi-encoder
175
+ print("\n-------------------------\n")
176
+ print("Top-5 Bi-Encoder Retrieval hits")
177
+ hits = sorted(hits, key=lambda x: x['score'], reverse=True)
178
+ for hit in hits[0:5]:
179
+ # print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
180
+ row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
181
+ print("\t{:.3f}\t".format(hit['score']),row_dict['Hotel'].values[0])
182
+ de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
183
+ print(f'\tPrice Per night: {de.price_per_night.values[0]}')
184
+ print(de.description.values[0])
185
+
186
+ # Output of top-5 hits from re-ranker
187
+ print("\n-------------------------\n")
188
+ print("Top-5 Cross-Encoder Re-ranker hits")
189
+ hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
190
+ for hit in hits[0:5]:
191
+ # print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
192
+ row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
193
+ print("\t{:.3f}\t".format(hit['cross-score']),row_dict['Hotel'].values[0])
194
+ de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
195
+ print(f'\tPrice Per night: {de.price_per_night.values[0]}')
196
+ print(de.description.values[0])
197
+
198
+
199
+ return bm25list
200
+
201
+
202
+
203
+
204
+ def greet(query):
205
+ bm25 = search(query)
206
+ # print("Input question:", na)
207
+ # print("\n-------------------------\n")
208
+ # k='name'
209
+ return bm25
210
+
211
+ demo = gr.Interface(fn=greet, inputs="text", outputs="text")
212
+
213
+ demo.launch(share=True)
214