File size: 4,989 Bytes
d87698c
 
84cebb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88fd94a
 
84cebb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e07db8e
84cebb3
 
 
 
 
 
 
 
 
 
 
 
 
 
07dc812
84cebb3
9e7108f
d87698c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import gradio as gr

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import os
nlp = spacy.load("en_core_web_sm")
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import datetime

from spacy import displacy

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from matplotlib import pyplot as plt

import nltk
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm

import pandas as pd

import scipy.spatial
import pickle
from sentence_transformers import SentenceTransformer, util
import torch
import time
import torch
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration
from string import punctuation
# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

import numpy as np
from sentence_transformers import SentenceTransformer
import scipy.spatial






    #import os
    
def load_model():
    return SentenceTransformer('all-MiniLM-L6-v2'),SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'),CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

embedder,bi_encoder,cross_encoder = load_model()




def lower_case(input_str):
    input_str = input_str.lower()
    return input_str

df_all = pd.read_csv('paris_clean_newer.csv')


df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).text.apply(''.join).reset_index(name='all_review')
df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')
df_combined_paris_summary = df_combined_paris_summary[['Hotel','summary']]

import re

# df_combined = pd.read_csv('df_combined.csv')

df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))


df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
df_basic = df_all[['Hotel','description','price_per_night']].drop_duplicates()
df_basic = df_basic.merge(df_combined_paris_summary,how='left')
df_combined_e = df_combined.merge(df_basic)
df_combined_e['all_review'] =df_combined_e['description']+ df_combined_e['all_review'] + df_combined_e['price_per_night']

df = df_combined_e.copy()


df_sentences = df_combined_e.set_index("all_review")

df_sentences = df_sentences["Hotel"].to_dict()
df_sentences_list = list(df_sentences.keys())

df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
#
corpus = df_sentences_list
# corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
corpus_embeddings = np.load('embeddings.npy')

bi_encoder.max_seq_length = 512     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality

# corpus_embeddings_h = np.load('embeddings_h_r.npy')

with open('corpus_embeddings_bi_encoder.pickle', 'rb') as pkl:
    doc_embedding = pickle.load(pkl)

with open('tokenized_corpus.pickle', 'rb') as pkl:
    tokenized_corpus = pickle.load(pkl)

bm25 = BM25Okapi(tokenized_corpus)
passages = corpus




# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


def search(query):
    print("Input question:", query)
    print("\n-------------------------\n")

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    bm25list = []
    print("Top-5 lexical search (BM25) hits")
    for hit in bm25_hits[0:5]:
        row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
        
        print("\t{:.3f}\t".format(hit['score']),row_dict['Hotel'].values[0])
        de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
        print(f'\tPrice Per night: {de.price_per_night.values[0]}')
        print(de.description.values[0])
        # doc = corpus[hit['corpus_id']]
        # kp.get_key_phrases(doc)
        
        bm25list.append(
        {
         "name":row_dict['Hotel'].values[0],
         "score": hit['score'],
         "desc":de.description.values[0],
         "price": de.price_per_night.values[0],
        }
        
        )
        

    
        
        
    return bm25list




def greet(query):
    bm25 = search(query)
    # print("Input question:", na)
    # print("\n-------------------------\n")
    # k='name'
    return bm25

demo = gr.Interface(fn=greet, inputs="text", outputs="json")

demo.launch()