Spaces:
Runtime error
Runtime error
File size: 4,989 Bytes
d87698c 84cebb3 88fd94a 84cebb3 e07db8e 84cebb3 07dc812 84cebb3 9e7108f d87698c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import gradio as gr
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import os
nlp = spacy.load("en_core_web_sm")
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import datetime
from spacy import displacy
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import nltk
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import pandas as pd
import scipy.spatial
import pickle
from sentence_transformers import SentenceTransformer, util
import torch
import time
import torch
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration
from string import punctuation
# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
import numpy as np
from sentence_transformers import SentenceTransformer
import scipy.spatial
#import os
def load_model():
return SentenceTransformer('all-MiniLM-L6-v2'),SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'),CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
embedder,bi_encoder,cross_encoder = load_model()
def lower_case(input_str):
input_str = input_str.lower()
return input_str
df_all = pd.read_csv('paris_clean_newer.csv')
df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).text.apply(''.join).reset_index(name='all_review')
df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')
df_combined_paris_summary = df_combined_paris_summary[['Hotel','summary']]
import re
# df_combined = pd.read_csv('df_combined.csv')
df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
df_basic = df_all[['Hotel','description','price_per_night']].drop_duplicates()
df_basic = df_basic.merge(df_combined_paris_summary,how='left')
df_combined_e = df_combined.merge(df_basic)
df_combined_e['all_review'] =df_combined_e['description']+ df_combined_e['all_review'] + df_combined_e['price_per_night']
df = df_combined_e.copy()
df_sentences = df_combined_e.set_index("all_review")
df_sentences = df_sentences["Hotel"].to_dict()
df_sentences_list = list(df_sentences.keys())
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
#
corpus = df_sentences_list
# corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
corpus_embeddings = np.load('embeddings.npy')
bi_encoder.max_seq_length = 512 #Truncate long passages to 256 tokens
top_k = 32 #Number of passages we want to retrieve with the bi-encoder
#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
# corpus_embeddings_h = np.load('embeddings_h_r.npy')
with open('corpus_embeddings_bi_encoder.pickle', 'rb') as pkl:
doc_embedding = pickle.load(pkl)
with open('tokenized_corpus.pickle', 'rb') as pkl:
tokenized_corpus = pickle.load(pkl)
bm25 = BM25Okapi(tokenized_corpus)
passages = corpus
# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
tokenized_doc = []
for token in text.lower().split():
token = token.strip(string.punctuation)
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
tokenized_doc.append(token)
return tokenized_doc
def search(query):
print("Input question:", query)
print("\n-------------------------\n")
##### BM25 search (lexical search) #####
bm25_scores = bm25.get_scores(bm25_tokenizer(query))
top_n = np.argpartition(bm25_scores, -5)[-5:]
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
bm25list = []
print("Top-5 lexical search (BM25) hits")
for hit in bm25_hits[0:5]:
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
print("\t{:.3f}\t".format(hit['score']),row_dict['Hotel'].values[0])
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
print(f'\tPrice Per night: {de.price_per_night.values[0]}')
print(de.description.values[0])
# doc = corpus[hit['corpus_id']]
# kp.get_key_phrases(doc)
bm25list.append(
{
"name":row_dict['Hotel'].values[0],
"score": hit['score'],
"desc":de.description.values[0],
"price": de.price_per_night.values[0],
}
)
return bm25list
def greet(query):
bm25 = search(query)
# print("Input question:", na)
# print("\n-------------------------\n")
# k='name'
return bm25
demo = gr.Interface(fn=greet, inputs="text", outputs="json")
demo.launch()
|