Spaces:
Runtime error
Runtime error
import gradio as gr | |
import spacy | |
from spacy.lang.en.stop_words import STOP_WORDS | |
from string import punctuation | |
from collections import Counter | |
from heapq import nlargest | |
import os | |
nlp = spacy.load("en_core_web_sm") | |
from sentence_transformers import SentenceTransformer, CrossEncoder, util | |
import datetime | |
from spacy import displacy | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
from matplotlib import pyplot as plt | |
import nltk | |
from rank_bm25 import BM25Okapi | |
from sklearn.feature_extraction import _stop_words | |
import string | |
from tqdm.autonotebook import tqdm | |
import pandas as pd | |
import scipy.spatial | |
import pickle | |
from sentence_transformers import SentenceTransformer, util | |
import torch | |
import time | |
import torch | |
import transformers | |
from transformers import BartTokenizer, BartForConditionalGeneration | |
from string import punctuation | |
# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn') | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
import scipy.spatial | |
#import os | |
def load_model(): | |
return SentenceTransformer('all-MiniLM-L6-v2'),SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'),CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') | |
embedder,bi_encoder,cross_encoder = load_model() | |
def lower_case(input_str): | |
input_str = input_str.lower() | |
return input_str | |
df_all = pd.read_csv('paris_clean_newer.csv') | |
df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).text.apply(''.join).reset_index(name='all_review') | |
df_combined_paris_summary = pd.read_csv('df_combined_paris.csv') | |
df_combined_paris_summary = df_combined_paris_summary[['Hotel','summary']] | |
import re | |
# df_combined = pd.read_csv('df_combined.csv') | |
df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x)) | |
df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x)) | |
df_basic = df_all[['Hotel','description','price_per_night']].drop_duplicates() | |
df_basic = df_basic.merge(df_combined_paris_summary,how='left') | |
df_combined_e = df_combined.merge(df_basic) | |
df_combined_e['all_review'] =df_combined_e['description']+ df_combined_e['all_review'] + df_combined_e['price_per_night'] | |
df = df_combined_e.copy() | |
df_sentences = df_combined_e.set_index("all_review") | |
df_sentences = df_sentences["Hotel"].to_dict() | |
df_sentences_list = list(df_sentences.keys()) | |
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)] | |
# | |
corpus = df_sentences_list | |
# corpus_embeddings = embedder.encode(corpus,show_progress_bar=True) | |
corpus_embeddings = np.load('embeddings.npy') | |
bi_encoder.max_seq_length = 512 #Truncate long passages to 256 tokens | |
top_k = 32 #Number of passages we want to retrieve with the bi-encoder | |
#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality | |
# corpus_embeddings_h = np.load('embeddings_h_r.npy') | |
with open('corpus_embeddings_bi_encoder.pickle', 'rb') as pkl: | |
doc_embedding = pickle.load(pkl) | |
with open('tokenized_corpus.pickle', 'rb') as pkl: | |
tokenized_corpus = pickle.load(pkl) | |
bm25 = BM25Okapi(tokenized_corpus) | |
passages = corpus | |
# We lower case our text and remove stop-words from indexing | |
def bm25_tokenizer(text): | |
tokenized_doc = [] | |
for token in text.lower().split(): | |
token = token.strip(string.punctuation) | |
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS: | |
tokenized_doc.append(token) | |
return tokenized_doc | |
def search(query): | |
print("Input question:", query) | |
print("\n-------------------------\n") | |
##### BM25 search (lexical search) ##### | |
bm25_scores = bm25.get_scores(bm25_tokenizer(query)) | |
top_n = np.argpartition(bm25_scores, -5)[-5:] | |
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n] | |
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True) | |
bm25list = [] | |
print("Top-5 lexical search (BM25) hits") | |
for hit in bm25_hits[0:5]: | |
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]] | |
print("\t{:.3f}\t".format(hit['score']),row_dict['Hotel'].values[0]) | |
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]] | |
print(f'\tPrice Per night: {de.price_per_night.values[0]}') | |
print(de.description.values[0]) | |
# doc = corpus[hit['corpus_id']] | |
# kp.get_key_phrases(doc) | |
bm25list.append( | |
{ | |
"name":row_dict['Hotel'].values[0], | |
"score": hit['score'], | |
"desc":de.description.values[0], | |
"price": de.price_per_night.values[0], | |
} | |
) | |
return bm25list | |
def greet(query): | |
bm25 = search(query) | |
# print("Input question:", na) | |
# print("\n-------------------------\n") | |
# k='name' | |
return bm25 | |
demo = gr.Interface(fn=greet, inputs="text", outputs="json") | |
demo.launch() | |