Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

paper-matching / score.py

jskim

added phrase highlights

580aef7 over 2 years ago

raw

history blame

6.45 kB

	from sentence_transformers import util
	from nltk.tokenize import sent_tokenize
	from nltk import word_tokenize, pos_tag
	import torch
	import numpy as np

	def compute_sentencewise_scores(model, query_sents, candidate_sents):
	# list of sentences from query and candidate

	q_v, c_v = get_embedding(model, query_sents, candidate_sents)
	return util.cos_sim(q_v, c_v)

	def get_embedding(model, query_sents, candidate_sents):

	q_v = model.encode(query_sents)
	c_v = model.encode(candidate_sents)

	return q_v, c_v

	def get_top_k(score_mat, K=3):
	"""
	Pick top K sentences to show
	"""
	idx = torch.argsort(-score_mat)
	picked_sent = idx[:,:K]
	picked_scores = torch.vstack(
	[score_mat[i,picked_sent[i]] for i in range(picked_sent.shape[0])]
	)

	return picked_sent, picked_scores

	def get_words(sent):
	words = []
	sent_start_id = [] # keep track of the word index where the new sentence starts
	counter = 0
	for x in sent:
	#w = x.split()
	w = word_tokenize(x)
	nw = len(w)
	counter += nw
	words.append(w)
	sent_start_id.append(counter)
	words = [word_tokenize(x) for x in sent]
	all_words = [item for sublist in words for item in sublist]
	sent_start_id.pop()
	sent_start_id = [0] + sent_start_id
	assert(len(sent_start_id) == len(sent))
	return words, all_words, sent_start_id

	def get_match_phrase(w1, w2):
	# list of words for query and candidate as input
	# return the word list and binary mask of matching phrases
	# POS tags that should be considered for matching phrase
	include = [
	'JJ',
	'JJR',
	'JJS',
	'MD',
	'NN',
	'NNS',
	'NNP',
	'NNPS',
	'RB',
	'RBR',
	'RBS',
	'SYM',
	'VB',
	'VBD',
	'VBG',
	'VBN',
	'FW'
	]
	mask1 = np.zeros(len(w1))
	mask2 = np.zeros(len(w2))
	pos1 = pos_tag(w1)
	pos2 = pos_tag(w2)
	for i, (w, p) in enumerate(pos2):
	if w.lower() in w1 and p in include:
	mask2[i] = 1
	return mask2

	def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
	num_query_sent = sent_ids.shape[0]
	num_words = len(all_words)

	output = dict()
	output['all_words'] = all_words
	output['words_by_sentence'] = words

	# for each query sentence, mark the highlight information
	for i in range(num_query_sent):
	query_words = word_tokenize(query_sents[i])
	is_selected_sent = np.zeros(num_words)
	is_selected_phrase = np.zeros(num_words)
	word_scores = np.zeros(num_words)

	# for each selected sentences from the candidate, compile information
	for sid, sscore in zip(sent_ids[i], sent_scores[i]):
	#print(len(sent_start_id), sid, sid+1)
	if sid+1 < len(sent_start_id):
	sent_range = (sent_start_id[sid], sent_start_id[sid+1])
	is_selected_sent[sent_range[0]:sent_range[1]] = 1
	word_scores[sent_range[0]:sent_range[1]] = sscore
	is_selected_phrase[sent_range[0]:sent_range[1]] = \
	get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
	else:
	is_selected_sent[sent_start_id[sid]:] = 1
	word_scores[sent_start_id[sid]:] = sscore
	is_selected_phrase[sent_start_id[sid]:] = \
	get_match_phrase(query_words, all_words[sent_start_id[sid]:])

	# update selected phrase scores (-1 meaning a different color in gradio)
	word_scores[is_selected_sent+is_selected_phrase==2] = -1

	output[i] = {
	'is_selected_sent': is_selected_sent,
	'is_selected_phrase': is_selected_phrase,
	'scores': word_scores
	}

	return output

	def get_highlight_info(model, text1, text2, K=None):
	sent1 = sent_tokenize(text1) # query
	sent2 = sent_tokenize(text2) # candidate
	if K is None: # if K is not set, select based on the length of the candidate
	K = int(len(sent2) / 3)
	score_mat = compute_sentencewise_scores(model, sent1, sent2)

	sent_ids, sent_scores = get_top_k(score_mat, K=K)
	#print(sent_ids, sent_scores)
	words2, all_words2, sent_start_id2 = get_words(sent2)
	#print(all_words1, sent_start_id1)
	info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)

	return sent_ids, sent_scores, info

	## Document-level operations
	def predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=20):

	# concatenate title and abstract
	title_abs = []
	for t, a in zip(titles, abstracts):
	if t is not None and a is not None:
	title_abs.append(t + ' [SEP] ' + a)

	num_docs = len(title_abs)
	no_iter = int(np.ceil(num_docs / batch))

	# preprocess the input
	scores = []
	with torch.no_grad():
	# batch
	for i in range(no_iter):
	inputs = tokenizer(
	[query] + title_abs[ibatch:(i+1)batch],
	padding=True,
	truncation=True,
	return_tensors="pt",
	max_length=512
	)
	inputs.to(doc_model.device)
	result = doc_model(**inputs)

	# take the first token in the batch as the embedding
	embeddings = result.last_hidden_state[:, 0, :].detach().cpu().numpy()

	# compute cosine similarity
	q_emb = embeddings[0,:]
	p_emb = embeddings[1:,:]
	nn = np.linalg.norm(q_emb) * np.linalg.norm(p_emb, axis=1)
	scores += list(np.dot(p_emb, q_emb) / nn)

	assert(len(scores) == num_docs)

	return scores

	def compute_overall_score(doc_model, tokenizer, query, papers, batch=5):
	scores = []
	titles = []
	abstracts = []
	for p in papers:
	titles.append(p['title'])
	abstracts.append(p['abstract'])
	scores = predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=batch)
	idx_sorted = np.argsort(scores)[::-1]

	titles_sorted = [titles[x] for x in idx_sorted]
	abstracts_sorted = [abstracts[x] for x in idx_sorted]
	scores_sorted = [scores[x] for x in idx_sorted]

	return titles_sorted, abstracts_sorted, scores_sorted