Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

jskim commited on Feb 23, 2023

Commit

580aef7

1 Parent(s): 4bea31b

added phrase highlights

Browse files

Files changed (2) hide show

app.py +5 -3
score.py +54 -11

app.py CHANGED Viewed

@@ -10,12 +10,16 @@ from input_format import *
 from score import *
 # load document scoring model
 pretrained_model = 'allenai/specter'
 tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
 doc_model = AutoModel.from_pretrained(pretrained_model)
 # load sentence model
 sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
 def get_similar_paper(
     abstract_text_input,
@@ -25,8 +29,6 @@ def get_similar_paper(
 ):
     input_sentences = sent_tokenize(abstract_text_input)
-    pickle.dump(input_sentences, open('tmp_input_sents.pkl', 'wb'))
     # TODO handle pdf file input
     if pdf_file_input is not None:
         name = None
@@ -42,7 +44,7 @@ def get_similar_paper(
         tokenizer,
         abstract_text_input,
         papers,
-        batch=30
     )
     tmp = {

 from score import *
 # load document scoring model
+torch.cuda.is_available = lambda : False
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 pretrained_model = 'allenai/specter'
 tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
 doc_model = AutoModel.from_pretrained(pretrained_model)
+doc_model.to(device)
 # load sentence model
 sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
+sent_model.to(device)
 def get_similar_paper(
     abstract_text_input,
 ):
     input_sentences = sent_tokenize(abstract_text_input)
     # TODO handle pdf file input
     if pdf_file_input is not None:
         name = None
         tokenizer,
         abstract_text_input,
         papers,
+        batch=50
     )
     tmp = {

score.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from sentence_transformers import util
 from nltk.tokenize import sent_tokenize
 import torch
 import numpy as np
@@ -33,19 +34,52 @@ def get_words(sent):
     sent_start_id = [] # keep track of the word index where the new sentence starts
     counter = 0
     for x in sent:
-        w = x.split()
         nw = len(w)
         counter += nw
         words.append(w)
         sent_start_id.append(counter)
-    words = [x.split() for x in sent]
     all_words = [item for sublist in words for item in sublist]
     sent_start_id.pop()
     sent_start_id = [0] + sent_start_id
     assert(len(sent_start_id) == len(sent))
     return words, all_words, sent_start_id
-def mark_words(words, all_words, sent_start_id, sent_ids, sent_scores):
     num_query_sent = sent_ids.shape[0]
     num_words = len(all_words)
@@ -55,22 +89,29 @@ def mark_words(words, all_words, sent_start_id, sent_ids, sent_scores):
     # for each query sentence, mark the highlight information
     for i in range(num_query_sent):
         is_selected_sent = np.zeros(num_words)
         is_selected_phrase = np.zeros(num_words)
-        word_scores = np.zeros(num_words) + 1e-4
-        # get sentence selection information
         for sid, sscore in zip(sent_ids[i], sent_scores[i]):
             #print(len(sent_start_id), sid, sid+1)
             if sid+1 < len(sent_start_id):
                 sent_range = (sent_start_id[sid], sent_start_id[sid+1])
                 is_selected_sent[sent_range[0]:sent_range[1]] = 1
                 word_scores[sent_range[0]:sent_range[1]] = sscore
             else:
-                is_selected_sent[sent_range[0]:] = 1
-                word_scores[sent_range[0]:] = sscore
-        # TODO get phrase selection information
         output[i] = {
             'is_selected_sent': is_selected_sent,
             'is_selected_phrase': is_selected_phrase,
@@ -79,16 +120,18 @@ def mark_words(words, all_words, sent_start_id, sent_ids, sent_scores):
     return output
-def get_highlight_info(model, text1, text2, K=3):
     sent1 = sent_tokenize(text1) # query
     sent2 = sent_tokenize(text2) # candidate
     score_mat = compute_sentencewise_scores(model, sent1, sent2)
     sent_ids, sent_scores = get_top_k(score_mat, K=K)
     #print(sent_ids, sent_scores)
-    words1, all_words1, sent_start_id1 = get_words(sent2)
     #print(all_words1, sent_start_id1)
-    info = mark_words(words1, all_words1, sent_start_id1, sent_ids, sent_scores)
     return sent_ids, sent_scores, info

 from sentence_transformers import util
 from nltk.tokenize import sent_tokenize
+from nltk import word_tokenize, pos_tag
 import torch
 import numpy as np
     sent_start_id = [] # keep track of the word index where the new sentence starts
     counter = 0
     for x in sent:
+        #w = x.split()
+        w = word_tokenize(x)
         nw = len(w)
         counter += nw
         words.append(w)
         sent_start_id.append(counter)
+    words = [word_tokenize(x) for x in sent]
     all_words = [item for sublist in words for item in sublist]
     sent_start_id.pop()
     sent_start_id = [0] + sent_start_id
     assert(len(sent_start_id) == len(sent))
     return words, all_words, sent_start_id
+def get_match_phrase(w1, w2):
+    # list of words for query and candidate as input
+    # return the word list and binary mask of matching phrases
+    # POS tags that should be considered for matching phrase
+    include = [
+        'JJ',
+        'JJR',
+        'JJS',
+        'MD',
+        'NN',
+        'NNS',
+        'NNP',
+        'NNPS',
+        'RB',
+        'RBR',
+        'RBS',
+        'SYM',
+        'VB',
+        'VBD',
+        'VBG',
+        'VBN',
+        'FW'
+    ]
+    mask1 = np.zeros(len(w1))
+    mask2 = np.zeros(len(w2))
+    pos1 = pos_tag(w1)
+    pos2 = pos_tag(w2)
+    for i, (w, p) in enumerate(pos2):
+        if w.lower() in w1 and p in include:
+            mask2[i] = 1
+    return mask2
+def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
     num_query_sent = sent_ids.shape[0]
     num_words = len(all_words)
     # for each query sentence, mark the highlight information
     for i in range(num_query_sent):
+        query_words = word_tokenize(query_sents[i])
         is_selected_sent = np.zeros(num_words)
         is_selected_phrase = np.zeros(num_words)
+        word_scores = np.zeros(num_words)
+        # for each selected sentences from the candidate, compile information
         for sid, sscore in zip(sent_ids[i], sent_scores[i]):
             #print(len(sent_start_id), sid, sid+1)
             if sid+1 < len(sent_start_id):
                 sent_range = (sent_start_id[sid], sent_start_id[sid+1])
                 is_selected_sent[sent_range[0]:sent_range[1]] = 1
                 word_scores[sent_range[0]:sent_range[1]] = sscore
+                is_selected_phrase[sent_range[0]:sent_range[1]] = \
+                    get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
             else:
+                is_selected_sent[sent_start_id[sid]:] = 1
+                word_scores[sent_start_id[sid]:] = sscore
+                is_selected_phrase[sent_start_id[sid]:] = \
+                    get_match_phrase(query_words, all_words[sent_start_id[sid]:])
+        # update selected phrase scores (-1 meaning a different color in gradio)
+        word_scores[is_selected_sent+is_selected_phrase==2] = -1
         output[i] = {
             'is_selected_sent': is_selected_sent,
             'is_selected_phrase': is_selected_phrase,
     return output
+def get_highlight_info(model, text1, text2, K=None):
     sent1 = sent_tokenize(text1) # query
     sent2 = sent_tokenize(text2) # candidate
+    if K is None: # if K is not set, select based on the length of the candidate
+        K = int(len(sent2) / 3)
     score_mat = compute_sentencewise_scores(model, sent1, sent2)
     sent_ids, sent_scores = get_top_k(score_mat, K=K)
     #print(sent_ids, sent_scores)
+    words2, all_words2, sent_start_id2 = get_words(sent2)
     #print(all_words1, sent_start_id1)
+    info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
     return sent_ids, sent_scores, info