File size: 3,673 Bytes
514343b
4fd42f1
 
 
 
 
 
 
 
 
514343b
59e519b
9983408
 
843aeb0
44264ed
 
418bd7c
 
88993fe
44264ed
e838b9b
 
 
 
 
b3c3404
1716434
b3c3404
418bd7c
b3c3404
20efea7
b3c3404
20efea7
b3c3404
 
 
 
 
56c44d6
15e7e9a
 
ea052a5
15e7e9a
 
b0f213e
 
 
15e7e9a
 
 
 
 
 
b1d589a
15e7e9a
 
 
 
 
 
56c44d6
 
 
 
 
 
 
08a6457
 
534b3f6
 
 
 
 
 
747ffcd
308457d
c7626f8
 
08a6457
fd7311e
08a6457
fc99537
b85f19f
82206e8
 
 
 
 
fc99537
82206e8
 
 
 
 
 
 
 
 
fd7311e
b0f213e
 
a1229f1
fd7311e
b0f213e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

''' To-do

Create a side bar to compare two or upload CSV

In the second tab, allow them to compare all CSV files


'''

import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

# Streamlit interface
      
st.title("Sentence Similarity")

sidebar_selectbox = st.sidebar.selectbox(
    "What would you like to work with?",
    ("Compare two sentences", "Bulk upload and mark")
)

# Streamlit form elements (default to "Compare two sentences")

if sidebar_selectbox == "Compare two sentences":

       st.subheader("Compare the similarity between two sentences")
       
       with st.form("submission_form", clear_on_submit=False):
       
              sentence_1 = st.text_input("Sentence 1 input")
              
              sentence_2 = st.text_input("Sentence 2 input")
              
              submit_button_compare = st.form_submit_button("Compare Sentences")
              
       # If submit_button_compare clicked
       if submit_button_compare:
       
              # Perform calculations
              
              #Initialise sentences
              sentences = []
              
              # Append input sentences to 'sentences' list
              sentences.append(sentence_1)
              sentences.append(sentence_2)
              
              # Create embeddings for both sentences
              sentence_embeddings = model.encode(sentences)
              
              cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
              cos_sim = round(cos_sim * 100) # Convert to percentage and round-off
             
                     
              st.write('Similarity between {} and {} is {}%'.format(sentence_1,
                     sentence_2, cos_sim))



if sidebar_selectbox == "Bulk upload and mark":

       st.subheader("Bulk compare similarity of sentences")
       
       sentence_reference = st.text_input("Reference sentence input")
       
       # Only allow user to upload CSV files
       data_file = st.file_uploader("Upload CSV",type=["csv"])
       
       if data_file is not None:
              file_details = {"filename":data_file.name, "filetype":data_file.type, "filesize":data_file.size}
              st.write(file_details)
              df = pd.read_csv(data_file)
              
              # Get length of df.shape (might not need this)
              #total_rows = df.shape[0]
              
              similarity_scores = []
              
              for idx, row in df.iterrows():
                     # st.write(idx, row['Sentences'])
                     
                     # Create an empty sentence list
                     sentences = []
                     
                     # Compare the setences two by two
                     sentence_comparison = row['Sentences']
                     sentences.append(sentence_reference)
                     sentences.append(sentence_comparison)
                     
                     sentence_embeddings = model.encode(sentences)
                     
                     cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
                     cos_sim = round(cos_sim * 100)
                     
                     similarity_scores.append(cos_sim)                    
              
              # Append new column to dataframe
              
              df['Similarity'] = similarity_scores
              
              st.dataframe(df)