File size: 3,288 Bytes
985d3f5
9d22f91
 
985d3f5
 
 
2a2a619
 
 
 
 
 
 
 
985d3f5
 
 
 
 
 
 
 
be2864a
 
985d3f5
 
be2864a
 
 
 
 
985d3f5
 
be2864a
 
 
985d3f5
2a2a619
985d3f5
76d621c
985d3f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a2a619
 
 
 
 
 
 
 
 
 
 
 
 
 
985d3f5
2a2a619
 
 
 
985d3f5
 
2a2a619
985d3f5
 
 
2a2a619
985d3f5
2a2a619
 
 
 
9d22f91
985d3f5
2a2a619
b6b6380
2a2a619
985d3f5
 
 
2a2a619
b6b6380
2a2a619
 
 
 
985d3f5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# let's import the libraries we need
#from sentence_transformers import SentenceTransformer
#from sentence_transformers import CrossEncoder
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
import io
import netrc
import pickle
import sys
import pandas as pd
import numpy as np
import streamlit as st
import torch
from tqdm import tqdm
tqdm.pandas()

# Load the English STSB dataset
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
stsb_test = pd.DataFrame(stsb_dataset['test'])

# let's create helper functions
nlp = spacy.load("en_core_web_sm")


def text_processing(sentence):
    sentence = [token.lemma_.lower()
                for token in nlp(sentence)
                if token.is_alpha and not token.is_stop]
    return sentence


def cos_sim(sentence1_emb, sentence2_emb):
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)


# let's read the csv file
data = (pd.read_csv("SBERT_data.csv")).drop(['Unnamed: 0'], axis=1)

prompt = "charles"
data['prompt'] = prompt
data.rename(columns={'target_text': 'sentence2',
            'prompt': 'sentence1'}, inplace=True)
data['sentence2'] = data['sentence2'].astype('str')
data['sentence1'] = data['sentence1'].astype('str')

XpathFinder = CrossEncoder("cross-encoder/stsb-roberta-base")
sentence_pairs = []
for sentence1, sentence2 in zip(data['sentence1'], data['sentence2']):
    sentence_pairs.append([sentence1, sentence2])

data['SBERT CrossEncoder_Score'] = XpathFinder.predict(
    sentence_pairs, show_progress_bar=True)

# sorting the values
data.sort_values(by=['SBERT CrossEncoder_Score'], ascending=False)

loaded_model = XpathFinder

# Containers
header_container = st.container()
mod_container = st.container()

# Header
with header_container:

    # different levels of text you can include in your app
    st.title("Xpath Finder App")


# model container
with mod_container:

    # collecting input from user
    prompt = st.text_input("Enter your description below ...")

    # Loading e data
    data = (pd.read_csv("/content/SBERT_data.csv")
            ).drop(['Unnamed: 0'], axis=1)

    data['prompt'] = prompt
    data.rename(columns={'target_text': 'sentence2',
                'prompt': 'sentence1'}, inplace=True)
    data['sentence2'] = data['sentence2'].astype('str')
    data['sentence1'] = data['sentence1'].astype('str')

    # let's pass the input to the loaded_model with torch compiled with cuda
    if prompt:
        # let's get the result
        simscore = loaded_model.predict([prompt])

        from sentence_transformers import CrossEncoder
        loaded_model = CrossEncoder("cross-encoder/stsb-roberta-base")
        sentence_pairs = []
        for sentence1, sentence2 in zip(data['sentence1'], data['sentence2']):
            sentence_pairs.append([sentence1, sentence2])

        # sorting the df to get highest scoring xpath_container
        data['SBERT CrossEncoder_Score'] = loaded_model.predict(sentence_pairs)
        most_acc = data.head(5)
        # predictions
        st.write("Highest Similarity score: ", simscore)
        st.text("Is this one of these the Xpath you're looking for?")
        st.write(st.write(most_acc["input_text"]))