File size: 4,822 Bytes
1ca0686
d042411
1ca0686
a451187
 
 
d042411
 
a451187
d042411
9f020c6
1ca0686
dddc433
d042411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a451187
 
 
 
 
 
 
d042411
 
 
 
 
6153e60
 
d042411
 
 
 
 
 
 
 
 
 
a451187
d042411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a451187
d042411
 
 
 
a451187
 
 
2a79df3
a451187
d042411
 
 
 
 
 
 
 
 
 
 
 
a451187
6153e60
1ca0686
d042411
 
3112e6e
d042411
 
 
 
 
 
a451187
 
 
 
d042411
 
 
a451187
d042411
 
2a79df3
 
a451187
 
2a79df3
a451187
 
2a79df3
 
a451187
 
2a79df3
a451187
 
 
 
 
 
 
 
fc4d986
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import streamlit as st
import pandas as pd
from transformers import pipeline
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
from pprint import pprint
from datasets import load_dataset
import tensorflow as tf

st.title("CS634 - milestone3/4 - Tedi Pano")


@st.cache_resource
def load_data():
    dataset_dict = load_dataset('HUPD/hupd',
        name='sample',
        data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
        icpr_label=None,
        train_filing_start_date='2016-01-01',
        train_filing_end_date='2016-01-21',
        val_filing_start_date='2016-01-22',
        val_filing_end_date='2016-01-31',
    )

    st.write('Loading is done!')
    return dataset_dict

@st.cache_resource
def training_computation(_dataset_dict):
    df = pd.DataFrame(_dataset_dict['train'])
    vf = pd.DataFrame(_dataset_dict['validation'])
    
    accepted_rejected = ['ACCEPTED', 'REJECTED']
    df = df[df['decision'].isin(accepted_rejected)]
    df['patentability_score'] = df['decision'].map({'ACCEPTED': 1, 'REJECTED': 0})
    vf = vf[vf['decision'].isin(accepted_rejected)]
    vf['patentability_score'] = vf['decision'].map({'ACCEPTED': 1, 'REJECTED': 0})
    
    st.write("Processed the data")
    
    
    
    dftrain, dftest = train_test_split(df, test_size = 0.99, random_state = None)
    vftrain, vftest = train_test_split(df, test_size = 0.99, random_state = None)
    
    #st.write(dftrain.shape[0])
    #st.write(vftrain.shape[0])
    
    
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    
    X_dtrain = dftrain['abstract'].tolist()
    y_dtrain = dftrain['patentability_score'].tolist()
    
    X_vtrain = vftrain['abstract'].tolist()
    y_vtrain = vftrain['patentability_score'].tolist()
    
    X_dtest = dftest['abstract'].tolist()
    y_dtest = dftest['patentability_score'].tolist()
    
    train_encodings = tokenizer(X_dtrain, truncation=True, padding=True)
    val_encodings = tokenizer(X_vtrain, truncation=True, padding=True)
    test_encodings = tokenizer(X_dtest, truncation=True, padding=True)
    
    st.write("tokenizing completed!")
    
    
    
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        y_dtrain
    ))
    
    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings),
        y_vtrain
    ))
    
    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings),
        y_dtest
    ))
    
    #st.write("back to dataset!")
    
    
    training_args = TFTrainingArguments(
        output_dir='./results',          
        num_train_epochs=1,              
        per_device_train_batch_size=8, 
        per_device_eval_batch_size=16,   
        warmup_steps=5,
        eval_steps=5              
    )
    
    
    with training_args.strategy.scope():
        model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
    
    trainer = TFTrainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=val_dataset             
    )
    st.write("training in progress.....")
    trainer.train()

    st.write("training completed")
    return trainer
    

dataset_dict = load_data()
trainer = training_computation(dataset_dict)


patents = pd.DataFrame(dataset_dict['train'])
accepted_rejected = ['ACCEPTED', 'REJECTED']
patents = patents[patents['decision'].isin(accepted_rejected)]
patents['patentability_score'] = patents['decision'].map({'ACCEPTED': 1, 'REJECTED': 0})

patent_selection = st.selectbox("Select Patent",patents['patent_number'])

patent = patents.loc[patents['patent_number'] == patent_selection]
#st.write(patent.shape[0])
st.write(patent['abstract'])
st.write(patent['claims'])


with st.form("my_form"):
    submitted = st.form_submit_button("Submit")
    pat_abstract = patent['abstract'].tolist()
    pat_score = patent['patentability_score'].tolist()
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    test_encodings = tokenizer(pat_abstract, truncation=True, padding=True)
    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings),
        pat_score
    ))
    predictions = trainer.predict(test_dataset)

    if submitted:
        if(predictions[1][0] == 1):
            st.write("Patent is ACCEPTED")
            st.write("with a certainty of " + str(predictions[0][0][1]))
        else:
            st.write("Patent is REJECTED")
            st.write("with a certainty of " + str(predictions[0][0][1]))