File size: 3,630 Bytes
1ca0686
d042411
1ca0686
d042411
 
 
 
9f020c6
1ca0686
d042411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ca0686
d042411
1ca0686
d042411
 
3112e6e
d042411
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import streamlit as st
import pandas as pd
from transformers import pipeline

from pprint import pprint
from datasets import load_dataset

st.title("CS634 - milestone3/4 - Tedi Pano")

@st.cache_resource
def load_data():
    dataset_dict = load_dataset('HUPD/hupd',
        name='sample',
        data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
        icpr_label=None,
        train_filing_start_date='2016-01-01',
        train_filing_end_date='2016-01-21',
        val_filing_start_date='2016-01-22',
        val_filing_end_date='2016-01-31',
    )

    st.write('Loading is done!')
    return dataset_dict

@st.cache_resource
def training_computation(_dataset_dict):
    df = pd.DataFrame(_dataset_dict['train'])
    vf = pd.DataFrame(_dataset_dict['validation'])
    
    accepted_rejected = ['ACCEPTED', 'REJECTED']
    df = df[df['decision'].isin(accepted_rejected)]
    df['patentability_score'] = df['decision'].map({'ACCEPTED': 1, 'REJECTED': 0})
    vf = vf[vf['decision'].isin(accepted_rejected)]
    vf['patentability_score'] = vf['decision'].map({'ACCEPTED': 1, 'REJECTED': 0})
    
    st.write("Processed the data")
    
    
    from sklearn.model_selection import train_test_split
    dftrain, dftest = train_test_split(df, test_size = 0.90, random_state = 0)
    
    from transformers import DistilBertTokenizerFast
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    
    X_dtrain = dftrain['abstract'].tolist()
    y_dtrain = dftrain['patentability_score'].tolist()
    
    X_vtrain = vf['abstract'].tolist()
    y_vtrain = vf['patentability_score'].tolist()
    
    X_dtest = dftest['abstract'].tolist()
    y_dtest = dftest['patentability_score'].tolist()
    
    train_encodings = tokenizer(X_dtrain, truncation=True, padding=True)
    val_encodings = tokenizer(X_vtrain, truncation=True, padding=True)
    test_encodings = tokenizer(X_dtest, truncation=True, padding=True)
    
    st.write("tokenizing completed!")
    
    import tensorflow as tf
    
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        y_dtrain
    ))
    
    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings),
        y_vtrain
    ))
    
    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings),
        y_dtest
    ))
    
    st.write("back to dataset!")
    
    from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
    
    training_args = TFTrainingArguments(
        output_dir='./results',          
        num_train_epochs=2,              
        per_device_train_batch_size=16, 
        per_device_eval_batch_size=16,   
        warmup_steps=500,
        eval_steps=500, 
        weight_decay=0.01               
    )
    
    
    with training_args.strategy.scope():
        model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
    
    trainer = TFTrainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=val_dataset             
    )

    trainer.train()

    st.write("training completed")
    return trainer
    

dataset_dict = load_data()
trainer = training_computation(dataset_dict)


patents = pd.DataFrame(dataset_dict['train'])
patent_selection = st.selectbox("Select Patent",patents['patent_number'])

patent = patents.loc[patents['patent_number'] == patent_selection]
st.write(patent['abstract'])
st.write(patent['claims'])