panotedi commited on
Commit
a451187
·
unverified ·
1 Parent(s): 6153e60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -20
app.py CHANGED
@@ -1,9 +1,12 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from transformers import pipeline
4
-
 
 
5
  from pprint import pprint
6
  from datasets import load_dataset
 
7
 
8
  st.title("CS634 - milestone3/4 - Tedi Pano")
9
 
@@ -36,11 +39,14 @@ def training_computation(_dataset_dict):
36
  st.write("Processed the data")
37
 
38
 
39
- from sklearn.model_selection import train_test_split
40
- dftrain, dftest = train_test_split(df, test_size = 0.90, random_state = 0)
41
- vftrain, vftest = train_test_split(df, test_size = 0.90, random_state = 0)
42
 
43
- from transformers import DistilBertTokenizerFast
 
 
 
 
 
 
44
  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
45
 
46
  X_dtrain = dftrain['abstract'].tolist()
@@ -58,7 +64,7 @@ def training_computation(_dataset_dict):
58
 
59
  st.write("tokenizing completed!")
60
 
61
- import tensorflow as tf
62
 
63
  train_dataset = tf.data.Dataset.from_tensor_slices((
64
  dict(train_encodings),
@@ -75,18 +81,16 @@ def training_computation(_dataset_dict):
75
  y_dtest
76
  ))
77
 
78
- st.write("back to dataset!")
79
 
80
- from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
81
 
82
  training_args = TFTrainingArguments(
83
  output_dir='./results',
84
- num_train_epochs=2,
85
- per_device_train_batch_size=128,
86
- per_device_eval_batch_size=256,
87
  warmup_steps=5,
88
- eval_steps=5,
89
- weight_decay=0.01
90
  )
91
 
92
 
@@ -99,7 +103,7 @@ def training_computation(_dataset_dict):
99
  train_dataset=train_dataset,
100
  eval_dataset=val_dataset
101
  )
102
-
103
  trainer.train()
104
 
105
  st.write("training completed")
@@ -111,20 +115,34 @@ trainer = training_computation(dataset_dict)
111
 
112
 
113
  patents = pd.DataFrame(dataset_dict['train'])
 
 
 
 
114
  patent_selection = st.selectbox("Select Patent",patents['patent_number'])
115
 
116
  patent = patents.loc[patents['patent_number'] == patent_selection]
 
117
  st.write(patent['abstract'])
118
  st.write(patent['claims'])
119
 
120
- submitted = st.form_submit_button("Submit")
121
 
122
- if submitted:
 
123
  pat_abstract = patent['abstract'].tolist()
124
- #pat_score = patent['patentability_score'].tolist()
 
125
  test_encodings = tokenizer(pat_abstract, truncation=True, padding=True)
126
  test_dataset = tf.data.Dataset.from_tensor_slices((
127
- dict(test_encodings)
 
128
  ))
129
- predictions = trainer.predict(test_dataset)[1]
130
- st.write(predictions)
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  from transformers import pipeline
4
+ from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
5
+ from sklearn.model_selection import train_test_split
6
+ from transformers import DistilBertTokenizerFast
7
  from pprint import pprint
8
  from datasets import load_dataset
9
+ import tensorflow as tf
10
 
11
  st.title("CS634 - milestone3/4 - Tedi Pano")
12
 
 
39
  st.write("Processed the data")
40
 
41
 
 
 
 
42
 
43
+ dftrain, dftest = train_test_split(df, test_size = 0.99, random_state = None)
44
+ vftrain, vftest = train_test_split(df, test_size = 0.99, random_state = None)
45
+
46
+ #st.write(dftrain.shape[0])
47
+ #st.write(vftrain.shape[0])
48
+
49
+
50
  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
51
 
52
  X_dtrain = dftrain['abstract'].tolist()
 
64
 
65
  st.write("tokenizing completed!")
66
 
67
+
68
 
69
  train_dataset = tf.data.Dataset.from_tensor_slices((
70
  dict(train_encodings),
 
81
  y_dtest
82
  ))
83
 
84
+ #st.write("back to dataset!")
85
 
 
86
 
87
  training_args = TFTrainingArguments(
88
  output_dir='./results',
89
+ num_train_epochs=1,
90
+ per_device_train_batch_size=8,
91
+ per_device_eval_batch_size=16,
92
  warmup_steps=5,
93
+ eval_steps=5
 
94
  )
95
 
96
 
 
103
  train_dataset=train_dataset,
104
  eval_dataset=val_dataset
105
  )
106
+ st.write("training in progress.....")
107
  trainer.train()
108
 
109
  st.write("training completed")
 
115
 
116
 
117
  patents = pd.DataFrame(dataset_dict['train'])
118
+ accepted_rejected = ['ACCEPTED', 'REJECTED']
119
+ patents = patents[patents['decision'].isin(accepted_rejected)]
120
+ patents['patentability_score'] = patents['decision'].map({'ACCEPTED': 1, 'REJECTED': 0})
121
+
122
  patent_selection = st.selectbox("Select Patent",patents['patent_number'])
123
 
124
  patent = patents.loc[patents['patent_number'] == patent_selection]
125
+ #st.write(patent.shape[0])
126
  st.write(patent['abstract'])
127
  st.write(patent['claims'])
128
 
 
129
 
130
+ with st.form("my_form"):
131
+ submitted = st.form_submit_button("Submit")
132
  pat_abstract = patent['abstract'].tolist()
133
+ pat_score = patent['patentability_score'].tolist()
134
+ tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
135
  test_encodings = tokenizer(pat_abstract, truncation=True, padding=True)
136
  test_dataset = tf.data.Dataset.from_tensor_slices((
137
+ dict(test_encodings),
138
+ pat_score
139
  ))
140
+ predictions = trainer.predict(test_dataset)
141
+
142
+ if submitted:
143
+ if(predictions[1][0] == 1):
144
+ st.write("Patent is ACCEPTED")
145
+ st.write("with a certainty of " + str(predictions[0][0][1]))
146
+ else:
147
+ st.write("Patent is REJECTED")
148
+ st.write("with a certainty of " + str(predictions[0][0][0]))