Spaces:
Sleeping
Sleeping
Remove large file train_info.txt
Browse files- app.py +76 -22
- new_test_saved_finetuned_model.py +2 -2
- plot.png +0 -0
- result.txt +7 -7
- roc_data.pkl +3 -0
- school_grduation_rate.pkl +3 -0
- selected_rows.txt +0 -0
- train.txt +0 -0
- train_info.txt +0 -1
- train_label.txt +0 -0
app.py
CHANGED
|
@@ -7,9 +7,10 @@ import subprocess
|
|
| 7 |
import shutil
|
| 8 |
import matplotlib.pyplot as plt
|
| 9 |
from sklearn.metrics import roc_curve, auc
|
|
|
|
| 10 |
# Define the function to process the input file and model selection
|
| 11 |
|
| 12 |
-
def process_file(file,label,info,
|
| 13 |
# progress = gr.Progress(track_tqdm=True)
|
| 14 |
progress(0, desc="Starting the processing")
|
| 15 |
with open(file.name, 'r') as f:
|
|
@@ -21,27 +22,66 @@ def process_file(file,label,info,inc_val,progress=Progress(track_tqdm=True)):
|
|
| 21 |
shutil.copyfile(file.name, saved_test_dataset)
|
| 22 |
shutil.copyfile(label.name, saved_test_label)
|
| 23 |
shutil.copyfile(info.name, saved_train_info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
# For demonstration purposes, we'll just return the content with the selected model name
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
# else:
|
| 32 |
-
# checkpoint=None
|
| 33 |
-
# print(checkpoint)
|
| 34 |
-
if (inc_val<5):
|
| 35 |
-
model_name="highGRschool10"
|
| 36 |
-
elif(inc_val>=5 & inc_val<10):
|
| 37 |
-
model_name="highGRschool10"
|
| 38 |
else:
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
subprocess.run([
|
| 41 |
"python", "new_test_saved_finetuned_model.py",
|
| 42 |
"-workspace_name", "ratio_proportion_change3_2223/sch_largest_100-coded",
|
| 43 |
-
"-finetune_task",
|
| 44 |
-
|
| 45 |
# "-test_label_path","../../../../train_label.txt",
|
| 46 |
"-finetuned_bert_classifier_checkpoint",
|
| 47 |
"ratio_proportion_change3_2223/sch_largest_100-coded/output/highGRschool10/bert_fine_tuned.model.ep42",
|
|
@@ -77,12 +117,26 @@ def process_file(file,label,info,inc_val,progress=Progress(track_tqdm=True)):
|
|
| 77 |
progress(1.0)
|
| 78 |
# Prepare text output
|
| 79 |
text_output = f"Model: {model_name}\nResult:\n{result}"
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
return text_output,plot_path
|
| 82 |
|
| 83 |
# List of models for the dropdown menu
|
| 84 |
|
| 85 |
-
models = ["
|
| 86 |
|
| 87 |
# Create the Gradio interface
|
| 88 |
with gr.Blocks(css="""
|
|
@@ -275,10 +329,10 @@ tbody.svelte-18wv37q>tr.svelte-18wv37q:nth-child(odd) {
|
|
| 275 |
|
| 276 |
info_input = gr.File(label="Upload test info", file_types=['.txt'], elem_classes="file-box")
|
| 277 |
|
| 278 |
-
|
| 279 |
|
| 280 |
|
| 281 |
-
increment_slider = gr.Slider(minimum=
|
| 282 |
|
| 283 |
with gr.Row():
|
| 284 |
output_text = gr.Textbox(label="Output Text")
|
|
@@ -286,7 +340,7 @@ tbody.svelte-18wv37q>tr.svelte-18wv37q:nth-child(odd) {
|
|
| 286 |
|
| 287 |
btn = gr.Button("Submit")
|
| 288 |
|
| 289 |
-
btn.click(fn=process_file, inputs=[file_input,label_input,info_input,increment_slider], outputs=[output_text,output_image])
|
| 290 |
|
| 291 |
|
| 292 |
# Launch the app
|
|
|
|
| 7 |
import shutil
|
| 8 |
import matplotlib.pyplot as plt
|
| 9 |
from sklearn.metrics import roc_curve, auc
|
| 10 |
+
import pandas as pd
|
| 11 |
# Define the function to process the input file and model selection
|
| 12 |
|
| 13 |
+
def process_file(file,label,info,model_name,inc_slider,progress=Progress(track_tqdm=True)):
|
| 14 |
# progress = gr.Progress(track_tqdm=True)
|
| 15 |
progress(0, desc="Starting the processing")
|
| 16 |
with open(file.name, 'r') as f:
|
|
|
|
| 22 |
shutil.copyfile(file.name, saved_test_dataset)
|
| 23 |
shutil.copyfile(label.name, saved_test_label)
|
| 24 |
shutil.copyfile(info.name, saved_train_info)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Load the test_info file and the graduation rate file
|
| 28 |
+
test_info = pd.read_csv('train_info.txt', sep=',', header=None, engine='python')
|
| 29 |
+
grad_rate_data = pd.DataFrame(pd.read_pickle('school_grduation_rate.pkl'),columns=['school_number','grad_rate']) # Load the grad_rate data
|
| 30 |
+
|
| 31 |
+
# Step 1: Extract unique school numbers from test_info
|
| 32 |
+
unique_schools = test_info[0].unique()
|
| 33 |
+
|
| 34 |
+
# Step 2: Filter the grad_rate_data using the unique school numbers
|
| 35 |
+
schools = grad_rate_data[grad_rate_data['school_number'].isin(unique_schools)]
|
| 36 |
+
|
| 37 |
+
# Define a threshold for high and low graduation rates (adjust as needed)
|
| 38 |
+
grad_rate_threshold = 0.9
|
| 39 |
+
|
| 40 |
+
# Step 4: Divide schools into high and low graduation rate groups
|
| 41 |
+
high_grad_schools = schools[schools['grad_rate'] >= grad_rate_threshold]['school_number'].unique()
|
| 42 |
+
low_grad_schools = schools[schools['grad_rate'] < grad_rate_threshold]['school_number'].unique()
|
| 43 |
+
|
| 44 |
+
# Step 5: Sample percentage of schools from each group
|
| 45 |
+
high_sample = pd.Series(high_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()
|
| 46 |
+
low_sample = pd.Series(low_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()
|
| 47 |
+
|
| 48 |
+
# Step 6: Combine the sampled schools
|
| 49 |
+
random_schools = high_sample + low_sample
|
| 50 |
+
|
| 51 |
+
# Step 7: Get indices for the sampled schools
|
| 52 |
+
indices = test_info[test_info[0].isin(random_schools)].index.tolist()
|
| 53 |
+
|
| 54 |
+
# Load the test file and select rows based on indices
|
| 55 |
+
test = pd.read_csv('train.txt', sep=',', header=None, engine='python')
|
| 56 |
+
selected_rows_df2 = test.loc[indices]
|
| 57 |
+
|
| 58 |
+
# Save the selected rows to a file
|
| 59 |
+
selected_rows_df2.to_csv('selected_rows.txt', sep='\t', index=False, header=False, quoting=3, escapechar=' ')
|
| 60 |
+
|
| 61 |
+
|
| 62 |
# For demonstration purposes, we'll just return the content with the selected model name
|
| 63 |
+
if(model_name=="High Graduated Schools"):
|
| 64 |
+
finetune_task="highGRschool10"
|
| 65 |
+
elif(model_name== "Low Graduated Schools" ):
|
| 66 |
+
finetune_task="highGRschool10"
|
| 67 |
+
elif(model_name=="Full Set"):
|
| 68 |
+
finetune_task="highGRschool10"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
else:
|
| 70 |
+
finetune_task=None
|
| 71 |
+
# print(checkpoint)
|
| 72 |
+
progress(0.1, desc="Files created and saved")
|
| 73 |
+
# if (inc_val<5):
|
| 74 |
+
# model_name="highGRschool10"
|
| 75 |
+
# elif(inc_val>=5 & inc_val<10):
|
| 76 |
+
# model_name="highGRschool10"
|
| 77 |
+
# else:
|
| 78 |
+
# model_name="highGRschool10"
|
| 79 |
+
progress(0.2, desc="Executing models")
|
| 80 |
subprocess.run([
|
| 81 |
"python", "new_test_saved_finetuned_model.py",
|
| 82 |
"-workspace_name", "ratio_proportion_change3_2223/sch_largest_100-coded",
|
| 83 |
+
"-finetune_task", "highGRschool10",
|
| 84 |
+
"-test_dataset_path","../../../../selected_rows.txt",
|
| 85 |
# "-test_label_path","../../../../train_label.txt",
|
| 86 |
"-finetuned_bert_classifier_checkpoint",
|
| 87 |
"ratio_proportion_change3_2223/sch_largest_100-coded/output/highGRschool10/bert_fine_tuned.model.ep42",
|
|
|
|
| 117 |
progress(1.0)
|
| 118 |
# Prepare text output
|
| 119 |
text_output = f"Model: {model_name}\nResult:\n{result}"
|
| 120 |
+
# Prepare text output with HTML formatting
|
| 121 |
+
text_output = f"""
|
| 122 |
+
Model: {model_name}\n
|
| 123 |
+
Result Summary:\n
|
| 124 |
+
-----------------\n
|
| 125 |
+
Average Loss: {result['avg_loss']:.4f}\n
|
| 126 |
+
Total Accuracy: {result['total_acc']:.2f}%\n
|
| 127 |
+
Precision: {result['precisions']:.2f}\n
|
| 128 |
+
Recall: {result['recalls']:.2f}\n
|
| 129 |
+
F1-Score: {result['f1_scores']:.2f}\n
|
| 130 |
+
Time Taken: {result['time_taken_from_start']:.2f} seconds\n
|
| 131 |
+
AUC Score: {result['auc_score']:.4f}\n
|
| 132 |
+
-----------------\n
|
| 133 |
+
Note: The ROC Curve is also displayed for the evaluation.
|
| 134 |
+
"""
|
| 135 |
return text_output,plot_path
|
| 136 |
|
| 137 |
# List of models for the dropdown menu
|
| 138 |
|
| 139 |
+
models = ["High Graduated Schools", "Low Graduated Schools", "Full Set"]
|
| 140 |
|
| 141 |
# Create the Gradio interface
|
| 142 |
with gr.Blocks(css="""
|
|
|
|
| 329 |
|
| 330 |
info_input = gr.File(label="Upload test info", file_types=['.txt'], elem_classes="file-box")
|
| 331 |
|
| 332 |
+
model_dropdown = gr.Dropdown(choices=models, label="Select Finetune Task", elem_classes="dropdown-menu")
|
| 333 |
|
| 334 |
|
| 335 |
+
increment_slider = gr.Slider(minimum=5, maximum=100, step=5, label="Schools Percentage", value=5)
|
| 336 |
|
| 337 |
with gr.Row():
|
| 338 |
output_text = gr.Textbox(label="Output Text")
|
|
|
|
| 340 |
|
| 341 |
btn = gr.Button("Submit")
|
| 342 |
|
| 343 |
+
btn.click(fn=process_file, inputs=[file_input,label_input,info_input,model_dropdown,increment_slider], outputs=[output_text,output_image])
|
| 344 |
|
| 345 |
|
| 346 |
# Launch the app
|
new_test_saved_finetuned_model.py
CHANGED
|
@@ -495,7 +495,7 @@ def train():
|
|
| 495 |
parser.add_argument("-hs", "--hidden", type=int, default=64, help="hidden size of transformer model") #64
|
| 496 |
parser.add_argument("-l", "--layers", type=int, default=4, help="number of layers") #4
|
| 497 |
parser.add_argument("-a", "--attn_heads", type=int, default=4, help="number of attention heads") #8
|
| 498 |
-
parser.add_argument("-s", "--seq_len", type=int, default=
|
| 499 |
|
| 500 |
parser.add_argument("-b", "--batch_size", type=int, default=500, help="number of batch_size") #64
|
| 501 |
parser.add_argument("-e", "--epochs", type=int, default=1)#1501, help="number of epochs") #501
|
|
@@ -508,7 +508,7 @@ def train():
|
|
| 508 |
# parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
|
| 509 |
parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
|
| 510 |
# parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false")
|
| 511 |
-
|
| 512 |
parser.add_argument("--dropout", type=float, default=0.1, help="dropout of network")
|
| 513 |
parser.add_argument("--lr", type=float, default=1e-05, help="learning rate of adam") #1e-3
|
| 514 |
parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
|
|
|
|
| 495 |
parser.add_argument("-hs", "--hidden", type=int, default=64, help="hidden size of transformer model") #64
|
| 496 |
parser.add_argument("-l", "--layers", type=int, default=4, help="number of layers") #4
|
| 497 |
parser.add_argument("-a", "--attn_heads", type=int, default=4, help="number of attention heads") #8
|
| 498 |
+
parser.add_argument("-s", "--seq_len", type=int, default=128, help="maximum sequence length")
|
| 499 |
|
| 500 |
parser.add_argument("-b", "--batch_size", type=int, default=500, help="number of batch_size") #64
|
| 501 |
parser.add_argument("-e", "--epochs", type=int, default=1)#1501, help="number of epochs") #501
|
|
|
|
| 508 |
# parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
|
| 509 |
parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
|
| 510 |
# parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false")
|
| 511 |
+
|
| 512 |
parser.add_argument("--dropout", type=float, default=0.1, help="dropout of network")
|
| 513 |
parser.add_argument("--lr", type=float, default=1e-05, help="learning rate of adam") #1e-3
|
| 514 |
parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
|
plot.png
CHANGED
|
|
result.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
avg_loss: 0.
|
| 2 |
-
total_acc:
|
| 3 |
-
precisions: 0.
|
| 4 |
-
recalls: 0.
|
| 5 |
-
f1_scores: 0.
|
| 6 |
-
time_taken_from_start:
|
| 7 |
-
auc_score: 0.
|
|
|
|
| 1 |
+
avg_loss: 0.5631513595581055
|
| 2 |
+
total_acc: 69.7320542507443
|
| 3 |
+
precisions: 0.7236992960620143
|
| 4 |
+
recalls: 0.6973205425074429
|
| 5 |
+
f1_scores: 0.6879225873063946
|
| 6 |
+
time_taken_from_start: 73.04951095581055
|
| 7 |
+
auc_score: 0.7452296224317393
|
roc_data.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4beb5de79dfb3592402832ced8db0c87f3264e46c0813553c40728c7ddafed5
|
| 3 |
+
size 29285
|
school_grduation_rate.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c0c99dd8fc601de1fc8f4af5880bf71b7198c09bf0d016a880b02043e0b3d03
|
| 3 |
+
size 18356
|
selected_rows.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
train.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
train_info.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
test
|
|
|
|
|
|
train_label.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|