Spaces:

suryadev1
/

astra

Sleeping

App Files Files Community

suryadev1 commited on Nov 25, 2024

Commit

2180f54

1 Parent(s): 09bc963

Remove large file train_info.txt

Browse files

Files changed (10) hide show

app.py +76 -22
new_test_saved_finetuned_model.py +2 -2
plot.png +0 -0
result.txt +7 -7
roc_data.pkl +3 -0
school_grduation_rate.pkl +3 -0
selected_rows.txt +0 -0
train.txt +0 -0
train_info.txt +0 -1
train_label.txt +0 -0

app.py CHANGED Viewed

@@ -7,9 +7,10 @@ import subprocess
 import shutil
 import matplotlib.pyplot as plt
 from sklearn.metrics import roc_curve, auc
 # Define the function to process the input file and model selection
-def process_file(file,label,info,inc_val,progress=Progress(track_tqdm=True)):
     # progress = gr.Progress(track_tqdm=True)
     progress(0, desc="Starting the processing")
     with open(file.name, 'r') as f:
@@ -21,27 +22,66 @@ def process_file(file,label,info,inc_val,progress=Progress(track_tqdm=True)):
     shutil.copyfile(file.name, saved_test_dataset)
     shutil.copyfile(label.name, saved_test_label)
     shutil.copyfile(info.name, saved_train_info)
     # For demonstration purposes, we'll just return the content with the selected model name
-    # if(model_name=="highGRschool10"):
-    #     checkpoint="ratio_proportion_change3/output/FS/bert_fine_tuned.model.ep32"
-    # elif(model_name=="lowGRschoolAll"):
-    #     checkpoint="ratio_proportion_change3/output/IS/bert_fine_tuned.model.ep14"
-    # elif(model_name=="fullTest"):
-    #     checkpoint="ratio_proportion_change3/output/correctness/bert_fine_tuned.model.ep48"
-    # else:
-    #     checkpoint=None
-    # print(checkpoint)
-    if (inc_val<5):
-        model_name="highGRschool10"
-    elif(inc_val>=5 & inc_val<10):
-        model_name="highGRschool10"
     else:
-        model_name="highGRschool10"
     subprocess.run([
         "python", "new_test_saved_finetuned_model.py",
         "-workspace_name", "ratio_proportion_change3_2223/sch_largest_100-coded",
-        "-finetune_task", model_name,
-        # "-test_dataset_path","../../../../train.txt",
         # "-test_label_path","../../../../train_label.txt",
         "-finetuned_bert_classifier_checkpoint",
         "ratio_proportion_change3_2223/sch_largest_100-coded/output/highGRschool10/bert_fine_tuned.model.ep42",
@@ -77,12 +117,26 @@ def process_file(file,label,info,inc_val,progress=Progress(track_tqdm=True)):
     progress(1.0)
     # Prepare text output
     text_output = f"Model: {model_name}\nResult:\n{result}"
     return text_output,plot_path
 # List of models for the dropdown menu
-models = ["highGRschool10", "lowGRschoolAll", "fullTest"]
 # Create the Gradio interface
 with gr.Blocks(css="""
@@ -275,10 +329,10 @@ tbody.svelte-18wv37q>tr.svelte-18wv37q:nth-child(odd) {
         info_input = gr.File(label="Upload test info", file_types=['.txt'], elem_classes="file-box")
-    # model_dropdown = gr.Dropdown(choices=models, label="Select Finetune Task", elem_classes="dropdown-menu")
-    increment_slider = gr.Slider(minimum=1, maximum=50, step=5, label="Schools number", value=1)
     with gr.Row():
         output_text = gr.Textbox(label="Output Text")
@@ -286,7 +340,7 @@ tbody.svelte-18wv37q>tr.svelte-18wv37q:nth-child(odd) {
     btn = gr.Button("Submit")
-    btn.click(fn=process_file, inputs=[file_input,label_input,info_input,increment_slider], outputs=[output_text,output_image])
 # Launch the app

 import shutil
 import matplotlib.pyplot as plt
 from sklearn.metrics import roc_curve, auc
+import pandas as pd
 # Define the function to process the input file and model selection
+def process_file(file,label,info,model_name,inc_slider,progress=Progress(track_tqdm=True)):
     # progress = gr.Progress(track_tqdm=True)
     progress(0, desc="Starting the processing")
     with open(file.name, 'r') as f:
     shutil.copyfile(file.name, saved_test_dataset)
     shutil.copyfile(label.name, saved_test_label)
     shutil.copyfile(info.name, saved_train_info)
+    # Load the test_info file and the graduation rate file
+    test_info = pd.read_csv('train_info.txt', sep=',', header=None, engine='python')
+    grad_rate_data = pd.DataFrame(pd.read_pickle('school_grduation_rate.pkl'),columns=['school_number','grad_rate'])  # Load the grad_rate data
+    # Step 1: Extract unique school numbers from test_info
+    unique_schools = test_info[0].unique()
+    # Step 2: Filter the grad_rate_data using the unique school numbers
+    schools = grad_rate_data[grad_rate_data['school_number'].isin(unique_schools)]
+    # Define a threshold for high and low graduation rates (adjust as needed)
+    grad_rate_threshold = 0.9
+    # Step 4: Divide schools into high and low graduation rate groups
+    high_grad_schools = schools[schools['grad_rate'] >= grad_rate_threshold]['school_number'].unique()
+    low_grad_schools = schools[schools['grad_rate'] < grad_rate_threshold]['school_number'].unique()
+    # Step 5: Sample percentage of schools from each group
+    high_sample = pd.Series(high_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()
+    low_sample = pd.Series(low_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()
+    # Step 6: Combine the sampled schools
+    random_schools = high_sample + low_sample
+    # Step 7: Get indices for the sampled schools
+    indices = test_info[test_info[0].isin(random_schools)].index.tolist()
+    # Load the test file and select rows based on indices
+    test = pd.read_csv('train.txt', sep=',', header=None, engine='python')
+    selected_rows_df2 = test.loc[indices]
+    # Save the selected rows to a file
+    selected_rows_df2.to_csv('selected_rows.txt', sep='\t', index=False, header=False, quoting=3, escapechar=' ')
     # For demonstration purposes, we'll just return the content with the selected model name
+    if(model_name=="High Graduated Schools"):
+        finetune_task="highGRschool10"
+    elif(model_name== "Low Graduated Schools" ):
+        finetune_task="highGRschool10"
+    elif(model_name=="Full Set"):
+        finetune_task="highGRschool10"
     else:
+        finetune_task=None
+    # print(checkpoint)
+    progress(0.1, desc="Files created and saved")
+    # if (inc_val<5):
+    #     model_name="highGRschool10"
+    # elif(inc_val>=5 & inc_val<10):
+    #     model_name="highGRschool10"
+    # else:
+    #     model_name="highGRschool10"
+    progress(0.2, desc="Executing models")
     subprocess.run([
         "python", "new_test_saved_finetuned_model.py",
         "-workspace_name", "ratio_proportion_change3_2223/sch_largest_100-coded",
+        "-finetune_task", "highGRschool10",
+        "-test_dataset_path","../../../../selected_rows.txt",
         # "-test_label_path","../../../../train_label.txt",
         "-finetuned_bert_classifier_checkpoint",
         "ratio_proportion_change3_2223/sch_largest_100-coded/output/highGRschool10/bert_fine_tuned.model.ep42",
     progress(1.0)
     # Prepare text output
     text_output = f"Model: {model_name}\nResult:\n{result}"
+    # Prepare text output with HTML formatting
+    text_output = f"""
+    Model: {model_name}\n
+    Result Summary:\n
+    -----------------\n
+    Average Loss: {result['avg_loss']:.4f}\n
+    Total Accuracy: {result['total_acc']:.2f}%\n
+    Precision: {result['precisions']:.2f}\n
+    Recall: {result['recalls']:.2f}\n
+    F1-Score: {result['f1_scores']:.2f}\n
+    Time Taken: {result['time_taken_from_start']:.2f} seconds\n
+    AUC Score: {result['auc_score']:.4f}\n
+    -----------------\n
+    Note: The ROC Curve is also displayed for the evaluation.
+    """
     return text_output,plot_path
 # List of models for the dropdown menu
+models = ["High Graduated Schools", "Low Graduated Schools", "Full Set"]
 # Create the Gradio interface
 with gr.Blocks(css="""
         info_input = gr.File(label="Upload test info", file_types=['.txt'], elem_classes="file-box")
+    model_dropdown = gr.Dropdown(choices=models, label="Select Finetune Task", elem_classes="dropdown-menu")
+    increment_slider = gr.Slider(minimum=5, maximum=100, step=5, label="Schools Percentage", value=5)
     with gr.Row():
         output_text = gr.Textbox(label="Output Text")
     btn = gr.Button("Submit")
+    btn.click(fn=process_file, inputs=[file_input,label_input,info_input,model_dropdown,increment_slider], outputs=[output_text,output_image])
 # Launch the app

new_test_saved_finetuned_model.py CHANGED Viewed

@@ -495,7 +495,7 @@ def train():
     parser.add_argument("-hs", "--hidden", type=int, default=64, help="hidden size of transformer model") #64
     parser.add_argument("-l", "--layers", type=int, default=4, help="number of layers") #4
     parser.add_argument("-a", "--attn_heads", type=int, default=4, help="number of attention heads") #8
-    parser.add_argument("-s", "--seq_len", type=int, default=5, help="maximum sequence length")
     parser.add_argument("-b", "--batch_size", type=int, default=500, help="number of batch_size") #64
     parser.add_argument("-e", "--epochs", type=int, default=1)#1501, help="number of epochs") #501
@@ -508,7 +508,7 @@ def train():
     # parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
     parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
     # parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false")
     parser.add_argument("--dropout", type=float, default=0.1, help="dropout of network")
     parser.add_argument("--lr", type=float, default=1e-05, help="learning rate of adam") #1e-3
     parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")

     parser.add_argument("-hs", "--hidden", type=int, default=64, help="hidden size of transformer model") #64
     parser.add_argument("-l", "--layers", type=int, default=4, help="number of layers") #4
     parser.add_argument("-a", "--attn_heads", type=int, default=4, help="number of attention heads") #8
+    parser.add_argument("-s", "--seq_len", type=int, default=128, help="maximum sequence length")
     parser.add_argument("-b", "--batch_size", type=int, default=500, help="number of batch_size") #64
     parser.add_argument("-e", "--epochs", type=int, default=1)#1501, help="number of epochs") #501
     # parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
     parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
     # parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false")
     parser.add_argument("--dropout", type=float, default=0.1, help="dropout of network")
     parser.add_argument("--lr", type=float, default=1e-05, help="learning rate of adam") #1e-3
     parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")

plot.png CHANGED Viewed

result.txt CHANGED Viewed

@@ -1,7 +1,7 @@
-avg_loss: 0.8249401861713046
-total_acc: 50.0
-precisions: 0.25
-recalls: 0.5
-f1_scores: 0.3333333333333333
-time_taken_from_start: 30.98168659210205
-auc_score: 0.7724651292107545

+avg_loss: 0.5631513595581055
+total_acc: 69.7320542507443
+precisions: 0.7236992960620143
+recalls: 0.6973205425074429
+f1_scores: 0.6879225873063946
+time_taken_from_start: 73.04951095581055
+auc_score: 0.7452296224317393

roc_data.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4beb5de79dfb3592402832ced8db0c87f3264e46c0813553c40728c7ddafed5
+size 29285

school_grduation_rate.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c0c99dd8fc601de1fc8f4af5880bf71b7198c09bf0d016a880b02043e0b3d03
+size 18356

selected_rows.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

train.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

train_info.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- test

train_label.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff