Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	ref model and tuple error fix
Browse files
    	
        app.py
    CHANGED
    
    | @@ -34,7 +34,6 @@ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default an | |
| 34 | 
             
            TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
         | 
| 35 |  | 
| 36 | 
             
            # CONFIGURATION:
         | 
| 37 | 
            -
            ref_model = "mistralai/Mistral-7B-v0.1"
         | 
| 38 | 
             
            test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
         | 
| 39 | 
             
            modelQueue = (pd.read_csv('data/queue.csv')).values.tolist()
         | 
| 40 | 
             
            print(modelQueue)
         | 
| @@ -48,11 +47,11 @@ def formatr(result): | |
| 48 | 
             
                result = result.replace(" ","")
         | 
| 49 | 
             
                return result
         | 
| 50 |  | 
| 51 | 
            -
            def save_to_txt(model, results, model_type):
         | 
| 52 | 
             
                file_path = "data/code_eval_board.csv"
         | 
| 53 |  | 
| 54 | 
             
                with open(file_path, "a") as f:
         | 
| 55 | 
            -
                    f.write(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])))
         | 
| 56 | 
             
                    f.close()
         | 
| 57 |  | 
| 58 | 
             
            def run_test(model,ref_model,data):
         | 
| @@ -67,8 +66,7 @@ def run_test(model,ref_model,data): | |
| 67 | 
             
                            ratio_gen=0.4
         | 
| 68 | 
             
                        ) # Call the main function in detect-pretrain-code-contamination/src/run.py
         | 
| 69 |  | 
| 70 | 
            -
            def evaluate(model,model_type):
         | 
| 71 | 
            -
                global ref_model
         | 
| 72 | 
             
                print(f"|| EVALUATING {model} ||")
         | 
| 73 | 
             
                results = {
         | 
| 74 | 
             
                    "arc": run_test(model, ref_model, test_datasets[2]),
         | 
| @@ -81,14 +79,14 @@ def evaluate(model,model_type): | |
| 81 | 
             
                }
         | 
| 82 |  | 
| 83 | 
             
                # Save to .txt file in /Evaluations/{model}
         | 
| 84 | 
            -
                save_to_txt(model, results, model_type)
         | 
| 85 | 
             
                return "\n".join([f"{k}:{results[k]}" for k in results])
         | 
| 86 |  | 
| 87 | 
             
            def worker_thread():
         | 
| 88 | 
             
                global modelQueue, server
         | 
| 89 | 
             
                while True:
         | 
| 90 | 
             
                    for submission in modelQueue:
         | 
| 91 | 
            -
                        #evaluate(submission[1],submission[0].split(" ")[0])
         | 
| 92 | 
             
                        #modelQueue.pop(modelQueue.index(submission))
         | 
| 93 |  | 
| 94 | 
             
                        # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
         | 
| @@ -110,6 +108,12 @@ def queue(model,model_type,ref_model): | |
| 110 | 
             
                    f.write(f"\n{model_type},{model},{ref_model}")
         | 
| 111 | 
             
                    f.close()
         | 
| 112 | 
             
                print(f"QUEUE:\n{modelQueue}")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 113 |  | 
| 114 |  | 
| 115 | 
             
            ### bigcode/bigcode-models-leaderboard
         | 
|  | |
| 34 | 
             
            TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
         | 
| 35 |  | 
| 36 | 
             
            # CONFIGURATION:
         | 
|  | |
| 37 | 
             
            test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
         | 
| 38 | 
             
            modelQueue = (pd.read_csv('data/queue.csv')).values.tolist()
         | 
| 39 | 
             
            print(modelQueue)
         | 
|  | |
| 47 | 
             
                result = result.replace(" ","")
         | 
| 48 | 
             
                return result
         | 
| 49 |  | 
| 50 | 
            +
            def save_to_txt(model, results, model_type,ref_model):
         | 
| 51 | 
             
                file_path = "data/code_eval_board.csv"
         | 
| 52 |  | 
| 53 | 
             
                with open(file_path, "a") as f:
         | 
| 54 | 
            +
                    f.write(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}")
         | 
| 55 | 
             
                    f.close()
         | 
| 56 |  | 
| 57 | 
             
            def run_test(model,ref_model,data):
         | 
|  | |
| 66 | 
             
                            ratio_gen=0.4
         | 
| 67 | 
             
                        ) # Call the main function in detect-pretrain-code-contamination/src/run.py
         | 
| 68 |  | 
| 69 | 
            +
            def evaluate(model,model_type,ref_model):
         | 
|  | |
| 70 | 
             
                print(f"|| EVALUATING {model} ||")
         | 
| 71 | 
             
                results = {
         | 
| 72 | 
             
                    "arc": run_test(model, ref_model, test_datasets[2]),
         | 
|  | |
| 79 | 
             
                }
         | 
| 80 |  | 
| 81 | 
             
                # Save to .txt file in /Evaluations/{model}
         | 
| 82 | 
            +
                save_to_txt(model, results, model_type,ref_model)
         | 
| 83 | 
             
                return "\n".join([f"{k}:{results[k]}" for k in results])
         | 
| 84 |  | 
| 85 | 
             
            def worker_thread():
         | 
| 86 | 
             
                global modelQueue, server
         | 
| 87 | 
             
                while True:
         | 
| 88 | 
             
                    for submission in modelQueue:
         | 
| 89 | 
            +
                        #evaluate(submission[1],submission[0].split(" ")[0],submission[2])
         | 
| 90 | 
             
                        #modelQueue.pop(modelQueue.index(submission))
         | 
| 91 |  | 
| 92 | 
             
                        # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
         | 
|  | |
| 108 | 
             
                    f.write(f"\n{model_type},{model},{ref_model}")
         | 
| 109 | 
             
                    f.close()
         | 
| 110 | 
             
                print(f"QUEUE:\n{modelQueue}")
         | 
| 111 | 
            +
                
         | 
| 112 | 
            +
                eval_entry = {
         | 
| 113 | 
            +
                    "model": model,
         | 
| 114 | 
            +
                    "model_type": model_type,
         | 
| 115 | 
            +
                    "ref_model": ref_model,
         | 
| 116 | 
            +
                }
         | 
| 117 |  | 
| 118 |  | 
| 119 | 
             
            ### bigcode/bigcode-models-leaderboard
         | 
