Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
update dockerfile
Browse files- Dockerfile +2 -5
- backend/data/lighteval_results/lighteval_results.json +10 -10
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json +121 -0
- backend/tasks/evaluationTask.py +2 -2
- backend/tasks/get_model_providers.py +1 -1
Dockerfile
CHANGED
|
@@ -32,17 +32,14 @@ RUN useradd -m -u 1000 user
|
|
| 32 |
RUN mkdir -p /app/.cache && \
|
| 33 |
chown -R user:user /app
|
| 34 |
|
| 35 |
-
# Copy backend
|
| 36 |
-
COPY backend/
|
| 37 |
|
| 38 |
# Install all dependencies explicitly
|
| 39 |
RUN pip install fastapi uvicorn
|
| 40 |
# Install project dependencies
|
| 41 |
RUN uv pip install -e . --system
|
| 42 |
|
| 43 |
-
# Copy backend code
|
| 44 |
-
COPY backend/ .
|
| 45 |
-
|
| 46 |
# Copy frontend server and build
|
| 47 |
COPY --from=frontend-build /app/build ./frontend/build
|
| 48 |
COPY --from=frontend-build /app/package*.json ./frontend/
|
|
|
|
| 32 |
RUN mkdir -p /app/.cache && \
|
| 33 |
chown -R user:user /app
|
| 34 |
|
| 35 |
+
# Copy all backend code first
|
| 36 |
+
COPY backend/ .
|
| 37 |
|
| 38 |
# Install all dependencies explicitly
|
| 39 |
RUN pip install fastapi uvicorn
|
| 40 |
# Install project dependencies
|
| 41 |
RUN uv pip install -e . --system
|
| 42 |
|
|
|
|
|
|
|
|
|
|
| 43 |
# Copy frontend server and build
|
| 44 |
COPY --from=frontend-build /app/build ./frontend/build
|
| 45 |
COPY --from=frontend-build /app/package*.json ./frontend/
|
backend/data/lighteval_results/lighteval_results.json
CHANGED
|
@@ -1,27 +1,27 @@
|
|
| 1 |
[
|
| 2 |
-
{
|
| 3 |
-
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
| 4 |
-
"provider": "sambanova",
|
| 5 |
-
"accuracy": 1.0,
|
| 6 |
-
"execution_time": 18.800472021102905,
|
| 7 |
-
"status": "success"
|
| 8 |
-
},
|
| 9 |
{
|
| 10 |
"model": "deepseek-ai/DeepSeek-V3-0324",
|
| 11 |
"provider": "novita",
|
| 12 |
"accuracy": 1.0,
|
| 13 |
-
"execution_time":
|
| 14 |
"status": "success"
|
| 15 |
},
|
| 16 |
{
|
| 17 |
-
"model": "Qwen/
|
| 18 |
"provider": "sambanova",
|
| 19 |
"accuracy": 0.0,
|
| 20 |
"execution_time": 60.0,
|
| 21 |
"status": "timeout"
|
| 22 |
},
|
| 23 |
{
|
| 24 |
-
"model": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"provider": "sambanova",
|
| 26 |
"accuracy": 0.0,
|
| 27 |
"execution_time": 60.0,
|
|
|
|
| 1 |
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
{
|
| 3 |
"model": "deepseek-ai/DeepSeek-V3-0324",
|
| 4 |
"provider": "novita",
|
| 5 |
"accuracy": 1.0,
|
| 6 |
+
"execution_time": 54.32098197937012,
|
| 7 |
"status": "success"
|
| 8 |
},
|
| 9 |
{
|
| 10 |
+
"model": "Qwen/QwQ-32B",
|
| 11 |
"provider": "sambanova",
|
| 12 |
"accuracy": 0.0,
|
| 13 |
"execution_time": 60.0,
|
| 14 |
"status": "timeout"
|
| 15 |
},
|
| 16 |
{
|
| 17 |
+
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
| 18 |
+
"provider": "sambanova",
|
| 19 |
+
"accuracy": 0.0,
|
| 20 |
+
"execution_time": 60.0,
|
| 21 |
+
"status": "timeout"
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"model": "Qwen/Qwen2.5-72B-Instruct",
|
| 25 |
"provider": "sambanova",
|
| 26 |
"accuracy": 0.0,
|
| 27 |
"execution_time": 60.0,
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": null,
|
| 6 |
+
"max_samples": 15,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 190861.972782125,
|
| 9 |
+
"end_time": 190876.962226916,
|
| 10 |
+
"total_evaluation_time_secondes": "14.989444790990092",
|
| 11 |
+
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": "",
|
| 15 |
+
"generation_parameters": {
|
| 16 |
+
"early_stopping": null,
|
| 17 |
+
"repetition_penalty": null,
|
| 18 |
+
"frequency_penalty": null,
|
| 19 |
+
"length_penalty": null,
|
| 20 |
+
"presence_penalty": null,
|
| 21 |
+
"max_new_tokens": null,
|
| 22 |
+
"min_new_tokens": null,
|
| 23 |
+
"seed": null,
|
| 24 |
+
"stop_tokens": null,
|
| 25 |
+
"temperature": null,
|
| 26 |
+
"top_k": null,
|
| 27 |
+
"min_p": null,
|
| 28 |
+
"top_p": null,
|
| 29 |
+
"truncate_prompt": null,
|
| 30 |
+
"response_format": null
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
"results": {
|
| 34 |
+
"custom|yourbench|0": {
|
| 35 |
+
"accuracy": 1.0,
|
| 36 |
+
"accuracy_stderr": 0.0
|
| 37 |
+
},
|
| 38 |
+
"all": {
|
| 39 |
+
"accuracy": 1.0,
|
| 40 |
+
"accuracy_stderr": 0.0
|
| 41 |
+
}
|
| 42 |
+
},
|
| 43 |
+
"versions": {
|
| 44 |
+
"custom|yourbench|0": 0
|
| 45 |
+
},
|
| 46 |
+
"config_tasks": {
|
| 47 |
+
"custom|yourbench": {
|
| 48 |
+
"name": "yourbench",
|
| 49 |
+
"prompt_function": "yourbench_prompt",
|
| 50 |
+
"hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
|
| 51 |
+
"hf_subset": "multi_hop_questions",
|
| 52 |
+
"metric": [
|
| 53 |
+
{
|
| 54 |
+
"metric_name": [
|
| 55 |
+
"accuracy"
|
| 56 |
+
],
|
| 57 |
+
"higher_is_better": {
|
| 58 |
+
"accuracy": true
|
| 59 |
+
},
|
| 60 |
+
"category": "7",
|
| 61 |
+
"use_case": "1",
|
| 62 |
+
"sample_level_fn": "compute",
|
| 63 |
+
"corpus_level_fn": {
|
| 64 |
+
"accuracy": "mean"
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"hf_revision": null,
|
| 69 |
+
"hf_filter": null,
|
| 70 |
+
"hf_avail_splits": [
|
| 71 |
+
"train"
|
| 72 |
+
],
|
| 73 |
+
"trust_dataset": true,
|
| 74 |
+
"evaluation_splits": [
|
| 75 |
+
"train"
|
| 76 |
+
],
|
| 77 |
+
"few_shots_split": null,
|
| 78 |
+
"few_shots_select": null,
|
| 79 |
+
"generation_size": 8192,
|
| 80 |
+
"generation_grammar": null,
|
| 81 |
+
"stop_sequence": [],
|
| 82 |
+
"num_samples": null,
|
| 83 |
+
"suite": [
|
| 84 |
+
"custom"
|
| 85 |
+
],
|
| 86 |
+
"original_num_docs": 34,
|
| 87 |
+
"effective_num_docs": 15,
|
| 88 |
+
"must_remove_duplicate_docs": false,
|
| 89 |
+
"version": 0
|
| 90 |
+
}
|
| 91 |
+
},
|
| 92 |
+
"summary_tasks": {
|
| 93 |
+
"custom|yourbench|0": {
|
| 94 |
+
"hashes": {
|
| 95 |
+
"hash_examples": "97803694d4430d2d",
|
| 96 |
+
"hash_full_prompts": "3125bcda69618d2b",
|
| 97 |
+
"hash_input_tokens": "58ec870775e406f3",
|
| 98 |
+
"hash_cont_tokens": "58ec870775e406f3"
|
| 99 |
+
},
|
| 100 |
+
"truncated": 0,
|
| 101 |
+
"non_truncated": 15,
|
| 102 |
+
"padded": 0,
|
| 103 |
+
"non_padded": 15,
|
| 104 |
+
"effective_few_shots": 0.0,
|
| 105 |
+
"num_truncated_few_shots": 0
|
| 106 |
+
}
|
| 107 |
+
},
|
| 108 |
+
"summary_general": {
|
| 109 |
+
"hashes": {
|
| 110 |
+
"hash_examples": "13a4051f728a0e87",
|
| 111 |
+
"hash_full_prompts": "e18b288370ab6ae2",
|
| 112 |
+
"hash_input_tokens": "544d800a25dfd777",
|
| 113 |
+
"hash_cont_tokens": "544d800a25dfd777"
|
| 114 |
+
},
|
| 115 |
+
"truncated": 0,
|
| 116 |
+
"non_truncated": 15,
|
| 117 |
+
"padded": 0,
|
| 118 |
+
"non_padded": 15,
|
| 119 |
+
"num_truncated_few_shots": 0
|
| 120 |
+
}
|
| 121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": null,
|
| 6 |
+
"max_samples": 15,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 190861.972804458,
|
| 9 |
+
"end_time": 190894.739973125,
|
| 10 |
+
"total_evaluation_time_secondes": "32.7671686669928",
|
| 11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": "",
|
| 15 |
+
"generation_parameters": {
|
| 16 |
+
"early_stopping": null,
|
| 17 |
+
"repetition_penalty": null,
|
| 18 |
+
"frequency_penalty": null,
|
| 19 |
+
"length_penalty": null,
|
| 20 |
+
"presence_penalty": null,
|
| 21 |
+
"max_new_tokens": null,
|
| 22 |
+
"min_new_tokens": null,
|
| 23 |
+
"seed": null,
|
| 24 |
+
"stop_tokens": null,
|
| 25 |
+
"temperature": null,
|
| 26 |
+
"top_k": null,
|
| 27 |
+
"min_p": null,
|
| 28 |
+
"top_p": null,
|
| 29 |
+
"truncate_prompt": null,
|
| 30 |
+
"response_format": null
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
"results": {
|
| 34 |
+
"custom|yourbench|0": {
|
| 35 |
+
"accuracy": 1.0,
|
| 36 |
+
"accuracy_stderr": 0.0
|
| 37 |
+
},
|
| 38 |
+
"all": {
|
| 39 |
+
"accuracy": 1.0,
|
| 40 |
+
"accuracy_stderr": 0.0
|
| 41 |
+
}
|
| 42 |
+
},
|
| 43 |
+
"versions": {
|
| 44 |
+
"custom|yourbench|0": 0
|
| 45 |
+
},
|
| 46 |
+
"config_tasks": {
|
| 47 |
+
"custom|yourbench": {
|
| 48 |
+
"name": "yourbench",
|
| 49 |
+
"prompt_function": "yourbench_prompt",
|
| 50 |
+
"hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
|
| 51 |
+
"hf_subset": "multi_hop_questions",
|
| 52 |
+
"metric": [
|
| 53 |
+
{
|
| 54 |
+
"metric_name": [
|
| 55 |
+
"accuracy"
|
| 56 |
+
],
|
| 57 |
+
"higher_is_better": {
|
| 58 |
+
"accuracy": true
|
| 59 |
+
},
|
| 60 |
+
"category": "7",
|
| 61 |
+
"use_case": "1",
|
| 62 |
+
"sample_level_fn": "compute",
|
| 63 |
+
"corpus_level_fn": {
|
| 64 |
+
"accuracy": "mean"
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"hf_revision": null,
|
| 69 |
+
"hf_filter": null,
|
| 70 |
+
"hf_avail_splits": [
|
| 71 |
+
"train"
|
| 72 |
+
],
|
| 73 |
+
"trust_dataset": true,
|
| 74 |
+
"evaluation_splits": [
|
| 75 |
+
"train"
|
| 76 |
+
],
|
| 77 |
+
"few_shots_split": null,
|
| 78 |
+
"few_shots_select": null,
|
| 79 |
+
"generation_size": 8192,
|
| 80 |
+
"generation_grammar": null,
|
| 81 |
+
"stop_sequence": [],
|
| 82 |
+
"num_samples": null,
|
| 83 |
+
"suite": [
|
| 84 |
+
"custom"
|
| 85 |
+
],
|
| 86 |
+
"original_num_docs": 34,
|
| 87 |
+
"effective_num_docs": 15,
|
| 88 |
+
"must_remove_duplicate_docs": false,
|
| 89 |
+
"version": 0
|
| 90 |
+
}
|
| 91 |
+
},
|
| 92 |
+
"summary_tasks": {
|
| 93 |
+
"custom|yourbench|0": {
|
| 94 |
+
"hashes": {
|
| 95 |
+
"hash_examples": "97803694d4430d2d",
|
| 96 |
+
"hash_full_prompts": "3125bcda69618d2b",
|
| 97 |
+
"hash_input_tokens": "58ec870775e406f3",
|
| 98 |
+
"hash_cont_tokens": "58ec870775e406f3"
|
| 99 |
+
},
|
| 100 |
+
"truncated": 0,
|
| 101 |
+
"non_truncated": 15,
|
| 102 |
+
"padded": 0,
|
| 103 |
+
"non_padded": 15,
|
| 104 |
+
"effective_few_shots": 0.0,
|
| 105 |
+
"num_truncated_few_shots": 0
|
| 106 |
+
}
|
| 107 |
+
},
|
| 108 |
+
"summary_general": {
|
| 109 |
+
"hashes": {
|
| 110 |
+
"hash_examples": "13a4051f728a0e87",
|
| 111 |
+
"hash_full_prompts": "e18b288370ab6ae2",
|
| 112 |
+
"hash_input_tokens": "544d800a25dfd777",
|
| 113 |
+
"hash_cont_tokens": "544d800a25dfd777"
|
| 114 |
+
},
|
| 115 |
+
"truncated": 0,
|
| 116 |
+
"non_truncated": 15,
|
| 117 |
+
"padded": 0,
|
| 118 |
+
"non_padded": 15,
|
| 119 |
+
"num_truncated_few_shots": 0
|
| 120 |
+
}
|
| 121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": null,
|
| 6 |
+
"max_samples": 30,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 190994.241279791,
|
| 9 |
+
"end_time": 191043.871577458,
|
| 10 |
+
"total_evaluation_time_secondes": "49.63029766699765",
|
| 11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": "",
|
| 15 |
+
"generation_parameters": {
|
| 16 |
+
"early_stopping": null,
|
| 17 |
+
"repetition_penalty": null,
|
| 18 |
+
"frequency_penalty": null,
|
| 19 |
+
"length_penalty": null,
|
| 20 |
+
"presence_penalty": null,
|
| 21 |
+
"max_new_tokens": null,
|
| 22 |
+
"min_new_tokens": null,
|
| 23 |
+
"seed": null,
|
| 24 |
+
"stop_tokens": null,
|
| 25 |
+
"temperature": null,
|
| 26 |
+
"top_k": null,
|
| 27 |
+
"min_p": null,
|
| 28 |
+
"top_p": null,
|
| 29 |
+
"truncate_prompt": null,
|
| 30 |
+
"response_format": null
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
"results": {
|
| 34 |
+
"custom|yourbench|0": {
|
| 35 |
+
"accuracy": 1.0,
|
| 36 |
+
"accuracy_stderr": 0.0
|
| 37 |
+
},
|
| 38 |
+
"all": {
|
| 39 |
+
"accuracy": 1.0,
|
| 40 |
+
"accuracy_stderr": 0.0
|
| 41 |
+
}
|
| 42 |
+
},
|
| 43 |
+
"versions": {
|
| 44 |
+
"custom|yourbench|0": 0
|
| 45 |
+
},
|
| 46 |
+
"config_tasks": {
|
| 47 |
+
"custom|yourbench": {
|
| 48 |
+
"name": "yourbench",
|
| 49 |
+
"prompt_function": "yourbench_prompt",
|
| 50 |
+
"hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
|
| 51 |
+
"hf_subset": "multi_hop_questions",
|
| 52 |
+
"metric": [
|
| 53 |
+
{
|
| 54 |
+
"metric_name": [
|
| 55 |
+
"accuracy"
|
| 56 |
+
],
|
| 57 |
+
"higher_is_better": {
|
| 58 |
+
"accuracy": true
|
| 59 |
+
},
|
| 60 |
+
"category": "7",
|
| 61 |
+
"use_case": "1",
|
| 62 |
+
"sample_level_fn": "compute",
|
| 63 |
+
"corpus_level_fn": {
|
| 64 |
+
"accuracy": "mean"
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"hf_revision": null,
|
| 69 |
+
"hf_filter": null,
|
| 70 |
+
"hf_avail_splits": [
|
| 71 |
+
"train"
|
| 72 |
+
],
|
| 73 |
+
"trust_dataset": true,
|
| 74 |
+
"evaluation_splits": [
|
| 75 |
+
"train"
|
| 76 |
+
],
|
| 77 |
+
"few_shots_split": null,
|
| 78 |
+
"few_shots_select": null,
|
| 79 |
+
"generation_size": 8192,
|
| 80 |
+
"generation_grammar": null,
|
| 81 |
+
"stop_sequence": [],
|
| 82 |
+
"num_samples": null,
|
| 83 |
+
"suite": [
|
| 84 |
+
"custom"
|
| 85 |
+
],
|
| 86 |
+
"original_num_docs": 34,
|
| 87 |
+
"effective_num_docs": 30,
|
| 88 |
+
"must_remove_duplicate_docs": false,
|
| 89 |
+
"version": 0
|
| 90 |
+
}
|
| 91 |
+
},
|
| 92 |
+
"summary_tasks": {
|
| 93 |
+
"custom|yourbench|0": {
|
| 94 |
+
"hashes": {
|
| 95 |
+
"hash_examples": "1b5afc5f13827f79",
|
| 96 |
+
"hash_full_prompts": "cd8c39c007643835",
|
| 97 |
+
"hash_input_tokens": "79ab129e9a18c6d6",
|
| 98 |
+
"hash_cont_tokens": "79ab129e9a18c6d6"
|
| 99 |
+
},
|
| 100 |
+
"truncated": 0,
|
| 101 |
+
"non_truncated": 30,
|
| 102 |
+
"padded": 0,
|
| 103 |
+
"non_padded": 30,
|
| 104 |
+
"effective_few_shots": 0.0,
|
| 105 |
+
"num_truncated_few_shots": 0
|
| 106 |
+
}
|
| 107 |
+
},
|
| 108 |
+
"summary_general": {
|
| 109 |
+
"hashes": {
|
| 110 |
+
"hash_examples": "b18e19e266a5bc51",
|
| 111 |
+
"hash_full_prompts": "1eaa15cbc4a17d04",
|
| 112 |
+
"hash_input_tokens": "05a66e44e190c178",
|
| 113 |
+
"hash_cont_tokens": "05a66e44e190c178"
|
| 114 |
+
},
|
| 115 |
+
"truncated": 0,
|
| 116 |
+
"non_truncated": 30,
|
| 117 |
+
"padded": 0,
|
| 118 |
+
"non_padded": 30,
|
| 119 |
+
"num_truncated_few_shots": 0
|
| 120 |
+
}
|
| 121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": null,
|
| 6 |
+
"max_samples": 30,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 191195.945968041,
|
| 9 |
+
"end_time": 191244.057571,
|
| 10 |
+
"total_evaluation_time_secondes": "48.111602959019365",
|
| 11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": "",
|
| 15 |
+
"generation_parameters": {
|
| 16 |
+
"early_stopping": null,
|
| 17 |
+
"repetition_penalty": null,
|
| 18 |
+
"frequency_penalty": null,
|
| 19 |
+
"length_penalty": null,
|
| 20 |
+
"presence_penalty": null,
|
| 21 |
+
"max_new_tokens": null,
|
| 22 |
+
"min_new_tokens": null,
|
| 23 |
+
"seed": null,
|
| 24 |
+
"stop_tokens": null,
|
| 25 |
+
"temperature": null,
|
| 26 |
+
"top_k": null,
|
| 27 |
+
"min_p": null,
|
| 28 |
+
"top_p": null,
|
| 29 |
+
"truncate_prompt": null,
|
| 30 |
+
"response_format": null
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
"results": {
|
| 34 |
+
"custom|yourbench|0": {
|
| 35 |
+
"accuracy": 1.0,
|
| 36 |
+
"accuracy_stderr": 0.0
|
| 37 |
+
},
|
| 38 |
+
"all": {
|
| 39 |
+
"accuracy": 1.0,
|
| 40 |
+
"accuracy_stderr": 0.0
|
| 41 |
+
}
|
| 42 |
+
},
|
| 43 |
+
"versions": {
|
| 44 |
+
"custom|yourbench|0": 0
|
| 45 |
+
},
|
| 46 |
+
"config_tasks": {
|
| 47 |
+
"custom|yourbench": {
|
| 48 |
+
"name": "yourbench",
|
| 49 |
+
"prompt_function": "yourbench_prompt",
|
| 50 |
+
"hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
|
| 51 |
+
"hf_subset": "multi_hop_questions",
|
| 52 |
+
"metric": [
|
| 53 |
+
{
|
| 54 |
+
"metric_name": [
|
| 55 |
+
"accuracy"
|
| 56 |
+
],
|
| 57 |
+
"higher_is_better": {
|
| 58 |
+
"accuracy": true
|
| 59 |
+
},
|
| 60 |
+
"category": "7",
|
| 61 |
+
"use_case": "1",
|
| 62 |
+
"sample_level_fn": "compute",
|
| 63 |
+
"corpus_level_fn": {
|
| 64 |
+
"accuracy": "mean"
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"hf_revision": null,
|
| 69 |
+
"hf_filter": null,
|
| 70 |
+
"hf_avail_splits": [
|
| 71 |
+
"train"
|
| 72 |
+
],
|
| 73 |
+
"trust_dataset": true,
|
| 74 |
+
"evaluation_splits": [
|
| 75 |
+
"train"
|
| 76 |
+
],
|
| 77 |
+
"few_shots_split": null,
|
| 78 |
+
"few_shots_select": null,
|
| 79 |
+
"generation_size": 8192,
|
| 80 |
+
"generation_grammar": null,
|
| 81 |
+
"stop_sequence": [],
|
| 82 |
+
"num_samples": null,
|
| 83 |
+
"suite": [
|
| 84 |
+
"custom"
|
| 85 |
+
],
|
| 86 |
+
"original_num_docs": 34,
|
| 87 |
+
"effective_num_docs": 30,
|
| 88 |
+
"must_remove_duplicate_docs": false,
|
| 89 |
+
"version": 0
|
| 90 |
+
}
|
| 91 |
+
},
|
| 92 |
+
"summary_tasks": {
|
| 93 |
+
"custom|yourbench|0": {
|
| 94 |
+
"hashes": {
|
| 95 |
+
"hash_examples": "1b5afc5f13827f79",
|
| 96 |
+
"hash_full_prompts": "cd8c39c007643835",
|
| 97 |
+
"hash_input_tokens": "79ab129e9a18c6d6",
|
| 98 |
+
"hash_cont_tokens": "79ab129e9a18c6d6"
|
| 99 |
+
},
|
| 100 |
+
"truncated": 0,
|
| 101 |
+
"non_truncated": 30,
|
| 102 |
+
"padded": 0,
|
| 103 |
+
"non_padded": 30,
|
| 104 |
+
"effective_few_shots": 0.0,
|
| 105 |
+
"num_truncated_few_shots": 0
|
| 106 |
+
}
|
| 107 |
+
},
|
| 108 |
+
"summary_general": {
|
| 109 |
+
"hashes": {
|
| 110 |
+
"hash_examples": "b18e19e266a5bc51",
|
| 111 |
+
"hash_full_prompts": "1eaa15cbc4a17d04",
|
| 112 |
+
"hash_input_tokens": "05a66e44e190c178",
|
| 113 |
+
"hash_cont_tokens": "05a66e44e190c178"
|
| 114 |
+
},
|
| 115 |
+
"truncated": 0,
|
| 116 |
+
"non_truncated": 30,
|
| 117 |
+
"padded": 0,
|
| 118 |
+
"non_padded": 30,
|
| 119 |
+
"num_truncated_few_shots": 0
|
| 120 |
+
}
|
| 121 |
+
}
|
backend/tasks/evaluationTask.py
CHANGED
|
@@ -71,7 +71,7 @@ class EvaluationTask:
|
|
| 71 |
from lighteval_task.lighteval_task import create_yourbench_task
|
| 72 |
|
| 73 |
# Create yourbench task
|
| 74 |
-
yourbench = create_yourbench_task("{dataset_name}", "
|
| 75 |
|
| 76 |
# Define TASKS_TABLE needed by lighteval
|
| 77 |
TASKS_TABLE = [yourbench]
|
|
@@ -86,7 +86,7 @@ TASKS_TABLE = [yourbench]
|
|
| 86 |
"custom|yourbench|0|0",
|
| 87 |
"--custom-tasks",
|
| 88 |
temp_file_path,
|
| 89 |
-
"--max-samples", "
|
| 90 |
"--output-dir", "data/lighteval_results",
|
| 91 |
# "--save-details",
|
| 92 |
"--no-push-to-hub"
|
|
|
|
| 71 |
from lighteval_task.lighteval_task import create_yourbench_task
|
| 72 |
|
| 73 |
# Create yourbench task
|
| 74 |
+
yourbench = create_yourbench_task("{dataset_name}", "multi_hop_questions")
|
| 75 |
|
| 76 |
# Define TASKS_TABLE needed by lighteval
|
| 77 |
TASKS_TABLE = [yourbench]
|
|
|
|
| 86 |
"custom|yourbench|0|0",
|
| 87 |
"--custom-tasks",
|
| 88 |
temp_file_path,
|
| 89 |
+
"--max-samples", "30",
|
| 90 |
"--output-dir", "data/lighteval_results",
|
| 91 |
# "--save-details",
|
| 92 |
"--no-push-to-hub"
|
backend/tasks/get_model_providers.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from huggingface_hub import model_info
|
| 2 |
-
PREFERRED_PROVIDERS = ["
|
| 3 |
|
| 4 |
def filter_providers(providers):
|
| 5 |
return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
|
|
|
|
| 1 |
from huggingface_hub import model_info
|
| 2 |
+
PREFERRED_PROVIDERS = ["novita","sambanova"]
|
| 3 |
|
| 4 |
def filter_providers(providers):
|
| 5 |
return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
|