Spaces:
Running
Running
new feat: o4-mini supported
Browse files- varco_arena/calc_cost.py +1 -1
- varco_arena/main.py +21 -19
- varco_arena/varco_arena_core/eval_utils.py +30 -3
- varco_arena/varco_arena_core/league.py +1 -1
- varco_arena/varco_arena_core/prompts/base_prompt.py +1 -1
- varco_arena/varco_arena_core/prompts/llmbar.py +1 -1
- varco_arena/varco_arena_core/prompts/prompt_utils.py +12 -2
- varco_arena/varco_arena_core/prompts/prompts_README.md +4 -0
- varco_arena/varco_arena_core/prompts/translation_new.py +9 -1
- varco_arena/varco_arena_core/prompts/translation_new.yaml +3 -3
- varco_arena/varco_arena_core/tournament.py +1 -1
- varco_arena/varco_arena_core/tracking_utils.py +2 -1
varco_arena/calc_cost.py
CHANGED
|
@@ -52,7 +52,7 @@ def calculate(
|
|
| 52 |
lambda row: eval_task_2_prm[
|
| 53 |
f"{evalprompt}_{row.task}"
|
| 54 |
].get_expected_max_tokens_w_room(
|
| 55 |
-
model_name, room=
|
| 56 |
), # here, prompt_obj will define tokenizer with `model_name`
|
| 57 |
axis=1,
|
| 58 |
)
|
|
|
|
| 52 |
lambda row: eval_task_2_prm[
|
| 53 |
f"{evalprompt}_{row.task}"
|
| 54 |
].get_expected_max_tokens_w_room(
|
| 55 |
+
model_name, room=2
|
| 56 |
), # here, prompt_obj will define tokenizer with `model_name`
|
| 57 |
axis=1,
|
| 58 |
)
|
varco_arena/main.py
CHANGED
|
@@ -49,25 +49,27 @@ def main(
|
|
| 49 |
os.makedirs(output_dir, exist_ok=True)
|
| 50 |
|
| 51 |
# cost estimation
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
Judge LLM: {evaluation_model}
|
| 62 |
-
νκ° ν둬ννΈ: {evalprompt}
|
| 63 |
-
νκ° λ¦¬κ·Έ λ°©λ²: {matching_method}
|
| 64 |
-
μμ νκ° νμ : {total_matches:,}
|
| 65 |
-
μμ μ
λ ₯ ν ν° : {total_toks_in:,}
|
| 66 |
-
μμ μΆλ ₯ ν ν° : {total_toks_out:,} (with x1.01 additional room)
|
| 67 |
-
---------------------------------------
|
| 68 |
-
μμ λ°μ λΉμ© : ${total_costs:.3f}
|
| 69 |
-
{_doubleline}"""
|
| 70 |
-
|
| 71 |
|
| 72 |
if args.calc_price_run:
|
| 73 |
return
|
|
@@ -95,7 +97,7 @@ if __name__ == "__main__":
|
|
| 95 |
parser.add_argument("-i", "--input", help="input file")
|
| 96 |
parser.add_argument("-o", "--output_dir", help="output file")
|
| 97 |
parser.add_argument(
|
| 98 |
-
"-e", "--evaluation_model", default="
|
| 99 |
)
|
| 100 |
parser.add_argument(
|
| 101 |
"-c",
|
|
|
|
| 49 |
os.makedirs(output_dir, exist_ok=True)
|
| 50 |
|
| 51 |
# cost estimation
|
| 52 |
+
if evaluation_model != "debug":
|
| 53 |
+
|
| 54 |
+
total_matches, total_toks_in, total_toks_out, total_costs = calculate(
|
| 55 |
+
dataset_df=dataset_df,
|
| 56 |
+
model_name=evaluation_model,
|
| 57 |
+
matching_method=matching_method,
|
| 58 |
+
evalprompt=evalprompt,
|
| 59 |
+
)
|
| 60 |
|
| 61 |
+
_doubleline = "=" * 50
|
| 62 |
+
message = f"""---------------------------------------
|
| 63 |
+
Judge LLM: {evaluation_model}
|
| 64 |
+
νκ° ν둬ννΈ: {evalprompt}
|
| 65 |
+
νκ° λ¦¬κ·Έ λ°©λ²: {matching_method}
|
| 66 |
+
μμ νκ° νμ : {total_matches:,}
|
| 67 |
+
μμ μ
λ ₯ ν ν° : {total_toks_in:,}
|
| 68 |
+
μμ μΆλ ₯ ν ν° : {total_toks_out:,} (with x1.01 additional room)
|
| 69 |
+
---------------------------------------
|
| 70 |
+
μμ λ°μ λΉμ© : ${total_costs:.3f}
|
| 71 |
+
{_doubleline}"""
|
| 72 |
+
print(message)
|
| 73 |
|
| 74 |
if args.calc_price_run:
|
| 75 |
return
|
|
|
|
| 97 |
parser.add_argument("-i", "--input", help="input file")
|
| 98 |
parser.add_argument("-o", "--output_dir", help="output file")
|
| 99 |
parser.add_argument(
|
| 100 |
+
"-e", "--evaluation_model", default="gpt-4.1-mini", help="evaluation model specifier"
|
| 101 |
)
|
| 102 |
parser.add_argument(
|
| 103 |
"-c",
|
varco_arena/varco_arena_core/eval_utils.py
CHANGED
|
@@ -7,7 +7,7 @@ import openai
|
|
| 7 |
import pandas as pd
|
| 8 |
import requests
|
| 9 |
from openlimit import ChatRateLimiter
|
| 10 |
-
from varco_arena_core.prompts import TranslationPairPrompt
|
| 11 |
from varco_arena_core.prompts.base_prompt import ComparisonPromptBase
|
| 12 |
|
| 13 |
from .tracking_utils import CountTokens
|
|
@@ -72,7 +72,29 @@ async def async_query_openai(
|
|
| 72 |
messages=completed_prompts,
|
| 73 |
**prompt_obj.sampling_parameters,
|
| 74 |
)
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
isopenai: bool = os.getenv("OPENAI_BASE_URL") == "https://api.openai.com/v1"
|
| 77 |
|
| 78 |
# defining client here?...
|
|
@@ -139,6 +161,10 @@ async def async_query_openai(
|
|
| 139 |
increase_match_count() # you're hacky Jumin...
|
| 140 |
|
| 141 |
normalized_result["api_call_kwargs"] = kwargs
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
return normalized_result, resp
|
| 143 |
|
| 144 |
|
|
@@ -158,7 +184,8 @@ async def async_eval_w_prompt(
|
|
| 158 |
task=position_1st.task,
|
| 159 |
)
|
| 160 |
|
| 161 |
-
if isinstance(prompt_obj, TranslationPairPrompt)
|
|
|
|
| 162 |
kwargs["source_lang"] = position_1st.source_lang
|
| 163 |
kwargs["target_lang"] = position_1st.target_lang
|
| 164 |
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
import requests
|
| 9 |
from openlimit import ChatRateLimiter
|
| 10 |
+
from varco_arena_core.prompts import TranslationPairPrompt, TranslationNewPrompt
|
| 11 |
from varco_arena_core.prompts.base_prompt import ComparisonPromptBase
|
| 12 |
|
| 13 |
from .tracking_utils import CountTokens
|
|
|
|
| 72 |
messages=completed_prompts,
|
| 73 |
**prompt_obj.sampling_parameters,
|
| 74 |
)
|
| 75 |
+
|
| 76 |
+
# o-series of models (reasoning models)
|
| 77 |
+
if model in [
|
| 78 |
+
"o4-mini",
|
| 79 |
+
]:
|
| 80 |
+
# does not provide logprobs
|
| 81 |
+
kwargs.pop("logprobs")
|
| 82 |
+
kwargs.pop("top_logprobs")
|
| 83 |
+
# does not allow temperature
|
| 84 |
+
kwargs.pop("temperature")
|
| 85 |
+
# does not allow stop
|
| 86 |
+
kwargs.pop("stop")
|
| 87 |
+
# max_completion_tokens is different from what I expect... does it count reaosning path too?
|
| 88 |
+
kwargs.pop("max_tokens")
|
| 89 |
+
|
| 90 |
+
# max_tokens = kwargs.pop("max_tokens")
|
| 91 |
+
# kwargs["max_completion_tokens"] = max_tokens
|
| 92 |
+
|
| 93 |
+
# prefer developer role than system
|
| 94 |
+
if kwargs["messages"][0]["role"] == "system":
|
| 95 |
+
kwargs["messages"][0]["role"] = "developer"
|
| 96 |
+
# do not support max_tokens --> max_completion_tokens
|
| 97 |
+
|
| 98 |
isopenai: bool = os.getenv("OPENAI_BASE_URL") == "https://api.openai.com/v1"
|
| 99 |
|
| 100 |
# defining client here?...
|
|
|
|
| 161 |
increase_match_count() # you're hacky Jumin...
|
| 162 |
|
| 163 |
normalized_result["api_call_kwargs"] = kwargs
|
| 164 |
+
try:
|
| 165 |
+
normalized_result["actual_response_text"] = resp.choices[0].message.content
|
| 166 |
+
except Exception as e:
|
| 167 |
+
normalized_result["actual_response_text"] = None
|
| 168 |
return normalized_result, resp
|
| 169 |
|
| 170 |
|
|
|
|
| 184 |
task=position_1st.task,
|
| 185 |
)
|
| 186 |
|
| 187 |
+
if isinstance(prompt_obj, TranslationPairPrompt) or \
|
| 188 |
+
isinstance(prompt_obj, TranslationNewPrompt):
|
| 189 |
kwargs["source_lang"] = position_1st.source_lang
|
| 190 |
kwargs["target_lang"] = position_1st.target_lang
|
| 191 |
|
varco_arena/varco_arena_core/league.py
CHANGED
|
@@ -60,7 +60,7 @@ class League:
|
|
| 60 |
"match_order_in_round": "league",
|
| 61 |
"tstamp": now_time,
|
| 62 |
"api_call_kwargs": match_result[0]["api_call_kwargs"],
|
| 63 |
-
|
| 64 |
},
|
| 65 |
]
|
| 66 |
else:
|
|
|
|
| 60 |
"match_order_in_round": "league",
|
| 61 |
"tstamp": now_time,
|
| 62 |
"api_call_kwargs": match_result[0]["api_call_kwargs"],
|
| 63 |
+
"actual_response_text": match_result[0]["actual_response_text"],
|
| 64 |
},
|
| 65 |
]
|
| 66 |
else:
|
varco_arena/varco_arena_core/prompts/base_prompt.py
CHANGED
|
@@ -110,7 +110,7 @@ class ComparisonPromptBase:
|
|
| 110 |
def get_expected_max_tokens_w_room(
|
| 111 |
self,
|
| 112 |
eval_model_name: str = None,
|
| 113 |
-
room: float = 1.
|
| 114 |
):
|
| 115 |
if room < 1:
|
| 116 |
raise ValueError(
|
|
|
|
| 110 |
def get_expected_max_tokens_w_room(
|
| 111 |
self,
|
| 112 |
eval_model_name: str = None,
|
| 113 |
+
room: float = 1.5,
|
| 114 |
):
|
| 115 |
if room < 1:
|
| 116 |
raise ValueError(
|
varco_arena/varco_arena_core/prompts/llmbar.py
CHANGED
|
@@ -47,7 +47,6 @@ class LLMBarPrompt(ComparisonPromptBase):
|
|
| 47 |
vllm response object (logprob struct differs)
|
| 48 |
|
| 49 |
"""
|
| 50 |
-
logprobs = response.choices[0].logprobs.content
|
| 51 |
|
| 52 |
# focus to the token of interest
|
| 53 |
# NOTE: res_tok is not guaranteed to follow the tokenization of the model, it just checks whether our output follows the expected format
|
|
@@ -59,6 +58,7 @@ class LLMBarPrompt(ComparisonPromptBase):
|
|
| 59 |
found_tokens: list = []
|
| 60 |
if isopenai:
|
| 61 |
try:
|
|
|
|
| 62 |
top_logprob_list = find_logprob_of_a_token_openai(
|
| 63 |
logprobs=logprobs, token=res_tok
|
| 64 |
).top_logprobs
|
|
|
|
| 47 |
vllm response object (logprob struct differs)
|
| 48 |
|
| 49 |
"""
|
|
|
|
| 50 |
|
| 51 |
# focus to the token of interest
|
| 52 |
# NOTE: res_tok is not guaranteed to follow the tokenization of the model, it just checks whether our output follows the expected format
|
|
|
|
| 58 |
found_tokens: list = []
|
| 59 |
if isopenai:
|
| 60 |
try:
|
| 61 |
+
logprobs = response.choices[0].logprobs.content
|
| 62 |
top_logprob_list = find_logprob_of_a_token_openai(
|
| 63 |
logprobs=logprobs, token=res_tok
|
| 64 |
).top_logprobs
|
varco_arena/varco_arena_core/prompts/prompt_utils.py
CHANGED
|
@@ -6,6 +6,8 @@ from typing import Any, Callable, Dict, List, Optional, Union
|
|
| 6 |
import tiktoken
|
| 7 |
from transformers import AutoTokenizer
|
| 8 |
|
|
|
|
|
|
|
| 9 |
FMTR = Formatter()
|
| 10 |
|
| 11 |
|
|
@@ -33,6 +35,7 @@ def fill_template_over_messsages(prompt_template: List[Dict], **kwargs):
|
|
| 33 |
else:
|
| 34 |
msg1 = msg
|
| 35 |
new_msgs.append(msg1)
|
|
|
|
| 36 |
return new_msgs
|
| 37 |
|
| 38 |
|
|
@@ -53,7 +56,11 @@ def get_tokenizer_from_model_name(
|
|
| 53 |
# load tokenizer
|
| 54 |
if model_name:
|
| 55 |
try: # assume openai model
|
| 56 |
-
if model_name
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
tokenizer = tiktoken.encoding_for_model("gpt-4o")
|
| 58 |
else:
|
| 59 |
tokenizer = tiktoken.encoding_for_model(model_name)
|
|
@@ -108,8 +115,11 @@ def is_model_from_openai(response: Any = None, model: str = None) -> bool:
|
|
| 108 |
else:
|
| 109 |
pass
|
| 110 |
openai_prefixes = [
|
| 111 |
-
"gpt-3.5-",
|
| 112 |
"gpt-4",
|
|
|
|
|
|
|
|
|
|
| 113 |
]
|
| 114 |
|
| 115 |
model_from_openai: bool = False
|
|
|
|
| 6 |
import tiktoken
|
| 7 |
from transformers import AutoTokenizer
|
| 8 |
|
| 9 |
+
|
| 10 |
+
OPENAI_MSGS = List[Dict[str, str]]
|
| 11 |
FMTR = Formatter()
|
| 12 |
|
| 13 |
|
|
|
|
| 35 |
else:
|
| 36 |
msg1 = msg
|
| 37 |
new_msgs.append(msg1)
|
| 38 |
+
|
| 39 |
return new_msgs
|
| 40 |
|
| 41 |
|
|
|
|
| 56 |
# load tokenizer
|
| 57 |
if model_name:
|
| 58 |
try: # assume openai model
|
| 59 |
+
if model_name in [
|
| 60 |
+
"gpt-4.1",
|
| 61 |
+
"gpt-4.1-mini",
|
| 62 |
+
"o4-mini",
|
| 63 |
+
]:
|
| 64 |
tokenizer = tiktoken.encoding_for_model("gpt-4o")
|
| 65 |
else:
|
| 66 |
tokenizer = tiktoken.encoding_for_model(model_name)
|
|
|
|
| 115 |
else:
|
| 116 |
pass
|
| 117 |
openai_prefixes = [
|
| 118 |
+
# "gpt-3.5-",
|
| 119 |
"gpt-4",
|
| 120 |
+
"o4",
|
| 121 |
+
"o3",
|
| 122 |
+
"o1",
|
| 123 |
]
|
| 124 |
|
| 125 |
model_from_openai: bool = False
|
varco_arena/varco_arena_core/prompts/prompts_README.md
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Guide for custom prompt
|
| 2 |
+
- your content of the prompt will be in [PROMPTNAME].yaml
|
| 3 |
+
- your pre/post processing logics for prompt will be in [PROMPTNAME].py
|
| 4 |
+
- presets are implemented with logprob decision which is... inconvenient. You don't need to do that (you could just parse text and decide like as done in translation_pair, translation_new)
|
varco_arena/varco_arena_core/prompts/translation_new.py
CHANGED
|
@@ -19,10 +19,18 @@ class TranslationNewPrompt(LLMBarPrompt):
|
|
| 19 |
res_tok = "(A)"
|
| 20 |
elif "(B)" in input_string and "(A)" not in input_string:
|
| 21 |
res_tok = "(B)"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
else: # both exists or nothing exists
|
| 23 |
# fallback for ambiguous or malformed model output
|
| 24 |
res_tok = random.choice(['(A)', '(B)'])
|
| 25 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
return res_tok
|
| 28 |
|
|
|
|
| 19 |
res_tok = "(A)"
|
| 20 |
elif "(B)" in input_string and "(A)" not in input_string:
|
| 21 |
res_tok = "(B)"
|
| 22 |
+
elif "A" in input_string and "B" not in input_string:
|
| 23 |
+
res_tok = "(A)"
|
| 24 |
+
elif "B" in input_string and "A" not in input_string:
|
| 25 |
+
res_tok = "(B)"
|
| 26 |
else: # both exists or nothing exists
|
| 27 |
# fallback for ambiguous or malformed model output
|
| 28 |
res_tok = random.choice(['(A)', '(B)'])
|
| 29 |
+
print("="*100)
|
| 30 |
+
print(f"actual_response={input_string}")
|
| 31 |
+
print(f"{res_tok=}")
|
| 32 |
+
print("Response format Error (model side, not code side): Fails to output in expected format. Fallback to random choice: ", res_tok)
|
| 33 |
+
print("="*100)
|
| 34 |
|
| 35 |
return res_tok
|
| 36 |
|
varco_arena/varco_arena_core/prompts/translation_new.yaml
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
sampling_parameters:
|
| 2 |
stop: []
|
| 3 |
temperature: 0.7
|
| 4 |
-
logprobs: true
|
| 5 |
top_logprobs: 20
|
| 6 |
|
| 7 |
|
| 8 |
decision_tokens:
|
| 9 |
-
prefer_1st:
|
| 10 |
-
prefer_2nd:
|
| 11 |
|
| 12 |
expected_generation_str: (A)
|
| 13 |
|
|
|
|
| 1 |
sampling_parameters:
|
| 2 |
stop: []
|
| 3 |
temperature: 0.7
|
| 4 |
+
logprobs: true # will strip off if models aren't compatible
|
| 5 |
top_logprobs: 20
|
| 6 |
|
| 7 |
|
| 8 |
decision_tokens:
|
| 9 |
+
prefer_1st: (A)
|
| 10 |
+
prefer_2nd: (B)
|
| 11 |
|
| 12 |
expected_generation_str: (A)
|
| 13 |
|
varco_arena/varco_arena_core/tournament.py
CHANGED
|
@@ -107,7 +107,7 @@ class Tournament:
|
|
| 107 |
"match_order_in_round": match_order,
|
| 108 |
"tstamp": now_time,
|
| 109 |
"api_call_kwargs": match_result[0]["api_call_kwargs"],
|
| 110 |
-
|
| 111 |
},
|
| 112 |
]
|
| 113 |
try:
|
|
|
|
| 107 |
"match_order_in_round": match_order,
|
| 108 |
"tstamp": now_time,
|
| 109 |
"api_call_kwargs": match_result[0]["api_call_kwargs"],
|
| 110 |
+
"actual_response_text": match_result[0]["actual_response_text"],
|
| 111 |
},
|
| 112 |
]
|
| 113 |
try:
|
varco_arena/varco_arena_core/tracking_utils.py
CHANGED
|
@@ -19,13 +19,14 @@ from openai import OpenAI
|
|
| 19 |
# return wrapper
|
| 20 |
|
| 21 |
pricing = {
|
| 22 |
-
"gpt-4o": {"input": 5.00, "output": 15.00},
|
| 23 |
"gpt-4o-2024-05-13": {"input": 2.50, "output": 10.00},
|
| 24 |
"gpt-4o-2024-08-06": {"input": 5.00, "output": 15.00},
|
| 25 |
"gpt-4o-mini": {"input": 0.15, "output": 0.600},
|
| 26 |
"gpt-4o-mini-2024-07-18": {"input": 0.15, "output": 0.600},
|
| 27 |
"gpt-4.1-mini": {"input": 0.4, "output": 1.600},
|
| 28 |
"gpt-4.1": {"input": 2., "output": 8.},
|
|
|
|
| 29 |
}
|
| 30 |
|
| 31 |
|
|
|
|
| 19 |
# return wrapper
|
| 20 |
|
| 21 |
pricing = {
|
| 22 |
+
"gpt-4o": {"input": 5.00, "output": 15.00},
|
| 23 |
"gpt-4o-2024-05-13": {"input": 2.50, "output": 10.00},
|
| 24 |
"gpt-4o-2024-08-06": {"input": 5.00, "output": 15.00},
|
| 25 |
"gpt-4o-mini": {"input": 0.15, "output": 0.600},
|
| 26 |
"gpt-4o-mini-2024-07-18": {"input": 0.15, "output": 0.600},
|
| 27 |
"gpt-4.1-mini": {"input": 0.4, "output": 1.600},
|
| 28 |
"gpt-4.1": {"input": 2., "output": 8.},
|
| 29 |
+
"o4-mini": {"input": 1.1, "output": 4.4},
|
| 30 |
}
|
| 31 |
|
| 32 |
|