Spaces:

NCSOFT
/

ArenaLite

Running

App Files Files Community

sonsus commited on Jun 19

Commit

9e784df

1 Parent(s): 1409f18

new feat: o4-mini supported

Browse files

Files changed (12) hide show

varco_arena/calc_cost.py +1 -1
varco_arena/main.py +21 -19
varco_arena/varco_arena_core/eval_utils.py +30 -3
varco_arena/varco_arena_core/league.py +1 -1
varco_arena/varco_arena_core/prompts/base_prompt.py +1 -1
varco_arena/varco_arena_core/prompts/llmbar.py +1 -1
varco_arena/varco_arena_core/prompts/prompt_utils.py +12 -2
varco_arena/varco_arena_core/prompts/prompts_README.md +4 -0
varco_arena/varco_arena_core/prompts/translation_new.py +9 -1
varco_arena/varco_arena_core/prompts/translation_new.yaml +3 -3
varco_arena/varco_arena_core/tournament.py +1 -1
varco_arena/varco_arena_core/tracking_utils.py +2 -1

varco_arena/calc_cost.py CHANGED Viewed

@@ -52,7 +52,7 @@ def calculate(
         lambda row: eval_task_2_prm[
             f"{evalprompt}_{row.task}"
         ].get_expected_max_tokens_w_room(
-            model_name, room=1.01
         ),  # here, prompt_obj will define tokenizer with `model_name`
         axis=1,
     )

         lambda row: eval_task_2_prm[
             f"{evalprompt}_{row.task}"
         ].get_expected_max_tokens_w_room(
+            model_name, room=2
         ),  # here, prompt_obj will define tokenizer with `model_name`
         axis=1,
     )

varco_arena/main.py CHANGED Viewed

@@ -49,25 +49,27 @@ def main(
         os.makedirs(output_dir, exist_ok=True)
     # cost estimation
-    total_matches, total_toks_in, total_toks_out, total_costs = calculate(
-        dataset_df=dataset_df,
-        model_name=evaluation_model,
-        matching_method=matching_method,
-        evalprompt=evalprompt,
-    )
-    _doubleline = "=" * 50
-    message = f"""---------------------------------------
-Judge LLM: {evaluation_model}
-평가 프롬프트: {evalprompt}
-평가 리그 방법: {matching_method}
-예상 평가 횟수 : {total_matches:,}
-예상 입력 토큰 : {total_toks_in:,}
-예상 출력 토큰 : {total_toks_out:,} (with x1.01 additional room)
----------------------------------------
-예상 발생 비용 : ${total_costs:.3f}
-{_doubleline}"""
-    print(message)
     if args.calc_price_run:
         return
@@ -95,7 +97,7 @@ if __name__ == "__main__":
     parser.add_argument("-i", "--input", help="input file")
     parser.add_argument("-o", "--output_dir", help="output file")
     parser.add_argument(
-        "-e", "--evaluation_model", default="debug", help="evaluation model specifier"
     )
     parser.add_argument(
         "-c",

         os.makedirs(output_dir, exist_ok=True)
     # cost estimation
+    if evaluation_model != "debug":
+        total_matches, total_toks_in, total_toks_out, total_costs = calculate(
+            dataset_df=dataset_df,
+            model_name=evaluation_model,
+            matching_method=matching_method,
+            evalprompt=evalprompt,
+        )
+        _doubleline = "=" * 50
+        message = f"""---------------------------------------
+    Judge LLM: {evaluation_model}
+    평가 프롬프트: {evalprompt}
+    평가 리그 방법: {matching_method}
+    예상 평가 횟수 : {total_matches:,}
+    예상 입력 토큰 : {total_toks_in:,}
+    예상 출력 토큰 : {total_toks_out:,} (with x1.01 additional room)
+    ---------------------------------------
+    예상 발생 비용 : ${total_costs:.3f}
+    {_doubleline}"""
+        print(message)
     if args.calc_price_run:
         return
     parser.add_argument("-i", "--input", help="input file")
     parser.add_argument("-o", "--output_dir", help="output file")
     parser.add_argument(
+        "-e", "--evaluation_model", default="gpt-4.1-mini", help="evaluation model specifier"
     )
     parser.add_argument(
         "-c",

varco_arena/varco_arena_core/eval_utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ import openai
 import pandas as pd
 import requests
 from openlimit import ChatRateLimiter
-from varco_arena_core.prompts import TranslationPairPrompt
 from varco_arena_core.prompts.base_prompt import ComparisonPromptBase
 from .tracking_utils import CountTokens
@@ -72,7 +72,29 @@ async def async_query_openai(
         messages=completed_prompts,
         **prompt_obj.sampling_parameters,
     )
     isopenai: bool = os.getenv("OPENAI_BASE_URL") == "https://api.openai.com/v1"
     # defining client here?...
@@ -139,6 +161,10 @@ async def async_query_openai(
     increase_match_count()  # you're hacky Jumin...
     normalized_result["api_call_kwargs"] = kwargs
     return normalized_result, resp
@@ -158,7 +184,8 @@ async def async_eval_w_prompt(
         task=position_1st.task,
     )
-    if isinstance(prompt_obj, TranslationPairPrompt):
         kwargs["source_lang"] = position_1st.source_lang
         kwargs["target_lang"] = position_1st.target_lang

 import pandas as pd
 import requests
 from openlimit import ChatRateLimiter
+from varco_arena_core.prompts import TranslationPairPrompt, TranslationNewPrompt
 from varco_arena_core.prompts.base_prompt import ComparisonPromptBase
 from .tracking_utils import CountTokens
         messages=completed_prompts,
         **prompt_obj.sampling_parameters,
     )
+    # o-series of models (reasoning models)
+    if model in [
+        "o4-mini",
+    ]:
+        # does not provide logprobs
+        kwargs.pop("logprobs")
+        kwargs.pop("top_logprobs")
+        # does not allow temperature
+        kwargs.pop("temperature")
+        # does not allow stop
+        kwargs.pop("stop")
+        # max_completion_tokens is different from what I expect... does it count reaosning path too?
+        kwargs.pop("max_tokens")
+        # max_tokens = kwargs.pop("max_tokens")
+        # kwargs["max_completion_tokens"] = max_tokens
+        # prefer developer role than system
+        if kwargs["messages"][0]["role"] == "system":
+            kwargs["messages"][0]["role"] = "developer"
+        # do not support max_tokens --> max_completion_tokens
     isopenai: bool = os.getenv("OPENAI_BASE_URL") == "https://api.openai.com/v1"
     # defining client here?...
     increase_match_count()  # you're hacky Jumin...
     normalized_result["api_call_kwargs"] = kwargs
+    try:
+        normalized_result["actual_response_text"] = resp.choices[0].message.content
+    except Exception as e:
+        normalized_result["actual_response_text"] = None
     return normalized_result, resp
         task=position_1st.task,
     )
+    if isinstance(prompt_obj, TranslationPairPrompt) or \
+        isinstance(prompt_obj, TranslationNewPrompt):
         kwargs["source_lang"] = position_1st.source_lang
         kwargs["target_lang"] = position_1st.target_lang

varco_arena/varco_arena_core/league.py CHANGED Viewed

@@ -60,7 +60,7 @@ class League:
                         "match_order_in_round": "league",
                         "tstamp": now_time,
                         "api_call_kwargs": match_result[0]["api_call_kwargs"],
-                        # "logs": match.match_metainfo_log[0],
                     },
                 ]
             else:

                         "match_order_in_round": "league",
                         "tstamp": now_time,
                         "api_call_kwargs": match_result[0]["api_call_kwargs"],
+                        "actual_response_text": match_result[0]["actual_response_text"],
                     },
                 ]
             else:

varco_arena/varco_arena_core/prompts/base_prompt.py CHANGED Viewed

@@ -110,7 +110,7 @@ class ComparisonPromptBase:
     def get_expected_max_tokens_w_room(
         self,
         eval_model_name: str = None,
-        room: float = 1.33,
     ):
         if room < 1:
             raise ValueError(

     def get_expected_max_tokens_w_room(
         self,
         eval_model_name: str = None,
+        room: float = 1.5,
     ):
         if room < 1:
             raise ValueError(

varco_arena/varco_arena_core/prompts/llmbar.py CHANGED Viewed

@@ -47,7 +47,6 @@ class LLMBarPrompt(ComparisonPromptBase):
         vllm response object (logprob struct differs)
         """
-        logprobs = response.choices[0].logprobs.content
         # focus to the token of interest
         # NOTE: res_tok is not guaranteed to follow the tokenization of the model, it just checks whether our output follows the expected format
@@ -59,6 +58,7 @@ class LLMBarPrompt(ComparisonPromptBase):
         found_tokens: list = []
         if isopenai:
             try:
                 top_logprob_list = find_logprob_of_a_token_openai(
                     logprobs=logprobs, token=res_tok
                 ).top_logprobs

         vllm response object (logprob struct differs)
         """
         # focus to the token of interest
         # NOTE: res_tok is not guaranteed to follow the tokenization of the model, it just checks whether our output follows the expected format
         found_tokens: list = []
         if isopenai:
             try:
+                logprobs = response.choices[0].logprobs.content
                 top_logprob_list = find_logprob_of_a_token_openai(
                     logprobs=logprobs, token=res_tok
                 ).top_logprobs

varco_arena/varco_arena_core/prompts/prompt_utils.py CHANGED Viewed

@@ -6,6 +6,8 @@ from typing import Any, Callable, Dict, List, Optional, Union
 import tiktoken
 from transformers import AutoTokenizer
 FMTR = Formatter()
@@ -33,6 +35,7 @@ def fill_template_over_messsages(prompt_template: List[Dict], **kwargs):
         else:
             msg1 = msg
         new_msgs.append(msg1)
     return new_msgs
@@ -53,7 +56,11 @@ def get_tokenizer_from_model_name(
     # load tokenizer
     if model_name:
         try:  # assume openai model
-            if model_name.startswith("gpt-4.1"): # hotfix for 4.1 support
                 tokenizer = tiktoken.encoding_for_model("gpt-4o")
             else:
                 tokenizer = tiktoken.encoding_for_model(model_name)
@@ -108,8 +115,11 @@ def is_model_from_openai(response: Any = None, model: str = None) -> bool:
     else:
         pass
     openai_prefixes = [
-        "gpt-3.5-",
         "gpt-4",
     ]
     model_from_openai: bool = False

 import tiktoken
 from transformers import AutoTokenizer
+OPENAI_MSGS = List[Dict[str, str]]
 FMTR = Formatter()
         else:
             msg1 = msg
         new_msgs.append(msg1)
     return new_msgs
     # load tokenizer
     if model_name:
         try:  # assume openai model
+            if model_name in [
+                "gpt-4.1",
+                "gpt-4.1-mini",
+                "o4-mini",
+            ]:
                 tokenizer = tiktoken.encoding_for_model("gpt-4o")
             else:
                 tokenizer = tiktoken.encoding_for_model(model_name)
     else:
         pass
     openai_prefixes = [
+        # "gpt-3.5-",
         "gpt-4",
+        "o4",
+        "o3",
+        "o1",
     ]
     model_from_openai: bool = False

varco_arena/varco_arena_core/prompts/prompts_README.md ADDED Viewed

	@@ -0,0 +1,4 @@

+# Guide for custom prompt
+- your content of the prompt will be in [PROMPTNAME].yaml
+- your pre/post processing logics for prompt will be in [PROMPTNAME].py
+- presets are implemented with logprob decision which is... inconvenient. You don't need to do that (you could just parse text and decide like as done in translation_pair, translation_new)

varco_arena/varco_arena_core/prompts/translation_new.py CHANGED Viewed

@@ -19,10 +19,18 @@ class TranslationNewPrompt(LLMBarPrompt):
             res_tok = "(A)"
         elif "(B)" in input_string and "(A)" not in input_string:
             res_tok = "(B)"
         else: # both exists or nothing exists
             # fallback for ambiguous or malformed model output
             res_tok = random.choice(['(A)', '(B)'])
-            print("Error: Fails to output in expected format. Fallback to random choice: ", res_tok)
         return res_tok

             res_tok = "(A)"
         elif "(B)" in input_string and "(A)" not in input_string:
             res_tok = "(B)"
+        elif "A" in input_string and "B" not in input_string:
+            res_tok = "(A)"
+        elif "B" in input_string and "A" not in input_string:
+            res_tok = "(B)"
         else: # both exists or nothing exists
             # fallback for ambiguous or malformed model output
             res_tok = random.choice(['(A)', '(B)'])
+            print("="*100)
+            print(f"actual_response={input_string}")
+            print(f"{res_tok=}")
+            print("Response format Error (model side, not code side): Fails to output in expected format. Fallback to random choice: ", res_tok)
+            print("="*100)
         return res_tok

varco_arena/varco_arena_core/prompts/translation_new.yaml CHANGED Viewed

@@ -1,13 +1,13 @@
 sampling_parameters:
   stop: []
   temperature: 0.7
-  logprobs: true
   top_logprobs: 20
 decision_tokens:
-  prefer_1st: a
-  prefer_2nd: b
 expected_generation_str: (A)

 sampling_parameters:
   stop: []
   temperature: 0.7
+  logprobs: true # will strip off if models aren't compatible
   top_logprobs: 20
 decision_tokens:
+  prefer_1st: (A)
+  prefer_2nd: (B)
 expected_generation_str: (A)

varco_arena/varco_arena_core/tournament.py CHANGED Viewed

@@ -107,7 +107,7 @@ class Tournament:
                             "match_order_in_round": match_order,
                             "tstamp": now_time,
                             "api_call_kwargs": match_result[0]["api_call_kwargs"],
-                            # "logs": match.match_metainfo_log[0],
                         },
                     ]
                     try:

                             "match_order_in_round": match_order,
                             "tstamp": now_time,
                             "api_call_kwargs": match_result[0]["api_call_kwargs"],
+                            "actual_response_text": match_result[0]["actual_response_text"],
                         },
                     ]
                     try:

varco_arena/varco_arena_core/tracking_utils.py CHANGED Viewed

@@ -19,13 +19,14 @@ from openai import OpenAI
 #     return wrapper
 pricing = {
-    "gpt-4o": {"input": 5.00, "output": 15.00},  # currently points to 0513
     "gpt-4o-2024-05-13": {"input": 2.50, "output": 10.00},
     "gpt-4o-2024-08-06": {"input": 5.00, "output": 15.00},
     "gpt-4o-mini": {"input": 0.15, "output": 0.600},
     "gpt-4o-mini-2024-07-18": {"input": 0.15, "output": 0.600},
     "gpt-4.1-mini": {"input": 0.4, "output": 1.600},
     "gpt-4.1": {"input": 2., "output": 8.},
 }

 #     return wrapper
 pricing = {
+    "gpt-4o": {"input": 5.00, "output": 15.00},
     "gpt-4o-2024-05-13": {"input": 2.50, "output": 10.00},
     "gpt-4o-2024-08-06": {"input": 5.00, "output": 15.00},
     "gpt-4o-mini": {"input": 0.15, "output": 0.600},
     "gpt-4o-mini-2024-07-18": {"input": 0.15, "output": 0.600},
     "gpt-4.1-mini": {"input": 0.4, "output": 1.600},
     "gpt-4.1": {"input": 2., "output": 8.},
+    "o4-mini": {"input": 1.1, "output": 4.4},
 }