Spaces:

DontPlanToEnd
/

UGI-Leaderboard

Running

App Files Files Community

437

DontPlanToEnd commited on Feb 3

Commit

799f51a

verified ·

1 Parent(s): 892c34b

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -9

app.py CHANGED Viewed

@@ -552,6 +552,17 @@ ugi_category_columns = [
     create_numeric_column(col, width=120) for col in UGI_CATEGORY_COLS
 ]
 political_columns = [
     {
         "headerName": "Ideology",
@@ -810,12 +821,13 @@ app.layout = html.Div([
             id='additional-columns-filter',
             options=[
                 {'label': 'UGI Categories', 'value': 'ugi_categories'},
                 {'label': 'Political Test Axes', 'value': 'political_axes'}
             ],
             value=[],
             inline=True,
             style={'display': 'inline-block'},
-            labelStyle={'fontWeight': 'normal', 'marginRight': '15px'}  # Add consistent spacing
         )
     ], style={'marginBottom': '13px', 'padding': '0 20px', 'overflow': 'hidden'}),
@@ -865,15 +877,53 @@ app.layout = html.Div([
     # Description
     html.Div([
-        html.H3("About"),
-        html.P([html.Strong("UGI:"), " Uncensored General Intelligence. A benchmark measuring both willingness to answer and accuracy in fact-based contentious questions. The test set is made of roughly 100 questions/tasks, covering topics that are commonly difficult to get LLMs to answer. The leaderboard's questions are kept private in order to avoid the common problem of not knowing if a model is intelligent or if it was just trained on the test questions."]),
-        html.P([html.Strong("W/10:"), " Willingness/10. A more narrow subset of the UGI questions, solely focused on measuring how far a model can be pushed before going against its instructions or refusing to answer."]),
-        html.P("A high UGI but low W/10 could mean for example that the model can provide a lot of accurate sensitive information, but will refuse to form the information into something it sees as offensive or against its rules."),
-        html.P([html.Strong("NatInt:"), " Natural Intelligence. A general knowledge quiz covering real-world subjects that llms are not commonly benchmarked on, such as pop culture trivia. This measures if the model understands a diverse range of topics, as opposed to over-training on textbook information and the types of questions commonly tested on benchmarks."]),
         html.P([html.Strong("Coding:"), " A simple 50 question quiz measuring how vast a model's programming knowledge is. Each question is worth 2 points."]),
@@ -1002,10 +1052,17 @@ def update_columns(additional_columns):
     # Add UGI category columns if selected
     if 'ugi_categories' in additional_columns:
-        current_columns.extend(ugi_category_columns)  # Use the pre-defined ugi_category_columns
-    # Add remaining base columns (W/10, NatInt, Coding, Political Lean)
-    current_columns.extend(columnDefs[7:11])
     # Add political columns if selected
     if 'political_axes' in additional_columns:

     create_numeric_column(col, width=120) for col in UGI_CATEGORY_COLS
 ]
+w10_type_columns = [
+    create_numeric_column("W/10-Direct", width=120, filterParams={
+        "defaultOption": "greaterThanOrEqual",
+        "filterOptions": ['equals', 'notEqual', 'greaterThan', 'greaterThanOrEqual', 'lessThan', 'lessThanOrEqual', 'inRange']
+    }),
+    create_numeric_column("W/10-Adherence", width=120, filterParams={
+        "defaultOption": "greaterThanOrEqual",
+        "filterOptions": ['equals', 'notEqual', 'greaterThan', 'greaterThanOrEqual', 'lessThan', 'lessThanOrEqual', 'inRange']
+    })
+]
 political_columns = [
     {
         "headerName": "Ideology",
             id='additional-columns-filter',
             options=[
                 {'label': 'UGI Categories', 'value': 'ugi_categories'},
+                {'label': 'W/10 Types', 'value': 'w10_types'},
                 {'label': 'Political Test Axes', 'value': 'political_axes'}
             ],
             value=[],
             inline=True,
             style={'display': 'inline-block'},
+            labelStyle={'fontWeight': 'normal', 'marginRight': '15px'}
         )
     ], style={'marginBottom': '13px', 'padding': '0 20px', 'overflow': 'hidden'}),
     # Description
     html.Div([
+        html.H3("About", style={'fontSize': '22px', 'marginBottom': '0px'}),
+        html.P([html.Strong("UGI:"), " Uncensored General Intelligence. A benchmark measuring both willingness to answer and accuracy in fact-based contentious questions. The test set is made of roughly 100 questions/tasks, covering topics that are commonly difficult to get LLMs to answer. The leaderboard's questions are kept private in order to avoid the common problem of not knowing if a model is intelligent or if it was just trained on the test questions."],
+            style={'marginTop': '7px', 'marginBottom': '4px'}),
+        html.Details([
+            html.Summary("Categories",
+                style={
+                    'fontWeight': 'normal',
+                    'fontSize': '1em',
+                    'marginLeft': '20px',
+                    'cursor': 'pointer'
+                }),
+            html.Ul([
+                html.Li("Unruly: Taboo Underground Knowledge"),
+                html.Li("Internet: Knowledge of controversial/explicit web content"),
+                html.Li("Societal/Political: Awareness of contentious socio-political issues")
+            ], style={'marginTop': '0px', 'marginBottom': '16px', 'marginLeft': '40px'})
+        ], style={'marginBottom': '16px'}),
+        html.P([html.Strong("W/10:"), " Willingness/10. A more narrow subset of the UGI questions, solely focused on measuring how far a model can be pushed before going against its instructions or refusing to answer."], style={'marginBottom': '4px'}),
+        html.Details([
+            html.Summary("Types",
+                style={
+                    'fontWeight': 'normal',
+                    'fontSize': '1em',
+                    'marginLeft': '20px',
+                    'cursor': 'pointer'
+                }),
+            html.Ul([
+                html.Li("Direct: Measures if the model directly refuses to respond to certain prompts"),
+                html.Li("Adherence: Some models might not explicitly refuse to do something, though will still deviate from the instructions as a way to get out of doing it, or simply due to lack of instruction following capabilities")
+            ], style={'marginTop': '0px', 'marginBottom': '16px', 'marginLeft': '40px'})
+        ], style={'marginBottom': '16px'}),
+        html.P([
+            "A high UGI but low W/10 could mean for example that the model can provide a lot of accurate sensitive information, but will refuse to form the information into something it sees as offensive or against its rules.",
+            html.Br(),
+            html.Br()
+        ]),
+        html.P([
+            html.Strong("Benchmarks not focused on censorship:"),
+            html.Div(style={'margin': '6px 0'}),
+            html.Strong("NatInt:"), " Natural Intelligence. A general knowledge quiz covering real-world subjects that llms are not commonly benchmarked on, such as pop culture trivia. This measures if the model understands a diverse range of topics, as opposed to over-training on textbook information and the types of questions commonly tested on benchmarks."
+        ]),
         html.P([html.Strong("Coding:"), " A simple 50 question quiz measuring how vast a model's programming knowledge is. Each question is worth 2 points."]),
     # Add UGI category columns if selected
     if 'ugi_categories' in additional_columns:
+        current_columns.extend(ugi_category_columns)
+    # Add W/10 column
+    current_columns.extend(columnDefs[7:8])  # Add just the W/10 column
+    # Add W/10 type columns if selected
+    if 'w10_types' in additional_columns:
+        current_columns.extend(w10_type_columns)
+    # Add remaining base columns (NatInt, Coding, Political Lean)
+    current_columns.extend(columnDefs[8:11])
     # Add political columns if selected
     if 'political_axes' in additional_columns: