Update app.py
Browse files
app.py
CHANGED
|
@@ -552,6 +552,17 @@ ugi_category_columns = [
|
|
| 552 |
create_numeric_column(col, width=120) for col in UGI_CATEGORY_COLS
|
| 553 |
]
|
| 554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
political_columns = [
|
| 556 |
{
|
| 557 |
"headerName": "Ideology",
|
|
@@ -810,12 +821,13 @@ app.layout = html.Div([
|
|
| 810 |
id='additional-columns-filter',
|
| 811 |
options=[
|
| 812 |
{'label': 'UGI Categories', 'value': 'ugi_categories'},
|
|
|
|
| 813 |
{'label': 'Political Test Axes', 'value': 'political_axes'}
|
| 814 |
],
|
| 815 |
value=[],
|
| 816 |
inline=True,
|
| 817 |
style={'display': 'inline-block'},
|
| 818 |
-
labelStyle={'fontWeight': 'normal', 'marginRight': '15px'}
|
| 819 |
)
|
| 820 |
], style={'marginBottom': '13px', 'padding': '0 20px', 'overflow': 'hidden'}),
|
| 821 |
|
|
@@ -865,15 +877,53 @@ app.layout = html.Div([
|
|
| 865 |
|
| 866 |
# Description
|
| 867 |
html.Div([
|
| 868 |
-
html.H3("About"),
|
|
|
|
|
|
|
|
|
|
| 869 |
|
| 870 |
-
html.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 871 |
|
| 872 |
-
html.P([html.Strong("W/10:"), " Willingness/10. A more narrow subset of the UGI questions, solely focused on measuring how far a model can be pushed before going against its instructions or refusing to answer."]),
|
| 873 |
|
| 874 |
-
html.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
|
| 876 |
-
html.P([
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 877 |
|
| 878 |
html.P([html.Strong("Coding:"), " A simple 50 question quiz measuring how vast a model's programming knowledge is. Each question is worth 2 points."]),
|
| 879 |
|
|
@@ -1002,10 +1052,17 @@ def update_columns(additional_columns):
|
|
| 1002 |
|
| 1003 |
# Add UGI category columns if selected
|
| 1004 |
if 'ugi_categories' in additional_columns:
|
| 1005 |
-
current_columns.extend(ugi_category_columns)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1006 |
|
| 1007 |
-
# Add remaining base columns (
|
| 1008 |
-
current_columns.extend(columnDefs[
|
| 1009 |
|
| 1010 |
# Add political columns if selected
|
| 1011 |
if 'political_axes' in additional_columns:
|
|
|
|
| 552 |
create_numeric_column(col, width=120) for col in UGI_CATEGORY_COLS
|
| 553 |
]
|
| 554 |
|
| 555 |
+
w10_type_columns = [
|
| 556 |
+
create_numeric_column("W/10-Direct", width=120, filterParams={
|
| 557 |
+
"defaultOption": "greaterThanOrEqual",
|
| 558 |
+
"filterOptions": ['equals', 'notEqual', 'greaterThan', 'greaterThanOrEqual', 'lessThan', 'lessThanOrEqual', 'inRange']
|
| 559 |
+
}),
|
| 560 |
+
create_numeric_column("W/10-Adherence", width=120, filterParams={
|
| 561 |
+
"defaultOption": "greaterThanOrEqual",
|
| 562 |
+
"filterOptions": ['equals', 'notEqual', 'greaterThan', 'greaterThanOrEqual', 'lessThan', 'lessThanOrEqual', 'inRange']
|
| 563 |
+
})
|
| 564 |
+
]
|
| 565 |
+
|
| 566 |
political_columns = [
|
| 567 |
{
|
| 568 |
"headerName": "Ideology",
|
|
|
|
| 821 |
id='additional-columns-filter',
|
| 822 |
options=[
|
| 823 |
{'label': 'UGI Categories', 'value': 'ugi_categories'},
|
| 824 |
+
{'label': 'W/10 Types', 'value': 'w10_types'},
|
| 825 |
{'label': 'Political Test Axes', 'value': 'political_axes'}
|
| 826 |
],
|
| 827 |
value=[],
|
| 828 |
inline=True,
|
| 829 |
style={'display': 'inline-block'},
|
| 830 |
+
labelStyle={'fontWeight': 'normal', 'marginRight': '15px'}
|
| 831 |
)
|
| 832 |
], style={'marginBottom': '13px', 'padding': '0 20px', 'overflow': 'hidden'}),
|
| 833 |
|
|
|
|
| 877 |
|
| 878 |
# Description
|
| 879 |
html.Div([
|
| 880 |
+
html.H3("About", style={'fontSize': '22px', 'marginBottom': '0px'}),
|
| 881 |
+
|
| 882 |
+
html.P([html.Strong("UGI:"), " Uncensored General Intelligence. A benchmark measuring both willingness to answer and accuracy in fact-based contentious questions. The test set is made of roughly 100 questions/tasks, covering topics that are commonly difficult to get LLMs to answer. The leaderboard's questions are kept private in order to avoid the common problem of not knowing if a model is intelligent or if it was just trained on the test questions."],
|
| 883 |
+
style={'marginTop': '7px', 'marginBottom': '4px'}),
|
| 884 |
|
| 885 |
+
html.Details([
|
| 886 |
+
html.Summary("Categories",
|
| 887 |
+
style={
|
| 888 |
+
'fontWeight': 'normal',
|
| 889 |
+
'fontSize': '1em',
|
| 890 |
+
'marginLeft': '20px',
|
| 891 |
+
'cursor': 'pointer'
|
| 892 |
+
}),
|
| 893 |
+
html.Ul([
|
| 894 |
+
html.Li("Unruly: Taboo Underground Knowledge"),
|
| 895 |
+
html.Li("Internet: Knowledge of controversial/explicit web content"),
|
| 896 |
+
html.Li("Societal/Political: Awareness of contentious socio-political issues")
|
| 897 |
+
], style={'marginTop': '0px', 'marginBottom': '16px', 'marginLeft': '40px'})
|
| 898 |
+
], style={'marginBottom': '16px'}),
|
| 899 |
|
| 900 |
+
html.P([html.Strong("W/10:"), " Willingness/10. A more narrow subset of the UGI questions, solely focused on measuring how far a model can be pushed before going against its instructions or refusing to answer."], style={'marginBottom': '4px'}),
|
| 901 |
|
| 902 |
+
html.Details([
|
| 903 |
+
html.Summary("Types",
|
| 904 |
+
style={
|
| 905 |
+
'fontWeight': 'normal',
|
| 906 |
+
'fontSize': '1em',
|
| 907 |
+
'marginLeft': '20px',
|
| 908 |
+
'cursor': 'pointer'
|
| 909 |
+
}),
|
| 910 |
+
html.Ul([
|
| 911 |
+
html.Li("Direct: Measures if the model directly refuses to respond to certain prompts"),
|
| 912 |
+
html.Li("Adherence: Some models might not explicitly refuse to do something, though will still deviate from the instructions as a way to get out of doing it, or simply due to lack of instruction following capabilities")
|
| 913 |
+
], style={'marginTop': '0px', 'marginBottom': '16px', 'marginLeft': '40px'})
|
| 914 |
+
], style={'marginBottom': '16px'}),
|
| 915 |
|
| 916 |
+
html.P([
|
| 917 |
+
"A high UGI but low W/10 could mean for example that the model can provide a lot of accurate sensitive information, but will refuse to form the information into something it sees as offensive or against its rules.",
|
| 918 |
+
html.Br(),
|
| 919 |
+
html.Br()
|
| 920 |
+
]),
|
| 921 |
+
|
| 922 |
+
html.P([
|
| 923 |
+
html.Strong("Benchmarks not focused on censorship:"),
|
| 924 |
+
html.Div(style={'margin': '6px 0'}),
|
| 925 |
+
html.Strong("NatInt:"), " Natural Intelligence. A general knowledge quiz covering real-world subjects that llms are not commonly benchmarked on, such as pop culture trivia. This measures if the model understands a diverse range of topics, as opposed to over-training on textbook information and the types of questions commonly tested on benchmarks."
|
| 926 |
+
]),
|
| 927 |
|
| 928 |
html.P([html.Strong("Coding:"), " A simple 50 question quiz measuring how vast a model's programming knowledge is. Each question is worth 2 points."]),
|
| 929 |
|
|
|
|
| 1052 |
|
| 1053 |
# Add UGI category columns if selected
|
| 1054 |
if 'ugi_categories' in additional_columns:
|
| 1055 |
+
current_columns.extend(ugi_category_columns)
|
| 1056 |
+
|
| 1057 |
+
# Add W/10 column
|
| 1058 |
+
current_columns.extend(columnDefs[7:8]) # Add just the W/10 column
|
| 1059 |
+
|
| 1060 |
+
# Add W/10 type columns if selected
|
| 1061 |
+
if 'w10_types' in additional_columns:
|
| 1062 |
+
current_columns.extend(w10_type_columns)
|
| 1063 |
|
| 1064 |
+
# Add remaining base columns (NatInt, Coding, Political Lean)
|
| 1065 |
+
current_columns.extend(columnDefs[8:11])
|
| 1066 |
|
| 1067 |
# Add political columns if selected
|
| 1068 |
if 'political_axes' in additional_columns:
|