File size: 5,485 Bytes
3edbc93 58db0a0 3e8741e 58db0a0 10e69e7 50e75cf 8f9985e 3edbc93 3e8741e 8f9985e 3edbc93 21f87d6 10e69e7 89d69bf 62b6599 3edbc93 3e8741e d2511c2 3e8741e 3edbc93 3e8741e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import pandas as pd
from datasets import load_dataset
import gradio as gr
import hashlib
from typing import Iterable, Union
from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS
pd.set_option('display.max_columns', None)
def show_output_box(message):
return gr.update(value=message, visible=True)
def anonymize_user(username: str) -> str:
# Anonymize using a hash of the username
return hashlib.sha256(username.encode()).hexdigest()[:8]
def fetch_hf_results():
# For debugging
# # Print current time in EST
# EST = timezone(timedelta(hours=-4))
# print(f"tmp: Fetching results from HF at {datetime.now(EST)}")
# Should cache by default if not using force_redownload
df = load_dataset(
RESULTS_REPO, data_files="auto_submissions/metrics_all.csv",
)["train"].to_pandas()
assert all(col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_COLUMNS) - set(df.columns)}"
# Show latest submission only
df = df.sort_values("submission_time", ascending=False).drop_duplicates(subset=["model", "assay", "user"], keep="first")
df["property"] = df["assay"].map(ASSAY_RENAME)
# Anonymize the user column at this point
df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
return df
# Readable hashing function similar to coolname or codenamize
ADJECTIVES = [
"ancient","brave","calm","clever","crimson","curious","dapper","eager",
"fuzzy","gentle","glowing","golden","happy","icy","jolly","lucky",
"magical","mellow","nimble","peachy","quick","royal","shiny","silent",
"sly","sparkly","spicy","spry","sturdy","sunny","swift","tiny","vivid",
"witty"
]
ANIMALS = [
"ant","bat","bear","bee","bison","boar","bug","cat","crab","crow",
"deer","dog","duck","eel","elk","fox","frog","goat","gull","hare",
"hawk","hen","horse","ibis","kid","kiwi","koala","lamb","lark","lemur",
"lion","llama","loon","lynx","mole","moose","mouse","newt","otter","owl",
"ox","panda","pig","prawn","puma","quail","quokka","rabbit","rat","ray",
"robin","seal","shark","sheep","shrew","skunk","slug","snail","snake",
"swan","toad","trout","turtle","vole","walrus","wasp","whale","wolf",
"worm","yak","zebra"
]
NOUNS = [
"rock","sand","star","tree","leaf","seed","stone","cloud","rain","snow",
"wind","fire","ash","dirt","mud","ice","wave","shell","dust","sun",
"moon","hill","lake","pond","reef","root","twig","wood"
]
def readable_hash(
data: Union[str, bytes, Iterable[int]],
*,
salt: Union[str, bytes, None] = None,
words: tuple[list[str], list[str]] = (ADJECTIVES, ANIMALS+NOUNS),
sep: str = "-",
checksum_len: int = 2, # 0 to disable; 2–3 is plenty
case: str = "lower" # "lower" | "title" | "upper"
) -> str:
"""
Deterministically map input data to 'adjective-animal[-checksum]'. Generated using ChatGPT.
Examples
--------
>>> readable_hash("hello world")
'magical-panda-6h'
>>> readable_hash("hello world", salt="my-app-v1", checksum_len=3)
'royal-otter-1pz'
>>> readable_hash(b"\x00\x01\x02\x03", case="title", checksum_len=0)
'Fuzzy-Tiger'
Vocabulary
----------
ADJECTIVES: ~160 safe, descriptive words (e.g. "ancient", "brave", "silent", "swift")
ANIMALS: ~80 short, common animals (e.g. "dog", "owl", "whale", "tiger")
NOUNS: optional set of ~30 neutral nouns (e.g. "rock", "star", "tree", "cloud")
Combinations
------------
- adjective + animal: ~13,000 unique names
- adjective + noun: ~5,000 unique names
- adjective + animal + noun: ~390,000 unique names
Checksum
--------
An optional short base-36 suffix (e.g. "-6h" or "-1pz"). The checksum
acts as a disambiguator in case two different inputs map to the same
word combination. With 2-3 characters, collisions become vanishingly rare.
If you only need fun, human-readable names, you can disable it by setting
``checksum_len=0``. If you need unique, stable identifiers, keep it enabled.
"""
if isinstance(data, str):
data = data.encode()
elif isinstance(data, Iterable) and not isinstance(data, (bytes, bytearray)):
data = bytes(data)
h = hashlib.blake2b(digest_size=8) # fast, stable, short digest
if salt:
h.update(salt.encode() if isinstance(salt, str) else salt)
h.update(b"\x00") # domain-separate salt from data
h.update(data)
digest = h.digest()
# Use the first 6 bytes to index words; last bytes for checksum
n1 = int.from_bytes(digest[0:3], "big")
n2 = int.from_bytes(digest[3:6], "big")
adj = words[0][n1 % len(words[0])]
noun = words[1][n2 % len(words[1])]
phrase = f"{adj}{sep}{noun}"
if checksum_len > 0:
# Short base36 checksum for collision visibility
cs = int.from_bytes(digest[6:], "big")
base36 = ""
alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
while cs:
cs, r = divmod(cs, 36)
base36 = alphabet[r] + base36
base36 = (base36 or "0")[:checksum_len]
phrase = f"{phrase}{sep}{base36}"
if case == "title":
phrase = sep.join(p.capitalize() for p in phrase.split(sep))
elif case == "upper":
phrase = phrase.upper()
return phrase
|