mshamrai commited on
Commit
4a784da
·
1 Parent(s): f9b063b

chore: use datasets

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -14,24 +14,25 @@ from utils import (plot_distances_tsne,
14
  cluster_languages_by_subfamilies,
15
  filter_languages_by_families)
16
  from functools import partial
 
17
 
18
 
19
- with open("../../results/languages_list.pkl", "rb") as f:
20
- languages = pickle.load(f)
21
 
22
- DATASETS = ["wikimedia/wikipedia", "uonlp/CulturaX", "HuggingFaceFW/fineweb-2"]
23
- MODELS = ["mistralai/Mistral-7B-v0.1", "google/gemma-3-4b-pt", "meta-llama/Llama-3.2-1B"]
 
 
 
24
 
25
  distance_matrices = {
26
- dataset: {
27
- model: np.load(os.path.join("../../results", dataset, model, "distances_matrix.npy"))
28
- for model in MODELS
29
  }
30
- for dataset in DATASETS
31
  }
32
 
33
- average_distances_matrix = np.load("../../results/average_distances_matrix.npy")
34
-
35
 
36
  def filter_languages_nan(model, dataset, use_average):
37
  if use_average:
 
14
  cluster_languages_by_subfamilies,
15
  filter_languages_by_families)
16
  from functools import partial
17
+ import datasets
18
 
19
 
20
+ dataset = datasets.load_dataset("mshamrai/language-metric-data", split="train", trust_remote_code=True)
 
21
 
22
+ languages = dataset["languages_list"][0]
23
+ average_distances_matrix = np.array(dataset["average_distances_matrix"][0])
24
+
25
+ DATASETS = dataset["distances_matrices"][0]["dataset_name"]
26
+ MODELS = dataset["distances_matrices"][0]["models"][0]["model_name"]
27
 
28
  distance_matrices = {
29
+ DATASETS[i]: {
30
+ MODELS[j]: np.array(dataset["distances_matrices"][0]["models"][i]["matrix"][j])
31
+ for j in range(len(MODELS))
32
  }
33
+ for i in range(len(DATASETS))
34
  }
35
 
 
 
36
 
37
  def filter_languages_nan(model, dataset, use_average):
38
  if use_average: