Commit 
							
							·
						
						bcadbe0
	
1
								Parent(s):
							
							2e5b810
								
Add seqlen
Browse files
    	
        app.py
    CHANGED
    
    | @@ -288,6 +288,59 @@ EXTERNAL_MODEL_TO_DIM = { | |
| 288 | 
             
                "unsup-simcse-bert-base-uncased": 768,
         | 
| 289 | 
             
            }
         | 
| 290 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 291 | 
             
            MODELS_TO_SKIP = {
         | 
| 292 | 
             
                "baseplate/instructor-large-1", # Duplicate
         | 
| 293 | 
             
                "radames/e5-large", # Duplicate
         | 
| @@ -341,26 +394,22 @@ for model in EXTERNAL_MODELS: | |
| 341 | 
             
                    ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
         | 
| 342 | 
             
                    EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
         | 
| 343 |  | 
| 344 | 
            -
            def  | 
| 345 | 
             
                filenames = [sib.rfilename for sib in model.siblings]
         | 
| 346 | 
            -
                dim = ""
         | 
| 347 | 
             
                if "1_Pooling/config.json" in filenames:
         | 
| 348 | 
             
                    st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
         | 
| 349 | 
             
                    dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
         | 
| 350 | 
             
                elif "2_Pooling/config.json" in filenames:
         | 
| 351 | 
             
                    st_config_path = hf_hub_download(model.modelId, filename="2_Pooling/config.json")
         | 
| 352 | 
             
                    dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
         | 
| 353 | 
            -
                 | 
| 354 | 
             
                    config_path = hf_hub_download(model.modelId, filename="config.json")
         | 
| 355 | 
             
                    config = json.load(open(config_path))
         | 
| 356 | 
            -
                    if  | 
| 357 | 
            -
                        dim = config | 
| 358 | 
            -
                     | 
| 359 | 
            -
             | 
| 360 | 
            -
                    elif "d_model" in config:
         | 
| 361 | 
            -
                        dim = config["d_model"]
         | 
| 362 | 
            -
                return dim
         | 
| 363 | 
            -
             | 
| 364 |  | 
| 365 | 
             
            def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
         | 
| 366 | 
             
                api = HfApi()
         | 
| @@ -381,6 +430,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_ | |
| 381 | 
             
                    if len(res) > 1:
         | 
| 382 | 
             
                        if add_emb_dim:
         | 
| 383 | 
             
                            res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
         | 
|  | |
| 384 | 
             
                        df_list.append(res)
         | 
| 385 |  | 
| 386 | 
             
                for model in models:
         | 
| @@ -414,7 +464,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_ | |
| 414 | 
             
                    # Model & at least one result
         | 
| 415 | 
             
                    if len(out) > 1:
         | 
| 416 | 
             
                        if add_emb_dim:
         | 
| 417 | 
            -
                            out["Embedding Dimensions"] =  | 
| 418 | 
             
                        df_list.append(out)
         | 
| 419 | 
             
                df = pd.DataFrame(df_list)
         | 
| 420 | 
             
                # Put 'Model' column first
         | 
| @@ -472,7 +522,7 @@ def get_mteb_average(): | |
| 472 | 
             
                DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
         | 
| 473 | 
             
                DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
         | 
| 474 |  | 
| 475 | 
            -
                DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
         | 
| 476 |  | 
| 477 | 
             
                return DATA_OVERALL
         | 
| 478 |  | 
|  | |
| 288 | 
             
                "unsup-simcse-bert-base-uncased": 768,
         | 
| 289 | 
             
            }
         | 
| 290 |  | 
| 291 | 
            +
             | 
| 292 | 
            +
            EXTERNAL_MODEL_TO_SEQLEN = {
         | 
| 293 | 
            +
                "xlm-roberta-large":  514,
         | 
| 294 | 
            +
                "use-cmlm-multilingual": 512,
         | 
| 295 | 
            +
                "gottbert-base": 512,
         | 
| 296 | 
            +
                "cross-en-de-roberta-sentence-transformer": 514,
         | 
| 297 | 
            +
                "gbert-base": 512,
         | 
| 298 | 
            +
                "gbert-large": 512,
         | 
| 299 | 
            +
                "gelectra-base": 512,
         | 
| 300 | 
            +
                "gelectra-large": 512,
         | 
| 301 | 
            +
                "gottbert-base": 512,
         | 
| 302 | 
            +
             | 
| 303 | 
            +
                "LASER2": "N/A",
         | 
| 304 | 
            +
                "LaBSE": 512,
         | 
| 305 | 
            +
                "all-MiniLM-L12-v2": 512,
         | 
| 306 | 
            +
                "all-MiniLM-L6-v2": 512,
         | 
| 307 | 
            +
                "all-mpnet-base-v2": 514,
         | 
| 308 | 
            +
                "allenai-specter": 512,
         | 
| 309 | 
            +
                "bert-base-uncased": 512,
         | 
| 310 | 
            +
                "contriever-base-msmarco": 512,
         | 
| 311 | 
            +
                "glove.6B.300d": "N/A",
         | 
| 312 | 
            +
                "gtr-t5-base": 512,
         | 
| 313 | 
            +
                "gtr-t5-large": 512,
         | 
| 314 | 
            +
                "gtr-t5-xl": 512,
         | 
| 315 | 
            +
                "gtr-t5-xxl": 512,
         | 
| 316 | 
            +
                "komninos": "N/A",
         | 
| 317 | 
            +
                "msmarco-bert-co-condensor": 512,
         | 
| 318 | 
            +
                "paraphrase-multilingual-MiniLM-L12-v2": 512,
         | 
| 319 | 
            +
                "paraphrase-multilingual-mpnet-base-v2": 514,
         | 
| 320 | 
            +
                "sentence-t5-base": 512,
         | 
| 321 | 
            +
                "sentence-t5-large": 512,
         | 
| 322 | 
            +
                "sentence-t5-xl": 512,
         | 
| 323 | 
            +
                "sentence-t5-xxl": 512,
         | 
| 324 | 
            +
                "sup-simcse-bert-base-uncased": 512,
         | 
| 325 | 
            +
             | 
| 326 | 
            +
                "text-embedding-ada-002": 8191,
         | 
| 327 | 
            +
                
         | 
| 328 | 
            +
                "text-similarity-ada-001": 2046,
         | 
| 329 | 
            +
                "text-similarity-babbage-001": 2046,    
         | 
| 330 | 
            +
                "text-similarity-curie-001": 2046,
         | 
| 331 | 
            +
                "text-similarity-davinci-001": 2046,    
         | 
| 332 | 
            +
             | 
| 333 | 
            +
                "text-search-ada-doc-001": 2046,
         | 
| 334 | 
            +
                "text-search-ada-query-001": 2046,
         | 
| 335 | 
            +
                "text-search-ada-001": 2046,   
         | 
| 336 | 
            +
                "text-search-babbage-001": 2046,     
         | 
| 337 | 
            +
                "text-search-curie-001": 2046,
         | 
| 338 | 
            +
                "text-search-davinci-001": 2046,   
         | 
| 339 | 
            +
             | 
| 340 | 
            +
                "unsup-simcse-bert-base-uncased": 512,
         | 
| 341 | 
            +
            }
         | 
| 342 | 
            +
             | 
| 343 | 
            +
             | 
| 344 | 
             
            MODELS_TO_SKIP = {
         | 
| 345 | 
             
                "baseplate/instructor-large-1", # Duplicate
         | 
| 346 | 
             
                "radames/e5-large", # Duplicate
         | 
|  | |
| 394 | 
             
                    ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
         | 
| 395 | 
             
                    EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
         | 
| 396 |  | 
| 397 | 
            +
            def get_dim_seq(model):
         | 
| 398 | 
             
                filenames = [sib.rfilename for sib in model.siblings]
         | 
| 399 | 
            +
                dim, seq = "", ""
         | 
| 400 | 
             
                if "1_Pooling/config.json" in filenames:
         | 
| 401 | 
             
                    st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
         | 
| 402 | 
             
                    dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
         | 
| 403 | 
             
                elif "2_Pooling/config.json" in filenames:
         | 
| 404 | 
             
                    st_config_path = hf_hub_download(model.modelId, filename="2_Pooling/config.json")
         | 
| 405 | 
             
                    dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
         | 
| 406 | 
            +
                if "config.json" in filenames:
         | 
| 407 | 
             
                    config_path = hf_hub_download(model.modelId, filename="config.json")
         | 
| 408 | 
             
                    config = json.load(open(config_path))
         | 
| 409 | 
            +
                    if not dim:
         | 
| 410 | 
            +
                        dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
         | 
| 411 | 
            +
                    seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
         | 
| 412 | 
            +
                return dim, seq
         | 
|  | |
|  | |
|  | |
|  | |
| 413 |  | 
| 414 | 
             
            def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
         | 
| 415 | 
             
                api = HfApi()
         | 
|  | |
| 430 | 
             
                    if len(res) > 1:
         | 
| 431 | 
             
                        if add_emb_dim:
         | 
| 432 | 
             
                            res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
         | 
| 433 | 
            +
                            res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
         | 
| 434 | 
             
                        df_list.append(res)
         | 
| 435 |  | 
| 436 | 
             
                for model in models:
         | 
|  | |
| 464 | 
             
                    # Model & at least one result
         | 
| 465 | 
             
                    if len(out) > 1:
         | 
| 466 | 
             
                        if add_emb_dim:
         | 
| 467 | 
            +
                            out["Embedding Dimensions"], out["Sequence Length"] = get_dim_seq(model)
         | 
| 468 | 
             
                        df_list.append(out)
         | 
| 469 | 
             
                df = pd.DataFrame(df_list)
         | 
| 470 | 
             
                # Put 'Model' column first
         | 
|  | |
| 522 | 
             
                DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
         | 
| 523 | 
             
                DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
         | 
| 524 |  | 
| 525 | 
            +
                DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
         | 
| 526 |  | 
| 527 | 
             
                return DATA_OVERALL
         | 
| 528 |  | 
 
			
