| | import json |
| | import os |
| |
|
| | import filelock |
| | import huggingface_hub |
| | import pandas as pd |
| |
|
| | from utils import ( |
| | build_datasets_urls, |
| | build_models_urls, |
| | build_text_icon, |
| | download_favicons, |
| | get_base_url, |
| | get_domain_name, |
| | ) |
| |
|
| |
|
| | HF_ICON = "https://huggingface.co/front/assets/huggingface_logo.svg" |
| | CROSS_ICON = "https://upload.wikimedia.org/wikipedia/commons/4/4e/Cross.png" |
| |
|
| | DISABLE_ONLINE_CACHE = False |
| | ONLINE_CACHE = "CONDA-Workshop/RequestCache" |
| |
|
| |
|
| | def save_cache(cache_data, cache_file, initial_timestamp): |
| | print(f"Saving cache to {cache_file}") |
| | |
| | with filelock.FileLock(f"{cache_file}.lock"): |
| | |
| | current_timestamp = ( |
| | os.path.getmtime(cache_file) if os.path.exists(cache_file) else None |
| | ) |
| | if current_timestamp is None or initial_timestamp != current_timestamp: |
| | |
| | try: |
| | with open(cache_file, "r", encoding="utf8") as f: |
| | |
| | cache_dict = json.load(f) |
| | |
| | if cache_dict != cache_data: |
| | cache_data.update(cache_dict) |
| |
|
| | except FileNotFoundError: |
| | pass |
| |
|
| | |
| | with open(cache_file, "w", encoding="utf8") as f: |
| | json.dump(cache_data, f, ensure_ascii=False, indent=4) |
| |
|
| | if not DISABLE_ONLINE_CACHE: |
| | try: |
| | huggingface_hub.upload_file( |
| | repo_id=ONLINE_CACHE, |
| | repo_type="dataset", |
| | token=os.environ.get("TOKEN") or True, |
| | path_in_repo=cache_file, |
| | path_or_fileobj=cache_file, |
| | ) |
| | except Exception as e: |
| | print(f"Unable to upload {cache_file}: {e}") |
| |
|
| | return cache_data |
| |
|
| |
|
| | def update_favicon_cache(sources): |
| | |
| | favicon_dict = {} |
| | favicon_file_path = "favicons.json" |
| | initial_timestamp = None |
| |
|
| | if not DISABLE_ONLINE_CACHE: |
| | try: |
| | huggingface_hub.hf_hub_download( |
| | repo_id=ONLINE_CACHE, |
| | repo_type="dataset", |
| | token=os.environ.get("TOKEN") or True, |
| | filename=favicon_file_path, |
| | local_dir=os.getcwd(), |
| | ) |
| | except Exception as e: |
| | print(f"Unable to download favicons.json: {e}") |
| |
|
| | |
| | if os.path.exists(favicon_file_path): |
| | initial_timestamp = os.path.getmtime(favicon_file_path) |
| | try: |
| | with open(favicon_file_path, "r", encoding="utf8") as f: |
| | favicon_dict = json.load(f) |
| | except FileNotFoundError: |
| | pass |
| |
|
| | |
| | missing_domains = [domain for domain in sources if domain not in favicon_dict] |
| |
|
| | |
| | if missing_domains: |
| | new_favicon_urls = download_favicons(missing_domains) |
| | favicon_dict.update(new_favicon_urls) |
| | favicon_dict = save_cache( |
| | cache_data=favicon_dict, |
| | cache_file=favicon_file_path, |
| | initial_timestamp=initial_timestamp, |
| | ) |
| |
|
| | return favicon_dict |
| |
|
| |
|
| | def update_model_url_cache(models): |
| | models = [x for x in models if x is not None] |
| | models = list(set(models)) |
| |
|
| | |
| | model_url_dict = {} |
| | model_url_file_path = "model_urls.json" |
| | initial_timestamp = None |
| |
|
| | if not DISABLE_ONLINE_CACHE: |
| | try: |
| | huggingface_hub.hf_hub_download( |
| | repo_id=ONLINE_CACHE, |
| | repo_type="dataset", |
| | token=os.environ.get("TOKEN") or True, |
| | filename=model_url_file_path, |
| | local_dir=os.getcwd(), |
| | ) |
| | except Exception as e: |
| | print(f"Unable to download model_urls.json: {e}") |
| |
|
| | |
| | if os.path.exists(model_url_file_path): |
| | initial_timestamp = os.path.getmtime(model_url_file_path) |
| | try: |
| | with open(model_url_file_path, "r", encoding="utf8") as f: |
| | model_url_dict = json.load(f) |
| | except FileNotFoundError: |
| | pass |
| |
|
| | |
| | missing_model_urls = [model for model in models if model not in model_url_dict] |
| |
|
| | |
| | if missing_model_urls: |
| | new_model_urls = build_models_urls(missing_model_urls) |
| | model_url_dict.update(new_model_urls) |
| | model_url_dict = save_cache( |
| | cache_data=model_url_dict, |
| | cache_file=model_url_file_path, |
| | initial_timestamp=initial_timestamp, |
| | ) |
| |
|
| | return model_url_dict |
| |
|
| |
|
| | def update_dataset_url_cache(datasets): |
| | datasets = [x for x in datasets if x is not None] |
| | datasets = list(set(datasets)) |
| |
|
| | |
| | dataset_url_dict = {} |
| | dataset_url_file_path = "dataset_urls.json" |
| | initial_timestamp = None |
| |
|
| | if not DISABLE_ONLINE_CACHE: |
| | try: |
| | huggingface_hub.hf_hub_download( |
| | repo_id=ONLINE_CACHE, |
| | repo_type="dataset", |
| | token=os.environ.get("TOKEN") or True, |
| | filename=dataset_url_file_path, |
| | local_dir=os.getcwd(), |
| | ) |
| | except Exception as e: |
| | print(f"Unable to download dataset_urls.json: {e}") |
| |
|
| | |
| | if os.path.exists(dataset_url_file_path): |
| | initial_timestamp = os.path.getmtime(dataset_url_file_path) |
| | try: |
| | with open(dataset_url_file_path, "r", encoding="utf8") as f: |
| | dataset_url_dict = json.load(f) |
| | except FileNotFoundError: |
| | pass |
| |
|
| | |
| | missing_dataset_urls = [ |
| | dataset for dataset in datasets if dataset not in dataset_url_dict |
| | ] |
| |
|
| | |
| | if missing_dataset_urls: |
| | new_dataset_urls = build_datasets_urls(missing_dataset_urls) |
| | dataset_url_dict.update(new_dataset_urls) |
| | dataset_url_dict = save_cache( |
| | cache_data=dataset_url_dict, |
| | cache_file=dataset_url_file_path, |
| | initial_timestamp=initial_timestamp, |
| | ) |
| |
|
| | return dataset_url_dict |
| |
|
| |
|
| | def get_dataframe(): |
| | |
| | data = pd.read_csv("contamination_report.csv", delimiter=";", header=0) |
| |
|
| | |
| | favicon_dict = {} |
| |
|
| | |
| | favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]]) |
| |
|
| | |
| | model_url_dict = update_model_url_cache( |
| | data[data["Model or corpus"] == "model"]["Contaminated Source"] |
| | ) |
| |
|
| | |
| | dataset_url_dict = update_dataset_url_cache( |
| | list(data["Evaluation Dataset"]) |
| | + list(data[data["Model or corpus"] == "corpus"]["Contaminated Source"]) |
| | ) |
| |
|
| | |
| | data["Reference"] = data["Reference"].apply( |
| | lambda x: build_text_icon( |
| | text=get_domain_name(x), |
| | url=x, |
| | icon_url=favicon_dict.get(get_base_url(x), ""), |
| | ) |
| | ) |
| |
|
| | PR_URL_FORMAT = "https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/discussions/{}" |
| | data["PR"] = data["PR"].apply( |
| | lambda x: build_text_icon( |
| | text="", |
| | url=PR_URL_FORMAT.format(int(x)) if not pd.isna(x) else "no link", |
| | icon_url=HF_ICON if x == x else CROSS_ICON, |
| | ) |
| | ) |
| |
|
| | data["Evaluation Dataset"] = data["Evaluation Dataset"].apply( |
| | lambda x: build_text_icon( |
| | text=x, |
| | url=dataset_url_dict.get(x, ""), |
| | icon_url=HF_ICON, |
| | ) |
| | ) |
| |
|
| | data["Evaluation Dataset"] = data.apply( |
| | lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"], |
| | axis=1, |
| | ) |
| |
|
| | del data["Subset"] |
| |
|
| | |
| | data["Contaminated Source"] = data.apply( |
| | lambda x: build_text_icon( |
| | text=x["Contaminated Source"] + f" ({x['Version']})" if pd.notna(x["Version"]) else x["Contaminated Source"], |
| | url=dataset_url_dict.get(x["Contaminated Source"], "") |
| | if x["Model or corpus"] == "corpus" |
| | else model_url_dict.get(x["Contaminated Source"], ""), |
| | icon_url=HF_ICON, |
| | ), |
| | axis=1, |
| | ) |
| | del data["Version"] |
| |
|
| | data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x) |
| | data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x) |
| | data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x) |
| |
|
| | return data |
| |
|