File fixes and cleaning (#17)
Browse files- Add changes (23add198b9aed3461771ec64c740e7c2f6789dd1)
- Add info about the changes in the markdown. (4a1e5cc01386ce466b5172d77f8d97e0792609f9)
- contamination_report.csv +0 -0
- dataset.py +2 -1
- markdown.py +2 -1
- postprocessing.py +43 -0
contamination_report.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset.py
CHANGED
|
@@ -256,7 +256,7 @@ def get_dataframe():
|
|
| 256 |
# For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
|
| 257 |
data["Contaminated Source"] = data.apply(
|
| 258 |
lambda x: build_text_icon(
|
| 259 |
-
text=x["Contaminated Source"],
|
| 260 |
url=dataset_url_dict.get(x["Contaminated Source"], "")
|
| 261 |
if x["Model or corpus"] == "corpus"
|
| 262 |
else model_url_dict.get(x["Contaminated Source"], ""),
|
|
@@ -264,6 +264,7 @@ def get_dataframe():
|
|
| 264 |
),
|
| 265 |
axis=1,
|
| 266 |
)
|
|
|
|
| 267 |
|
| 268 |
data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
|
| 269 |
data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
|
|
|
|
| 256 |
# For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
|
| 257 |
data["Contaminated Source"] = data.apply(
|
| 258 |
lambda x: build_text_icon(
|
| 259 |
+
text=x["Contaminated Source"] + f" ({x['Version']})" if pd.notna(x["Version"]) else x["Contaminated Source"],
|
| 260 |
url=dataset_url_dict.get(x["Contaminated Source"], "")
|
| 261 |
if x["Model or corpus"] == "corpus"
|
| 262 |
else model_url_dict.get(x["Contaminated Source"], ""),
|
|
|
|
| 264 |
),
|
| 265 |
axis=1,
|
| 266 |
)
|
| 267 |
+
del data["Version"]
|
| 268 |
|
| 269 |
data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
|
| 270 |
data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
|
markdown.py
CHANGED
|
@@ -60,8 +60,9 @@ Citation: `@inproceedings{...`
|
|
| 60 |
|
| 61 |
The [contamination_report.csv](https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Database/blob/main/contamination_report.csv) file is a csv filed with `;` delimiters. You will need to update the following columns:
|
| 62 |
- **Evaluation Dataset**: Name of the evaluation dataset that has has (not) been compromised. If available in the HuggingFace Hub please write the path (e.g. `uonlp/CulturaX`), otherwise proviede the name of the dataset.
|
| 63 |
-
- **Subset**: Many HuggingFace datasets have different subsets or splits on a single dataset. This field is to define a particular subset of a given dataset. For example, `qnli` subset of `glue`.
|
| 64 |
- **Contaminated Source**: Name of the model that has been trained with the evaluation dataset or name of the pre-training copora that contains the evaluation datset. If available in the HuggingFace Hub please write the path (e.g. `allenai/OLMo-7B`), otherwise proviede the name of the model/dataset.
|
|
|
|
| 65 |
- **Train split**: Percentage of the train split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
|
| 66 |
- **Development split**: Percentage of the development split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised.
|
| 67 |
- **Train split**: Percentage of the test split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
|
|
|
|
| 60 |
|
| 61 |
The [contamination_report.csv](https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Database/blob/main/contamination_report.csv) file is a csv filed with `;` delimiters. You will need to update the following columns:
|
| 62 |
- **Evaluation Dataset**: Name of the evaluation dataset that has has (not) been compromised. If available in the HuggingFace Hub please write the path (e.g. `uonlp/CulturaX`), otherwise proviede the name of the dataset.
|
| 63 |
+
- **Subset**: (Optional) Many HuggingFace datasets have different subsets or splits on a single dataset. This field is to define a particular subset of a given dataset. For example, `qnli` subset of `glue`.
|
| 64 |
- **Contaminated Source**: Name of the model that has been trained with the evaluation dataset or name of the pre-training copora that contains the evaluation datset. If available in the HuggingFace Hub please write the path (e.g. `allenai/OLMo-7B`), otherwise proviede the name of the model/dataset.
|
| 65 |
+
- **Version**: (Optional) Any information relevant to identify the version of the model or dataset. This information will be shown between parentheses in the Contaminated Source column.
|
| 66 |
- **Train split**: Percentage of the train split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
|
| 67 |
- **Development split**: Percentage of the development split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised.
|
| 68 |
- **Train split**: Percentage of the test split contaminated. 0 means no contamination. 100 means that the dataset has been fully compromised. If the dataset doesn't have splits, you can consider that the full dataset is a train or test split.
|
postprocessing.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def load_file(filename):
|
| 2 |
+
with open(filename, 'r') as f:
|
| 3 |
+
header = f.readline().strip().split(";")
|
| 4 |
+
return header, [line.strip().split(";") for line in f if line.strip()]
|
| 5 |
+
|
| 6 |
+
def remove_duplicates(data):
|
| 7 |
+
keys = set()
|
| 8 |
+
_data = []
|
| 9 |
+
for item in data:
|
| 10 |
+
key = tuple((item[0], item[1], item[2], item[3], item[-1]))
|
| 11 |
+
if key in keys:
|
| 12 |
+
continue
|
| 13 |
+
_data += [item]
|
| 14 |
+
keys.add(key)
|
| 15 |
+
return _data
|
| 16 |
+
|
| 17 |
+
def fix_arxiv_links(data):
|
| 18 |
+
return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data]
|
| 19 |
+
|
| 20 |
+
def sort_data(data):
|
| 21 |
+
return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1]))
|
| 22 |
+
|
| 23 |
+
def main():
|
| 24 |
+
header, data = load_file("contamination_report.csv")
|
| 25 |
+
data = sort_data(data)
|
| 26 |
+
data = remove_duplicates(data)
|
| 27 |
+
data = fix_arxiv_links(data)
|
| 28 |
+
print("Total datapoints:", len(data))
|
| 29 |
+
|
| 30 |
+
with open("contamination_report.csv", 'w') as f:
|
| 31 |
+
f.write(";".join(header) + "\n")
|
| 32 |
+
past_key = None
|
| 33 |
+
for line in data:
|
| 34 |
+
key = tuple((line[0], line[1]))
|
| 35 |
+
if key != past_key:
|
| 36 |
+
f.write("\n")
|
| 37 |
+
past_key = key
|
| 38 |
+
line = line[:3] + [""] + line[3:]
|
| 39 |
+
f.write(";".join(line) + "\n")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
main()
|