Spaces:
Runtime error
Runtime error
""" | |
This code was inspired from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/ | |
and https://huggingface.co/spaces/SaulLu/diff-visualizer | |
""" | |
import streamlit as st | |
from datasets import load_dataset | |
import diff_viewer | |
import os | |
st.set_page_config(page_title="PII Visualization", layout="wide") | |
st.title("PII Visualization") | |
auth_token = os.environ.get("data-pii") or True | |
def load_data(language="python"): | |
# load dataset with modified files with: content, references and language columns | |
dataset = load_dataset("bigcode/pii_checks_python_java_js", data_dir = f"data/{language}", split="train", use_auth_token=auth_token) | |
dataset = dataset.remove_columns(['has_secrets', 'number_secrets', 'path']) | |
return dataset | |
def get_samples_tag(dataset, tag): | |
# get samples with tag | |
# add column id to be able to retrieve the sample | |
tmp = dataset.add_column("index", range(len(dataset))) | |
samples = tmp.filter(lambda x: "PI:" + tag.upper() in x['references']) | |
return samples["index"] | |
col1, col2 = st.columns([2, 4]) | |
with col1: | |
lang = st.selectbox("Select a programming language", ["Python", "Java", "JavaScript"]) | |
samples = load_data(language=lang.lower()) | |
max_docs = len(samples) | |
with col1: | |
index_example = st.number_input(f"Index of the chosen example from the existing {max_docs}", min_value=0, max_value=max_docs-1, value=0, step=1) | |
keys = get_samples_tag(samples, "KEY") | |
ips = get_samples_tag(samples, "IP_ADDRESS") | |
st.write("Here we highlight the difference in code before and after the PII redaction on the Python, Java and Javascript subsets of the-stack-smol. We only show files that were modified.") | |
example = samples[index_example] | |
delimiter = f"PI:" | |
count = example["references"].count(delimiter) | |
secrets = "secret" if count == 1 else "secrets" | |
st.subheader(f"{lang} example {index_example} has {count} redacted {secrets}:") | |
diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none") | |
#diff_viewer.diff_viewer(old_text=example["content"], new_text=example["references"], lang="none") | |
st.markdown(f"Email redaction examples are very common unlike **IP addresses** and **keys**. To visualize them you can check these samples:") | |
st.text(f"IP addresses:\n{ips}\nKeys:\n{keys}") |