pii-public-demo / app.py
loubnabnl's picture
loubnabnl HF Staff
Duplicate from bigcode/pii-visualization
4c12510
raw
history blame
2.31 kB
"""
This code was inspired from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/
and https://huggingface.co/spaces/SaulLu/diff-visualizer
"""
import streamlit as st
from datasets import load_dataset
import diff_viewer
import os
st.set_page_config(page_title="PII Visualization", layout="wide")
st.title("PII Visualization")
auth_token = os.environ.get("data-pii") or True
@st.cache()
def load_data(language="python"):
# load dataset with modified files with: content, references and language columns
dataset = load_dataset("bigcode/pii_checks_python_java_js", data_dir = f"data/{language}", split="train", use_auth_token=auth_token)
dataset = dataset.remove_columns(['has_secrets', 'number_secrets', 'path'])
return dataset
def get_samples_tag(dataset, tag):
# get samples with tag
# add column id to be able to retrieve the sample
tmp = dataset.add_column("index", range(len(dataset)))
samples = tmp.filter(lambda x: "PI:" + tag.upper() in x['references'])
return samples["index"]
col1, col2 = st.columns([2, 4])
with col1:
lang = st.selectbox("Select a programming language", ["Python", "Java", "JavaScript"])
samples = load_data(language=lang.lower())
max_docs = len(samples)
with col1:
index_example = st.number_input(f"Index of the chosen example from the existing {max_docs}", min_value=0, max_value=max_docs-1, value=0, step=1)
keys = get_samples_tag(samples, "KEY")
ips = get_samples_tag(samples, "IP_ADDRESS")
st.write("Here we highlight the difference in code before and after the PII redaction on the Python, Java and Javascript subsets of the-stack-smol. We only show files that were modified.")
example = samples[index_example]
delimiter = f"PI:"
count = example["references"].count(delimiter)
secrets = "secret" if count == 1 else "secrets"
st.subheader(f"{lang} example {index_example} has {count} redacted {secrets}:")
diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none")
#diff_viewer.diff_viewer(old_text=example["content"], new_text=example["references"], lang="none")
st.markdown(f"Email redaction examples are very common unlike **IP addresses** and **keys**. To visualize them you can check these samples:")
st.text(f"IP addresses:\n{ips}\nKeys:\n{keys}")