File size: 2,305 Bytes
4c12510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
This code was inspired from https://huggingface.co/spaces/HugoLaurencon/examples_before_after_pii/
and https://huggingface.co/spaces/SaulLu/diff-visualizer
"""

import streamlit as st
from datasets import load_dataset
import diff_viewer
import os

st.set_page_config(page_title="PII Visualization", layout="wide")
st.title("PII Visualization")
auth_token = os.environ.get("data-pii") or True

@st.cache()
def load_data(language="python"):
    # load dataset with modified files with: content, references and language columns
    dataset = load_dataset("bigcode/pii_checks_python_java_js", data_dir = f"data/{language}", split="train", use_auth_token=auth_token)
    dataset = dataset.remove_columns(['has_secrets', 'number_secrets', 'path'])
    return dataset 


def get_samples_tag(dataset, tag):
    # get samples with tag
    # add column id to be able to retrieve the sample
    tmp = dataset.add_column("index", range(len(dataset)))
    samples = tmp.filter(lambda x: "PI:" + tag.upper() in x['references'])
    return samples["index"]


col1, col2 = st.columns([2, 4])
with col1:
    lang = st.selectbox("Select a programming language", ["Python", "Java", "JavaScript"])

samples = load_data(language=lang.lower())
max_docs = len(samples)

with col1:
    index_example = st.number_input(f"Index of the chosen example from the existing {max_docs}", min_value=0, max_value=max_docs-1, value=0, step=1)

keys = get_samples_tag(samples, "KEY")
ips = get_samples_tag(samples, "IP_ADDRESS")

st.write("Here we highlight the difference in code before and after the PII redaction on the Python, Java and Javascript subsets of the-stack-smol. We only show files that were modified.")

example = samples[index_example]
delimiter = f"PI:"
count = example["references"].count(delimiter)

secrets = "secret" if count == 1 else "secrets"
st.subheader(f"{lang} example {index_example} has {count} redacted {secrets}:")  
diff_viewer.diff_viewer(old_text=example["content"], new_text=example["new_content"], lang="none")
#diff_viewer.diff_viewer(old_text=example["content"], new_text=example["references"], lang="none")

st.markdown(f"Email redaction examples are very common unlike **IP addresses** and **keys**. To visualize them you can check these samples:")
st.text(f"IP addresses:\n{ips}\nKeys:\n{keys}")