Spaces:
Build error
Build error
Update app.py (#2)
Browse files- Update app.py (ed8d0baa6e642462e7e1933082b80838e20a2edb)
- Update analyze.py (3d1dbd3301a99d1e065bc479c52b169333a74702)
- analyze.py +5 -6
- app.py +44 -13
analyze.py
CHANGED
|
@@ -46,11 +46,10 @@ def batched(
|
|
| 46 |
|
| 47 |
|
| 48 |
def mask(text: str) -> str:
|
| 49 |
-
return
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
# )
|
| 54 |
|
| 55 |
|
| 56 |
def get_strings(row_content: Any) -> str:
|
|
@@ -101,7 +100,7 @@ def analyze(
|
|
| 101 |
]
|
| 102 |
return [
|
| 103 |
PresidioEntity(
|
| 104 |
-
text=
|
| 105 |
type=recognizer_result.entity_type,
|
| 106 |
row_idx=row_idx,
|
| 107 |
column_name=column_name,
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
def mask(text: str) -> str:
|
| 49 |
+
return " ".join(
|
| 50 |
+
word[: min(2, len(word) - 1)] + re.sub("[A-Za-z0-9]", "*", word[min(2, len(word) - 1) :])
|
| 51 |
+
for word in text.split(" ")
|
| 52 |
+
)
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
def get_strings(row_content: Any) -> str:
|
|
|
|
| 100 |
]
|
| 101 |
return [
|
| 102 |
PresidioEntity(
|
| 103 |
+
text=texts[i * len(scanned_columns) + j][recognizer_result.start : recognizer_result.end],
|
| 104 |
type=recognizer_result.entity_type,
|
| 105 |
row_idx=row_idx,
|
| 106 |
column_name=column_name,
|
app.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
-
from
|
|
|
|
|
|
|
| 2 |
from typing import Any, Iterable, TypeVar
|
| 3 |
|
| 4 |
import gradio as gr
|
|
@@ -7,7 +9,7 @@ import pandas as pd
|
|
| 7 |
from datasets import Features
|
| 8 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 9 |
|
| 10 |
-
from analyze import analyzer, get_column_description, get_columns_with_strings, presidio_scan_entities
|
| 11 |
|
| 12 |
MAX_ROWS = 100
|
| 13 |
T = TypeVar("T")
|
|
@@ -24,6 +26,22 @@ DEFAULT_PRESIDIO_ENTITIES = sorted([
|
|
| 24 |
'IBAN_CODE',
|
| 25 |
'EMAIL',
|
| 26 |
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
| 29 |
batch_size = 100
|
|
@@ -47,7 +65,16 @@ class track_iter:
|
|
| 47 |
self.next_idx += 1
|
| 48 |
yield item
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
| 52 |
if "error" in info_resp:
|
| 53 |
yield "❌ " + info_resp["error"], pd.DataFrame()
|
|
@@ -65,10 +92,12 @@ def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFram
|
|
| 65 |
for presidio_entity in presidio_scan_entities(
|
| 66 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
| 67 |
):
|
|
|
|
|
|
|
| 68 |
if presidio_entity["type"] in enabled_presidio_entities:
|
| 69 |
presidio_entities.append(presidio_entity)
|
| 70 |
-
yield
|
| 71 |
-
yield
|
| 72 |
|
| 73 |
with gr.Blocks() as demo:
|
| 74 |
gr.Markdown("# Scan datasets using Presidio")
|
|
@@ -85,26 +114,28 @@ with gr.Blocks() as demo:
|
|
| 85 |
value=DEFAULT_PRESIDIO_ENTITIES,
|
| 86 |
interactive=True,
|
| 87 |
),
|
|
|
|
| 88 |
]
|
| 89 |
button = gr.Button("Run Presidio Scan")
|
| 90 |
outputs = [
|
| 91 |
-
gr.
|
| 92 |
gr.DataFrame(),
|
| 93 |
]
|
| 94 |
button.click(analyze_dataset, inputs, outputs)
|
| 95 |
gr.Examples(
|
| 96 |
[
|
| 97 |
-
["microsoft/orca-math-word-problems-200k"
|
| 98 |
-
["tatsu-lab/alpaca"
|
| 99 |
-
["Anthropic/hh-rlhf"
|
| 100 |
-
["OpenAssistant/oasst1"
|
| 101 |
-
["sidhq/email-thread-summary"
|
| 102 |
-
["lhoestq/fake_name_and_ssn"
|
| 103 |
],
|
| 104 |
inputs,
|
| 105 |
outputs,
|
| 106 |
fn=analyze_dataset,
|
| 107 |
-
run_on_click=True
|
|
|
|
| 108 |
)
|
| 109 |
|
| 110 |
demo.launch()
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
from itertools import count, groupby, islice
|
| 3 |
+
from operator import itemgetter
|
| 4 |
from typing import Any, Iterable, TypeVar
|
| 5 |
|
| 6 |
import gradio as gr
|
|
|
|
| 9 |
from datasets import Features
|
| 10 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 11 |
|
| 12 |
+
from analyze import PresidioEntity, analyzer, get_column_description, get_columns_with_strings, mask, presidio_scan_entities
|
| 13 |
|
| 14 |
MAX_ROWS = 100
|
| 15 |
T = TypeVar("T")
|
|
|
|
| 26 |
'IBAN_CODE',
|
| 27 |
'EMAIL',
|
| 28 |
])
|
| 29 |
+
WARNING_PRESIDIO_ENTITIES = sorted([
|
| 30 |
+
'PHONE_NUMBER',
|
| 31 |
+
'US_PASSPORT',
|
| 32 |
+
'EMAIL_ADDRESS',
|
| 33 |
+
'IP_ADDRESS',
|
| 34 |
+
'US_BANK_NUMBER',
|
| 35 |
+
'IBAN_CODE',
|
| 36 |
+
'EMAIL',
|
| 37 |
+
])
|
| 38 |
+
ALERT_PRESIDIO_ENTITIES = sorted([
|
| 39 |
+
'CREDIT_CARD',
|
| 40 |
+
'US_SSN',
|
| 41 |
+
'US_PASSPORT',
|
| 42 |
+
'US_BANK_NUMBER',
|
| 43 |
+
'IBAN_CODE',
|
| 44 |
+
])
|
| 45 |
|
| 46 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
| 47 |
batch_size = 100
|
|
|
|
| 65 |
self.next_idx += 1
|
| 66 |
yield item
|
| 67 |
|
| 68 |
+
|
| 69 |
+
def presidio_report(presidio_entities: list[PresidioEntity], next_row_idx: int, num_rows: int) -> dict[str, float]:
|
| 70 |
+
title = f"Scan finished: {len(presidio_entities)} entities found" if num_rows == next_row_idx else "Scan in progress..."
|
| 71 |
+
counter = Counter([title] * next_row_idx)
|
| 72 |
+
for row_idx, presidio_entities_per_row in groupby(presidio_entities, itemgetter("row_idx")):
|
| 73 |
+
counter.update(set("% of rows with " + presidio_entity["type"] for presidio_entity in presidio_entities_per_row))
|
| 74 |
+
return dict((presidio_entity_type, presidio_entity_type_row_count / num_rows) for presidio_entity_type, presidio_entity_type_row_count in counter.most_common())
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def analyze_dataset(dataset: str, enabled_presidio_entities: list[str] = DEFAULT_PRESIDIO_ENTITIES, show_texts_without_masks: bool = False) -> pd.DataFrame:
|
| 78 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
| 79 |
if "error" in info_resp:
|
| 80 |
yield "❌ " + info_resp["error"], pd.DataFrame()
|
|
|
|
| 92 |
for presidio_entity in presidio_scan_entities(
|
| 93 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
| 94 |
):
|
| 95 |
+
if not show_texts_without_masks:
|
| 96 |
+
presidio_entity["text"] = mask(presidio_entity["text"])
|
| 97 |
if presidio_entity["type"] in enabled_presidio_entities:
|
| 98 |
presidio_entities.append(presidio_entity)
|
| 99 |
+
yield presidio_report(presidio_entities, next_row_idx=rows.next_idx, num_rows=num_rows), pd.DataFrame(presidio_entities)
|
| 100 |
+
yield presidio_report(presidio_entities, next_row_idx=rows.next_idx, num_rows=num_rows), pd.DataFrame(presidio_entities)
|
| 101 |
|
| 102 |
with gr.Blocks() as demo:
|
| 103 |
gr.Markdown("# Scan datasets using Presidio")
|
|
|
|
| 114 |
value=DEFAULT_PRESIDIO_ENTITIES,
|
| 115 |
interactive=True,
|
| 116 |
),
|
| 117 |
+
gr.Checkbox(label="Show texts without masks", value=False),
|
| 118 |
]
|
| 119 |
button = gr.Button("Run Presidio Scan")
|
| 120 |
outputs = [
|
| 121 |
+
gr.Label(show_label=False),
|
| 122 |
gr.DataFrame(),
|
| 123 |
]
|
| 124 |
button.click(analyze_dataset, inputs, outputs)
|
| 125 |
gr.Examples(
|
| 126 |
[
|
| 127 |
+
["microsoft/orca-math-word-problems-200k"],
|
| 128 |
+
["tatsu-lab/alpaca"],
|
| 129 |
+
["Anthropic/hh-rlhf"],
|
| 130 |
+
["OpenAssistant/oasst1"],
|
| 131 |
+
["sidhq/email-thread-summary"],
|
| 132 |
+
["lhoestq/fake_name_and_ssn"]
|
| 133 |
],
|
| 134 |
inputs,
|
| 135 |
outputs,
|
| 136 |
fn=analyze_dataset,
|
| 137 |
+
run_on_click=True,
|
| 138 |
+
cache_examples=False,
|
| 139 |
)
|
| 140 |
|
| 141 |
demo.launch()
|