presidio-dataset-scanner

Build error

App Files Files Community

lhoestq HF Staff commited on May 2, 2024

Commit

d0ca54b

verified ·

1 Parent(s): 797cf6b

Update app.py (#2)

Browse files

- Update app.py (ed8d0baa6e642462e7e1933082b80838e20a2edb)
- Update analyze.py (3d1dbd3301a99d1e065bc479c52b169333a74702)

Files changed (2) hide show

analyze.py +5 -6
app.py +44 -13

analyze.py CHANGED Viewed

@@ -46,11 +46,10 @@ def batched(
 def mask(text: str) -> str:
-    return text  # don't apply mask for demo
-    # return " ".join(
-    #     word[: min(2, len(word) - 1)] + re.sub("[A-Za-z0-9]", "*", word[min(2, len(word) - 1) :])
-    #     for word in text.split(" ")
-    # )
 def get_strings(row_content: Any) -> str:
@@ -101,7 +100,7 @@ def analyze(
     ]
     return [
         PresidioEntity(
-            text=mask(texts[i * len(scanned_columns) + j][recognizer_result.start : recognizer_result.end]),
             type=recognizer_result.entity_type,
             row_idx=row_idx,
             column_name=column_name,

 def mask(text: str) -> str:
+    return " ".join(
+        word[: min(2, len(word) - 1)] + re.sub("[A-Za-z0-9]", "*", word[min(2, len(word) - 1) :])
+        for word in text.split(" ")
+    )
 def get_strings(row_content: Any) -> str:
     ]
     return [
         PresidioEntity(
+            text=texts[i * len(scanned_columns) + j][recognizer_result.start : recognizer_result.end],
             type=recognizer_result.entity_type,
             row_idx=row_idx,
             column_name=column_name,

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
-from itertools import count, islice
 from typing import Any, Iterable, TypeVar
 import gradio as gr
@@ -7,7 +9,7 @@ import pandas as pd
 from datasets import Features
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
-from analyze import analyzer, get_column_description, get_columns_with_strings, presidio_scan_entities
 MAX_ROWS = 100
 T = TypeVar("T")
@@ -24,6 +26,22 @@ DEFAULT_PRESIDIO_ENTITIES = sorted([
     'IBAN_CODE',
     'EMAIL',
 ])
 def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
     batch_size = 100
@@ -47,7 +65,16 @@ class track_iter:
             self.next_idx += 1
             yield item
-def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFrame:
     info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
     if "error" in info_resp:
         yield "❌ " + info_resp["error"], pd.DataFrame()
@@ -65,10 +92,12 @@ def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFram
     for presidio_entity in presidio_scan_entities(
         rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
     ):
         if presidio_entity["type"] in enabled_presidio_entities:
             presidio_entities.append(presidio_entity)
-            yield f"⚙️ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
-    yield f"✅ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
 with gr.Blocks() as demo:
     gr.Markdown("# Scan datasets using Presidio")
@@ -85,26 +114,28 @@ with gr.Blocks() as demo:
             value=DEFAULT_PRESIDIO_ENTITIES,
             interactive=True,
         ),
     ]
     button = gr.Button("Run Presidio Scan")
     outputs = [
-        gr.Markdown(),
         gr.DataFrame(),
     ]
     button.click(analyze_dataset, inputs, outputs)
     gr.Examples(
         [
-            ["microsoft/orca-math-word-problems-200k", DEFAULT_PRESIDIO_ENTITIES],
-            ["tatsu-lab/alpaca", DEFAULT_PRESIDIO_ENTITIES],
-            ["Anthropic/hh-rlhf", DEFAULT_PRESIDIO_ENTITIES],
-            ["OpenAssistant/oasst1", DEFAULT_PRESIDIO_ENTITIES],
-            ["sidhq/email-thread-summary", DEFAULT_PRESIDIO_ENTITIES],
-            ["lhoestq/fake_name_and_ssn", DEFAULT_PRESIDIO_ENTITIES]
         ],
         inputs,
         outputs,
         fn=analyze_dataset,
-        run_on_click=True
     )
 demo.launch()

+from collections import Counter
+from itertools import count, groupby, islice
+from operator import itemgetter
 from typing import Any, Iterable, TypeVar
 import gradio as gr
 from datasets import Features
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from analyze import PresidioEntity, analyzer, get_column_description, get_columns_with_strings, mask, presidio_scan_entities
 MAX_ROWS = 100
 T = TypeVar("T")
     'IBAN_CODE',
     'EMAIL',
 ])
+WARNING_PRESIDIO_ENTITIES = sorted([
+    'PHONE_NUMBER',
+    'US_PASSPORT',
+    'EMAIL_ADDRESS',
+    'IP_ADDRESS',
+    'US_BANK_NUMBER',
+    'IBAN_CODE',
+    'EMAIL',
+])
+ALERT_PRESIDIO_ENTITIES = sorted([
+    'CREDIT_CARD',
+    'US_SSN',
+    'US_PASSPORT',
+    'US_BANK_NUMBER',
+    'IBAN_CODE',
+])
 def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
     batch_size = 100
             self.next_idx += 1
             yield item
+def presidio_report(presidio_entities: list[PresidioEntity], next_row_idx: int, num_rows: int) -> dict[str, float]:
+    title = f"Scan finished: {len(presidio_entities)} entities found" if num_rows == next_row_idx else "Scan in progress..."
+    counter = Counter([title] * next_row_idx)
+    for row_idx, presidio_entities_per_row in groupby(presidio_entities, itemgetter("row_idx")):
+        counter.update(set("% of rows with " + presidio_entity["type"] for presidio_entity in presidio_entities_per_row))
+    return dict((presidio_entity_type, presidio_entity_type_row_count / num_rows) for presidio_entity_type, presidio_entity_type_row_count in counter.most_common())
+def analyze_dataset(dataset: str, enabled_presidio_entities: list[str] = DEFAULT_PRESIDIO_ENTITIES, show_texts_without_masks: bool = False) -> pd.DataFrame:
     info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
     if "error" in info_resp:
         yield "❌ " + info_resp["error"], pd.DataFrame()
     for presidio_entity in presidio_scan_entities(
         rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
     ):
+        if not show_texts_without_masks:
+            presidio_entity["text"] = mask(presidio_entity["text"])
         if presidio_entity["type"] in enabled_presidio_entities:
             presidio_entities.append(presidio_entity)
+            yield presidio_report(presidio_entities, next_row_idx=rows.next_idx, num_rows=num_rows), pd.DataFrame(presidio_entities)
+    yield presidio_report(presidio_entities, next_row_idx=rows.next_idx, num_rows=num_rows), pd.DataFrame(presidio_entities)
 with gr.Blocks() as demo:
     gr.Markdown("# Scan datasets using Presidio")
             value=DEFAULT_PRESIDIO_ENTITIES,
             interactive=True,
         ),
+        gr.Checkbox(label="Show texts without masks", value=False),
     ]
     button = gr.Button("Run Presidio Scan")
     outputs = [
+        gr.Label(show_label=False),
         gr.DataFrame(),
     ]
     button.click(analyze_dataset, inputs, outputs)
     gr.Examples(
         [
+            ["microsoft/orca-math-word-problems-200k"],
+            ["tatsu-lab/alpaca"],
+            ["Anthropic/hh-rlhf"],
+            ["OpenAssistant/oasst1"],
+            ["sidhq/email-thread-summary"],
+            ["lhoestq/fake_name_and_ssn"]
         ],
         inputs,
         outputs,
         fn=analyze_dataset,
+        run_on_click=True,
+        cache_examples=False,
     )
 demo.launch()