Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- __init__.py +9 -0
- __pycache__/__init__.cpython-311.pyc +0 -0
- __pycache__/masking.cpython-311.pyc +0 -0
- __pycache__/utils.cpython-311.pyc +0 -0
- cli.py +57 -0
- masking.py +52 -0
- utils.py +25 -0
__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AnonySpark: Lightweight PySpark data anonymization
|
2 |
+
from .masking import (
|
3 |
+
mask_email, mask_name, mask_date,
|
4 |
+
mask_ssn, mask_itin, mask_phone,
|
5 |
+
mask_email_udf, mask_name_udf, mask_date_udf,
|
6 |
+
mask_ssn_udf, mask_itin_udf, mask_phone_udf
|
7 |
+
)
|
8 |
+
|
9 |
+
from .utils import apply_masking
|
__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (624 Bytes). View file
|
|
__pycache__/masking.cpython-311.pyc
ADDED
Binary file (2.49 kB). View file
|
|
__pycache__/utils.cpython-311.pyc
ADDED
Binary file (1.15 kB). View file
|
|
cli.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import argparse
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
from pyspark.sql import SparkSession
|
6 |
+
from anonyspark.masking import (
|
7 |
+
mask_email_udf, mask_name_udf, mask_date_udf,
|
8 |
+
mask_ssn_udf, mask_itin_udf, mask_phone_udf
|
9 |
+
)
|
10 |
+
|
11 |
+
def apply_masking(df, schema):
|
12 |
+
"""
|
13 |
+
Apply masking UDFs based on schema definitions.
|
14 |
+
"""
|
15 |
+
for column, dtype in schema.items():
|
16 |
+
if dtype == "email":
|
17 |
+
df = df.withColumn(f"masked_{column}", mask_email_udf(df[column]))
|
18 |
+
elif dtype == "name":
|
19 |
+
df = df.withColumn(f"masked_{column}", mask_name_udf(df[column]))
|
20 |
+
elif dtype == "dob":
|
21 |
+
df = df.withColumn(f"masked_{column}", mask_date_udf(df[column]))
|
22 |
+
elif dtype == "ssn":
|
23 |
+
df = df.withColumn(f"masked_{column}", mask_ssn_udf(df[column]))
|
24 |
+
elif dtype == "itin":
|
25 |
+
df = df.withColumn(f"masked_{column}", mask_itin_udf(df[column]))
|
26 |
+
elif dtype == "phone":
|
27 |
+
df = df.withColumn(f"masked_{column}", mask_phone_udf(df[column]))
|
28 |
+
return df
|
29 |
+
|
30 |
+
def main():
|
31 |
+
parser = argparse.ArgumentParser(description="AnonySpark CLI for masking sensitive data.")
|
32 |
+
parser.add_argument('--input', type=str, required=True, help='Path to input CSV file')
|
33 |
+
parser.add_argument('--output', type=str, required=True, help='Directory to save masked output')
|
34 |
+
parser.add_argument('--schema', type=str, required=True, help='Path to masking schema JSON file')
|
35 |
+
args = parser.parse_args()
|
36 |
+
|
37 |
+
# Create output directory if it doesn't exist
|
38 |
+
os.makedirs(args.output, exist_ok=True)
|
39 |
+
|
40 |
+
# Start Spark
|
41 |
+
spark = SparkSession.builder.master("local[*]").appName("AnonysparkCLI").getOrCreate()
|
42 |
+
|
43 |
+
# Load data and schema
|
44 |
+
df = spark.read.csv(args.input, header=True)
|
45 |
+
with open(args.schema, 'r') as f:
|
46 |
+
schema = json.load(f)
|
47 |
+
|
48 |
+
# Apply masking
|
49 |
+
masked_df = apply_masking(df, schema)
|
50 |
+
|
51 |
+
# Save to output directory
|
52 |
+
masked_df.write.mode("overwrite").csv(args.output, header=True)
|
53 |
+
|
54 |
+
print(f"Masked file written to: {args.output}")
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
main()
|
masking.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__all__ = [
|
2 |
+
"mask_email_udf", "mask_name_udf", "mask_date_udf",
|
3 |
+
"mask_ssn_udf", "mask_itin_udf", "mask_phone_udf"
|
4 |
+
]
|
5 |
+
|
6 |
+
from pyspark.sql.functions import udf
|
7 |
+
from pyspark.sql.types import StringType
|
8 |
+
import re
|
9 |
+
from datetime import datetime
|
10 |
+
|
11 |
+
# Masking functions
|
12 |
+
def mask_email(value):
|
13 |
+
if value and "@" in value:
|
14 |
+
user, domain = value.split("@")
|
15 |
+
return "***@" + domain
|
16 |
+
return None
|
17 |
+
|
18 |
+
def mask_name(value):
|
19 |
+
if value:
|
20 |
+
return value[0] + "***"
|
21 |
+
return None
|
22 |
+
|
23 |
+
def mask_date(value):
|
24 |
+
try:
|
25 |
+
dt = datetime.strptime(value, "%Y-%m-%d")
|
26 |
+
return dt.strftime("***-**-%d")
|
27 |
+
except:
|
28 |
+
return None
|
29 |
+
|
30 |
+
def mask_ssn(value):
|
31 |
+
if value and re.match(r"\d{3}-\d{2}-\d{4}", value):
|
32 |
+
return "***-**-" + value[-4:]
|
33 |
+
return None
|
34 |
+
|
35 |
+
def mask_itin(value):
|
36 |
+
if value and re.match(r"9\d{2}-7\d-\d{4}", value):
|
37 |
+
return "***-**-" + value[-4:]
|
38 |
+
return None
|
39 |
+
|
40 |
+
def mask_phone(value):
|
41 |
+
if value and re.match(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", value):
|
42 |
+
return "***-***-" + value[-4:]
|
43 |
+
return None
|
44 |
+
|
45 |
+
# UDFs for Spark
|
46 |
+
mask_email_udf = udf(mask_email, StringType())
|
47 |
+
mask_name_udf = udf(mask_name, StringType())
|
48 |
+
mask_date_udf = udf(mask_date, StringType())
|
49 |
+
mask_ssn_udf = udf(mask_ssn, StringType())
|
50 |
+
mask_itin_udf = udf(mask_itin, StringType())
|
51 |
+
mask_phone_udf = udf(mask_phone, StringType())
|
52 |
+
|
utils.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pyspark.sql.functions import col
|
2 |
+
|
3 |
+
def apply_masking(df, schema):
|
4 |
+
"""
|
5 |
+
Apply masking UDFs to specified columns based on schema.
|
6 |
+
Schema = { "original_col": "mask_type" }
|
7 |
+
"""
|
8 |
+
from .masking import (
|
9 |
+
mask_email_udf, mask_name_udf, mask_date_udf,
|
10 |
+
mask_ssn_udf, mask_itin_udf, mask_phone_udf
|
11 |
+
)
|
12 |
+
|
13 |
+
masking_map = {
|
14 |
+
"email": mask_email_udf,
|
15 |
+
"name": mask_name_udf,
|
16 |
+
"dob": mask_date_udf,
|
17 |
+
"ssn": mask_ssn_udf,
|
18 |
+
"itin": mask_itin_udf,
|
19 |
+
"phone": mask_phone_udf,
|
20 |
+
}
|
21 |
+
|
22 |
+
for col_name, mask_type in schema.items():
|
23 |
+
if mask_type in masking_map:
|
24 |
+
df = df.withColumn(f"masked_{col_name}", masking_map[mask_type](col(col_name)))
|
25 |
+
return df
|