GenAIDevTOProd commited on
Commit
492deb9
·
verified ·
1 Parent(s): f159967

Upload folder using huggingface_hub

Browse files
__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # AnonySpark: Lightweight PySpark data anonymization
2
+ from .masking import (
3
+ mask_email, mask_name, mask_date,
4
+ mask_ssn, mask_itin, mask_phone,
5
+ mask_email_udf, mask_name_udf, mask_date_udf,
6
+ mask_ssn_udf, mask_itin_udf, mask_phone_udf
7
+ )
8
+
9
+ from .utils import apply_masking
__pycache__/__init__.cpython-311.pyc ADDED
Binary file (624 Bytes). View file
 
__pycache__/masking.cpython-311.pyc ADDED
Binary file (2.49 kB). View file
 
__pycache__/utils.cpython-311.pyc ADDED
Binary file (1.15 kB). View file
 
cli.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import argparse
3
+ import json
4
+ import os
5
+ from pyspark.sql import SparkSession
6
+ from anonyspark.masking import (
7
+ mask_email_udf, mask_name_udf, mask_date_udf,
8
+ mask_ssn_udf, mask_itin_udf, mask_phone_udf
9
+ )
10
+
11
+ def apply_masking(df, schema):
12
+ """
13
+ Apply masking UDFs based on schema definitions.
14
+ """
15
+ for column, dtype in schema.items():
16
+ if dtype == "email":
17
+ df = df.withColumn(f"masked_{column}", mask_email_udf(df[column]))
18
+ elif dtype == "name":
19
+ df = df.withColumn(f"masked_{column}", mask_name_udf(df[column]))
20
+ elif dtype == "dob":
21
+ df = df.withColumn(f"masked_{column}", mask_date_udf(df[column]))
22
+ elif dtype == "ssn":
23
+ df = df.withColumn(f"masked_{column}", mask_ssn_udf(df[column]))
24
+ elif dtype == "itin":
25
+ df = df.withColumn(f"masked_{column}", mask_itin_udf(df[column]))
26
+ elif dtype == "phone":
27
+ df = df.withColumn(f"masked_{column}", mask_phone_udf(df[column]))
28
+ return df
29
+
30
+ def main():
31
+ parser = argparse.ArgumentParser(description="AnonySpark CLI for masking sensitive data.")
32
+ parser.add_argument('--input', type=str, required=True, help='Path to input CSV file')
33
+ parser.add_argument('--output', type=str, required=True, help='Directory to save masked output')
34
+ parser.add_argument('--schema', type=str, required=True, help='Path to masking schema JSON file')
35
+ args = parser.parse_args()
36
+
37
+ # Create output directory if it doesn't exist
38
+ os.makedirs(args.output, exist_ok=True)
39
+
40
+ # Start Spark
41
+ spark = SparkSession.builder.master("local[*]").appName("AnonysparkCLI").getOrCreate()
42
+
43
+ # Load data and schema
44
+ df = spark.read.csv(args.input, header=True)
45
+ with open(args.schema, 'r') as f:
46
+ schema = json.load(f)
47
+
48
+ # Apply masking
49
+ masked_df = apply_masking(df, schema)
50
+
51
+ # Save to output directory
52
+ masked_df.write.mode("overwrite").csv(args.output, header=True)
53
+
54
+ print(f"Masked file written to: {args.output}")
55
+
56
+ if __name__ == "__main__":
57
+ main()
masking.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = [
2
+ "mask_email_udf", "mask_name_udf", "mask_date_udf",
3
+ "mask_ssn_udf", "mask_itin_udf", "mask_phone_udf"
4
+ ]
5
+
6
+ from pyspark.sql.functions import udf
7
+ from pyspark.sql.types import StringType
8
+ import re
9
+ from datetime import datetime
10
+
11
+ # Masking functions
12
+ def mask_email(value):
13
+ if value and "@" in value:
14
+ user, domain = value.split("@")
15
+ return "***@" + domain
16
+ return None
17
+
18
+ def mask_name(value):
19
+ if value:
20
+ return value[0] + "***"
21
+ return None
22
+
23
+ def mask_date(value):
24
+ try:
25
+ dt = datetime.strptime(value, "%Y-%m-%d")
26
+ return dt.strftime("***-**-%d")
27
+ except:
28
+ return None
29
+
30
+ def mask_ssn(value):
31
+ if value and re.match(r"\d{3}-\d{2}-\d{4}", value):
32
+ return "***-**-" + value[-4:]
33
+ return None
34
+
35
+ def mask_itin(value):
36
+ if value and re.match(r"9\d{2}-7\d-\d{4}", value):
37
+ return "***-**-" + value[-4:]
38
+ return None
39
+
40
+ def mask_phone(value):
41
+ if value and re.match(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", value):
42
+ return "***-***-" + value[-4:]
43
+ return None
44
+
45
+ # UDFs for Spark
46
+ mask_email_udf = udf(mask_email, StringType())
47
+ mask_name_udf = udf(mask_name, StringType())
48
+ mask_date_udf = udf(mask_date, StringType())
49
+ mask_ssn_udf = udf(mask_ssn, StringType())
50
+ mask_itin_udf = udf(mask_itin, StringType())
51
+ mask_phone_udf = udf(mask_phone, StringType())
52
+
utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pyspark.sql.functions import col
2
+
3
+ def apply_masking(df, schema):
4
+ """
5
+ Apply masking UDFs to specified columns based on schema.
6
+ Schema = { "original_col": "mask_type" }
7
+ """
8
+ from .masking import (
9
+ mask_email_udf, mask_name_udf, mask_date_udf,
10
+ mask_ssn_udf, mask_itin_udf, mask_phone_udf
11
+ )
12
+
13
+ masking_map = {
14
+ "email": mask_email_udf,
15
+ "name": mask_name_udf,
16
+ "dob": mask_date_udf,
17
+ "ssn": mask_ssn_udf,
18
+ "itin": mask_itin_udf,
19
+ "phone": mask_phone_udf,
20
+ }
21
+
22
+ for col_name, mask_type in schema.items():
23
+ if mask_type in masking_map:
24
+ df = df.withColumn(f"masked_{col_name}", masking_map[mask_type](col(col_name)))
25
+ return df