GenAIDevTOProd commited on
Commit
e0c264d
·
verified ·
1 Parent(s): 36b06cf

Upload folder using huggingface_hub

Browse files
__init__.py CHANGED
@@ -1,9 +1 @@
1
- # AnonySpark: Lightweight PySpark data anonymization
2
- from .masking import (
3
- mask_email, mask_name, mask_date,
4
- mask_ssn, mask_itin, mask_phone,
5
- mask_email_udf, mask_name_udf, mask_date_udf,
6
- mask_ssn_udf, mask_itin_udf, mask_phone_udf
7
- )
8
-
9
- from .utils import apply_masking
 
1
+ # Init for tests
 
 
 
 
 
 
 
 
__pycache__/__init__.cpython-311.pyc ADDED
Binary file (135 Bytes). View file
 
__pycache__/test_masking.cpython-311-pytest-8.4.1.pyc ADDED
Binary file (13.8 kB). View file
 
__pycache__/test_schema_masking.cpython-311-pytest-8.4.1.pyc ADDED
Binary file (4.46 kB). View file
 
test_masking.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from anonyspark.masking import (
2
+ mask_email, mask_name, mask_date,
3
+ mask_ssn, mask_itin, mask_phone
4
+ )
5
+
6
+ def test_mask_email():
7
+ assert mask_email("[email protected]") == "***@example.com"
8
+ assert mask_email("") is None
9
+ assert mask_email(None) is None
10
+
11
+ def test_mask_name():
12
+ assert mask_name("John") == "J***"
13
+ assert mask_name("") is None
14
+ assert mask_name(None) is None
15
+
16
+ def test_mask_date():
17
+ assert mask_date("1991-08-14") == "***-**-14"
18
+ assert mask_date("invalid") is None
19
+ assert mask_date(None) is None
20
+
21
+ def test_mask_ssn():
22
+ assert mask_ssn("123-45-6789") == "***-**-6789"
23
+ assert mask_ssn("invalid") is None
24
+
25
+ def test_mask_itin():
26
+ assert mask_itin("912-73-1234") == "***-**-1234"
27
+ assert mask_itin("123-45-6789") is None
28
+
29
+ def test_mask_phone():
30
+ assert mask_phone("123-456-7890") == "***-***-7890"
31
+ assert mask_phone("(123) 456-7890") == "***-***-7890"
32
+ assert mask_phone("invalid") is None
test_schema_masking.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tests/test_schema_masking.py
2
+
3
+ import sys
4
+ import os
5
+
6
+ sys.path.append("/content/anonyspark")
7
+
8
+ from pyspark.sql import SparkSession
9
+ from anonyspark.utils import apply_masking
10
+
11
+ def test_schema_masking():
12
+ spark = SparkSession.builder.master("local[1]").appName("Test").getOrCreate()
13
+
14
+ df = spark.createDataFrame([{
15
+ "email": "[email protected]",
16
+ "name": "John",
17
+ "dob": "1991-08-14",
18
+ "ssn": "123-45-6789",
19
+ "itin": "912-73-1234",
20
+ "phone": "123-456-7890"
21
+ }])
22
+
23
+ schema = {
24
+ "email": "email",
25
+ "name": "name",
26
+ "dob": "dob",
27
+ "ssn": "ssn",
28
+ "itin": "itin",
29
+ "phone": "phone"
30
+ }
31
+
32
+ masked_df = apply_masking(df, schema)
33
+ result = masked_df.collect()[0].asDict()
34
+
35
+ assert result["masked_email"] == "***@example.com"
36
+ assert result["masked_name"] == "J***"
37
+ assert result["masked_dob"] == "***-**-14"
38
+ assert result["masked_ssn"] == "***-**-6789"
39
+ assert result["masked_itin"] == "***-**-1234"
40
+ assert result["masked_phone"] == "***-***-7890"