Spaces:
Build error
Build error
Commit
·
d0b1031
1
Parent(s):
1dfccc3
update anonymize file in clear with roberta +update uuid map with query id
Browse files- anonymize_file_clear.py +13 -7
- app.py +1 -1
- fhe_anonymizer.py +5 -5
anonymize_file_clear.py
CHANGED
|
@@ -5,15 +5,21 @@ import uuid
|
|
| 5 |
from pathlib import Path
|
| 6 |
import gensim
|
| 7 |
from concrete.ml.common.serialization.loaders import load
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def load_models():
|
| 10 |
base_dir = Path(__file__).parent / "models"
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
fhe_ner_detection = load(file=model_file)
|
| 14 |
-
return embeddings_model, fhe_ner_detection
|
| 15 |
|
| 16 |
-
def anonymize_text(text, embeddings_model, fhe_ner_detection):
|
| 17 |
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
|
| 18 |
tokens = re.findall(token_pattern, text)
|
| 19 |
uuid_map = {}
|
|
@@ -21,7 +27,7 @@ def anonymize_text(text, embeddings_model, fhe_ner_detection):
|
|
| 21 |
|
| 22 |
for token in tokens:
|
| 23 |
if token.strip() and re.match(r"\w+", token): # If the token is a word
|
| 24 |
-
x =
|
| 25 |
prediction_proba = fhe_ner_detection.predict_proba(x)
|
| 26 |
probability = prediction_proba[0][1]
|
| 27 |
prediction = probability >= 0.5
|
|
@@ -42,7 +48,7 @@ def main():
|
|
| 42 |
parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
|
| 43 |
args = parser.parse_args()
|
| 44 |
|
| 45 |
-
embeddings_model, fhe_ner_detection = load_models()
|
| 46 |
|
| 47 |
# Read the input file
|
| 48 |
with open(args.file_path, 'r', encoding='utf-8') as file:
|
|
@@ -54,7 +60,7 @@ def main():
|
|
| 54 |
original_file.write(text)
|
| 55 |
|
| 56 |
# Anonymize the text
|
| 57 |
-
anonymized_text, uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
|
| 58 |
|
| 59 |
# Save the anonymized text to its specified file
|
| 60 |
anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
import gensim
|
| 7 |
from concrete.ml.common.serialization.loaders import load
|
| 8 |
+
from transformers import AutoTokenizer, AutoModel
|
| 9 |
+
from utils_demo import get_batch_text_representation
|
| 10 |
|
| 11 |
def load_models():
|
| 12 |
base_dir = Path(__file__).parent / "models"
|
| 13 |
+
|
| 14 |
+
# Load tokenizer and model
|
| 15 |
+
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
|
| 16 |
+
embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
|
| 17 |
+
|
| 18 |
+
with open(base_dir / "cml_logreg.model", "r") as model_file:
|
| 19 |
fhe_ner_detection = load(file=model_file)
|
| 20 |
+
return embeddings_model, tokenizer, fhe_ner_detection
|
| 21 |
|
| 22 |
+
def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
|
| 23 |
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
|
| 24 |
tokens = re.findall(token_pattern, text)
|
| 25 |
uuid_map = {}
|
|
|
|
| 27 |
|
| 28 |
for token in tokens:
|
| 29 |
if token.strip() and re.match(r"\w+", token): # If the token is a word
|
| 30 |
+
x = get_batch_text_representation([token], embeddings_model, tokenizer)
|
| 31 |
prediction_proba = fhe_ner_detection.predict_proba(x)
|
| 32 |
probability = prediction_proba[0][1]
|
| 33 |
prediction = probability >= 0.5
|
|
|
|
| 48 |
parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
|
| 49 |
args = parser.parse_args()
|
| 50 |
|
| 51 |
+
embeddings_model, tokenizer, fhe_ner_detection = load_models()
|
| 52 |
|
| 53 |
# Read the input file
|
| 54 |
with open(args.file_path, 'r', encoding='utf-8') as file:
|
|
|
|
| 60 |
original_file.write(text)
|
| 61 |
|
| 62 |
# Anonymize the text
|
| 63 |
+
anonymized_text, uuid_map = anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection)
|
| 64 |
|
| 65 |
# Save the anonymized text to its specified file
|
| 66 |
anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
|
app.py
CHANGED
|
@@ -142,7 +142,7 @@ with demo:
|
|
| 142 |
|
| 143 |
examples_radio.change(lambda example_query: example_query, inputs=[examples_radio], outputs=[input_text])
|
| 144 |
|
| 145 |
-
anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=1)
|
| 146 |
|
| 147 |
identified_words_output = gr.Dataframe(label="Identified Words", visible=False)
|
| 148 |
|
|
|
|
| 142 |
|
| 143 |
examples_radio.change(lambda example_query: example_query, inputs=[examples_radio], outputs=[input_text])
|
| 144 |
|
| 145 |
+
anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=1, interactive=True)
|
| 146 |
|
| 147 |
identified_words_output = gr.Dataframe(label="Identified Words", visible=False)
|
| 148 |
|
fhe_anonymizer.py
CHANGED
|
@@ -14,13 +14,11 @@ base_dir = Path(__file__).parent
|
|
| 14 |
class FHEAnonymizer:
|
| 15 |
def __init__(self, punctuation_list=".,!?:;"):
|
| 16 |
|
| 17 |
-
# Load tokenizer and model
|
| 18 |
self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
|
| 19 |
self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
|
| 20 |
|
| 21 |
self.punctuation_list = punctuation_list
|
| 22 |
-
with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
|
| 23 |
-
self.fhe_ner_detection = load(file=model_file)
|
| 24 |
|
| 25 |
with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
|
| 26 |
self.uuid_map = json.load(file)
|
|
@@ -44,7 +42,6 @@ class FHEAnonymizer:
|
|
| 44 |
identified_words_with_prob = []
|
| 45 |
processed_tokens = []
|
| 46 |
|
| 47 |
-
print(tokens)
|
| 48 |
for token in tokens:
|
| 49 |
# Directly append non-word tokens or whitespace to processed_tokens
|
| 50 |
if not token.strip() or not re.match(r"\w+", token):
|
|
@@ -54,7 +51,6 @@ class FHEAnonymizer:
|
|
| 54 |
# Prediction for each word
|
| 55 |
x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
|
| 56 |
|
| 57 |
-
# prediction_proba = self.fhe_ner_detection.predict_proba(x)
|
| 58 |
prediction_proba = self.fhe_inference(x)
|
| 59 |
probability = prediction_proba[0][1]
|
| 60 |
|
|
@@ -68,6 +64,10 @@ class FHEAnonymizer:
|
|
| 68 |
else:
|
| 69 |
processed_tokens.append(token)
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
# Reconstruct the sentence
|
| 72 |
reconstructed_sentence = ''.join(processed_tokens)
|
| 73 |
return reconstructed_sentence, identified_words_with_prob
|
|
|
|
| 14 |
class FHEAnonymizer:
|
| 15 |
def __init__(self, punctuation_list=".,!?:;"):
|
| 16 |
|
| 17 |
+
# Load tokenizer and model
|
| 18 |
self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
|
| 19 |
self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
|
| 20 |
|
| 21 |
self.punctuation_list = punctuation_list
|
|
|
|
|
|
|
| 22 |
|
| 23 |
with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
|
| 24 |
self.uuid_map = json.load(file)
|
|
|
|
| 42 |
identified_words_with_prob = []
|
| 43 |
processed_tokens = []
|
| 44 |
|
|
|
|
| 45 |
for token in tokens:
|
| 46 |
# Directly append non-word tokens or whitespace to processed_tokens
|
| 47 |
if not token.strip() or not re.match(r"\w+", token):
|
|
|
|
| 51 |
# Prediction for each word
|
| 52 |
x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
|
| 53 |
|
|
|
|
| 54 |
prediction_proba = self.fhe_inference(x)
|
| 55 |
probability = prediction_proba[0][1]
|
| 56 |
|
|
|
|
| 64 |
else:
|
| 65 |
processed_tokens.append(token)
|
| 66 |
|
| 67 |
+
# Update the UUID map with query.
|
| 68 |
+
with open(base_dir / "original_document_uuid_mapping.json", 'w') as file:
|
| 69 |
+
json.dump(self.uuid_map, file)
|
| 70 |
+
|
| 71 |
# Reconstruct the sentence
|
| 72 |
reconstructed_sentence = ''.join(processed_tokens)
|
| 73 |
return reconstructed_sentence, identified_words_with_prob
|