Zamanonymize3

Build error

App Files Files Community

jfrery-zama commited on Mar 22, 2024

Commit

d0b1031

1 Parent(s): 1dfccc3

update anonymize file in clear with roberta +update uuid map with query id

Browse files

Files changed (3) hide show

anonymize_file_clear.py +13 -7
app.py +1 -1
fhe_anonymizer.py +5 -5

anonymize_file_clear.py CHANGED Viewed

@@ -5,15 +5,21 @@ import uuid
 from pathlib import Path
 import gensim
 from concrete.ml.common.serialization.loaders import load
 def load_models():
     base_dir = Path(__file__).parent / "models"
-    embeddings_model = gensim.models.FastText.load(str(base_dir / "without_pronoun_embedded_model.model"))
-    with open(base_dir / "without_pronoun_cml_xgboost.model", "r") as model_file:
         fhe_ner_detection = load(file=model_file)
-    return embeddings_model, fhe_ner_detection
-def anonymize_text(text, embeddings_model, fhe_ner_detection):
     token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
     tokens = re.findall(token_pattern, text)
     uuid_map = {}
@@ -21,7 +27,7 @@ def anonymize_text(text, embeddings_model, fhe_ner_detection):
     for token in tokens:
         if token.strip() and re.match(r"\w+", token):  # If the token is a word
-            x = embeddings_model.wv[token][None]
             prediction_proba = fhe_ner_detection.predict_proba(x)
             probability = prediction_proba[0][1]
             prediction = probability >= 0.5
@@ -42,7 +48,7 @@ def main():
     parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
     args = parser.parse_args()
-    embeddings_model, fhe_ner_detection = load_models()
     # Read the input file
     with open(args.file_path, 'r', encoding='utf-8') as file:
@@ -54,7 +60,7 @@ def main():
         original_file.write(text)
     # Anonymize the text
-    anonymized_text, uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
     # Save the anonymized text to its specified file
     anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"

 from pathlib import Path
 import gensim
 from concrete.ml.common.serialization.loaders import load
+from transformers import AutoTokenizer, AutoModel
+from utils_demo import get_batch_text_representation
 def load_models():
     base_dir = Path(__file__).parent / "models"
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
+    embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
+    with open(base_dir / "cml_logreg.model", "r") as model_file:
         fhe_ner_detection = load(file=model_file)
+    return embeddings_model, tokenizer, fhe_ner_detection
+def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
     token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
     tokens = re.findall(token_pattern, text)
     uuid_map = {}
     for token in tokens:
         if token.strip() and re.match(r"\w+", token):  # If the token is a word
+            x = get_batch_text_representation([token], embeddings_model, tokenizer)
             prediction_proba = fhe_ner_detection.predict_proba(x)
             probability = prediction_proba[0][1]
             prediction = probability >= 0.5
     parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
     args = parser.parse_args()
+    embeddings_model, tokenizer, fhe_ner_detection = load_models()
     # Read the input file
     with open(args.file_path, 'r', encoding='utf-8') as file:
         original_file.write(text)
     # Anonymize the text
+    anonymized_text, uuid_map = anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection)
     # Save the anonymized text to its specified file
     anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"

app.py CHANGED Viewed

@@ -142,7 +142,7 @@ with demo:
         examples_radio.change(lambda example_query: example_query, inputs=[examples_radio], outputs=[input_text])
-    anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=1)
     identified_words_output = gr.Dataframe(label="Identified Words", visible=False)

         examples_radio.change(lambda example_query: example_query, inputs=[examples_radio], outputs=[input_text])
+    anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=1, interactive=True)
     identified_words_output = gr.Dataframe(label="Identified Words", visible=False)

fhe_anonymizer.py CHANGED Viewed

@@ -14,13 +14,11 @@ base_dir = Path(__file__).parent
 class FHEAnonymizer:
     def __init__(self, punctuation_list=".,!?:;"):
-        # Load tokenizer and model, move model to the selected device
         self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
         self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
         self.punctuation_list = punctuation_list
-        with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
-            self.fhe_ner_detection = load(file=model_file)
         with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
             self.uuid_map = json.load(file)
@@ -44,7 +42,6 @@ class FHEAnonymizer:
         identified_words_with_prob = []
         processed_tokens = []
-        print(tokens)
         for token in tokens:
             # Directly append non-word tokens or whitespace to processed_tokens
             if not token.strip() or not re.match(r"\w+", token):
@@ -54,7 +51,6 @@ class FHEAnonymizer:
             # Prediction for each word
             x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
-            # prediction_proba = self.fhe_ner_detection.predict_proba(x)
             prediction_proba = self.fhe_inference(x)
             probability = prediction_proba[0][1]
@@ -68,6 +64,10 @@ class FHEAnonymizer:
             else:
                 processed_tokens.append(token)
         # Reconstruct the sentence
         reconstructed_sentence = ''.join(processed_tokens)
         return reconstructed_sentence, identified_words_with_prob

 class FHEAnonymizer:
     def __init__(self, punctuation_list=".,!?:;"):
+        # Load tokenizer and model
         self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
         self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
         self.punctuation_list = punctuation_list
         with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
             self.uuid_map = json.load(file)
         identified_words_with_prob = []
         processed_tokens = []
         for token in tokens:
             # Directly append non-word tokens or whitespace to processed_tokens
             if not token.strip() or not re.match(r"\w+", token):
             # Prediction for each word
             x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
             prediction_proba = self.fhe_inference(x)
             probability = prediction_proba[0][1]
             else:
                 processed_tokens.append(token)
+        # Update the UUID map with query.
+        with open(base_dir / "original_document_uuid_mapping.json", 'w') as file:
+            json.dump(self.uuid_map, file)
         # Reconstruct the sentence
         reconstructed_sentence = ''.join(processed_tokens)
         return reconstructed_sentence, identified_words_with_prob