Spaces:
Sleeping
Sleeping
""" | |
Sistema di anonimizzazione con NER e regex. | |
""" | |
import re | |
from typing import Dict, Tuple | |
from transformers import pipeline | |
import streamlit as st | |
from config import Config, REGEX_PATTERNS | |
class NERAnonimizer: | |
"""Anonimizzatore con NER e regex""" | |
def __init__(self): | |
self.regex_patterns = REGEX_PATTERNS | |
self._ner_pipe = None | |
def ner_pipe(self): | |
"""Lazy loading del modello NER""" | |
if self._ner_pipe is None: | |
with st.spinner("Caricamento modello NER..."): | |
try: | |
self._ner_pipe = pipeline( | |
"ner", | |
model=Config.NER_MODEL, | |
aggregation_strategy="simple" | |
) | |
except Exception as e: | |
st.error(f"Errore caricamento NER: {e}") | |
return None | |
return self._ner_pipe | |
def mask_with_regex(self, text: str) -> Tuple[str, Dict]: | |
"""Applica mascheramento con regex""" | |
masked_text = text | |
found_entities = {} | |
# Ordina pattern per lunghezza (più lunghi prima) | |
sorted_patterns = sorted( | |
self.regex_patterns.items(), | |
key=lambda item: len(item[1]), | |
reverse=True | |
) | |
for label, pattern in sorted_patterns: | |
matches = list(re.finditer(pattern, masked_text, flags=re.IGNORECASE)) | |
for match in reversed(matches): | |
original = match.group() | |
if original.startswith('[') and original.endswith(']'): | |
continue | |
placeholder = f"[{label}_{len(found_entities)}]" | |
found_entities[placeholder] = original | |
masked_text = masked_text[:match.start()] + placeholder + masked_text[match.end():] | |
return masked_text, found_entities | |
def mask_with_ner(self, text: str) -> Tuple[str, Dict]: | |
"""Applica mascheramento con NER""" | |
if not self.ner_pipe: | |
return text, {} | |
try: | |
entities = self.ner_pipe(text) | |
entity_map = {} | |
sorted_entities = sorted(entities, key=lambda x: x['start'], reverse=True) | |
for ent in sorted_entities: | |
if ent['score'] > 0.5: | |
label = ent['entity_group'] | |
original_text = text[ent['start']:ent['end']] | |
if original_text.startswith('[') and original_text.endswith(']'): | |
continue | |
placeholder = f"[{label}_{len(entity_map)}]" | |
entity_map[placeholder] = original_text | |
text = text[:ent['start']] + placeholder + text[ent['end']:] | |
return text, entity_map | |
except Exception as e: | |
st.error(f"Errore NER: {e}") | |
return text, {} | |
def anonymize(self, text: str) -> Tuple[str, Dict]: | |
"""Pipeline completa di anonimizzazione""" | |
if not text or not text.strip(): | |
return text, {} | |
# Regex prima, poi NER | |
masked_text, regex_entities = self.mask_with_regex(text) | |
final_text, ner_entities = self.mask_with_ner(masked_text) | |
# Combina entità | |
all_entities = {**regex_entities, **ner_entities} | |
return final_text, all_entities |