File size: 3,600 Bytes
9c8c4f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""

Sistema di anonimizzazione con NER e regex.

"""

import re
from typing import Dict, Tuple
from transformers import pipeline
import streamlit as st
from config import Config, REGEX_PATTERNS

class NERAnonimizer:
    """Anonimizzatore con NER e regex"""
    
    def __init__(self):
        self.regex_patterns = REGEX_PATTERNS
        self._ner_pipe = None
    
    @property
    def ner_pipe(self):
        """Lazy loading del modello NER"""
        if self._ner_pipe is None:
            with st.spinner("Caricamento modello NER..."):
                try:
                    self._ner_pipe = pipeline(
                        "ner",
                        model=Config.NER_MODEL,
                        aggregation_strategy="simple"
                    )
                except Exception as e:
                    st.error(f"Errore caricamento NER: {e}")
                    return None
        return self._ner_pipe
    
    def mask_with_regex(self, text: str) -> Tuple[str, Dict]:
        """Applica mascheramento con regex"""
        masked_text = text
        found_entities = {}
        
        # Ordina pattern per lunghezza (più lunghi prima)
        sorted_patterns = sorted(
            self.regex_patterns.items(), 
            key=lambda item: len(item[1]), 
            reverse=True
        )

        for label, pattern in sorted_patterns:
            matches = list(re.finditer(pattern, masked_text, flags=re.IGNORECASE))
            for match in reversed(matches):
                original = match.group()
                if original.startswith('[') and original.endswith(']'):
                    continue

                placeholder = f"[{label}_{len(found_entities)}]"
                found_entities[placeholder] = original
                masked_text = masked_text[:match.start()] + placeholder + masked_text[match.end():]
        
        return masked_text, found_entities
    
    def mask_with_ner(self, text: str) -> Tuple[str, Dict]:
        """Applica mascheramento con NER"""
        if not self.ner_pipe:
            return text, {}
            
        try:
            entities = self.ner_pipe(text)
            entity_map = {}
            
            sorted_entities = sorted(entities, key=lambda x: x['start'], reverse=True)
            
            for ent in sorted_entities:
                if ent['score'] > 0.5:
                    label = ent['entity_group']
                    original_text = text[ent['start']:ent['end']]
                    
                    if original_text.startswith('[') and original_text.endswith(']'):
                        continue

                    placeholder = f"[{label}_{len(entity_map)}]"
                    entity_map[placeholder] = original_text
                    
                    text = text[:ent['start']] + placeholder + text[ent['end']:]
            
            return text, entity_map
            
        except Exception as e:
            st.error(f"Errore NER: {e}")
            return text, {}
    
    def anonymize(self, text: str) -> Tuple[str, Dict]:
        """Pipeline completa di anonimizzazione"""
        if not text or not text.strip():
            return text, {}
        
        # Regex prima, poi NER
        masked_text, regex_entities = self.mask_with_regex(text)
        final_text, ner_entities = self.mask_with_ner(masked_text)
        
        # Combina entità
        all_entities = {**regex_entities, **ner_entities}
        
        return final_text, all_entities