Spaces:

asdc
/

Temporal_expression_extraction

Sleeping

App Files Files Community

asdc commited on about 1 month ago

Commit

6e9b058

verified ·

1 Parent(s): 65003ad

Upload streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +21 -28

src/streamlit_app.py CHANGED Viewed

@@ -9,15 +9,15 @@ from transformers import AutoTokenizer, AutoModelForTokenClassification
 # Mapping of label to color
 LABEL_COLORS = {
-    'LABEL-0': '#cccccc',  # NONE
-    'LABEL-1': '#ffadad',  # B-DATE
-    'LABEL-2': '#ffd6a5',  # I-DATE
-    'LABEL-3': '#fdffb6',  # B-TIME
-    'LABEL-4': '#caffbf',  # I-TIME
-    'LABEL-5': '#9bf6ff',  # B-DURATION
-    'LABEL-6': '#a0c4ff',  # I-DURATION
-    'LABEL-7': '#bdb2ff',  # B-SET
-    'LABEL-8': '#ffc6ff',  # I-SET
 }
 LABEL_MEANINGS = {
@@ -40,41 +40,33 @@ def load_model():
 def ner_with_robertime(text: str) -> List[Tuple[str, str]]:
     tokenizer, model = load_model()
-    # Tokenize and get input tensors
     tokens = tokenizer(text, return_tensors="pt", truncation=True, is_split_into_words=False)
     with torch.no_grad():
         outputs = model(**tokens)
     predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
-    # Map ids to labels
     labels = [model.config.id2label[pred] for pred in predictions]
-    # Get tokens (handling subwords)
     word_ids = tokens.word_ids(batch_index=0)
-    token_list = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
-    # Merge subwords and assign entity labels
     entities = []
-    current_word = ''
     current_label = None
     last_word_id = None
     for idx, word_id in enumerate(word_ids):
         if word_id is None:
             continue
-        token = token_list[idx]
         label = labels[idx]
-        if token.startswith('▁') or token.startswith('##') or token.startswith('Ġ'):
-            token = token.lstrip('▁#Ġ')
-        if word_id != last_word_id and current_word:
-            entities.append((current_word, current_label))
-            current_word = token
             current_label = label
         else:
-            if current_word:
-                current_word += token if token.startswith("'") else f' {token}'
-            else:
-                current_word = token
             current_label = label
         last_word_id = word_id
-    if current_word:
-        entities.append((current_word, current_label))
     return entities
 def colorize_entities(ner_result: List[Tuple[str, str]]) -> str:
@@ -83,7 +75,8 @@ def colorize_entities(ner_result: List[Tuple[str, str]]) -> str:
         norm_label = label.replace('_', '-')
         if norm_label != 'LABEL-0':
             color = LABEL_COLORS.get(norm_label, '#eeeeee')
-            html += f'<span style="background-color:{color};padding:2px 4px;border-radius:4px;margin:1px;">{token}</span> '
         else:
             html += f'{token} '
     return html

 # Mapping of label to color
 LABEL_COLORS = {
+    'LABEL-0': '#ffffff',  # NONE (no color)
+    'LABEL-1': '#fff4e6',  # B-DATE (creamy orange)
+    'LABEL-2': '#ffe9ec',  # I-DATE (creamy pink)
+    'LABEL-3': '#f3ffe3',  # B-TIME (creamy green)
+    'LABEL-4': '#e6f7ff',  # I-TIME (creamy blue)
+    'LABEL-5': '#f9f7e8',  # B-DURATION (creamy yellow)
+    'LABEL-6': '#f6eaff',  # I-DURATION (creamy purple)
+    'LABEL-7': '#fdf6ec',  # B-SET (creamy beige)
+    'LABEL-8': '#f6fff8',  # I-SET (creamy mint)
 }
 LABEL_MEANINGS = {
 def ner_with_robertime(text: str) -> List[Tuple[str, str]]:
     tokenizer, model = load_model()
     tokens = tokenizer(text, return_tensors="pt", truncation=True, is_split_into_words=False)
     with torch.no_grad():
         outputs = model(**tokens)
     predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
     labels = [model.config.id2label[pred] for pred in predictions]
     word_ids = tokens.word_ids(batch_index=0)
+    input_ids = tokens["input_ids"][0]
     entities = []
+    current_word_ids = []
     current_label = None
     last_word_id = None
     for idx, word_id in enumerate(word_ids):
         if word_id is None:
             continue
         label = labels[idx]
+        if word_id != last_word_id and current_word_ids:
+            word = tokenizer.decode([input_ids[i] for i in current_word_ids], skip_special_tokens=True)
+            entities.append((word, current_label))
+            current_word_ids = [idx]
             current_label = label
         else:
+            current_word_ids.append(idx)
             current_label = label
         last_word_id = word_id
+    if current_word_ids:
+        word = tokenizer.decode([input_ids[i] for i in current_word_ids], skip_special_tokens=True)
+        entities.append((word, current_label))
     return entities
 def colorize_entities(ner_result: List[Tuple[str, str]]) -> str:
         norm_label = label.replace('_', '-')
         if norm_label != 'LABEL-0':
             color = LABEL_COLORS.get(norm_label, '#eeeeee')
+            label_meaning = LABEL_MEANINGS.get(norm_label, norm_label)
+            html += f'<span style="background-color:{color};padding:2px 4px;border-radius:4px;margin:1px;" title="{label_meaning}">{token}</span> '
         else:
             html += f'{token} '
     return html