Upload span_lableing_operators.py with huggingface_hub
Browse files- span_lableing_operators.py +80 -0
span_lableing_operators.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict, List, Optional
|
| 2 |
+
|
| 3 |
+
from .operator import StreamInstanceOperator
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class IobExtractor(StreamInstanceOperator):
|
| 7 |
+
"""A class designed to extract entities from sequences of text using the Inside-Outside-Beginning (IOB) tagging convention. It identifies entities based on IOB tags and categorizes them into predefined labels such as Person, Organization, and Location.
|
| 8 |
+
|
| 9 |
+
Attributes:
|
| 10 |
+
labels (List[str]): A list of entity type labels, e.g., ["Person", "Organization", "Location"].
|
| 11 |
+
begin_labels (List[str]): A list of labels indicating the beginning of an entity, e.g., ["B-PER", "B-ORG", "B-LOC"].
|
| 12 |
+
inside_labels (List[str]): A list of labels indicating the continuation of an entity, e.g., ["I-PER", "I-ORG", "I-LOC"].
|
| 13 |
+
outside_label (str): The label indicating tokens outside of any entity, typically "O".
|
| 14 |
+
|
| 15 |
+
The extraction process identifies spans of text corresponding to entities and labels them according to their entity type. Each span is annotated with a start and end character offset, the entity text, and the corresponding label.
|
| 16 |
+
|
| 17 |
+
Example of instantiation and usage:
|
| 18 |
+
```python
|
| 19 |
+
operator = IobExtractor(
|
| 20 |
+
labels=["Person", "Organization", "Location"],
|
| 21 |
+
begin_labels=["B-PER", "B-ORG", "B-LOC"],
|
| 22 |
+
inside_labels=["I-PER", "I-ORG", "I-LOC"],
|
| 23 |
+
outside_label="O",
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
instance = {
|
| 27 |
+
"labels": ["B-PER", "I-PER", "O", "B-ORG", "I-ORG"],
|
| 28 |
+
"tokens": ["John", "Doe", "works", "at", "OpenAI"]
|
| 29 |
+
}
|
| 30 |
+
processed_instance = operator.process(instance)
|
| 31 |
+
print(processed_instance["spans"])
|
| 32 |
+
# Output: [{'start': 0, 'end': 8, 'text': 'John Doe', 'label': 'Person'}, ...]
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
For more details on the IOB tagging convention, see: https://en.wikipedia.org/wiki/Inside-outside-beginning_(tagging)
|
| 36 |
+
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
labels: List[str]
|
| 40 |
+
begin_labels: List[str]
|
| 41 |
+
inside_labels: List[str]
|
| 42 |
+
outside_label: int
|
| 43 |
+
|
| 44 |
+
def process(
|
| 45 |
+
self, instance: Dict[str, Any], stream_name: Optional[str] = None
|
| 46 |
+
) -> Dict[str, Any]:
|
| 47 |
+
labels = instance["labels"]
|
| 48 |
+
tokens = instance["tokens"]
|
| 49 |
+
text = instance["text"]
|
| 50 |
+
|
| 51 |
+
spans = []
|
| 52 |
+
current_pos = 0
|
| 53 |
+
end_pos = 0
|
| 54 |
+
|
| 55 |
+
for label, token in zip(labels, tokens):
|
| 56 |
+
token_pos = text.find(token, current_pos)
|
| 57 |
+
if token_pos == -1:
|
| 58 |
+
raise ValueError(
|
| 59 |
+
f"Token '{token}' not found in text '{text}' starting from position {current_pos}"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
end_pos = token_pos + len(token)
|
| 63 |
+
|
| 64 |
+
if label in self.begin_labels:
|
| 65 |
+
span = {
|
| 66 |
+
"start": token_pos,
|
| 67 |
+
"label": self.labels[self.begin_labels.index(label)],
|
| 68 |
+
"end": end_pos,
|
| 69 |
+
}
|
| 70 |
+
spans.append(span)
|
| 71 |
+
elif label in self.inside_labels and spans:
|
| 72 |
+
spans[-1]["end"] = end_pos
|
| 73 |
+
|
| 74 |
+
current_pos = end_pos
|
| 75 |
+
|
| 76 |
+
for span in spans:
|
| 77 |
+
span["text"] = text[span["start"] : span["end"]]
|
| 78 |
+
|
| 79 |
+
instance["spans"] = spans
|
| 80 |
+
return instance
|