Upload processors.py with huggingface_hub
Browse files- processors.py +38 -10
processors.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import json
|
| 2 |
import re
|
| 3 |
-
from typing import Any
|
| 4 |
|
| 5 |
from .operator import BaseFieldOperator
|
| 6 |
|
|
@@ -17,23 +16,21 @@ class ToStringStripped(BaseFieldOperator):
|
|
| 17 |
|
| 18 |
class ToListByComma(BaseFieldOperator):
|
| 19 |
def process(self, instance):
|
| 20 |
-
|
| 21 |
-
return output
|
| 22 |
|
| 23 |
|
| 24 |
class RegexParser(BaseFieldOperator):
|
| 25 |
-
"""
|
| 26 |
-
A processor that uses regex in order to parse a string.
|
| 27 |
-
"""
|
| 28 |
|
| 29 |
regex: str
|
| 30 |
termination_regex: str = None
|
| 31 |
|
| 32 |
def process(self, text):
|
| 33 |
-
if self.termination_regex is not None and re.fullmatch(
|
|
|
|
|
|
|
| 34 |
return []
|
| 35 |
-
|
| 36 |
-
return matches
|
| 37 |
|
| 38 |
|
| 39 |
class LoadJson(BaseFieldOperator):
|
|
@@ -61,7 +58,9 @@ class DictOfListsToPairs(BaseFieldOperator):
|
|
| 61 |
for key, values in obj.items():
|
| 62 |
for value in values:
|
| 63 |
assert isinstance(value, str)
|
| 64 |
-
pair = (
|
|
|
|
|
|
|
| 65 |
result.append(pair)
|
| 66 |
return result
|
| 67 |
except:
|
|
@@ -74,3 +73,32 @@ class TakeFirstNonEmptyLine(BaseFieldOperator):
|
|
| 74 |
if len(splitted) == 0:
|
| 75 |
return ""
|
| 76 |
return splitted[0].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import re
|
|
|
|
| 3 |
|
| 4 |
from .operator import BaseFieldOperator
|
| 5 |
|
|
|
|
| 16 |
|
| 17 |
class ToListByComma(BaseFieldOperator):
|
| 18 |
def process(self, instance):
|
| 19 |
+
return [x.strip() for x in instance.split(",")]
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
class RegexParser(BaseFieldOperator):
|
| 23 |
+
"""A processor that uses regex in order to parse a string."""
|
|
|
|
|
|
|
| 24 |
|
| 25 |
regex: str
|
| 26 |
termination_regex: str = None
|
| 27 |
|
| 28 |
def process(self, text):
|
| 29 |
+
if self.termination_regex is not None and re.fullmatch(
|
| 30 |
+
self.termination_regex, text
|
| 31 |
+
):
|
| 32 |
return []
|
| 33 |
+
return re.findall(self.regex, text)
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
class LoadJson(BaseFieldOperator):
|
|
|
|
| 58 |
for key, values in obj.items():
|
| 59 |
for value in values:
|
| 60 |
assert isinstance(value, str)
|
| 61 |
+
pair = (
|
| 62 |
+
(key, value) if self.position_key_before_value else (value, key)
|
| 63 |
+
)
|
| 64 |
result.append(pair)
|
| 65 |
return result
|
| 66 |
except:
|
|
|
|
| 73 |
if len(splitted) == 0:
|
| 74 |
return ""
|
| 75 |
return splitted[0].strip()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class LowerCaseTillPunc(BaseFieldOperator):
|
| 79 |
+
def process(self, instance):
|
| 80 |
+
non_empty_line = instance.lower()
|
| 81 |
+
match = re.search(r"[.,!?;]", non_empty_line)
|
| 82 |
+
if match:
|
| 83 |
+
# Extract text up to the first punctuation
|
| 84 |
+
non_empty_line = non_empty_line[: match.start()]
|
| 85 |
+
return non_empty_line
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class FirstCharacter(BaseFieldOperator):
|
| 89 |
+
def process(self, instance):
|
| 90 |
+
match = re.search(r"\s*(\w)", instance)
|
| 91 |
+
if match:
|
| 92 |
+
return match.groups(0)[0]
|
| 93 |
+
return ""
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class StringOrNotString(BaseFieldOperator):
|
| 97 |
+
string: str
|
| 98 |
+
|
| 99 |
+
def process(self, instance):
|
| 100 |
+
if "not " + self.string.lower() in instance.lower():
|
| 101 |
+
return "not " + self.string.lower()
|
| 102 |
+
if self.string.lower() in instance.lower():
|
| 103 |
+
return self.string.lower()
|
| 104 |
+
return instance
|