Hobson commited on
Commit
d5d83ae
·
1 Parent(s): 9aec672

nlutils from app.py

Browse files
Files changed (2) hide show
  1. app.py +4 -193
  2. nlutils.py +327 -0
app.py CHANGED
@@ -2,196 +2,7 @@ import gradio as gr
2
  import spacy # noqa
3
  from transformers import pipeline
4
 
5
- # import os
6
- # os.environ['KMP_DUPLICATE_LIB_OK']='True'
7
- # import spacy
8
-
9
- # Change this according to what words should be corrected to
10
- SPELL_CORRECT_MIN_CHAR_DIFF = 2
11
-
12
- TOKENS2INT_ERROR_INT = 32202
13
-
14
- ONES = [
15
- "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
16
- "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
17
- "sixteen", "seventeen", "eighteen", "nineteen",
18
- ]
19
-
20
- CHAR_MAPPING = {
21
- "-": " ",
22
- "_": " ",
23
- "and": " ",
24
- }
25
- # CHAR_MAPPING.update((str(i), word) for i, word in enumerate([" " + s + " " for s in ONES]))
26
- TOKEN_MAPPING = {
27
- "and": " ",
28
- "oh": "0",
29
- }
30
-
31
-
32
- def find_char_diff(a, b):
33
- # Finds the character difference between two str objects by counting the occurences of every character. Not edit distance.
34
- char_counts_a = {}
35
- char_counts_b = {}
36
- for char in a:
37
- if char in char_counts_a.keys():
38
- char_counts_a[char] += 1
39
- else:
40
- char_counts_a[char] = 1
41
- for char in b:
42
- if char in char_counts_b.keys():
43
- char_counts_b[char] += 1
44
- else:
45
- char_counts_b[char] = 1
46
- char_diff = 0
47
- for i in char_counts_a:
48
- if i in char_counts_b.keys():
49
- char_diff += abs(char_counts_a[i] - char_counts_b[i])
50
- else:
51
- char_diff += char_counts_a[i]
52
- return char_diff
53
-
54
-
55
- def tokenize(text):
56
- text = text.lower()
57
- # print(text)
58
- text = replace_tokens(''.join(i for i in replace_chars(text)).split())
59
- # print(text)
60
- text = [i for i in text if i != ' ']
61
- # print(text)
62
- output = []
63
- for word in text:
64
- # print(word)
65
- output.append(convert_word_to_int(word))
66
- output = [i for i in output if i != ' ']
67
- # print(output)
68
- return output
69
-
70
-
71
- def detokenize(tokens):
72
- return ' '.join(tokens)
73
-
74
-
75
- def replace_tokens(tokens, token_mapping=TOKEN_MAPPING):
76
- return [token_mapping.get(tok, tok) for tok in tokens]
77
-
78
-
79
- def replace_chars(text, char_mapping=CHAR_MAPPING):
80
- return [char_mapping.get(c, c) for c in text]
81
-
82
-
83
- def convert_word_to_int(in_word, numwords={}):
84
- # Converts a single word/str into a single int
85
- tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
86
- scales = ["hundred", "thousand", "million", "billion", "trillion"]
87
- if not numwords:
88
- for idx, word in enumerate(ONES):
89
- numwords[word] = idx
90
- for idx, word in enumerate(tens):
91
- numwords[word] = idx * 10
92
- for idx, word in enumerate(scales):
93
- numwords[word] = 10 ** (idx * 3 or 2)
94
- if in_word in numwords:
95
- # print(in_word)
96
- # print(numwords[in_word])
97
- return numwords[in_word]
98
- try:
99
- int(in_word)
100
- return int(in_word)
101
- except ValueError:
102
- pass
103
- # Spell correction using find_char_diff
104
- char_diffs = [find_char_diff(in_word, i) for i in ONES + tens + scales]
105
- min_char_diff = min(char_diffs)
106
- if min_char_diff <= SPELL_CORRECT_MIN_CHAR_DIFF:
107
- return char_diffs.index(min_char_diff)
108
-
109
-
110
- def tokens2int(tokens):
111
- # Takes a list of tokens and returns a int representation of them
112
- types = []
113
- for i in tokens:
114
- if i <= 9:
115
- types.append(1)
116
-
117
- elif i <= 90:
118
- types.append(2)
119
-
120
- else:
121
- types.append(3)
122
- # print(tokens)
123
- if len(tokens) <= 3:
124
- current = 0
125
- for i, number in enumerate(tokens):
126
- if i != 0 and types[i] < types[i - 1] and current != tokens[i - 1] and types[i - 1] != 3:
127
- current += tokens[i] + tokens[i - 1]
128
- elif current <= tokens[i] and current != 0:
129
- current *= tokens[i]
130
- elif 3 not in types and 1 not in types:
131
- current = int(''.join(str(i) for i in tokens))
132
- break
133
- elif '111' in ''.join(str(i) for i in types) and 2 not in types and 3 not in types:
134
- current = int(''.join(str(i) for i in tokens))
135
- break
136
- else:
137
- current += number
138
-
139
- elif 3 not in types and 2 not in types:
140
- current = int(''.join(str(i) for i in tokens))
141
-
142
- else:
143
- """
144
- double_list = []
145
- current_double = []
146
- double_type_list = []
147
- for i in tokens:
148
- if len(current_double) < 2:
149
- current_double.append(i)
150
- else:
151
- double_list.append(current_double)
152
- current_double = []
153
- current_double = []
154
- for i in types:
155
- if len(current_double) < 2:
156
- current_double.append(i)
157
- else:
158
- double_type_list.append(current_double)
159
- current_double = []
160
- print(double_type_list)
161
- print(double_list)
162
- current = 0
163
- for i, type_double in enumerate(double_type_list):
164
- if len(type_double) == 1:
165
- current += double_list[i][0]
166
- elif type_double[0] == type_double[1]:
167
- current += int(str(double_list[i][0]) + str(double_list[i][1]))
168
- elif type_double[0] > type_double[1]:
169
- current += sum(double_list[i])
170
- elif type_double[0] < type_double[1]:
171
- current += double_list[i][0] * double_list[i][1]
172
- #print(current)
173
- """
174
- count = 0
175
- current = 0
176
- for i, token in enumerate(tokens):
177
- count += 1
178
- if count == 2:
179
- if types[i - 1] == types[i]:
180
- current += int(str(token) + str(tokens[i - 1]))
181
- elif types[i - 1] > types[i]:
182
- current += tokens[i - 1] + token
183
- else:
184
- current += tokens[i - 1] * token
185
- count = 0
186
- elif i == len(tokens) - 1:
187
- current += token
188
-
189
- return current
190
-
191
-
192
- def text2int(text):
193
- # Wraps all of the functions up into one
194
- return tokens2int(tokenize(text))
195
 
196
 
197
  sentiment = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
@@ -206,9 +17,9 @@ with gr.Blocks() as html_block:
206
 
207
  with gr.Tab("Text to integer"):
208
  inputs_text2int = [gr.Text(
209
- placeholder="Type a number as text or a sentence",
210
- label="Text to process",
211
- value="forty two")]
212
 
213
  outputs_text2int = gr.Textbox(label="Output integer")
214
 
 
2
  import spacy # noqa
3
  from transformers import pipeline
4
 
5
+ from mathtext.nlutils import text2int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
  sentiment = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
 
17
 
18
  with gr.Tab("Text to integer"):
19
  inputs_text2int = [gr.Text(
20
+ placeholder="Type a number as text or a sentence",
21
+ label="Text to process",
22
+ value="forty two")]
23
 
24
  outputs_text2int = gr.Textbox(label="Output integer")
25
 
nlutils.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy # noqa
2
+ import time
3
+ from editdistance import eval as edit_dist
4
+
5
+ # import os
6
+ # os.environ['KMP_DUPLICATE_LIB_OK']='True'
7
+ # import spacy
8
+
9
+ # Change this according to what words should be corrected to
10
+ SPELL_CORRECT_MIN_CHAR_DIFF = 2
11
+
12
+ TOKENS2INT_ERROR_INT = 32202
13
+
14
+ ONES = [
15
+ "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
16
+ "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
17
+ "sixteen", "seventeen", "eighteen", "nineteen",
18
+ ]
19
+
20
+ CHAR_MAPPING = {
21
+ "-": " ",
22
+ "_": " ",
23
+ "and": " ",
24
+ }
25
+ # CHAR_MAPPING.update((str(i), word) for i, word in enumerate([" " + s + " " for s in ONES]))
26
+ TOKEN_MAPPING = {
27
+ "and": " ",
28
+ "oh": "0",
29
+ }
30
+
31
+
32
+ def find_char_diff(a, b):
33
+ # Finds the character difference between two str objects by counting the occurences of every character. Not edit distance.
34
+ char_counts_a = {}
35
+ char_counts_b = {}
36
+ for char in a:
37
+ if char in char_counts_a.keys():
38
+ char_counts_a[char] += 1
39
+ else:
40
+ char_counts_a[char] = 1
41
+ for char in b:
42
+ if char in char_counts_b.keys():
43
+ char_counts_b[char] += 1
44
+ else:
45
+ char_counts_b[char] = 1
46
+ char_diff = 0
47
+ for i in char_counts_a:
48
+ if i in char_counts_b.keys():
49
+ char_diff += abs(char_counts_a[i] - char_counts_b[i])
50
+ else:
51
+ char_diff += char_counts_a[i]
52
+ return char_diff
53
+
54
+
55
+ def tokenize(text):
56
+ text = text.lower()
57
+ # print(text)
58
+ text = replace_tokens(''.join(i for i in replace_chars(text)).split())
59
+ # print(text)
60
+ text = [i for i in text if i != ' ']
61
+ # print(text)
62
+ output = []
63
+ for word in text:
64
+ # print(word)
65
+ output.append(convert_word_to_int(word))
66
+ output = [i for i in output if i != ' ']
67
+ # print(output)
68
+ return output
69
+
70
+
71
+ def detokenize(tokens):
72
+ return ' '.join(tokens)
73
+
74
+
75
+ def replace_tokens(tokens, token_mapping=TOKEN_MAPPING):
76
+ return [token_mapping.get(tok, tok) for tok in tokens]
77
+
78
+
79
+ def replace_chars(text, char_mapping=CHAR_MAPPING):
80
+ return [char_mapping.get(c, c) for c in text]
81
+
82
+
83
+ def convert_word_to_int(in_word, numwords={}):
84
+ # Converts a single word/str into a single int
85
+ tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
86
+ scales = ["hundred", "thousand", "million", "billion", "trillion"]
87
+ if not numwords:
88
+ for idx, word in enumerate(ONES):
89
+ numwords[word] = idx
90
+ for idx, word in enumerate(tens):
91
+ numwords[word] = idx * 10
92
+ for idx, word in enumerate(scales):
93
+ numwords[word] = 10 ** (idx * 3 or 2)
94
+ if in_word in numwords:
95
+ # print(in_word)
96
+ # print(numwords[in_word])
97
+ return numwords[in_word]
98
+ try:
99
+ int(in_word)
100
+ return int(in_word)
101
+ except ValueError:
102
+ pass
103
+ # Spell correction using find_char_diff
104
+ char_diffs = [find_char_diff(in_word, i) for i in ONES + tens + scales]
105
+ min_char_diff = min(char_diffs)
106
+ if min_char_diff <= SPELL_CORRECT_MIN_CHAR_DIFF:
107
+ return char_diffs.index(min_char_diff)
108
+
109
+
110
+ def tokens2int(tokens):
111
+ # Takes a list of tokens and returns a int representation of them
112
+ types = []
113
+ for i in tokens:
114
+ if i <= 9:
115
+ types.append(1)
116
+
117
+ elif i <= 90:
118
+ types.append(2)
119
+
120
+ else:
121
+ types.append(3)
122
+ # print(tokens)
123
+ if len(tokens) <= 3:
124
+ current = 0
125
+ for i, number in enumerate(tokens):
126
+ if i != 0 and types[i] < types[i - 1] and current != tokens[i - 1] and types[i - 1] != 3:
127
+ current += tokens[i] + tokens[i - 1]
128
+ elif current <= tokens[i] and current != 0:
129
+ current *= tokens[i]
130
+ elif 3 not in types and 1 not in types:
131
+ current = int(''.join(str(i) for i in tokens))
132
+ break
133
+ elif '111' in ''.join(str(i) for i in types) and 2 not in types and 3 not in types:
134
+ current = int(''.join(str(i) for i in tokens))
135
+ break
136
+ else:
137
+ current += number
138
+
139
+ elif 3 not in types and 2 not in types:
140
+ current = int(''.join(str(i) for i in tokens))
141
+
142
+ else:
143
+ """
144
+ double_list = []
145
+ current_double = []
146
+ double_type_list = []
147
+ for i in tokens:
148
+ if len(current_double) < 2:
149
+ current_double.append(i)
150
+ else:
151
+ double_list.append(current_double)
152
+ current_double = []
153
+ current_double = []
154
+ for i in types:
155
+ if len(current_double) < 2:
156
+ current_double.append(i)
157
+ else:
158
+ double_type_list.append(current_double)
159
+ current_double = []
160
+ print(double_type_list)
161
+ print(double_list)
162
+ current = 0
163
+ for i, type_double in enumerate(double_type_list):
164
+ if len(type_double) == 1:
165
+ current += double_list[i][0]
166
+ elif type_double[0] == type_double[1]:
167
+ current += int(str(double_list[i][0]) + str(double_list[i][1]))
168
+ elif type_double[0] > type_double[1]:
169
+ current += sum(double_list[i])
170
+ elif type_double[0] < type_double[1]:
171
+ current += double_list[i][0] * double_list[i][1]
172
+ # print(current)
173
+ """
174
+ count = 0
175
+ current = 0
176
+ for i, token in enumerate(tokens):
177
+ count += 1
178
+ if count == 2:
179
+ if types[i - 1] == types[i]:
180
+ current += int(str(token) + str(tokens[i - 1]))
181
+ elif types[i - 1] > types[i]:
182
+ current += tokens[i - 1] + token
183
+ else:
184
+ current += tokens[i - 1] * token
185
+ count = 0
186
+ elif i == len(tokens) - 1:
187
+ current += token
188
+
189
+ return current
190
+
191
+
192
+ def text2int(text):
193
+ # Wraps all of the functions up into one
194
+ return tokens2int(tokenize(text))
195
+
196
+ ###############################################
197
+ # Vish editdistance approach doesn't halt
198
+
199
+
200
+ def lev_dist(a, b):
201
+ '''
202
+ This function will calculate the levenshtein distance between two input
203
+ strings a and b
204
+
205
+ params:
206
+ a (String) : The first string you want to compare
207
+ b (String) : The second string you want to compare
208
+
209
+ returns:
210
+ This function will return the distance between string a and b.
211
+
212
+ example:
213
+ a = 'stamp'
214
+ b = 'stomp'
215
+ lev_dist(a,b)
216
+ >> 1.0
217
+ '''
218
+ if not isinstance(a, str) and isinstance(b, str):
219
+ raise ValueError(f"lev_dist() requires 2 strings not lev_dist({repr(a)}, {repr(b)}")
220
+ if a == b:
221
+ return 0
222
+
223
+ def min_dist(s1, s2):
224
+
225
+ print(f"{a[s1]}s1{b[s2]}s2 ", end='')
226
+ if s1 >= len(a) or s2 >= len(b):
227
+ return len(a) - s1 + len(b) - s2
228
+
229
+ # no change required
230
+ if a[s1] == b[s2]:
231
+ return min_dist(s1 + 1, s2 + 1)
232
+
233
+ return 1 + min(
234
+ min_dist(s1, s2 + 1), # insert character
235
+ min_dist(s1 + 1, s2), # delete character
236
+ min_dist(s1 + 1, s2 + 1), # replace character
237
+ )
238
+
239
+ dist = min_dist(0, 0)
240
+ print(f"\n lev_dist({a}, {b}) => {dist}")
241
+ return dist
242
+
243
+
244
+ def correct_number_text(text):
245
+ """ Convert an English str containing number words with possibly incorrect spellings into an int
246
+
247
+ >>> robust_text2int("too")
248
+ 2
249
+ >>> robust_text2int("fore")
250
+ 4
251
+ >>> robust_text2int("1 2 tree")
252
+ 123
253
+ """
254
+ words = {
255
+ "zero": 0,
256
+ "one": 1,
257
+ "two": 2,
258
+ "three": 3,
259
+ "four": 4,
260
+ "five": 5,
261
+ "six": 6,
262
+ "seven": 7,
263
+ "eight": 8,
264
+ "nine": 9,
265
+ "ten": 10,
266
+ "eleven": 11,
267
+ "twelve": 12,
268
+ "thirteen": 13,
269
+ "fourteen": 14,
270
+ "fifteen": 15,
271
+ "sixteen": 16,
272
+ "seventeen": 17,
273
+ "eighteen": 18,
274
+ "nineteen": 19,
275
+ "score": 20,
276
+ "twenty": 20,
277
+ "thirty": 30,
278
+ "forty": 40,
279
+ "fifty": 50,
280
+ "sixty": 60,
281
+ "seventy": 70,
282
+ "eighty": 80,
283
+ "ninety": 90,
284
+ "hundred": 100,
285
+ "thousand": 1000,
286
+ "million": 1000000,
287
+ "billion": 1000000000,
288
+ }
289
+
290
+ text = text.lower()
291
+ text_words = text.replace("-", " ").split()
292
+ corrected_words = []
293
+ for text_word in text_words:
294
+ if text_word not in words:
295
+ print(f"{text_word} not in words")
296
+ if not isinstance(text_word, str):
297
+ return TOKENS2INT_ERROR_INT
298
+ t0 = time.time()
299
+ min_dist = len(text_word)
300
+ correct_spelling = None
301
+ for word in words:
302
+ dist = edit_dist(word, text_word)
303
+ if dist < min_dist:
304
+ correct_spelling = word
305
+ min_dist = dist
306
+ corrected_words.append(correct_spelling)
307
+ t1 = time.time()
308
+ print(f"{text_word} dt:{t1-t0}")
309
+ else:
310
+ corrected_words.append(text_word)
311
+
312
+ corrected_text = " ".join(corrected_words)
313
+
314
+ print(corrected_text)
315
+ return corrected_text
316
+
317
+ # From hereon, we can use text2int
318
+ # TODO
319
+
320
+
321
+ def robust_text2int(text):
322
+ """ Correct spelling of number words in text before using text2int """
323
+ try:
324
+ return tokens2int(tokenize(correct_number_text(text)))
325
+ except Exception as e:
326
+ print(e)
327
+ return TOKENS2INT_ERROR_INT