Baishali commited on
Commit
6fe6caf
·
1 Parent(s): 8b626ea

Added Project for Hosting

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -27
  2. README.md +4 -16
  3. comments_toxicity.h5 +3 -0
  4. data_cleaning.py +291 -0
  5. requirements.txt +6 -0
  6. tokenizer.pickle +3 -0
  7. web_app.py +71 -0
.gitattributes CHANGED
@@ -1,27 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bin.* filter=lfs diff=lfs merge=lfs -text
5
- *.bz2 filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.model filter=lfs diff=lfs merge=lfs -text
12
- *.msgpack filter=lfs diff=lfs merge=lfs -text
13
- *.onnx filter=lfs diff=lfs merge=lfs -text
14
- *.ot filter=lfs diff=lfs merge=lfs -text
15
- *.parquet filter=lfs diff=lfs merge=lfs -text
16
- *.pb filter=lfs diff=lfs merge=lfs -text
17
- *.pt filter=lfs diff=lfs merge=lfs -text
18
- *.pth filter=lfs diff=lfs merge=lfs -text
19
- *.rar filter=lfs diff=lfs merge=lfs -text
20
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
- *.tar.* filter=lfs diff=lfs merge=lfs -text
22
- *.tflite filter=lfs diff=lfs merge=lfs -text
23
- *.tgz filter=lfs diff=lfs merge=lfs -text
24
- *.xz filter=lfs diff=lfs merge=lfs -text
25
- *.zip filter=lfs diff=lfs merge=lfs -text
26
- *.zstandard filter=lfs diff=lfs merge=lfs -text
27
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ comments_toxicity.h5 filter=lfs diff=lfs merge=lfs -text
2
+ tokenizer.pickle filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,37 +1,25 @@
1
  ---
2
  title: Comments Toxicity Detection
3
- emoji: 😻
4
  colorFrom: indigo
5
- colorTo: pink
6
  sdk: gradio
7
- app_file: app.py
8
  pinned: false
9
  ---
10
-
11
  # Configuration
12
-
13
  `title`: _string_
14
  Display title for the Space
15
-
16
  `emoji`: _string_
17
  Space emoji (emoji-only character allowed)
18
-
19
  `colorFrom`: _string_
20
  Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
21
-
22
  `colorTo`: _string_
23
  Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
24
-
25
  `sdk`: _string_
26
  Can be either `gradio` or `streamlit`
27
-
28
- `sdk_version` : _string_
29
- Only applicable for `streamlit` SDK.
30
- See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
31
-
32
  `app_file`: _string_
33
  Path to your main application file (which contains either `gradio` or `streamlit` Python code).
34
  Path is relative to the root of the repository.
35
-
36
  `pinned`: _boolean_
37
- Whether the Space stays on top of your list.
 
1
  ---
2
  title: Comments Toxicity Detection
3
+ emoji: 📈
4
  colorFrom: indigo
5
+ colorTo: indigo
6
  sdk: gradio
7
+ app_file: web_app.py
8
  pinned: false
9
  ---
 
10
  # Configuration
 
11
  `title`: _string_
12
  Display title for the Space
 
13
  `emoji`: _string_
14
  Space emoji (emoji-only character allowed)
 
15
  `colorFrom`: _string_
16
  Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
 
17
  `colorTo`: _string_
18
  Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
 
19
  `sdk`: _string_
20
  Can be either `gradio` or `streamlit`
 
 
 
 
 
21
  `app_file`: _string_
22
  Path to your main application file (which contains either `gradio` or `streamlit` Python code).
23
  Path is relative to the root of the repository.
 
24
  `pinned`: _boolean_
25
+ Whether the Space stays on top of your list.
comments_toxicity.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a96667eeb86a77902701b6992185a73e59e27f42277e42d1906d42254ff1d3a
3
+ size 9172104
data_cleaning.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __author__ = "Baishali Dutta"
2
+ __copyright__ = "Copyright (C) 2021 Baishali Dutta"
3
+ __license__ = "Apache License 2.0"
4
+ __version__ = "0.1"
5
+
6
+ # -------------------------------------------------------------------------
7
+ # Import Libraries
8
+ # -------------------------------------------------------------------------
9
+ import re
10
+
11
+ import nltk
12
+ from nltk.corpus import stopwords
13
+ from textblob import TextBlob, Word
14
+
15
+ # -------------------------------------------------------------------------
16
+ # One-shot Instance Creation
17
+ # -------------------------------------------------------------------------
18
+ nltk.download('stopwords')
19
+ nltk.download('wordnet')
20
+ stop_words = stopwords.words('english')
21
+
22
+
23
+ # -------------------------------------------------------------------------
24
+ # Data Cleaning
25
+ # -------------------------------------------------------------------------
26
+
27
+ def convert_to_lower_case_on_string(text):
28
+ """
29
+ Coverts the specified text to lower case
30
+ :param text: the text to convert
31
+ :return: the lower cased text
32
+ """
33
+ return " ".join(text.lower() for text in text.split())
34
+
35
+
36
+ def convert_to_lower_case(text_column):
37
+ """
38
+ Coverts the text in the specified column to lower case
39
+ :param text_column: the text column whose context needs to be converted
40
+ :return: the text column containing the lower cased text
41
+ """
42
+ return text_column.apply(
43
+ lambda x: convert_to_lower_case_on_string(x))
44
+
45
+
46
+ def apply_contraction_mapping_on_string(text):
47
+ """
48
+ Applies the contraction mapping to the specified text
49
+ :param text: the text on which the contraction will be mapped
50
+ :return: the text after the application of contraction mapping
51
+ """
52
+ contraction_mapping = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because",
53
+ "could've": "could have", "couldn't": "could not",
54
+ "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
55
+ "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will",
56
+ "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
57
+ "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
58
+ "I'll've": "I will have", "I'm": "I am", "I've": "I have", "i'd": "i would",
59
+ "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am",
60
+ "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
61
+ "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us",
62
+ "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not",
63
+ "mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
64
+ "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
65
+ "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
66
+ "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
67
+ "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
68
+ "she'll've": "she will have", "she's": "she is", "should've": "should have",
69
+ "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
70
+ "so's": "so as",
71
+ "this's": "this is", "that'd": "that would", "that'd've": "that would have",
72
+ "that's": "that is",
73
+ "there'd": "there would", "there'd've": "there would have", "there's": "there is",
74
+ "here's": "here is",
75
+ "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
76
+ "they'll've": "they will have",
77
+ "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
78
+ "we'd": "we would", "we'd've": "we would have", "we'll": "we will",
79
+ "we'll've": "we will have",
80
+ "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will",
81
+ "what'll've": "what will have",
82
+ "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is",
83
+ "when've": "when have",
84
+ "where'd": "where did", "where's": "where is", "where've": "where have",
85
+ "who'll": "who will",
86
+ "who'll've": "who will have",
87
+ "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
88
+ "will've": "will have",
89
+ "won't": "will not", "won't've": "will not have", "would've": "would have",
90
+ "wouldn't": "would not",
91
+ "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
92
+ "y'all'd've": "you all would have",
93
+ "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would",
94
+ "you'd've": "you would have", "you'll": "you will",
95
+ "you'll've": "you will have", "you're": "you are", "you've": "you have"}
96
+ specials = ["’", "‘", "´", "`"]
97
+ for s in specials:
98
+ text = text.replace(s, "'")
99
+ text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])
100
+ return text
101
+
102
+
103
+ def apply_contraction_mapping(text_column):
104
+ """
105
+ Applies the contraction mapping to the text in the specified column
106
+ :param text_column: the text column on which the contraction will be mapped
107
+ :return: the text column after the application of contraction mapping
108
+ """
109
+ return text_column.apply(lambda x: apply_contraction_mapping_on_string(x))
110
+
111
+
112
+ def fix_misspelled_words_on_string2(text):
113
+ """
114
+ Fixes the misspelled words on the specified text (uses predefined misspelled dictionary)
115
+ :param text: The text to be fixed
116
+ :return: the fixed text
117
+ """
118
+ mispelled_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling',
119
+ 'counselling': 'counseling',
120
+ 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization',
121
+ 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora',
122
+ 'sallary': 'salary',
123
+ 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are',
124
+ 'howcan': 'how can',
125
+ 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I',
126
+ 'theBest': 'the best',
127
+ 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate',
128
+ "mastrubating": 'masturbating',
129
+ 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data',
130
+ '2k17': '2017', '2k18': '2018',
131
+ 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what',
132
+ 'watsapp': 'whatsapp',
133
+ 'demonitisation': 'demonetization', 'demonitization': 'demonetization',
134
+ 'demonetisation': 'demonetization', ' ur ': 'your', ' u r ': 'you are'}
135
+ for word in mispelled_dict.keys():
136
+ text = text.replace(word, mispelled_dict[word])
137
+ return text
138
+
139
+
140
+ def fix_misspelled_words_on_string(text):
141
+ """
142
+ Fixes the misspelled words on the specified text (uses TextBlob model)
143
+ :param text: The text to be fixed
144
+ :return: the fixed text
145
+ """
146
+ b = TextBlob(text)
147
+ return str(b.correct())
148
+
149
+
150
+ def fix_misspelled_words(text_column):
151
+ """
152
+ Fixes the misspelled words on the text column
153
+ :param text_column: The text column to be fixed
154
+ :return: the text column containing the text
155
+ """
156
+ return text_column.apply(lambda x: fix_misspelled_words_on_string2(x))
157
+
158
+
159
+ def remove_punctuations_on_string(text):
160
+ """
161
+ Removes all punctuations from the specified text
162
+ :param text: the text whose punctuations to be removed
163
+ :return: the text after removing the punctuations
164
+ """
165
+ return text.replace('[^\w\s]', '')
166
+
167
+
168
+ def remove_punctuations(text_column):
169
+ """
170
+ Removes all punctuations from the text of the specified text column
171
+ :param text_column: the text column whose punctuations to be removed
172
+ :return: the text column after removing the punctuations
173
+ """
174
+ return remove_punctuations_on_string(text_column.str)
175
+
176
+
177
+ def remove_emojis_on_string(text):
178
+ """
179
+ Removes emojis from the specified text
180
+ :param text: the text whose emojis need to be removed
181
+ :return: the text after removing the emojis
182
+ """
183
+ emoji_pattern = re.compile("["
184
+ u"\U0001F600-\U0001F64F" # emoticons
185
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
186
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
187
+ u"\U0001F1E0-\U0001F1FF" # flags
188
+ u"\U00002702-\U000027B0"
189
+ u"\U000024C2-\U0001F251"
190
+ "]+", flags=re.UNICODE)
191
+ return emoji_pattern.sub(r'', text)
192
+
193
+
194
+ def remove_emojis(text_column):
195
+ """
196
+ Removes emojis from the text of the specified column
197
+ :param text_column: the text column whose emojis need to be removed
198
+ :return: the text column after removing the emojis
199
+ """
200
+ return text_column.apply(lambda x: remove_emojis_on_string(x))
201
+
202
+
203
+ def remove_stopwords_on_string(text):
204
+ """
205
+ Removes all stop words from the specified text
206
+ :param text: the text whose stop words need to be removed
207
+ :return: the text after removing the stop words
208
+ """
209
+ return " ".join(x for x in text.split() if x not in stop_words)
210
+
211
+
212
+ def remove_stopwords(text_column):
213
+ """
214
+ Removes all stop words from the text of the specified column
215
+ :param text_column: the text column whose stop words need to be removed
216
+ :return: the text column after removing the stop words
217
+ """
218
+ return text_column.apply(
219
+ lambda x: remove_stopwords_on_string(x))
220
+
221
+
222
+ def lemmatize_on_string(text):
223
+ """
224
+ Lemmatizes the specified text
225
+ :param text: the text which needs to be lemmatized
226
+ :return: the lemmatized text
227
+ """
228
+ blob = TextBlob(text).split()
229
+ result=[]
230
+ for word in blob:
231
+ expected_str = Word(word)
232
+ expected_str = expected_str.lemmatize()
233
+ result.append(expected_str)
234
+ return " ".join(result)
235
+
236
+
237
+ def lemmatize(text_column):
238
+ """
239
+ Lemmatizes the text of the specified text column
240
+ :param text_column: the text column which needs to be lemmatized
241
+ :return: the lemmatized text column
242
+ """
243
+ return text_column.apply(lemmatize_on_string)
244
+
245
+
246
+ def clean_text_column(text_column):
247
+ """
248
+ Cleans the data specified in the text column
249
+ The cleaning procedure is as follows:
250
+ 1. Convert the context to lower case
251
+ 2. Apply contraction mapping in which we fix shorter usages of english sentences
252
+ 3. Fixe misspelled words
253
+ 4. Remove all punctuations
254
+ 5. Remove all emojis
255
+ 6. Remove all stop words
256
+ 7. Apply lemmatisation
257
+ :return: the text column with the cleaned data
258
+ """
259
+ text_column = convert_to_lower_case(text_column)
260
+ text_column = apply_contraction_mapping(text_column)
261
+ text_column = fix_misspelled_words(text_column)
262
+ text_column = remove_punctuations(text_column)
263
+ text_column = remove_emojis(text_column)
264
+ text_column = remove_stopwords(text_column)
265
+ text_column = lemmatize(text_column)
266
+
267
+ return text_column
268
+
269
+
270
+ def clean_text(text):
271
+ """
272
+ Cleans the specified text
273
+ The cleaning procedure is as follows:
274
+ 1. Convert the context to lower case
275
+ 2. Apply contraction mapping in which we fix shorter usages of english sentences
276
+ 3. Fix misspelled words
277
+ 4. Remove all punctuations
278
+ 5. Remove all emojis
279
+ 6. Remove all stop words
280
+ 7. Apply lemmatization
281
+ :return: the cleaned text
282
+ """
283
+ text = convert_to_lower_case_on_string(text)
284
+ text = apply_contraction_mapping_on_string(text)
285
+ text = fix_misspelled_words_on_string(text)
286
+ text = remove_punctuations_on_string(text)
287
+ text = remove_emojis_on_string(text)
288
+ text = remove_stopwords_on_string(text)
289
+ text = lemmatize_on_string(text)
290
+
291
+ return text
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Keras>=2.4.3
2
+ gradio>=1.5.3
3
+ numpy>=1.19.5
4
+ nltk~=3.5
5
+ textblob~=0.15.3
6
+ tensorflow>=2.4.1
tokenizer.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:945510a17bae3b1875889ac54f16c214eca7dcaf2fd46f9190cb5692e238093f
3
+ size 11039542
web_app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __author__ = "Baishali Dutta"
2
+ __copyright__ = "Copyright (C) 2021 Baishali Dutta"
3
+ __license__ = "Apache License 2.0"
4
+ __version__ = "0.1"
5
+
6
+ # -------------------------------------------------------------------------
7
+ # Import Libraries
8
+ # -------------------------------------------------------------------------
9
+ import pickle
10
+
11
+ import gradio as gr
12
+ from tensorflow.keras.models import load_model
13
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
14
+
15
+ from data_cleaning import clean_text
16
+
17
+ # -------------------------------------------------------------------------
18
+ # Load Existing Model and Tokenizer
19
+ # -------------------------------------------------------------------------
20
+
21
+ # load the trained model
22
+ rnn_model = load_model("comments_toxicity.h5")
23
+
24
+ # load the tokenizer
25
+ with open("tokenizer.pickle", 'rb') as handle:
26
+ tokenizer = pickle.load(handle)
27
+
28
+
29
+ # -------------------------------------------------------------------------
30
+ # Main Application
31
+ # -------------------------------------------------------------------------
32
+
33
+ def make_prediction(input_comment):
34
+ """
35
+ Predicts the toxicity of the specified comment
36
+ :param input_comment: the comment to be verified
37
+ """
38
+ input_comment = clean_text(input_comment)
39
+ input_comment = input_comment.split(" ")
40
+
41
+ sequences = tokenizer.texts_to_sequences(input_comment)
42
+ sequences = [[item for sublist in sequences for item in sublist]]
43
+
44
+ padded_data = pad_sequences(sequences, maxlen=100)
45
+ result = rnn_model.predict(padded_data, len(padded_data), verbose=1)
46
+
47
+ return \
48
+ {
49
+ "Toxic": str(result[0][0]),
50
+ "Very Toxic": str(result[0][1]),
51
+ "Obscene": str(result[0][2]),
52
+ "Threat": str(result[0][3]),
53
+ "Insult": str(result[0][4]),
54
+ "Hate": str(result[0][5]),
55
+ "Neutral": str(result[0][6])
56
+ }
57
+
58
+
59
+ comment = gr.inputs.Textbox(lines=17, placeholder="Enter your comment here")
60
+
61
+ title = "Comments Toxicity Detection"
62
+ description = "This application uses a Bidirectional Long Short-Term Memory (LSTM) Recurrent Neural Network (RNN) " \
63
+ "model to predict the inappropriateness of a comment"
64
+
65
+ gr.Interface(fn=make_prediction,
66
+ inputs=comment,
67
+ outputs="label",
68
+ title=title,
69
+ description=description,
70
+ article="http://raw.githubusercontent.com/baishalidutta/Comments-Toxicity-Detection/gradio/README.md") \
71
+ .launch(share=True)