Jayesh13 commited on
Commit
474a30c
·
1 Parent(s): aecf28c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -87
app.py CHANGED
@@ -1,98 +1,17 @@
1
  import streamlit as st
2
  import pickle
3
  from tensorflow.keras.models import load_model
4
- import re
5
- import string
6
-
7
- import nltk
8
- from nltk.stem.porter import PorterStemmer
9
- from nltk.stem import WordNetLemmatizer
10
- from nltk.tokenize import word_tokenize
11
-
12
- from nltk.corpus import stopwords
13
- nltk.download('stopwords')
14
- stop_words = stopwords.words('english')
15
- import html
16
- import unicodedata
17
-
18
- from tensorflow.keras.preprocessing.text import text_to_word_sequence
19
- from tensorflow.keras.preprocessing.text import Tokenizer
20
  from tensorflow.keras.preprocessing.sequence import pad_sequences
21
- from tensorflow.keras import models
22
- from tensorflow.keras import layers
23
- from tensorflow.keras import losses
24
- from tensorflow.keras import metrics
25
- from tensorflow.keras import optimizers
26
- from tensorflow.keras.utils import plot_model
27
-
28
- def remove_special_chars(text):
29
- re1 = re.compile(r' +')
30
- x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
31
- 'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
32
- '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
33
- ' @-@ ', '-').replace('\\', ' \\ ')
34
- return re1.sub(' ', html.unescape(x1))
35
-
36
- def to_lowercase(text):
37
- return text.lower()
38
-
39
- def remove_punctuation(text):
40
- """Remove punctuation from list of tokenized words"""
41
- translator = str.maketrans('', '', string.punctuation)
42
- return text.translate(translator)
43
-
44
- def replace_numbers(text):
45
- """Replace all interger occurrences in list of tokenized words with textual representation"""
46
- return re.sub(r'\d+', '', text)
47
-
48
- def remove_whitespaces(text):
49
- return text.strip()
50
-
51
- def remove_stopwords(words, stop_words):
52
- return [word for word in words if word not in stop_words]
53
-
54
- def stem_words(words):
55
- """Stem words in text"""
56
- stemmer = PorterStemmer()
57
- return [stemmer.stem(word) for word in words]
58
-
59
- def lemmatize_words(words):
60
- """Lemmatize words in text"""
61
-
62
- lemmatizer = WordNetLemmatizer()
63
- return [lemmatizer.lemmatize(word) for word in words]
64
-
65
- def lemmatize_verbs(words):
66
- """Lemmatize verbs in text"""
67
-
68
- lemmatizer = WordNetLemmatizer()
69
- return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words])
70
-
71
- def text2words(text):
72
- return word_tokenize(text)
73
-
74
- def clean_text( text):
75
- text = remove_special_chars(text)
76
- text = remove_punctuation(text)
77
- text = to_lowercase(text)
78
- text = replace_numbers(text)
79
- words = text2words(text)
80
- words = remove_stopwords(words, stop_words)
81
- #words = stem_words(words)# Either stem ovocar lemmatize
82
- words = lemmatize_words(words)
83
- words = lemmatize_verbs(words)
84
-
85
- return ''.join(words)
86
-
87
 
88
  model = load_model('tox_model.h5')
 
89
  text = st.text_area('Enter some text')
90
 
91
- comment = []
92
- comment_input = comment.append(text)
93
- comment_input = comment_input.apply(lambda x: clean_text(x))
94
- comment_input = tok.texts_to_sequences(comment_input)
95
- test = pad_sequences(comment_input,
96
  maxlen=50,
97
  truncating='post',
98
  padding='post'
 
1
  import streamlit as st
2
  import pickle
3
  from tensorflow.keras.models import load_model
4
+ from transformers import AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  model = load_model('tox_model.h5')
8
+ tokenizer = AutoTokenizer.from_pretrained('model')
9
  text = st.text_area('Enter some text')
10
 
11
+ input_ids = tokenizer.encode(text, return_tensors='pt')
12
+
13
+
14
+ test = pad_sequences(input_ids,
 
15
  maxlen=50,
16
  truncating='post',
17
  padding='post'