Commit
·
5178166
1
Parent(s):
ed8b14c
Update tokenizer.py
Browse files- tokenizer.py +34 -0
tokenizer.py
CHANGED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from data_processing import load_data, spotify_data, path
|
| 2 |
+
import pandas
|
| 3 |
+
import nltk
|
| 4 |
+
from nltk.corpus import stopwords
|
| 5 |
+
from nltk.tokenize import word_tokenize
|
| 6 |
+
import string
|
| 7 |
+
|
| 8 |
+
#---------------------------Download the requirements NLTK--------------------------------
|
| 9 |
+
|
| 10 |
+
#nltk.download('punkt')
|
| 11 |
+
#nltk.download('stopwords')
|
| 12 |
+
|
| 13 |
+
def clean_lyrics(lyrics):
|
| 14 |
+
# Tokenización
|
| 15 |
+
tokens = word_tokenize(lyrics)
|
| 16 |
+
|
| 17 |
+
# To lower case
|
| 18 |
+
tokens = [word.lower() for word in tokens]
|
| 19 |
+
|
| 20 |
+
# Delete signs
|
| 21 |
+
table = str.maketrans('', '', string.punctuation)
|
| 22 |
+
stripped_tokens = [word.translate(table) for word in tokens]
|
| 23 |
+
|
| 24 |
+
# Stop Words
|
| 25 |
+
stop_words = set(stopwords.words('english'))
|
| 26 |
+
tokens_without_sw = [word for word in stripped_tokens if word not in stop_words]
|
| 27 |
+
|
| 28 |
+
return tokens_without_sw
|
| 29 |
+
|
| 30 |
+
# Apply clean
|
| 31 |
+
spotify_data['cleaned_text'] = spotify_data['text'].apply(clean_lyrics)
|
| 32 |
+
spotify_data.to_csv('spotify_data_processed.csv', index=False)
|
| 33 |
+
|
| 34 |
+
#print(spotify_data['cleaned_text'].head())
|