Spaces:

Saturdays
/

Harmonize

Sleeping

Diego-0121 commited on Jan 5, 2024

Commit

5178166

1 Parent(s): ed8b14c

Update tokenizer.py

Files changed (1) hide show

tokenizer.py CHANGED Viewed

+from data_processing import load_data, spotify_data, path
+import pandas
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import string
+#---------------------------Download the requirements NLTK--------------------------------
+#nltk.download('punkt')
+#nltk.download('stopwords')
+def clean_lyrics(lyrics):
+    # Tokenización
+    tokens = word_tokenize(lyrics)
+    # To lower case
+    tokens = [word.lower() for word in tokens]
+    # Delete signs
+    table = str.maketrans('', '', string.punctuation)
+    stripped_tokens = [word.translate(table) for word in tokens]
+    # Stop Words
+    stop_words = set(stopwords.words('english'))
+    tokens_without_sw = [word for word in stripped_tokens if word not in stop_words]
+    return tokens_without_sw
+# Apply clean
+spotify_data['cleaned_text'] = spotify_data['text'].apply(clean_lyrics)
+spotify_data.to_csv('spotify_data_processed.csv', index=False)
+#print(spotify_data['cleaned_text'].head())