Spaces:

merve
/

streamlit-dataset-demo

Build error

App Files Files Community

merve HF Staff commited on Sep 27, 2021

Commit

143d008

1 Parent(s): 76eb760

Upload app.py

Browse files

Files changed (1) hide show

app.py +43 -14

app.py CHANGED Viewed

@@ -1,18 +1,29 @@
-from datasets import load_dataset
 import streamlit as st
 import pandas as pd
 import re
 import nltk
 from wordcloud import WordCloud, STOPWORDS
 from nltk.corpus import stopwords
-nltk.download("stopwords")
-stop = stopwords.words('english')
-dataset = load_dataset("huggingartists/gorillaz")
 df = pd.DataFrame.from_dict(dataset["train"])
-st.dataframe(df)
-st.write("Removed special characters")
 def standardize(text, remove_digits=True):
     text=re.sub('[^a-zA-Z\d\s]', '',text)
@@ -20,23 +31,41 @@ def standardize(text, remove_digits=True):
     return text
-df.text = df.text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
-df.text=df.text.apply(standardize)
 st.dataframe(df)
-words = df.text.str.split(expand=True).unstack().value_counts()
-st.bar_chart(words[20:40])
 st.set_option('deprecation.showPyplotGlobalUse', False)
 import matplotlib.pyplot as plt
 def word_cloud(content, title):
-    wc = WordCloud(background_color='white', max_words=200,
-                  stopwords=STOPWORDS, max_font_size=50)
     wc.generate(" ".join(content.index.values))
-    fig = plt.figure(figsize=(16, 13))
     plt.title(title, fontsize=20)
-    plt.imshow(wc.recolor(colormap='Pastel2', random_state=42), alpha=0.98)
     plt.axis('off')
     st.pyplot()
 word_cloud(words, "Word Cloud")

 import streamlit as st
 import pandas as pd
 import re
 import nltk
+from PIL import Image
+import os
+import numpy as np
+import seaborn as sns
 from wordcloud import WordCloud, STOPWORDS
 from nltk.corpus import stopwords
+import datasets
+from datasets import load_dataset
+import sklearn
+from sklearn.preprocessing import LabelEncoder
+# loading dataset
+dataset = load_dataset("merve/poetry", streaming=True)
 df = pd.DataFrame.from_dict(dataset["train"])
+d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
+nltk.download("stopwords")
+stop = stopwords.words('english')
+# standardizing dataset by removing special characters and lowercasing
 def standardize(text, remove_digits=True):
     text=re.sub('[^a-zA-Z\d\s]', '',text)
     return text
+st.write("Poetry dataset, content column cleaned from special characters and lowercased")
+df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
+df.content=df.content.apply(standardize)
 st.dataframe(df)
+#most appearing words including stopwords
+st.write("Most appearing words including stopwords")
+words = df.content.str.split(expand=True).unstack().value_counts()
+st.bar_chart(words[0:50])
 st.set_option('deprecation.showPyplotGlobalUse', False)
+mask = np.array(Image.open(os.path.join(d, "poet.png")))
+# distributions of poem types according to ages and authors
+st.write("Distributions of poem types according to ages and authors")
+le = LabelEncoder()
+df.author = le.fit_transform(df.author)
+sns.catplot(x="age", y="author",hue="type", data=df)
+st.pyplot()
+# most appearing words other than stop words
 import matplotlib.pyplot as plt
 def word_cloud(content, title):
+    wc = WordCloud(background_color="white", max_words=200,contour_width=3,
+                  stopwords=STOPWORDS, mask = mask, max_font_size=50)
     wc.generate(" ".join(content.index.values))
+    fig = plt.figure(figsize=(10, 10))
     plt.title(title, fontsize=20)
+    plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
     plt.axis('off')
     st.pyplot()
+st.write("Most appearing words excluding stopwords")
 word_cloud(words, "Word Cloud")