Spaces:
Build error
Build error
File size: 2,218 Bytes
143d008 79ffe5b 143d008 79ffe5b 143d008 79ffe5b 143d008 79ffe5b 143d008 a4b69f2 79ffe5b 7dc792f 79ffe5b 7dc792f 143d008 9774795 79ffe5b 143d008 79ffe5b 143d008 79ffe5b 143d008 79ffe5b 143d008 79ffe5b 143d008 79ffe5b 143d008 79ffe5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import streamlit as st
import pandas as pd
import re
import nltk
from PIL import Image
import os
import numpy as np
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import datasets
from datasets import load_dataset
import sklearn
from sklearn.preprocessing import LabelEncoder
# loading dataset
dataset = load_dataset("merve/poetry", streaming=True)
df = pd.DataFrame.from_dict(dataset["train"])
d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
nltk.download("stopwords")
stop = stopwords.words('english')
# standardizing dataset by removing special characters and lowercasing
def standardize(text, remove_digits=True):
text=re.sub('[^a-zA-Z\d\s]', '',text)
text = text.lower()
return text
st.write("Poetry dataset, content column cleaned from special characters and lowercased")
df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df.content=df.content.apply(standardize)
st.dataframe(df)
#most appearing words including stopwords
st.write("Most appearing words including stopwords")
words = df.content.str.split(expand=True).unstack().value_counts()
st.bar_chart(words[0:50])
st.set_option('deprecation.showPyplotGlobalUse', False)
mask = np.array(Image.open(os.path.join(d, "poet.png")))
# distributions of poem types according to ages and authors
st.write("Distributions of poem types according to ages and authors")
le = LabelEncoder()
df.author = le.fit_transform(df.author)
sns.catplot(x="age", y="author",hue="type", data=df)
st.pyplot()
# most appearing words other than stop words
import matplotlib.pyplot as plt
def word_cloud(content, title):
wc = WordCloud(background_color="white", max_words=200,contour_width=3,
stopwords=STOPWORDS, mask = mask, max_font_size=50)
wc.generate(" ".join(content.index.values))
fig = plt.figure(figsize=(10, 10))
plt.title(title, fontsize=20)
plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
plt.axis('off')
st.pyplot()
st.write("Most appearing words excluding stopwords")
word_cloud(words, "Word Cloud") |