File size: 2,218 Bytes
143d008
79ffe5b
 
 
 
143d008
 
 
 
79ffe5b
 
143d008
 
 
 
79ffe5b
143d008
 
79ffe5b
 
143d008
 
 
 
 
 
a4b69f2
79ffe5b
 
 
7dc792f
79ffe5b
7dc792f
143d008
 
 
9774795
79ffe5b
143d008
 
 
 
79ffe5b
 
143d008
 
 
 
 
 
 
 
 
 
 
 
 
 
79ffe5b
 
143d008
 
79ffe5b
143d008
79ffe5b
143d008
79ffe5b
 
 
143d008
79ffe5b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

import streamlit as st
import pandas as pd
import re
import nltk
from PIL import Image
import os
import numpy as np
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import datasets 
from datasets import load_dataset
import sklearn
from sklearn.preprocessing import LabelEncoder

# loading dataset
dataset = load_dataset("merve/poetry", streaming=True)
df = pd.DataFrame.from_dict(dataset["train"])


d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
nltk.download("stopwords")
stop = stopwords.words('english')

# standardizing dataset by removing special characters and lowercasing

def standardize(text, remove_digits=True):
    text=re.sub('[^a-zA-Z\d\s]', '',text)
    text = text.lower()

    return text

st.write("Poetry dataset, content column cleaned from special characters and lowercased")
df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df.content=df.content.apply(standardize)
st.dataframe(df)

#most appearing words including stopwords
st.write("Most appearing words including stopwords")
words = df.content.str.split(expand=True).unstack().value_counts()
st.bar_chart(words[0:50])
st.set_option('deprecation.showPyplotGlobalUse', False)



mask = np.array(Image.open(os.path.join(d, "poet.png")))

# distributions of poem types according to ages and authors
st.write("Distributions of poem types according to ages and authors")
le = LabelEncoder()

df.author = le.fit_transform(df.author)
sns.catplot(x="age", y="author",hue="type", data=df)
st.pyplot()

# most appearing words other than stop words
 
import matplotlib.pyplot as plt
def word_cloud(content, title):
    wc = WordCloud(background_color="white", max_words=200,contour_width=3, 
                  stopwords=STOPWORDS, mask = mask, max_font_size=50)
    wc.generate(" ".join(content.index.values))
    fig = plt.figure(figsize=(10, 10))
    plt.title(title, fontsize=20)
    plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
    plt.axis('off')
    st.pyplot()

st.write("Most appearing words excluding stopwords")
word_cloud(words, "Word Cloud")