File size: 2,721 Bytes
7b848a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from keras.preprocessing.text import Tokenizer
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Set up the Streamlit app
st.set_page_config(page_title='Keyword Extraction and Clustering')

# Load data from Wikipedia
def load_wiki_data(pages):
    data = []
    for page in pages:
        url = f'https://en.wikipedia.org/wiki/{page}'
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()
        data.append(text)
    df = pd.DataFrame({'text': data})
    return df

# Create a bar chart of word frequency
def plot_word_frequency(text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    word_counts = tokenizer.word_counts
    words = list(word_counts.keys())
    counts = list(word_counts.values())

    # Categorize words by type and assign color based on type
    word_types = {}
    for word in words:
        if word.isalpha():
            if word.isupper():
                word_types[word] = 'uppercase'
            elif word.istitle():
                word_types[word] = 'titlecase'
            else:
                word_types[word] = 'lowercase'
        else:
            word_types[word] = 'other'

    colors = {'uppercase': 'red', 'titlecase': 'green', 'lowercase': 'blue', 'other': 'gray'}
    color_list = [colors[word_types[word]] for word in words]

    fig = go.Figure([go.Bar(x=words, y=counts, marker={'color': color_list})])
    fig.update_layout(title='Word Frequency')
    st.plotly_chart(fig)

# Create a scatter plot of clustered keywords
def plot_keyword_clusters(keywords, clusters):
    fig, ax = plt.subplots()
    ax.scatter(keywords[:,0], keywords[:,1], c=clusters)
    st.pyplot(fig)

# Main Streamlit app
pages = ['Python_(programming_language)', 'Data_science', 'Machine_learning']
if st.button('Load Wikipedia Data'):
    df = load_wiki_data(pages)
    st.write('Data loaded')
else:
    df = pd.DataFrame({'text': []})
    st.write('Click "Load Wikipedia Data" to load data')

st.write(df)
text = df['text'].tolist()
if text:
    # Perform keyword extraction
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(text)
    feature_names = vectorizer.get_feature_names()

    # Perform clustering of keywords
    kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
    keywords = kmeans.cluster_centers_[:, :2]

    # Plot word frequency and keyword clusters
    plot_word_frequency(text)
    plot_keyword_clusters(keywords, kmeans.labels_)