Spaces:
Runtime error
Runtime error
File size: 2,722 Bytes
7b848a4 35e0fca 7b848a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import streamlit as st
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from keras.preprocessing.text import Tokenizer
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
# Set up the Streamlit app
st.set_page_config(page_title='Keyword Extraction and Clustering')
# Load data from Wikipedia
def load_wiki_data(pages):
data = []
for page in pages:
url = f'https://en.wikipedia.org/wiki/{page}'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text()
data.append(text)
df = pd.DataFrame({'text': data})
return df
# Create a bar chart of word frequency
def plot_word_frequency(text):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
word_counts = tokenizer.word_counts
words = list(word_counts.keys())
counts = list(word_counts.values())
# Categorize words by type and assign color based on type
word_types = {}
for word in words:
if word.isalpha():
if word.isupper():
word_types[word] = 'uppercase'
elif word.istitle():
word_types[word] = 'titlecase'
else:
word_types[word] = 'lowercase'
else:
word_types[word] = 'other'
colors = {'uppercase': 'red', 'titlecase': 'green', 'lowercase': 'blue', 'other': 'gray'}
color_list = [colors[word_types[word]] for word in words]
fig = go.Figure([go.Bar(x=words, y=counts, marker={'color': color_list})])
fig.update_layout(title='Word Frequency')
st.plotly_chart(fig)
# Create a scatter plot of clustered keywords
def plot_keyword_clusters(keywords, clusters):
fig, ax = plt.subplots()
ax.scatter(keywords[:,0], keywords[:,1], c=clusters)
st.pyplot(fig)
# Main Streamlit app
pages = ['Python_(programming_language)', 'Data_science', 'Machine_learning']
if st.button('Load Wikipedia Data'):
df = load_wiki_data(pages)
st.write('Data loaded')
else:
df = pd.DataFrame({'text': []})
st.write('Click "Load Wikipedia Data" to load data')
st.write(df)
text = df['text'].tolist()
if text:
# Perform keyword extraction
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(text)
#feature_names = vectorizer.get_feature_names()
# Perform clustering of keywords
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
keywords = kmeans.cluster_centers_[:, :2]
# Plot word frequency and keyword clusters
plot_word_frequency(text)
plot_keyword_clusters(keywords, kmeans.labels_)
|