Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import plotly.graph_objs as go
|
5 |
+
from keras.preprocessing.text import Tokenizer
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
+
from sklearn.cluster import KMeans
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
|
12 |
+
# Set up the Streamlit app
|
13 |
+
st.set_page_config(page_title='Keyword Extraction and Clustering')
|
14 |
+
|
15 |
+
# Load data from Wikipedia
|
16 |
+
def load_wiki_data(pages):
|
17 |
+
data = []
|
18 |
+
for page in pages:
|
19 |
+
url = f'https://en.wikipedia.org/wiki/{page}'
|
20 |
+
response = requests.get(url)
|
21 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
22 |
+
text = soup.get_text()
|
23 |
+
data.append(text)
|
24 |
+
df = pd.DataFrame({'text': data})
|
25 |
+
return df
|
26 |
+
|
27 |
+
# Create a bar chart of word frequency
|
28 |
+
def plot_word_frequency(text):
|
29 |
+
tokenizer = Tokenizer()
|
30 |
+
tokenizer.fit_on_texts(text)
|
31 |
+
word_counts = tokenizer.word_counts
|
32 |
+
words = list(word_counts.keys())
|
33 |
+
counts = list(word_counts.values())
|
34 |
+
|
35 |
+
# Categorize words by type and assign color based on type
|
36 |
+
word_types = {}
|
37 |
+
for word in words:
|
38 |
+
if word.isalpha():
|
39 |
+
if word.isupper():
|
40 |
+
word_types[word] = 'uppercase'
|
41 |
+
elif word.istitle():
|
42 |
+
word_types[word] = 'titlecase'
|
43 |
+
else:
|
44 |
+
word_types[word] = 'lowercase'
|
45 |
+
else:
|
46 |
+
word_types[word] = 'other'
|
47 |
+
|
48 |
+
colors = {'uppercase': 'red', 'titlecase': 'green', 'lowercase': 'blue', 'other': 'gray'}
|
49 |
+
color_list = [colors[word_types[word]] for word in words]
|
50 |
+
|
51 |
+
fig = go.Figure([go.Bar(x=words, y=counts, marker={'color': color_list})])
|
52 |
+
fig.update_layout(title='Word Frequency')
|
53 |
+
st.plotly_chart(fig)
|
54 |
+
|
55 |
+
# Create a scatter plot of clustered keywords
|
56 |
+
def plot_keyword_clusters(keywords, clusters):
|
57 |
+
fig, ax = plt.subplots()
|
58 |
+
ax.scatter(keywords[:,0], keywords[:,1], c=clusters)
|
59 |
+
st.pyplot(fig)
|
60 |
+
|
61 |
+
# Main Streamlit app
|
62 |
+
pages = ['Python_(programming_language)', 'Data_science', 'Machine_learning']
|
63 |
+
if st.button('Load Wikipedia Data'):
|
64 |
+
df = load_wiki_data(pages)
|
65 |
+
st.write('Data loaded')
|
66 |
+
else:
|
67 |
+
df = pd.DataFrame({'text': []})
|
68 |
+
st.write('Click "Load Wikipedia Data" to load data')
|
69 |
+
|
70 |
+
st.write(df)
|
71 |
+
text = df['text'].tolist()
|
72 |
+
if text:
|
73 |
+
# Perform keyword extraction
|
74 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
75 |
+
X = vectorizer.fit_transform(text)
|
76 |
+
feature_names = vectorizer.get_feature_names()
|
77 |
+
|
78 |
+
# Perform clustering of keywords
|
79 |
+
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
|
80 |
+
keywords = kmeans.cluster_centers_[:, :2]
|
81 |
+
|
82 |
+
# Plot word frequency and keyword clusters
|
83 |
+
plot_word_frequency(text)
|
84 |
+
plot_keyword_clusters(keywords, kmeans.labels_)
|