awacke1 commited on
Commit
7b848a4
·
1 Parent(s): 35ae446

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.graph_objs as go
5
+ from keras.preprocessing.text import Tokenizer
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.cluster import KMeans
10
+ import matplotlib.pyplot as plt
11
+
12
+ # Set up the Streamlit app
13
+ st.set_page_config(page_title='Keyword Extraction and Clustering')
14
+
15
+ # Load data from Wikipedia
16
+ def load_wiki_data(pages):
17
+ data = []
18
+ for page in pages:
19
+ url = f'https://en.wikipedia.org/wiki/{page}'
20
+ response = requests.get(url)
21
+ soup = BeautifulSoup(response.content, 'html.parser')
22
+ text = soup.get_text()
23
+ data.append(text)
24
+ df = pd.DataFrame({'text': data})
25
+ return df
26
+
27
+ # Create a bar chart of word frequency
28
+ def plot_word_frequency(text):
29
+ tokenizer = Tokenizer()
30
+ tokenizer.fit_on_texts(text)
31
+ word_counts = tokenizer.word_counts
32
+ words = list(word_counts.keys())
33
+ counts = list(word_counts.values())
34
+
35
+ # Categorize words by type and assign color based on type
36
+ word_types = {}
37
+ for word in words:
38
+ if word.isalpha():
39
+ if word.isupper():
40
+ word_types[word] = 'uppercase'
41
+ elif word.istitle():
42
+ word_types[word] = 'titlecase'
43
+ else:
44
+ word_types[word] = 'lowercase'
45
+ else:
46
+ word_types[word] = 'other'
47
+
48
+ colors = {'uppercase': 'red', 'titlecase': 'green', 'lowercase': 'blue', 'other': 'gray'}
49
+ color_list = [colors[word_types[word]] for word in words]
50
+
51
+ fig = go.Figure([go.Bar(x=words, y=counts, marker={'color': color_list})])
52
+ fig.update_layout(title='Word Frequency')
53
+ st.plotly_chart(fig)
54
+
55
+ # Create a scatter plot of clustered keywords
56
+ def plot_keyword_clusters(keywords, clusters):
57
+ fig, ax = plt.subplots()
58
+ ax.scatter(keywords[:,0], keywords[:,1], c=clusters)
59
+ st.pyplot(fig)
60
+
61
+ # Main Streamlit app
62
+ pages = ['Python_(programming_language)', 'Data_science', 'Machine_learning']
63
+ if st.button('Load Wikipedia Data'):
64
+ df = load_wiki_data(pages)
65
+ st.write('Data loaded')
66
+ else:
67
+ df = pd.DataFrame({'text': []})
68
+ st.write('Click "Load Wikipedia Data" to load data')
69
+
70
+ st.write(df)
71
+ text = df['text'].tolist()
72
+ if text:
73
+ # Perform keyword extraction
74
+ vectorizer = TfidfVectorizer(stop_words='english')
75
+ X = vectorizer.fit_transform(text)
76
+ feature_names = vectorizer.get_feature_names()
77
+
78
+ # Perform clustering of keywords
79
+ kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
80
+ keywords = kmeans.cluster_centers_[:, :2]
81
+
82
+ # Plot word frequency and keyword clusters
83
+ plot_word_frequency(text)
84
+ plot_keyword_clusters(keywords, kmeans.labels_)