Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.graph_objs as go | |
| from keras.preprocessing.text import Tokenizer | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.cluster import KMeans | |
| import matplotlib.pyplot as plt | |
| # Set up the Streamlit app | |
| st.set_page_config(page_title='Keyword Extraction and Clustering') | |
| # Load data from Wikipedia | |
| def load_wiki_data(pages): | |
| data = [] | |
| for page in pages: | |
| url = f'https://en.wikipedia.org/wiki/{page}' | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| text = soup.get_text() | |
| data.append(text) | |
| df = pd.DataFrame({'text': data}) | |
| return df | |
| # Create a bar chart of word frequency | |
| def plot_word_frequency(text): | |
| tokenizer = Tokenizer() | |
| tokenizer.fit_on_texts(text) | |
| word_counts = tokenizer.word_counts | |
| words = list(word_counts.keys()) | |
| counts = list(word_counts.values()) | |
| # Categorize words by type and assign color based on type | |
| word_types = {} | |
| for word in words: | |
| if word.isalpha(): | |
| if word.isupper(): | |
| word_types[word] = 'uppercase' | |
| elif word.istitle(): | |
| word_types[word] = 'titlecase' | |
| else: | |
| word_types[word] = 'lowercase' | |
| else: | |
| word_types[word] = 'other' | |
| colors = {'uppercase': 'red', 'titlecase': 'green', 'lowercase': 'blue', 'other': 'gray'} | |
| color_list = [colors[word_types[word]] for word in words] | |
| fig = go.Figure([go.Bar(x=words, y=counts, marker={'color': color_list})]) | |
| fig.update_layout(title='Word Frequency') | |
| st.plotly_chart(fig) | |
| # Create a scatter plot of clustered keywords | |
| def plot_keyword_clusters(keywords, clusters): | |
| fig, ax = plt.subplots() | |
| ax.scatter(keywords[:,0], keywords[:,1], c=clusters) | |
| st.pyplot(fig) | |
| # Main Streamlit app | |
| pages = ['Python_(programming_language)', 'Data_science', 'Machine_learning'] | |
| if st.button('Load Wikipedia Data'): | |
| df = load_wiki_data(pages) | |
| st.write('Data loaded') | |
| else: | |
| df = pd.DataFrame({'text': []}) | |
| st.write('Click "Load Wikipedia Data" to load data') | |
| st.write(df) | |
| text = df['text'].tolist() | |
| if text: | |
| # Perform keyword extraction | |
| vectorizer = TfidfVectorizer(stop_words='english') | |
| X = vectorizer.fit_transform(text) | |
| #feature_names = vectorizer.get_feature_names() | |
| # Perform clustering of keywords | |
| kmeans = KMeans(n_clusters=3, random_state=0).fit(X) | |
| keywords = kmeans.cluster_centers_[:, :2] | |
| # Plot word frequency and keyword clusters | |
| plot_word_frequency(text) | |
| plot_keyword_clusters(keywords, kmeans.labels_) | |