Spaces:

awacke1
/

VizLib-KeywordExtraction-Clustering-Translation

Runtime error

App Files Files Community

VizLib-KeywordExtraction-Clustering-Translation / app.py

awacke1

Update app.py

35e0fca almost 3 years ago

raw

history blame contribute delete

2.72 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.graph_objs as go
	from keras.preprocessing.text import Tokenizer
	import requests
	from bs4 import BeautifulSoup
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans
	import matplotlib.pyplot as plt

	# Set up the Streamlit app
	st.set_page_config(page_title='Keyword Extraction and Clustering')

	# Load data from Wikipedia
	def load_wiki_data(pages):
	data = []
	for page in pages:
	url = f'https://en.wikipedia.org/wiki/{page}'
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'html.parser')
	text = soup.get_text()
	data.append(text)
	df = pd.DataFrame({'text': data})
	return df

	# Create a bar chart of word frequency
	def plot_word_frequency(text):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(text)
	word_counts = tokenizer.word_counts
	words = list(word_counts.keys())
	counts = list(word_counts.values())

	# Categorize words by type and assign color based on type
	word_types = {}
	for word in words:
	if word.isalpha():
	if word.isupper():
	word_types[word] = 'uppercase'
	elif word.istitle():
	word_types[word] = 'titlecase'
	else:
	word_types[word] = 'lowercase'
	else:
	word_types[word] = 'other'

	colors = {'uppercase': 'red', 'titlecase': 'green', 'lowercase': 'blue', 'other': 'gray'}
	color_list = [colors[word_types[word]] for word in words]

	fig = go.Figure([go.Bar(x=words, y=counts, marker={'color': color_list})])
	fig.update_layout(title='Word Frequency')
	st.plotly_chart(fig)

	# Create a scatter plot of clustered keywords
	def plot_keyword_clusters(keywords, clusters):
	fig, ax = plt.subplots()
	ax.scatter(keywords[:,0], keywords[:,1], c=clusters)
	st.pyplot(fig)

	# Main Streamlit app
	pages = ['Python_(programming_language)', 'Data_science', 'Machine_learning']
	if st.button('Load Wikipedia Data'):
	df = load_wiki_data(pages)
	st.write('Data loaded')
	else:
	df = pd.DataFrame({'text': []})
	st.write('Click "Load Wikipedia Data" to load data')

	st.write(df)
	text = df['text'].tolist()
	if text:
	# Perform keyword extraction
	vectorizer = TfidfVectorizer(stop_words='english')
	X = vectorizer.fit_transform(text)
	#feature_names = vectorizer.get_feature_names()

	# Perform clustering of keywords
	kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
	keywords = kmeans.cluster_centers_[:, :2]

	# Plot word frequency and keyword clusters
	plot_word_frequency(text)
	plot_keyword_clusters(keywords, kmeans.labels_)