Spaces:

awacke1
/

DataScienceDataEngineeringChallenge

Sleeping

App Files Files Community

DataScienceDataEngineeringChallenge / app.py

awacke1

Create app.py

af02f65 verified 10 months ago

raw

history blame contribute delete

3.64 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	from sklearn.feature_selection import SelectKBest, f_classif
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import IsolationForest
	from sklearn.decomposition import PCA
	import nltk
	from nltk.sentiment import SentimentIntensityAnalyzer
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from collections import Counter
	import matplotlib.pyplot as plt

	# Download necessary NLTK data
	nltk.download('vader_lexicon')
	nltk.download('punkt')
	nltk.download('stopwords')

	def main():
	st.title("AI in Data Science Demo")

	# Sidebar for navigation
	page = st.sidebar.selectbox("Choose a demo", ["Feature Engineering", "Anomaly Detection", "NLP Analysis"])

	if page == "Feature Engineering":
	feature_engineering_demo()
	elif page == "Anomaly Detection":
	anomaly_detection_demo()
	else:
	nlp_demo()

	def feature_engineering_demo():
	st.header("Automated Feature Engineering and Selection")

	# Generate sample data
	X = np.random.rand(100, 5)
	y = np.random.randint(0, 2, 100)

	# Feature selection
	selector = SelectKBest(f_classif, k=3)
	X_new = selector.fit_transform(X, y)

	# PCA
	scaler = StandardScaler()
	X_scaled = scaler.fit_transform(X)
	pca = PCA(n_components=2)
	X_pca = pca.fit_transform(X_scaled)

	# Display results
	st.subheader("Original Features")
	st.write(pd.DataFrame(X, columns=[f"Feature {i+1}" for i in range(5)]).head())

	st.subheader("Selected Top 3 Features")
	st.write(pd.DataFrame(X_new, columns=[f"Selected Feature {i+1}" for i in range(3)]).head())

	st.subheader("PCA Transformation")
	st.write(pd.DataFrame(X_pca, columns=["PC1", "PC2"]).head())

	# Visualization
	fig, ax = plt.subplots()
	ax.scatter(X_pca[:, 0], X_pca[:, 1], c=y)
	ax.set_xlabel("First Principal Component")
	ax.set_ylabel("Second Principal Component")
	ax.set_title("PCA of Dataset")
	st.pyplot(fig)

	def anomaly_detection_demo():
	st.header("Anomaly Detection")

	# Generate sample data with anomalies
	np.random.seed(42)
	X = np.random.randn(100, 2)
	X[-5:] = X[-5:] + [4, 4] # Add some anomalies

	# Fit Isolation Forest
	clf = IsolationForest(contamination=0.1, random_state=42)
	y_pred = clf.fit_predict(X)

	# Visualization
	fig, ax = plt.subplots()
	ax.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis')
	ax.set_title("Anomaly Detection using Isolation Forest")
	ax.set_xlabel("Feature 1")
	ax.set_ylabel("Feature 2")
	st.pyplot(fig)

	st.write("Points in yellow are detected as anomalies.")

	def nlp_demo():
	st.header("NLP Analysis")

	# Sample text input
	text = st.text_area("Enter text for analysis", "I love using AI for data analysis. It's exciting and powerful!")

	if text:
	# Sentiment Analysis
	sia = SentimentIntensityAnalyzer()
	sentiment = sia.polarity_scores(text)

	st.subheader("Sentiment Analysis")
	st.write(f"Positive: {sentiment['pos']:.2f}")
	st.write(f"Neutral: {sentiment['neu']:.2f}")
	st.write(f"Negative: {sentiment['neg']:.2f}")

	# Simple keyword extraction
	tokens = word_tokenize(text.lower())
	stop_words = set(stopwords.words('english'))
	keywords = [word for word in tokens if word.isalnum() and word not in stop_words]
	keyword_freq = Counter(keywords).most_common(5)

	st.subheader("Top Keywords")
	st.write(pd.DataFrame(keyword_freq, columns=["Keyword", "Frequency"]))

	if __name__ == "__main__":
	main()