|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.feature_selection import SelectKBest, f_classif |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.ensemble import IsolationForest |
|
from sklearn.decomposition import PCA |
|
import nltk |
|
from nltk.sentiment import SentimentIntensityAnalyzer |
|
from nltk.tokenize import word_tokenize |
|
from nltk.corpus import stopwords |
|
from collections import Counter |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
nltk.download('vader_lexicon') |
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
|
|
def main(): |
|
st.title("AI in Data Science Demo") |
|
|
|
|
|
page = st.sidebar.selectbox("Choose a demo", ["Feature Engineering", "Anomaly Detection", "NLP Analysis"]) |
|
|
|
if page == "Feature Engineering": |
|
feature_engineering_demo() |
|
elif page == "Anomaly Detection": |
|
anomaly_detection_demo() |
|
else: |
|
nlp_demo() |
|
|
|
def feature_engineering_demo(): |
|
st.header("Automated Feature Engineering and Selection") |
|
|
|
|
|
X = np.random.rand(100, 5) |
|
y = np.random.randint(0, 2, 100) |
|
|
|
|
|
selector = SelectKBest(f_classif, k=3) |
|
X_new = selector.fit_transform(X, y) |
|
|
|
|
|
scaler = StandardScaler() |
|
X_scaled = scaler.fit_transform(X) |
|
pca = PCA(n_components=2) |
|
X_pca = pca.fit_transform(X_scaled) |
|
|
|
|
|
st.subheader("Original Features") |
|
st.write(pd.DataFrame(X, columns=[f"Feature {i+1}" for i in range(5)]).head()) |
|
|
|
st.subheader("Selected Top 3 Features") |
|
st.write(pd.DataFrame(X_new, columns=[f"Selected Feature {i+1}" for i in range(3)]).head()) |
|
|
|
st.subheader("PCA Transformation") |
|
st.write(pd.DataFrame(X_pca, columns=["PC1", "PC2"]).head()) |
|
|
|
|
|
fig, ax = plt.subplots() |
|
ax.scatter(X_pca[:, 0], X_pca[:, 1], c=y) |
|
ax.set_xlabel("First Principal Component") |
|
ax.set_ylabel("Second Principal Component") |
|
ax.set_title("PCA of Dataset") |
|
st.pyplot(fig) |
|
|
|
def anomaly_detection_demo(): |
|
st.header("Anomaly Detection") |
|
|
|
|
|
np.random.seed(42) |
|
X = np.random.randn(100, 2) |
|
X[-5:] = X[-5:] + [4, 4] |
|
|
|
|
|
clf = IsolationForest(contamination=0.1, random_state=42) |
|
y_pred = clf.fit_predict(X) |
|
|
|
|
|
fig, ax = plt.subplots() |
|
ax.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis') |
|
ax.set_title("Anomaly Detection using Isolation Forest") |
|
ax.set_xlabel("Feature 1") |
|
ax.set_ylabel("Feature 2") |
|
st.pyplot(fig) |
|
|
|
st.write("Points in yellow are detected as anomalies.") |
|
|
|
def nlp_demo(): |
|
st.header("NLP Analysis") |
|
|
|
|
|
text = st.text_area("Enter text for analysis", "I love using AI for data analysis. It's exciting and powerful!") |
|
|
|
if text: |
|
|
|
sia = SentimentIntensityAnalyzer() |
|
sentiment = sia.polarity_scores(text) |
|
|
|
st.subheader("Sentiment Analysis") |
|
st.write(f"Positive: {sentiment['pos']:.2f}") |
|
st.write(f"Neutral: {sentiment['neu']:.2f}") |
|
st.write(f"Negative: {sentiment['neg']:.2f}") |
|
|
|
|
|
tokens = word_tokenize(text.lower()) |
|
stop_words = set(stopwords.words('english')) |
|
keywords = [word for word in tokens if word.isalnum() and word not in stop_words] |
|
keyword_freq = Counter(keywords).most_common(5) |
|
|
|
st.subheader("Top Keywords") |
|
st.write(pd.DataFrame(keyword_freq, columns=["Keyword", "Frequency"])) |
|
|
|
if __name__ == "__main__": |
|
main() |