import streamlit as st import pandas as pd import numpy as np from sklearn.feature_selection import SelectKBest, f_classif from sklearn.preprocessing import StandardScaler from sklearn.ensemble import IsolationForest from sklearn.decomposition import PCA import nltk from nltk.sentiment import SentimentIntensityAnalyzer from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from collections import Counter import matplotlib.pyplot as plt # Download necessary NLTK data nltk.download('vader_lexicon') nltk.download('punkt') nltk.download('stopwords') def main(): st.title("AI in Data Science Demo") # Sidebar for navigation page = st.sidebar.selectbox("Choose a demo", ["Feature Engineering", "Anomaly Detection", "NLP Analysis"]) if page == "Feature Engineering": feature_engineering_demo() elif page == "Anomaly Detection": anomaly_detection_demo() else: nlp_demo() def feature_engineering_demo(): st.header("Automated Feature Engineering and Selection") # Generate sample data X = np.random.rand(100, 5) y = np.random.randint(0, 2, 100) # Feature selection selector = SelectKBest(f_classif, k=3) X_new = selector.fit_transform(X, y) # PCA scaler = StandardScaler() X_scaled = scaler.fit_transform(X) pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) # Display results st.subheader("Original Features") st.write(pd.DataFrame(X, columns=[f"Feature {i+1}" for i in range(5)]).head()) st.subheader("Selected Top 3 Features") st.write(pd.DataFrame(X_new, columns=[f"Selected Feature {i+1}" for i in range(3)]).head()) st.subheader("PCA Transformation") st.write(pd.DataFrame(X_pca, columns=["PC1", "PC2"]).head()) # Visualization fig, ax = plt.subplots() ax.scatter(X_pca[:, 0], X_pca[:, 1], c=y) ax.set_xlabel("First Principal Component") ax.set_ylabel("Second Principal Component") ax.set_title("PCA of Dataset") st.pyplot(fig) def anomaly_detection_demo(): st.header("Anomaly Detection") # Generate sample data with anomalies np.random.seed(42) X = np.random.randn(100, 2) X[-5:] = X[-5:] + [4, 4] # Add some anomalies # Fit Isolation Forest clf = IsolationForest(contamination=0.1, random_state=42) y_pred = clf.fit_predict(X) # Visualization fig, ax = plt.subplots() ax.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis') ax.set_title("Anomaly Detection using Isolation Forest") ax.set_xlabel("Feature 1") ax.set_ylabel("Feature 2") st.pyplot(fig) st.write("Points in yellow are detected as anomalies.") def nlp_demo(): st.header("NLP Analysis") # Sample text input text = st.text_area("Enter text for analysis", "I love using AI for data analysis. It's exciting and powerful!") if text: # Sentiment Analysis sia = SentimentIntensityAnalyzer() sentiment = sia.polarity_scores(text) st.subheader("Sentiment Analysis") st.write(f"Positive: {sentiment['pos']:.2f}") st.write(f"Neutral: {sentiment['neu']:.2f}") st.write(f"Negative: {sentiment['neg']:.2f}") # Simple keyword extraction tokens = word_tokenize(text.lower()) stop_words = set(stopwords.words('english')) keywords = [word for word in tokens if word.isalnum() and word not in stop_words] keyword_freq = Counter(keywords).most_common(5) st.subheader("Top Keywords") st.write(pd.DataFrame(keyword_freq, columns=["Keyword", "Frequency"])) if __name__ == "__main__": main()