File size: 3,635 Bytes
af02f65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt

# Download necessary NLTK data
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

def main():
    st.title("AI in Data Science Demo")

    # Sidebar for navigation
    page = st.sidebar.selectbox("Choose a demo", ["Feature Engineering", "Anomaly Detection", "NLP Analysis"])

    if page == "Feature Engineering":
        feature_engineering_demo()
    elif page == "Anomaly Detection":
        anomaly_detection_demo()
    else:
        nlp_demo()

def feature_engineering_demo():
    st.header("Automated Feature Engineering and Selection")

    # Generate sample data
    X = np.random.rand(100, 5)
    y = np.random.randint(0, 2, 100)

    # Feature selection
    selector = SelectKBest(f_classif, k=3)
    X_new = selector.fit_transform(X, y)

    # PCA
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)

    # Display results
    st.subheader("Original Features")
    st.write(pd.DataFrame(X, columns=[f"Feature {i+1}" for i in range(5)]).head())

    st.subheader("Selected Top 3 Features")
    st.write(pd.DataFrame(X_new, columns=[f"Selected Feature {i+1}" for i in range(3)]).head())

    st.subheader("PCA Transformation")
    st.write(pd.DataFrame(X_pca, columns=["PC1", "PC2"]).head())

    # Visualization
    fig, ax = plt.subplots()
    ax.scatter(X_pca[:, 0], X_pca[:, 1], c=y)
    ax.set_xlabel("First Principal Component")
    ax.set_ylabel("Second Principal Component")
    ax.set_title("PCA of Dataset")
    st.pyplot(fig)

def anomaly_detection_demo():
    st.header("Anomaly Detection")

    # Generate sample data with anomalies
    np.random.seed(42)
    X = np.random.randn(100, 2)
    X[-5:] = X[-5:] + [4, 4]  # Add some anomalies

    # Fit Isolation Forest
    clf = IsolationForest(contamination=0.1, random_state=42)
    y_pred = clf.fit_predict(X)

    # Visualization
    fig, ax = plt.subplots()
    ax.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis')
    ax.set_title("Anomaly Detection using Isolation Forest")
    ax.set_xlabel("Feature 1")
    ax.set_ylabel("Feature 2")
    st.pyplot(fig)

    st.write("Points in yellow are detected as anomalies.")

def nlp_demo():
    st.header("NLP Analysis")

    # Sample text input
    text = st.text_area("Enter text for analysis", "I love using AI for data analysis. It's exciting and powerful!")

    if text:
        # Sentiment Analysis
        sia = SentimentIntensityAnalyzer()
        sentiment = sia.polarity_scores(text)

        st.subheader("Sentiment Analysis")
        st.write(f"Positive: {sentiment['pos']:.2f}")
        st.write(f"Neutral: {sentiment['neu']:.2f}")
        st.write(f"Negative: {sentiment['neg']:.2f}")

        # Simple keyword extraction
        tokens = word_tokenize(text.lower())
        stop_words = set(stopwords.words('english'))
        keywords = [word for word in tokens if word.isalnum() and word not in stop_words]
        keyword_freq = Counter(keywords).most_common(5)

        st.subheader("Top Keywords")
        st.write(pd.DataFrame(keyword_freq, columns=["Keyword", "Frequency"]))

if __name__ == "__main__":
    main()