Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from io import StringIO | |
| import openpyxl | |
| import matplotlib.font_manager as fm | |
| from scipy import stats | |
| # ํ๊ธ ํฐํธ ์ค์ | |
| def set_font(): | |
| font_path = "Pretendard-Bold.ttf" # ์ค์ ํฐํธ ํ์ผ ๊ฒฝ๋ก๋ก ๋ณ๊ฒฝํด์ฃผ์ธ์ | |
| fm.fontManager.addfont(font_path) | |
| return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False} | |
| # ํฐํธ ์ค์ ์ ๊ฐ์ ธ์ต๋๋ค | |
| font_settings = set_font() | |
| def load_data(file): | |
| file_extension = file.name.split('.')[-1].lower() | |
| if file_extension == 'csv': | |
| data = pd.read_csv(file) | |
| elif file_extension in ['xls', 'xlsx']: | |
| data = pd.read_excel(file) | |
| else: | |
| st.error("์ง์๋์ง ์๋ ํ์ผ ํ์์ ๋๋ค. CSV, XLS, ๋๋ XLSX ํ์ผ์ ์ ๋ก๋ํด์ฃผ์ธ์.") | |
| return None | |
| return data | |
| def manual_data_entry(): | |
| st.subheader("์๋ ๋ฐ์ดํฐ ์ ๋ ฅ") | |
| col_names = st.text_input("์ด ์ด๋ฆ์ ์ผํ๋ก ๊ตฌ๋ถํ์ฌ ์ ๋ ฅํ์ธ์:").split(',') | |
| col_names = [name.strip() for name in col_names if name.strip()] | |
| if col_names: | |
| num_rows = st.number_input("์ด๊ธฐ ํ์ ์๋ฅผ ์ ๋ ฅํ์ธ์:", min_value=1, value=5) | |
| data = pd.DataFrame(columns=col_names, index=range(num_rows)) | |
| edited_data = st.data_editor(data, num_rows="dynamic") | |
| return edited_data | |
| return None | |
| def preprocess_data(data): | |
| st.subheader("๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ") | |
| # ๊ฒฐ์ธก์น ์ฒ๋ฆฌ | |
| if data.isnull().sum().sum() > 0: | |
| st.write("๊ฒฐ์ธก์น ์ฒ๋ฆฌ:") | |
| for column in data.columns: | |
| if data[column].isnull().sum() > 0: | |
| method = st.selectbox(f"{column} ์ด์ ์ฒ๋ฆฌ ๋ฐฉ๋ฒ ์ ํ:", | |
| ["์ ๊ฑฐ", "ํ๊ท ์ผ๋ก ๋์ฒด", "์ค์๊ฐ์ผ๋ก ๋์ฒด", "์ต๋น๊ฐ์ผ๋ก ๋์ฒด"]) | |
| if method == "์ ๊ฑฐ": | |
| data = data.dropna(subset=[column]) | |
| elif method == "ํ๊ท ์ผ๋ก ๋์ฒด": | |
| data[column].fillna(data[column].mean(), inplace=True) | |
| elif method == "์ค์๊ฐ์ผ๋ก ๋์ฒด": | |
| data[column].fillna(data[column].median(), inplace=True) | |
| elif method == "์ต๋น๊ฐ์ผ๋ก ๋์ฒด": | |
| data[column].fillna(data[column].mode()[0], inplace=True) | |
| # ๋ฐ์ดํฐ ํ์ ๋ณํ | |
| for column in data.columns: | |
| if data[column].dtype == 'object': | |
| try: | |
| data[column] = pd.to_numeric(data[column]) | |
| st.write(f"{column} ์ด์ ์ซ์ํ์ผ๋ก ๋ณํํ์ต๋๋ค.") | |
| except ValueError: | |
| st.write(f"{column} ์ด์ ๋ฒ์ฃผํ์ผ๋ก ์ ์ง๋ฉ๋๋ค.") | |
| return data | |
| def perform_analysis(data): | |
| st.header("ํ์์ ๋ฐ์ดํฐ ๋ถ์") | |
| # ์์ฝ ํต๊ณ | |
| st.write("์์ฝ ํต๊ณ:") | |
| st.write(data.describe()) | |
| # ์๊ด๊ด๊ณ ํํธ๋งต | |
| st.write("์๊ด๊ด๊ณ ํํธ๋งต:") | |
| numeric_data = data.select_dtypes(include=['float64', 'int64']) | |
| if not numeric_data.empty: | |
| fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1) | |
| fig.update_layout(title='์๊ด๊ด๊ณ ํํธ๋งต') | |
| st.plotly_chart(fig) | |
| else: | |
| st.write("์๊ด๊ด๊ณ ํํธ๋งต์ ๊ทธ๋ฆด ์ ์๋ ์ซ์ํ ์ด์ด ์์ต๋๋ค.") | |
| # ๊ณผ๋ชฉ๋ณ ์ ์ ๋ถํฌ | |
| if '๊ณผ๋ชฉ' in data.columns and 'ํ์ตํ๊ฐ' in data.columns: | |
| st.write("๊ณผ๋ชฉ๋ณ ์ ์ ๋ถํฌ:") | |
| fig = px.box(data, x='๊ณผ๋ชฉ', y='ํ์ตํ๊ฐ', points="all") | |
| fig.update_layout(title='๊ณผ๋ชฉ๋ณ ํ์ตํ๊ฐ ์ ์ ๋ถํฌ') | |
| st.plotly_chart(fig) | |
| # ์๋ณ ์ ์ ์ถ์ด | |
| if '๋ฌ' in data.columns and 'ํ์ตํ๊ฐ' in data.columns: | |
| st.write("์๋ณ ์ ์ ์ถ์ด:") | |
| fig = px.line(data, x='๋ฌ', y='ํ์ตํ๊ฐ', color='๊ณผ๋ชฉ', markers=True) | |
| fig.update_layout(title='์๋ณ ํ์ตํ๊ฐ ์ ์ ์ถ์ด') | |
| st.plotly_chart(fig) | |
| # ์๊ธฐ๋ ธ๋ ฅ๋์ ํ์ตํ๊ฐ ๊ด๊ณ (ํ๊ท์ ๊ณผ R-squared ์ถ๊ฐ) | |
| if '์๊ธฐ๋ ธ๋ ฅ๋' in data.columns and 'ํ์ตํ๊ฐ' in data.columns: | |
| st.write("์๊ธฐ๋ ธ๋ ฅ๋์ ํ์ตํ๊ฐ ๊ด๊ณ:") | |
| fig = px.scatter(data, x='์๊ธฐ๋ ธ๋ ฅ๋', y='ํ์ตํ๊ฐ', color='๊ณผ๋ชฉ', hover_data=['๋ฌ']) | |
| # ์ ์ฒด ๋ฐ์ดํฐ์ ๋ํ ํ๊ท์ ์ถ๊ฐ | |
| x = data['์๊ธฐ๋ ธ๋ ฅ๋'] | |
| y = data['ํ์ตํ๊ฐ'] | |
| slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) | |
| line_x = np.array([x.min(), x.max()]) | |
| line_y = slope * line_x + intercept | |
| fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='ํ๊ท์ ')) | |
| r_squared = r_value ** 2 | |
| fig.update_layout( | |
| title=f'์๊ธฐ๋ ธ๋ ฅ๋์ ํ์ตํ๊ฐ ๊ด๊ณ (R-squared: {r_squared:.4f})', | |
| annotations=[ | |
| dict( | |
| x=0.5, | |
| y=1.05, | |
| xref='paper', | |
| yref='paper', | |
| text=f'R-squared: {r_squared:.4f}', | |
| showarrow=False, | |
| ) | |
| ] | |
| ) | |
| st.plotly_chart(fig) | |
| # ์ธํฐ๋ํฐ๋ธ ํํฐ๋ง | |
| st.write("์ธํฐ๋ํฐ๋ธ ํํฐ๋ง:") | |
| if '์๊ธฐ๋ ธ๋ ฅ๋' in data.columns: | |
| min_effort = int(data['์๊ธฐ๋ ธ๋ ฅ๋'].min()) | |
| max_effort = int(data['์๊ธฐ๋ ธ๋ ฅ๋'].max()) | |
| effort_range = st.slider("์๊ธฐ๋ ธ๋ ฅ๋ ๋ฒ์ ์ ํ", min_effort, max_effort, (min_effort, max_effort)) | |
| filtered_data = data[(data['์๊ธฐ๋ ธ๋ ฅ๋'] >= effort_range[0]) & (data['์๊ธฐ๋ ธ๋ ฅ๋'] <= effort_range[1])] | |
| if '๊ณผ๋ชฉ' in filtered_data.columns and 'ํ์ตํ๊ฐ' in filtered_data.columns: | |
| fig = px.scatter(filtered_data, x='์๊ธฐ๋ ธ๋ ฅ๋', y='ํ์ตํ๊ฐ', color='๊ณผ๋ชฉ', hover_data=['๋ฌ']) | |
| # ํํฐ๋ง๋ ๋ฐ์ดํฐ์ ๋ํ ํ๊ท์ ์ถ๊ฐ | |
| x = filtered_data['์๊ธฐ๋ ธ๋ ฅ๋'] | |
| y = filtered_data['ํ์ตํ๊ฐ'] | |
| slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) | |
| line_x = np.array([x.min(), x.max()]) | |
| line_y = slope * line_x + intercept | |
| fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='ํ๊ท์ ')) | |
| r_squared = r_value ** 2 | |
| fig.update_layout( | |
| title=f'์๊ธฐ๋ ธ๋ ฅ๋ {effort_range[0]}-{effort_range[1]} ๋ฒ์์ ํ์ตํ๊ฐ ๊ด๊ณ (R-squared: {r_squared:.4f})', | |
| annotations=[ | |
| dict( | |
| x=0.5, | |
| y=1.05, | |
| xref='paper', | |
| yref='paper', | |
| text=f'R-squared: {r_squared:.4f}', | |
| showarrow=False, | |
| ) | |
| ] | |
| ) | |
| st.plotly_chart(fig) | |
| # ๊ณผ๋ชฉ๋ณ ์์ธ ๋ถ์ | |
| if '๊ณผ๋ชฉ' in data.columns: | |
| st.write("๊ณผ๋ชฉ๋ณ ์์ธ ๋ถ์:") | |
| selected_subject = st.selectbox("๋ถ์ํ ๊ณผ๋ชฉ ์ ํ", data['๊ณผ๋ชฉ'].unique()) | |
| subject_data = data[data['๊ณผ๋ชฉ'] == selected_subject] | |
| if '๋ฌ' in subject_data.columns and 'ํ์ตํ๊ฐ' in subject_data.columns: | |
| fig = px.line(subject_data, x='๋ฌ', y='ํ์ตํ๊ฐ', markers=True) | |
| fig.update_layout(title=f'{selected_subject} ์๋ณ ํ์ตํ๊ฐ ์ ์ ์ถ์ด') | |
| st.plotly_chart(fig) | |
| if '์๊ธฐ๋ ธ๋ ฅ๋' in subject_data.columns and 'ํ์ตํ๊ฐ' in subject_data.columns: | |
| fig = px.scatter(subject_data, x='์๊ธฐ๋ ธ๋ ฅ๋', y='ํ์ตํ๊ฐ', hover_data=['๋ฌ']) | |
| # ์ ํ๋ ๊ณผ๋ชฉ์ ๋ํ ํ๊ท์ ์ถ๊ฐ | |
| x = subject_data['์๊ธฐ๋ ธ๋ ฅ๋'] | |
| y = subject_data['ํ์ตํ๊ฐ'] | |
| slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) | |
| line_x = np.array([x.min(), x.max()]) | |
| line_y = slope * line_x + intercept | |
| fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='ํ๊ท์ ')) | |
| r_squared = r_value ** 2 | |
| fig.update_layout( | |
| title=f'{selected_subject} ์๊ธฐ๋ ธ๋ ฅ๋์ ํ์ตํ๊ฐ ๊ด๊ณ (R-squared: {r_squared:.4f})', | |
| annotations=[ | |
| dict( | |
| x=0.5, | |
| y=1.05, | |
| xref='paper', | |
| yref='paper', | |
| text=f'R-squared: {r_squared:.4f}', | |
| showarrow=False, | |
| ) | |
| ] | |
| ) | |
| st.plotly_chart(fig) | |
| def main(): | |
| st.title("์ธํฐ๋ํฐ๋ธ EDA ํดํท") | |
| data_input_method = st.radio("๋ฐ์ดํฐ ์ ๋ ฅ ๋ฐฉ๋ฒ ์ ํ:", ("ํ์ผ ์ ๋ก๋", "์๋ ์ ๋ ฅ")) | |
| if data_input_method == "ํ์ผ ์ ๋ก๋": | |
| uploaded_file = st.file_uploader("CSV, XLS, ๋๋ XLSX ํ์ผ์ ์ ํํ์ธ์", type=["csv", "xls", "xlsx"]) | |
| if uploaded_file is not None: | |
| data = load_data(uploaded_file) | |
| else: | |
| data = None | |
| else: | |
| data = manual_data_entry() | |
| if data is not None: | |
| st.subheader("๋ฐ์ดํฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ ๋ฐ ์์ ") | |
| st.write("๋ฐ์ดํฐ๋ฅผ ํ์ธํ๊ณ ํ์ํ ๊ฒฝ์ฐ ์์ ํ์ธ์:") | |
| edited_data = st.data_editor(data, num_rows="dynamic") | |
| if st.button("๋ฐ์ดํฐ ๋ถ์ ์์"): | |
| processed_data = preprocess_data(edited_data) | |
| perform_analysis(processed_data) | |
| if __name__ == "__main__": | |
| main() |