Spaces:

mgbam
/

Healthapp

Sleeping

App Files Files Community

mgbam commited on Jan 29

Commit

55ef016

verified ·

1 Parent(s): 211e3a6

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -19

app.py CHANGED Viewed

@@ -55,7 +55,7 @@ if not OPENAI_API_KEY:
 # Instantiate the OpenAI client
 try:
-   client = OpenAI(api_key=OPENAI_API_KEY) # Instantiating the client right here
 except Exception as e:
     st.error(f"Failed to initialize OpenAI client: {e}")
     logger.error(f"Failed to initialize OpenAI client: {e}")
@@ -239,35 +239,48 @@ class HypothesisTester(DataAnalyzer):
             return "No significant evidence against H0"
 from sklearn.impute import SimpleImputer
 class LogisticRegressionTrainer(DataAnalyzer):
-    """Logistic Regression Model Trainer with Missing Value Handling."""
     def invoke(self, data: pd.DataFrame, target_col: str, columns: List[str], **kwargs) -> Dict[str, Any]:
         try:
-            X = data[columns]
-            y = data[target_col]
-            # Check for missing values in X
             if X.isnull().values.any():
                 logger.info("Missing values detected in feature variables. Applying imputation.")
-                imputer = SimpleImputer(strategy='mean')  # You can choose 'median', 'most_frequent', etc.
                 X_imputed = imputer.fit_transform(X)
                 X = pd.DataFrame(X_imputed, columns=columns)
                 logger.info("Imputation completed for feature variables.")
             else:
                 logger.info("No missing values detected in feature variables.")
-            # Check for missing values in y
             if y.isnull().values.any():
-                logger.info("Missing values detected in target variable. Applying imputation.")
-                # For classification, it's common to impute with the mode
-                y_imputer = SimpleImputer(strategy='most_frequent')
-                y_imputed = y_imputer.fit_transform(y.values.reshape(-1, 1))
-                y = pd.Series(y_imputer.ravel())
-                logger.info("Imputation completed for target variable.")
             else:
                 logger.info("No missing values detected in target variable.")
             # Split the data
             X_train, X_test, y_train, y_test = train_test_split(
                 X, y, test_size=0.2, random_state=42
@@ -275,7 +288,7 @@ class LogisticRegressionTrainer(DataAnalyzer):
             logger.info("Data split into training and testing sets.")
             # Initialize and train the model
-            model = LogisticRegression(max_iter=1000)
             model.fit(X_train, y_train)
             logger.info("Logistic Regression model training completed.")
@@ -293,7 +306,6 @@ class LogisticRegressionTrainer(DataAnalyzer):
             logger.error(f"Logistic Regression Model Error: {str(e)}")
             return {"error": f"Logistic Regression Model Error: {str(e)}"}
 # ---------------------- Business Logic Layer ---------------------------
 class ClinicalRule(BaseModel):
@@ -544,7 +556,7 @@ class SimpleMedicalKnowledge(MedicalKnowledgeBase):
             )
             # Extract the answer from the response
-            answer = response.choices[0].message.content.strip() # Corrected access
             logger.info("Successfully retrieved data from OpenAI GPT-4.")
@@ -800,7 +812,7 @@ def initialize_session_state():
     if 'openai_client' not in st.session_state:
         # Instantiate the OpenAI client only if it doesn't exist in session state
-        st.session_state.openai_client = client # The one created earlier
     if 'data' not in st.session_state:
         st.session_state.data = {}  # Store pd.DataFrame under a name
@@ -826,7 +838,7 @@ def initialize_session_state():
     if 'knowledge_base' not in st.session_state:
         st.session_state.knowledge_base = SimpleMedicalKnowledge(nlp_model=nlp, client=st.session_state.openai_client)
     if 'pub_email' not in st.session_state:
-                st.session_state.pub_email = PUB_EMAIL  # Load PUB_EMAIL from environment variables
     if 'treatment_recommendation' not in st.session_state:
         st.session_state.treatment_recommendation = BasicTreatmentRecommendation()
@@ -1209,4 +1221,4 @@ def medical_knowledge_section():
             st.error("Please enter a medical question to search.")
 if __name__ == "__main__":
-    main()

 # Instantiate the OpenAI client
 try:
+    client = OpenAI(api_key=OPENAI_API_KEY)  # Instantiating the client right here
 except Exception as e:
     st.error(f"Failed to initialize OpenAI client: {e}")
     logger.error(f"Failed to initialize OpenAI client: {e}")
             return "No significant evidence against H0"
 from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import LabelEncoder
 class LogisticRegressionTrainer(DataAnalyzer):
+    """Logistic Regression Model Trainer with Missing Value Handling and Target Encoding."""
     def invoke(self, data: pd.DataFrame, target_col: str, columns: List[str], **kwargs) -> Dict[str, Any]:
         try:
+            # Prevent data leakage by removing target_col from features if present
+            if target_col in columns:
+                columns.remove(target_col)
+                logger.warning(f"Removed target column '{target_col}' from feature list to prevent data leakage.")
+            X = data[columns].copy()
+            y = data[target_col].copy()
+            # Handle missing values in X
             if X.isnull().values.any():
                 logger.info("Missing values detected in feature variables. Applying imputation.")
+                imputer = SimpleImputer(strategy='mean')  # Choose strategy as needed
                 X_imputed = imputer.fit_transform(X)
                 X = pd.DataFrame(X_imputed, columns=columns)
                 logger.info("Imputation completed for feature variables.")
             else:
                 logger.info("No missing values detected in feature variables.")
+            # Handle missing values in y
             if y.isnull().values.any():
+                logger.info("Missing values detected in target variable. Dropping missing targets.")
+                # For classification, it's common to impute with the mode or drop missing targets
+                data = data.dropna(subset=[target_col])
+                y = data[target_col]
+                X = data[columns]
+                logger.info("Dropped rows with missing target values.")
             else:
                 logger.info("No missing values detected in target variable.")
+            # Encode target if it's categorical and not numeric
+            if y.dtype == 'object' or y.dtype.name == 'category':
+                logger.info("Encoding categorical target variable.")
+                label_encoder = LabelEncoder()
+                y = label_encoder.fit_transform(y)
+                logger.info("Encoding completed.")
             # Split the data
             X_train, X_test, y_train, y_test = train_test_split(
                 X, y, test_size=0.2, random_state=42
             logger.info("Data split into training and testing sets.")
             # Initialize and train the model
+            model = LogisticRegression(max_iter=1000, multi_class='auto', solver='lbfgs')
             model.fit(X_train, y_train)
             logger.info("Logistic Regression model training completed.")
             logger.error(f"Logistic Regression Model Error: {str(e)}")
             return {"error": f"Logistic Regression Model Error: {str(e)}"}
 # ---------------------- Business Logic Layer ---------------------------
 class ClinicalRule(BaseModel):
             )
             # Extract the answer from the response
+            answer = response.choices[0].message.content.strip()  # Corrected access
             logger.info("Successfully retrieved data from OpenAI GPT-4.")
     if 'openai_client' not in st.session_state:
         # Instantiate the OpenAI client only if it doesn't exist in session state
+        st.session_state.openai_client = client  # The one created earlier
     if 'data' not in st.session_state:
         st.session_state.data = {}  # Store pd.DataFrame under a name
     if 'knowledge_base' not in st.session_state:
         st.session_state.knowledge_base = SimpleMedicalKnowledge(nlp_model=nlp, client=st.session_state.openai_client)
     if 'pub_email' not in st.session_state:
+        st.session_state.pub_email = PUB_EMAIL  # Load PUB_EMAIL from environment variables
     if 'treatment_recommendation' not in st.session_state:
         st.session_state.treatment_recommendation = BasicTreatmentRecommendation()
             st.error("Please enter a medical question to search.")
 if __name__ == "__main__":
+    main()