# -*- coding: utf-8 -*- """2_preprocessing_test.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/10c3x9G9z70J73l0LJDA8_VDZphQmHEZB """ import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder import os from sklearn.model_selection import train_test_split import pickle import warnings warnings.filterwarnings('ignore') df1 = pd.read_csv("/content/drive/MyDrive/Google Colab/disease-symptom-prediction/data/dataset.csv") print(df1.shape) df1.head() df1.sort_values(by='Disease', inplace=True) df1.head() df1.drop_duplicates(inplace=True) df1.shape df1['Disease'].value_counts() df1[df1['Disease']=="Fungal infection"] df1.fillna("none", inplace=True) df1[df1['Disease']=="Fungal infection"] df1.columns = df1.columns.str.strip().str.lower() for col in df1.columns: df1[col] = df1[col].astype(str).str.strip().str.lower() symptom_cols = [col for col in df1.columns if col.startswith('symptom')] print(symptom_cols) all_symptoms = set() for col in symptom_cols: for val in df1[col].unique(): if val != 'none': all_symptoms.add(val) print(f"Unique symptoms: {len(all_symptoms)}") print(all_symptoms) df1.head() df1_num = pd.DataFrame(df1['disease']) for symptom in all_symptoms: df1_num[symptom] = df1[symptom_cols].apply(lambda row: int(symptom in row.values), axis=1) df1_num X = df1_num.drop('disease', axis=1) y = df1_num['disease'] X.shape, y.shape X.sum(axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) print(np.unique(y_train, return_counts=True)) print(np.unique(y_test, return_counts=True)) from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=100,random_state=42) model.fit(X_train, y_train) model.fit(X_train, y_train) import pickle # Save model with open("disease_model.pkl", "wb") as f: pickle.dump(model, f) # Save symptom list (to use in the app later) with open("symptoms.pkl", "wb") as f: pickle.dump(list(all_symptoms), f) # Original symptoms (keys) all_symptoms = sorted(all_symptoms) # Create display labels by replacing '_' with ' ' and capitalizing each word display_symptoms = [symptom.replace('_', ' ').title() for symptom in all_symptoms] # Create a mapping from display label back to original symptom key label_to_symptom = dict(zip(display_symptoms, all_symptoms)) from sklearn.metrics import accuracy_score, f1_score y_train_pred = model.predict(X_train) train_accuracy = accuracy_score(y_train, y_train_pred) train_f1_score = f1_score(y_train, y_train_pred,average="weighted") print("Train Accuracy:", train_accuracy) print("Train f1 score:", train_f1_score) y_test_pred = model.predict(X_test) test_accuracy = accuracy_score(y_test, y_test_pred) test_f1_score = f1_score(y_test, y_test_pred, average="weighted") print("Train Accuracy:", test_accuracy) print("Train f1 score:", test_f1_score) import numpy as np # Example user symptoms user_symptoms = ['nausea', 'vomiting', 'abdominal_pain', 'diarrhoea'] # Tip for the user if len(user_symptoms) < 4: print("Tip: The model performs better if you enter at least 4 symptoms.\n") # Convert symptoms to input vector input_vector = [1 if symptom in user_symptoms else 0 for symptom in all_symptoms] input_vector = np.array([input_vector]) # Make prediction and get probabilities probas = model.predict_proba(input_vector)[0] max_proba = np.max(probas) predicted = model.classes_[np.argmax(probas)] # Confidence threshold threshold = 0.5 # Print predicted disease and confidence if max_proba < threshold: print("Warning: The model is not confident about this prediction.") print(f"Predicted disease: {predicted} (Confidence: {max_proba * 100:.1f}%)") else: print(f"Predicted disease: {predicted} (Confidence: {max_proba * 100:.1f}%)") # Function to print top N diseases def print_top_diseases(probas, model, top_n=5): classes = model.classes_ sorted_indices = np.argsort(probas)[::-1] print(f"\nTop {top_n} possible diseases:") for i in range(min(top_n, len(classes))): disease = classes[sorted_indices[i]] probability = probas[sorted_indices[i]] print(f"{i+1}. {disease}: {probability:.4f}") # Show top 5 possible diseases print_top_diseases(probas, model, top_n=5) import gradio as gr import pickle import numpy as np # --- 1. Load Disease Prediction Model --- with open("disease_model.pkl", "rb") as f: model = pickle.load(f) with open("symptoms.pkl", "rb") as f: all_symptoms = pickle.load(f) # Preprocess symptoms all_symptoms = sorted(all_symptoms) display_symptoms = [s.replace('_', ' ').title() for s in all_symptoms] label_to_symptom = dict(zip(display_symptoms, all_symptoms)) # --- 2. Medical Knowledge Base --- MEDICAL_KNOWLEDGE = { "migraine": [ "For migraines: (1) Rest in dark room (2) OTC pain relievers (ibuprofen/acetaminophen) (3) Apply cold compress (4) Consult neurologist if frequent", "Migraine treatment options include triptans (prescription) and caffeine. Avoid triggers like bright lights or strong smells." ], "allergy": [ "Allergy management: (1) Antihistamines (cetirizine/loratadine) (2) Nasal sprays (3) Allergy shots (immunotherapy) for severe cases", "For food allergies: Strict avoidance, carry epinephrine auto-injector (EpiPen), read food labels carefully" ], "cold": [ "Treat colds with rest, fluids, and OTC pain relievers. See doctor if fever lasts >3 days", "Most colds resolve in 7-10 days. Use decongestants for nasal congestion" ], "headache": [ "For headaches: Hydrate, rest, and use OTC pain relievers sparingly", "Persistent headaches require medical evaluation - consult your doctor" ], "fever": [ "For fever: Rest, fluids, and acetaminophen/ibuprofen. Seek help if >39°C or lasts >3 days", "High fever warning: Seek emergency care if fever >40°C or with stiff neck" ] } SPECIAL_RESPONSES = { "general approaches": "I can provide specific guidance for: allergies, migraines, colds, fever, back pain, rashes. What condition are you asking about?", "consult a doctor": "For these symptoms, seek medical care: severe pain, difficulty breathing, sudden weakness, high fever (>103°F), or symptoms lasting >7 days" } def get_medical_response(user_query): user_query = user_query.lower() # First check for special cases for phrase, response in SPECIAL_RESPONSES.items(): if phrase in user_query: return response # Then check medical conditions for condition, responses in MEDICAL_KNOWLEDGE.items(): if condition in user_query: return np.random.choice(responses) # Final improvement - suggest related conditions related = [cond for cond in MEDICAL_KNOWLEDGE.keys() if cond in user_query] if related: return f"Are you asking about {', '.join(related)}? {np.random.choice(MEDICAL_KNOWLEDGE[related[0]])}" return "I can advise on: " + ", ".join(MEDICAL_KNOWLEDGE.keys()) + ". Please be more specific." # --- 3. Disease Prediction Function --- def predict_disease(selected_labels): if not selected_labels or len(selected_labels) < 4: return "⚠️ Please select at least 4 symptoms for accurate results." user_symptoms = [label_to_symptom[label] for label in selected_labels] input_vector = [1 if symptom in user_symptoms else 0 for symptom in all_symptoms] input_vector = np.array([input_vector]) probas = model.predict_proba(input_vector)[0] max_proba = np.max(probas) predicted = model.classes_[np.argmax(probas)] sorted_indices = np.argsort(probas)[::-1] top_diseases = [ f"{i+1}. {model.classes_[idx]} — {probas[idx]*100:.1f}%" for i, idx in enumerate(sorted_indices[:3]) ] prediction_result = ( f"
{predicted} ({max_proba*100:.1f}% confidence)
" "Select symptoms for diagnosis and get medical advice