Spaces:

gabrielchua
/

refactored-guacamole

Sleeping

App Files Files Community

gabrielchua commited on Apr 14

Commit

4c7a460

verified ·

1 Parent(s): db5b988

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -50

app.py CHANGED Viewed

@@ -1,74 +1,166 @@
 import os
 import gradio as gr
 import joblib
-import logfire
 import numpy as np
 import pandas as pd
 from openai import OpenAI
-from pydantic import BaseModel
-# Configure logging
-logfire.configure(token=os.getenv("LOGFIRE_API_KEY"))
-logfire.instrument_pydantic()
-# Load pre-trained model and label names
-model_data = joblib.load("model.joblib")
-model = model_data["model"]
-label_names = model_data["label_names"]
-class Results(BaseModel):
-    text: str
-    hateful: float
-    insults: float
-    sexual: float
-    violence: float
-    self_harm: float
-    aom: float
-# Initialize OpenAI client
-client = OpenAI()
-def get_embedding(text: str, embedding_model: str = "text-embedding-3-large") -> np.ndarray:
     """
-    Get embedding for the input text from OpenAI.
-    Replaces newlines with spaces before calling the API.
     """
-    text = text.replace("\n", " ")
-    response = client.embeddings.create(input=[text], model=embedding_model)
-    embedding = response.data[0].embedding
-    return np.array(embedding)
 def classify_text(text: str):
     """
-    Get the OpenAI embedding for the provided text, classify it using your model,
-    and return a DataFrame with the rounded probabilities and binary predictions.
     """
-    embedding = get_embedding(text)
-    X = embedding.reshape(1, -1)
-    probabilities = model.predict(X)
-    rounded_probs = np.round(probabilities[0], 4)
-    # Optionally log the results (this doesn't affect the output)
-    Results(
-        text=text,
-        hateful=rounded_probs[0],
-        insults=rounded_probs[1],
-        sexual=rounded_probs[2],
-        violence=rounded_probs[3],
-        self_harm=rounded_probs[4],
-        aom=rounded_probs[5],
-    )
-    # Create DataFrame with rounded probabilities and binary predictions
-    df = pd.DataFrame({
-        "Label": label_names,
-        "Probability": rounded_probs,
-        "Prediction": (rounded_probs > 0.5).astype(int)
-    })
     return gr.update(value=df, visible=True)
-with gr.Blocks(title="Zoo Entry 001") as iface:
     input_text = gr.Textbox(lines=5, label="Input Text")
     submit_btn = gr.Button("Submit")
     output_table = gr.DataFrame(label="Classification Results", visible=False)

 import os
 import gradio as gr
 import joblib
 import numpy as np
 import pandas as pd
 from openai import OpenAI
+from typing import List, Dict, Any
+# --- New Inference Code Components ---
+# Define categories with sub-level information
+CATEGORIES = {
+    'hateful': ['level_1_discriminatory', 'level_2_hate_speech'],
+    'insults': ['insults'],
+    'sexual': ['level_1_not_appropriate_for_minors', 'level_2_not_appropriate_for_all_ages'],
+    'physical_violence': ['physical_violence'],
+    'self_harm': ['level_1_self_harm_intent', 'level_2_self_harm_action'],
+    'all_other_misconduct': ['level_1_not_socially_accepted', 'level_2_illegal_activities']
+}
+def get_embeddings(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray:
+    """
+    Generate embeddings for a list of texts using the OpenAI API synchronously.
+    Args:
+        texts: List of strings to embed.
+        model: The OpenAI embedding model to use.
+    Returns:
+        A numpy array of embeddings.
+    """
+    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+    MAX_TOKENS = 8191  # Maximum tokens for the embedding model
+    truncated_texts = [text[:MAX_TOKENS] for text in texts]
+    response = client.embeddings.create(
+        input=truncated_texts,
+        model=model
+    )
+    embeddings = np.array([data.embedding for data in response.data])
+    return embeddings
+def run_model(model_file: str, embeddings: np.ndarray):
+    """
+    Run the model on the embeddings.
+    Args:
+        model_file: Path to the model file.
+        embeddings: Numpy array of embeddings.
+    Returns:
+        expanded_predictions, expanded_probabilities, expanded_label_names
+    """
+    print("Loading model...")
+    model_data = joblib.load(model_file)
+    model = model_data['model']
+    label_names = model_data['label_names']
+    print("Predicting...")
+    # raw_predictions is a list of arrays with shape (n_samples, 2)
+    raw_predictions = model.predict(embeddings)
+    print("Processing predictions...")
+    predictions = []
+    probabilities = []
+    # Process each category's raw predictions
+    for i, pred in enumerate(raw_predictions):
+        # Convert raw predictions (P(y>0), P(y>1)) into a class from {0, 1, 2}
+        pred_class = np.zeros(len(pred))
+        pred_class += (pred[:, 0] > 0.5).astype(int)  # y > 0
+        pred_class += (pred[:, 1] > 0.5).astype(int)  # y > 1
+        predictions.append(pred_class)
+        # Calculate probabilities for each class:
+        # P(y=0) = 1 - P(y>0), P(y=1) = P(y>0) - P(y>1), P(y=2) = P(y>1)
+        prob = np.zeros((len(pred), 3))
+        prob[:, 0] = 1 - pred[:, 0]
+        prob[:, 1] = pred[:, 0] - pred[:, 1]
+        prob[:, 2] = pred[:, 1]
+        probabilities.append(prob)
+    predictions = np.array(predictions).T
+    probabilities = np.array(probabilities).transpose(1, 0, 2)
+    # Expand predictions to sub-levels
+    expanded_predictions = []
+    expanded_probabilities = []
+    expanded_label_names = []
+    for i, cat in enumerate(label_names):
+        # Level 1: binary indicator for class 1 only.
+        y_pred_l1 = (predictions[:, i] == 1).astype(int)
+        y_proba_l1 = probabilities[:, i, 1]  # Probability for class 1
+        # Level 2: binary indicator for any positivity (class 1 or 2)
+        y_pred_l2 = (predictions[:, i] > 0).astype(int)
+        y_proba_l2 = 1 - probabilities[:, i, 0]  # 1 - probability for class 0
+        # For certain categories, only the overall (level 2) output is used
+        if cat in ['binary', 'insults', 'physical_violence']:
+            expanded_predictions.append(y_pred_l2)
+            expanded_probabilities.append(y_proba_l2)
+            expanded_label_names.append(cat)
+        else:
+            expanded_predictions.append(y_pred_l1)
+            expanded_probabilities.append(y_proba_l1)
+            expanded_label_names.append(CATEGORIES[cat][0])
+            expanded_predictions.append(y_pred_l2)
+            expanded_probabilities.append(y_proba_l2)
+            expanded_label_names.append(CATEGORIES[cat][1])
+    expanded_predictions = np.array(expanded_predictions).T
+    expanded_probabilities = np.array(expanded_probabilities).T
+    return expanded_predictions, expanded_probabilities, expanded_label_names
+def format_output(predictions: np.ndarray, probabilities: np.ndarray, label_names: List[str]) -> pd.DataFrame:
     """
+    Format the output predictions into a DataFrame.
+    Args:
+        predictions: Binary predictions.
+        probabilities: Associated prediction scores.
+        label_names: List of label names.
+    Returns:
+        DataFrame with columns "Label", "Prediction", and "Score".
     """
+    # As our Gradio interface processes one text at a time, we use the first (and only) sample.
+    data = {
+        "Label": label_names,
+        "Prediction": predictions[0].tolist(),
+        "Score": np.round(probabilities[0], 4).tolist()
+    }
+    return pd.DataFrame(data)
+# --- Gradio App Integration ---
+# Define model file path (adjust as necessary)
+MODEL_FILE = "model.joblib"
 def classify_text(text: str):
     """
+    Given an input text, generates embeddings, runs the model inference,
+    and returns a DataFrame of classification results.
     """
+    if not text.strip():
+        # Return an empty DataFrame if no text provided
+        empty_df = pd.DataFrame({"Label": [], "Prediction": [], "Score": []})
+        return gr.update(value=empty_df, visible=True)
+    # Obtain embeddings (input must be a list)
+    embeddings = get_embeddings([text])
+    # Run inference on the embeddings using the new model file
+    predictions, probabilities, label_names = run_model(MODEL_FILE, embeddings)
+    # Format the results to a DataFrame that Gradio can display
+    df = format_output(predictions, probabilities, label_names)
     return gr.update(value=df, visible=True)
+with gr.Blocks(title="Zoo Entry 001 - Updated Inference") as iface:
     input_text = gr.Textbox(lines=5, label="Input Text")
     submit_btn = gr.Button("Submit")
     output_table = gr.DataFrame(label="Classification Results", visible=False)