gabrielchua commited on
Commit
4c7a460
·
verified ·
1 Parent(s): db5b988

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -50
app.py CHANGED
@@ -1,74 +1,166 @@
1
  import os
2
  import gradio as gr
3
  import joblib
4
- import logfire
5
  import numpy as np
6
  import pandas as pd
7
  from openai import OpenAI
8
- from pydantic import BaseModel
9
 
10
- # Configure logging
11
- logfire.configure(token=os.getenv("LOGFIRE_API_KEY"))
12
- logfire.instrument_pydantic()
13
 
14
- # Load pre-trained model and label names
15
- model_data = joblib.load("model.joblib")
16
- model = model_data["model"]
17
- label_names = model_data["label_names"]
 
 
 
 
 
18
 
19
- class Results(BaseModel):
20
- text: str
21
- hateful: float
22
- insults: float
23
- sexual: float
24
- violence: float
25
- self_harm: float
26
- aom: float
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # Initialize OpenAI client
29
- client = OpenAI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- def get_embedding(text: str, embedding_model: str = "text-embedding-3-large") -> np.ndarray:
32
  """
33
- Get embedding for the input text from OpenAI.
34
- Replaces newlines with spaces before calling the API.
 
 
 
 
 
 
 
35
  """
36
- text = text.replace("\n", " ")
37
- response = client.embeddings.create(input=[text], model=embedding_model)
38
- embedding = response.data[0].embedding
39
- return np.array(embedding)
 
 
 
 
 
 
 
 
40
 
41
  def classify_text(text: str):
42
  """
43
- Get the OpenAI embedding for the provided text, classify it using your model,
44
- and return a DataFrame with the rounded probabilities and binary predictions.
45
  """
46
- embedding = get_embedding(text)
47
- X = embedding.reshape(1, -1)
48
- probabilities = model.predict(X)
49
- rounded_probs = np.round(probabilities[0], 4)
50
-
51
- # Optionally log the results (this doesn't affect the output)
52
- Results(
53
- text=text,
54
- hateful=rounded_probs[0],
55
- insults=rounded_probs[1],
56
- sexual=rounded_probs[2],
57
- violence=rounded_probs[3],
58
- self_harm=rounded_probs[4],
59
- aom=rounded_probs[5],
60
- )
61
 
62
- # Create DataFrame with rounded probabilities and binary predictions
63
- df = pd.DataFrame({
64
- "Label": label_names,
65
- "Probability": rounded_probs,
66
- "Prediction": (rounded_probs > 0.5).astype(int)
67
- })
68
 
 
 
69
  return gr.update(value=df, visible=True)
70
 
71
- with gr.Blocks(title="Zoo Entry 001") as iface:
72
  input_text = gr.Textbox(lines=5, label="Input Text")
73
  submit_btn = gr.Button("Submit")
74
  output_table = gr.DataFrame(label="Classification Results", visible=False)
 
1
  import os
2
  import gradio as gr
3
  import joblib
 
4
  import numpy as np
5
  import pandas as pd
6
  from openai import OpenAI
7
+ from typing import List, Dict, Any
8
 
9
+ # --- New Inference Code Components ---
 
 
10
 
11
+ # Define categories with sub-level information
12
+ CATEGORIES = {
13
+ 'hateful': ['level_1_discriminatory', 'level_2_hate_speech'],
14
+ 'insults': ['insults'],
15
+ 'sexual': ['level_1_not_appropriate_for_minors', 'level_2_not_appropriate_for_all_ages'],
16
+ 'physical_violence': ['physical_violence'],
17
+ 'self_harm': ['level_1_self_harm_intent', 'level_2_self_harm_action'],
18
+ 'all_other_misconduct': ['level_1_not_socially_accepted', 'level_2_illegal_activities']
19
+ }
20
 
21
+ def get_embeddings(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray:
22
+ """
23
+ Generate embeddings for a list of texts using the OpenAI API synchronously.
24
+
25
+ Args:
26
+ texts: List of strings to embed.
27
+ model: The OpenAI embedding model to use.
28
+
29
+ Returns:
30
+ A numpy array of embeddings.
31
+ """
32
+ client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
33
+ MAX_TOKENS = 8191 # Maximum tokens for the embedding model
34
+ truncated_texts = [text[:MAX_TOKENS] for text in texts]
35
+
36
+ response = client.embeddings.create(
37
+ input=truncated_texts,
38
+ model=model
39
+ )
40
+
41
+ embeddings = np.array([data.embedding for data in response.data])
42
+ return embeddings
43
 
44
+ def run_model(model_file: str, embeddings: np.ndarray):
45
+ """
46
+ Run the model on the embeddings.
47
+
48
+ Args:
49
+ model_file: Path to the model file.
50
+ embeddings: Numpy array of embeddings.
51
+
52
+ Returns:
53
+ expanded_predictions, expanded_probabilities, expanded_label_names
54
+ """
55
+ print("Loading model...")
56
+ model_data = joblib.load(model_file)
57
+ model = model_data['model']
58
+ label_names = model_data['label_names']
59
+
60
+ print("Predicting...")
61
+ # raw_predictions is a list of arrays with shape (n_samples, 2)
62
+ raw_predictions = model.predict(embeddings)
63
+
64
+ print("Processing predictions...")
65
+ predictions = []
66
+ probabilities = []
67
+ # Process each category's raw predictions
68
+ for i, pred in enumerate(raw_predictions):
69
+ # Convert raw predictions (P(y>0), P(y>1)) into a class from {0, 1, 2}
70
+ pred_class = np.zeros(len(pred))
71
+ pred_class += (pred[:, 0] > 0.5).astype(int) # y > 0
72
+ pred_class += (pred[:, 1] > 0.5).astype(int) # y > 1
73
+ predictions.append(pred_class)
74
+
75
+ # Calculate probabilities for each class:
76
+ # P(y=0) = 1 - P(y>0), P(y=1) = P(y>0) - P(y>1), P(y=2) = P(y>1)
77
+ prob = np.zeros((len(pred), 3))
78
+ prob[:, 0] = 1 - pred[:, 0]
79
+ prob[:, 1] = pred[:, 0] - pred[:, 1]
80
+ prob[:, 2] = pred[:, 1]
81
+ probabilities.append(prob)
82
+
83
+ predictions = np.array(predictions).T
84
+ probabilities = np.array(probabilities).transpose(1, 0, 2)
85
+
86
+ # Expand predictions to sub-levels
87
+ expanded_predictions = []
88
+ expanded_probabilities = []
89
+ expanded_label_names = []
90
+ for i, cat in enumerate(label_names):
91
+ # Level 1: binary indicator for class 1 only.
92
+ y_pred_l1 = (predictions[:, i] == 1).astype(int)
93
+ y_proba_l1 = probabilities[:, i, 1] # Probability for class 1
94
+
95
+ # Level 2: binary indicator for any positivity (class 1 or 2)
96
+ y_pred_l2 = (predictions[:, i] > 0).astype(int)
97
+ y_proba_l2 = 1 - probabilities[:, i, 0] # 1 - probability for class 0
98
+
99
+ # For certain categories, only the overall (level 2) output is used
100
+ if cat in ['binary', 'insults', 'physical_violence']:
101
+ expanded_predictions.append(y_pred_l2)
102
+ expanded_probabilities.append(y_proba_l2)
103
+ expanded_label_names.append(cat)
104
+ else:
105
+ expanded_predictions.append(y_pred_l1)
106
+ expanded_probabilities.append(y_proba_l1)
107
+ expanded_label_names.append(CATEGORIES[cat][0])
108
+
109
+ expanded_predictions.append(y_pred_l2)
110
+ expanded_probabilities.append(y_proba_l2)
111
+ expanded_label_names.append(CATEGORIES[cat][1])
112
+
113
+ expanded_predictions = np.array(expanded_predictions).T
114
+ expanded_probabilities = np.array(expanded_probabilities).T
115
+
116
+ return expanded_predictions, expanded_probabilities, expanded_label_names
117
 
118
+ def format_output(predictions: np.ndarray, probabilities: np.ndarray, label_names: List[str]) -> pd.DataFrame:
119
  """
120
+ Format the output predictions into a DataFrame.
121
+
122
+ Args:
123
+ predictions: Binary predictions.
124
+ probabilities: Associated prediction scores.
125
+ label_names: List of label names.
126
+
127
+ Returns:
128
+ DataFrame with columns "Label", "Prediction", and "Score".
129
  """
130
+ # As our Gradio interface processes one text at a time, we use the first (and only) sample.
131
+ data = {
132
+ "Label": label_names,
133
+ "Prediction": predictions[0].tolist(),
134
+ "Score": np.round(probabilities[0], 4).tolist()
135
+ }
136
+ return pd.DataFrame(data)
137
+
138
+ # --- Gradio App Integration ---
139
+
140
+ # Define model file path (adjust as necessary)
141
+ MODEL_FILE = "model.joblib"
142
 
143
  def classify_text(text: str):
144
  """
145
+ Given an input text, generates embeddings, runs the model inference,
146
+ and returns a DataFrame of classification results.
147
  """
148
+ if not text.strip():
149
+ # Return an empty DataFrame if no text provided
150
+ empty_df = pd.DataFrame({"Label": [], "Prediction": [], "Score": []})
151
+ return gr.update(value=empty_df, visible=True)
 
 
 
 
 
 
 
 
 
 
 
152
 
153
+ # Obtain embeddings (input must be a list)
154
+ embeddings = get_embeddings([text])
155
+
156
+ # Run inference on the embeddings using the new model file
157
+ predictions, probabilities, label_names = run_model(MODEL_FILE, embeddings)
 
158
 
159
+ # Format the results to a DataFrame that Gradio can display
160
+ df = format_output(predictions, probabilities, label_names)
161
  return gr.update(value=df, visible=True)
162
 
163
+ with gr.Blocks(title="Zoo Entry 001 - Updated Inference") as iface:
164
  input_text = gr.Textbox(lines=5, label="Input Text")
165
  submit_btn = gr.Button("Submit")
166
  output_table = gr.DataFrame(label="Classification Results", visible=False)