Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,74 +1,166 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
import joblib
|
4 |
-
import logfire
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
from openai import OpenAI
|
8 |
-
from
|
9 |
|
10 |
-
#
|
11 |
-
logfire.configure(token=os.getenv("LOGFIRE_API_KEY"))
|
12 |
-
logfire.instrument_pydantic()
|
13 |
|
14 |
-
#
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
def
|
32 |
"""
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
"""
|
36 |
-
text
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
def classify_text(text: str):
|
42 |
"""
|
43 |
-
|
44 |
-
and
|
45 |
"""
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
# Optionally log the results (this doesn't affect the output)
|
52 |
-
Results(
|
53 |
-
text=text,
|
54 |
-
hateful=rounded_probs[0],
|
55 |
-
insults=rounded_probs[1],
|
56 |
-
sexual=rounded_probs[2],
|
57 |
-
violence=rounded_probs[3],
|
58 |
-
self_harm=rounded_probs[4],
|
59 |
-
aom=rounded_probs[5],
|
60 |
-
)
|
61 |
|
62 |
-
#
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
})
|
68 |
|
|
|
|
|
69 |
return gr.update(value=df, visible=True)
|
70 |
|
71 |
-
with gr.Blocks(title="Zoo Entry 001") as iface:
|
72 |
input_text = gr.Textbox(lines=5, label="Input Text")
|
73 |
submit_btn = gr.Button("Submit")
|
74 |
output_table = gr.DataFrame(label="Classification Results", visible=False)
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
import joblib
|
|
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
6 |
from openai import OpenAI
|
7 |
+
from typing import List, Dict, Any
|
8 |
|
9 |
+
# --- New Inference Code Components ---
|
|
|
|
|
10 |
|
11 |
+
# Define categories with sub-level information
|
12 |
+
CATEGORIES = {
|
13 |
+
'hateful': ['level_1_discriminatory', 'level_2_hate_speech'],
|
14 |
+
'insults': ['insults'],
|
15 |
+
'sexual': ['level_1_not_appropriate_for_minors', 'level_2_not_appropriate_for_all_ages'],
|
16 |
+
'physical_violence': ['physical_violence'],
|
17 |
+
'self_harm': ['level_1_self_harm_intent', 'level_2_self_harm_action'],
|
18 |
+
'all_other_misconduct': ['level_1_not_socially_accepted', 'level_2_illegal_activities']
|
19 |
+
}
|
20 |
|
21 |
+
def get_embeddings(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray:
|
22 |
+
"""
|
23 |
+
Generate embeddings for a list of texts using the OpenAI API synchronously.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
texts: List of strings to embed.
|
27 |
+
model: The OpenAI embedding model to use.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
A numpy array of embeddings.
|
31 |
+
"""
|
32 |
+
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
33 |
+
MAX_TOKENS = 8191 # Maximum tokens for the embedding model
|
34 |
+
truncated_texts = [text[:MAX_TOKENS] for text in texts]
|
35 |
+
|
36 |
+
response = client.embeddings.create(
|
37 |
+
input=truncated_texts,
|
38 |
+
model=model
|
39 |
+
)
|
40 |
+
|
41 |
+
embeddings = np.array([data.embedding for data in response.data])
|
42 |
+
return embeddings
|
43 |
|
44 |
+
def run_model(model_file: str, embeddings: np.ndarray):
|
45 |
+
"""
|
46 |
+
Run the model on the embeddings.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
model_file: Path to the model file.
|
50 |
+
embeddings: Numpy array of embeddings.
|
51 |
+
|
52 |
+
Returns:
|
53 |
+
expanded_predictions, expanded_probabilities, expanded_label_names
|
54 |
+
"""
|
55 |
+
print("Loading model...")
|
56 |
+
model_data = joblib.load(model_file)
|
57 |
+
model = model_data['model']
|
58 |
+
label_names = model_data['label_names']
|
59 |
+
|
60 |
+
print("Predicting...")
|
61 |
+
# raw_predictions is a list of arrays with shape (n_samples, 2)
|
62 |
+
raw_predictions = model.predict(embeddings)
|
63 |
+
|
64 |
+
print("Processing predictions...")
|
65 |
+
predictions = []
|
66 |
+
probabilities = []
|
67 |
+
# Process each category's raw predictions
|
68 |
+
for i, pred in enumerate(raw_predictions):
|
69 |
+
# Convert raw predictions (P(y>0), P(y>1)) into a class from {0, 1, 2}
|
70 |
+
pred_class = np.zeros(len(pred))
|
71 |
+
pred_class += (pred[:, 0] > 0.5).astype(int) # y > 0
|
72 |
+
pred_class += (pred[:, 1] > 0.5).astype(int) # y > 1
|
73 |
+
predictions.append(pred_class)
|
74 |
+
|
75 |
+
# Calculate probabilities for each class:
|
76 |
+
# P(y=0) = 1 - P(y>0), P(y=1) = P(y>0) - P(y>1), P(y=2) = P(y>1)
|
77 |
+
prob = np.zeros((len(pred), 3))
|
78 |
+
prob[:, 0] = 1 - pred[:, 0]
|
79 |
+
prob[:, 1] = pred[:, 0] - pred[:, 1]
|
80 |
+
prob[:, 2] = pred[:, 1]
|
81 |
+
probabilities.append(prob)
|
82 |
+
|
83 |
+
predictions = np.array(predictions).T
|
84 |
+
probabilities = np.array(probabilities).transpose(1, 0, 2)
|
85 |
+
|
86 |
+
# Expand predictions to sub-levels
|
87 |
+
expanded_predictions = []
|
88 |
+
expanded_probabilities = []
|
89 |
+
expanded_label_names = []
|
90 |
+
for i, cat in enumerate(label_names):
|
91 |
+
# Level 1: binary indicator for class 1 only.
|
92 |
+
y_pred_l1 = (predictions[:, i] == 1).astype(int)
|
93 |
+
y_proba_l1 = probabilities[:, i, 1] # Probability for class 1
|
94 |
+
|
95 |
+
# Level 2: binary indicator for any positivity (class 1 or 2)
|
96 |
+
y_pred_l2 = (predictions[:, i] > 0).astype(int)
|
97 |
+
y_proba_l2 = 1 - probabilities[:, i, 0] # 1 - probability for class 0
|
98 |
+
|
99 |
+
# For certain categories, only the overall (level 2) output is used
|
100 |
+
if cat in ['binary', 'insults', 'physical_violence']:
|
101 |
+
expanded_predictions.append(y_pred_l2)
|
102 |
+
expanded_probabilities.append(y_proba_l2)
|
103 |
+
expanded_label_names.append(cat)
|
104 |
+
else:
|
105 |
+
expanded_predictions.append(y_pred_l1)
|
106 |
+
expanded_probabilities.append(y_proba_l1)
|
107 |
+
expanded_label_names.append(CATEGORIES[cat][0])
|
108 |
+
|
109 |
+
expanded_predictions.append(y_pred_l2)
|
110 |
+
expanded_probabilities.append(y_proba_l2)
|
111 |
+
expanded_label_names.append(CATEGORIES[cat][1])
|
112 |
+
|
113 |
+
expanded_predictions = np.array(expanded_predictions).T
|
114 |
+
expanded_probabilities = np.array(expanded_probabilities).T
|
115 |
+
|
116 |
+
return expanded_predictions, expanded_probabilities, expanded_label_names
|
117 |
|
118 |
+
def format_output(predictions: np.ndarray, probabilities: np.ndarray, label_names: List[str]) -> pd.DataFrame:
|
119 |
"""
|
120 |
+
Format the output predictions into a DataFrame.
|
121 |
+
|
122 |
+
Args:
|
123 |
+
predictions: Binary predictions.
|
124 |
+
probabilities: Associated prediction scores.
|
125 |
+
label_names: List of label names.
|
126 |
+
|
127 |
+
Returns:
|
128 |
+
DataFrame with columns "Label", "Prediction", and "Score".
|
129 |
"""
|
130 |
+
# As our Gradio interface processes one text at a time, we use the first (and only) sample.
|
131 |
+
data = {
|
132 |
+
"Label": label_names,
|
133 |
+
"Prediction": predictions[0].tolist(),
|
134 |
+
"Score": np.round(probabilities[0], 4).tolist()
|
135 |
+
}
|
136 |
+
return pd.DataFrame(data)
|
137 |
+
|
138 |
+
# --- Gradio App Integration ---
|
139 |
+
|
140 |
+
# Define model file path (adjust as necessary)
|
141 |
+
MODEL_FILE = "model.joblib"
|
142 |
|
143 |
def classify_text(text: str):
|
144 |
"""
|
145 |
+
Given an input text, generates embeddings, runs the model inference,
|
146 |
+
and returns a DataFrame of classification results.
|
147 |
"""
|
148 |
+
if not text.strip():
|
149 |
+
# Return an empty DataFrame if no text provided
|
150 |
+
empty_df = pd.DataFrame({"Label": [], "Prediction": [], "Score": []})
|
151 |
+
return gr.update(value=empty_df, visible=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
+
# Obtain embeddings (input must be a list)
|
154 |
+
embeddings = get_embeddings([text])
|
155 |
+
|
156 |
+
# Run inference on the embeddings using the new model file
|
157 |
+
predictions, probabilities, label_names = run_model(MODEL_FILE, embeddings)
|
|
|
158 |
|
159 |
+
# Format the results to a DataFrame that Gradio can display
|
160 |
+
df = format_output(predictions, probabilities, label_names)
|
161 |
return gr.update(value=df, visible=True)
|
162 |
|
163 |
+
with gr.Blocks(title="Zoo Entry 001 - Updated Inference") as iface:
|
164 |
input_text = gr.Textbox(lines=5, label="Input Text")
|
165 |
submit_btn = gr.Button("Submit")
|
166 |
output_table = gr.DataFrame(label="Classification Results", visible=False)
|