Update app.py
Browse files
app.py
CHANGED
@@ -74,18 +74,18 @@ def train_ui_tars(file):
|
|
74 |
# Step 1: Load and preprocess the uploaded JSON file
|
75 |
with open(file.name, "r", encoding="utf-8") as f:
|
76 |
raw_data = json.load(f)
|
77 |
-
|
78 |
# Extract training pairs or use flat structure
|
79 |
training_data = raw_data.get("training_pairs", raw_data)
|
80 |
-
|
81 |
# Save fixed JSON to avoid issues
|
82 |
fixed_json_path = "fixed_fraud_data.json"
|
83 |
with open(fixed_json_path, "w", encoding="utf-8") as f:
|
84 |
json.dump(training_data, f, indent=4)
|
85 |
-
|
86 |
# Load dataset
|
87 |
dataset = datasets.load_dataset("json", data_files=fixed_json_path)
|
88 |
-
|
89 |
# Step 2: Tokenize dataset with Llama-compatible context length
|
90 |
def tokenize_data(example):
|
91 |
# Format input for Llama (instruction-following style)
|
@@ -99,9 +99,9 @@ def train_ui_tars(file):
|
|
99 |
)
|
100 |
inputs["labels"] = inputs["input_ids"].clone()
|
101 |
return {k: v.squeeze(0) for k, v in inputs.items()}
|
102 |
-
|
103 |
tokenized_dataset = dataset["train"].map(tokenize_data, batched=True, remove_columns=dataset["train"].column_names)
|
104 |
-
|
105 |
# Step 3: Training setup
|
106 |
training_args = TrainingArguments(
|
107 |
output_dir="./fine_tuned_llama",
|
@@ -120,7 +120,7 @@ def train_ui_tars(file):
|
|
120 |
optim="adamw_torch",
|
121 |
warmup_steps=100,
|
122 |
)
|
123 |
-
|
124 |
# Custom data collator for Llama
|
125 |
def custom_data_collator(features):
|
126 |
batch = {
|
@@ -136,16 +136,16 @@ def train_ui_tars(file):
|
|
136 |
train_dataset=tokenized_dataset,
|
137 |
data_collator=custom_data_collator,
|
138 |
)
|
139 |
-
|
140 |
# Step 4: Start training
|
141 |
trainer.train()
|
142 |
-
|
143 |
# Step 5: Save the model
|
144 |
model.save_pretrained("./fine_tuned_llama")
|
145 |
tokenizer.save_pretrained("./fine_tuned_llama")
|
146 |
-
|
147 |
return "Training completed successfully! Model saved to ./fine_tuned_llama"
|
148 |
-
|
149 |
except Exception as e:
|
150 |
return f"Error: {str(e)}"
|
151 |
|
@@ -153,11 +153,11 @@ def train_ui_tars(file):
|
|
153 |
with gr.Blocks(title="Model Fine-Tuning Interface") as demo:
|
154 |
gr.Markdown("# Llama Fraud Detection Fine-Tuning UI")
|
155 |
gr.Markdown("Upload a JSON file with 'input' and 'output' pairs to fine-tune the Llama model on your fraud dataset.")
|
156 |
-
|
157 |
file_input = gr.File(label="Upload Fraud Dataset (JSON)")
|
158 |
train_button = gr.Button("Start Fine-Tuning")
|
159 |
output = gr.Textbox(label="Training Status")
|
160 |
-
|
161 |
train_button.click(fn=train_ui_tars, inputs=file_input, outputs=output)
|
162 |
|
163 |
demo.launch()
|
|
|
74 |
# Step 1: Load and preprocess the uploaded JSON file
|
75 |
with open(file.name, "r", encoding="utf-8") as f:
|
76 |
raw_data = json.load(f)
|
77 |
+
|
78 |
# Extract training pairs or use flat structure
|
79 |
training_data = raw_data.get("training_pairs", raw_data)
|
80 |
+
|
81 |
# Save fixed JSON to avoid issues
|
82 |
fixed_json_path = "fixed_fraud_data.json"
|
83 |
with open(fixed_json_path, "w", encoding="utf-8") as f:
|
84 |
json.dump(training_data, f, indent=4)
|
85 |
+
|
86 |
# Load dataset
|
87 |
dataset = datasets.load_dataset("json", data_files=fixed_json_path)
|
88 |
+
|
89 |
# Step 2: Tokenize dataset with Llama-compatible context length
|
90 |
def tokenize_data(example):
|
91 |
# Format input for Llama (instruction-following style)
|
|
|
99 |
)
|
100 |
inputs["labels"] = inputs["input_ids"].clone()
|
101 |
return {k: v.squeeze(0) for k, v in inputs.items()}
|
102 |
+
|
103 |
tokenized_dataset = dataset["train"].map(tokenize_data, batched=True, remove_columns=dataset["train"].column_names)
|
104 |
+
|
105 |
# Step 3: Training setup
|
106 |
training_args = TrainingArguments(
|
107 |
output_dir="./fine_tuned_llama",
|
|
|
120 |
optim="adamw_torch",
|
121 |
warmup_steps=100,
|
122 |
)
|
123 |
+
|
124 |
# Custom data collator for Llama
|
125 |
def custom_data_collator(features):
|
126 |
batch = {
|
|
|
136 |
train_dataset=tokenized_dataset,
|
137 |
data_collator=custom_data_collator,
|
138 |
)
|
139 |
+
|
140 |
# Step 4: Start training
|
141 |
trainer.train()
|
142 |
+
|
143 |
# Step 5: Save the model
|
144 |
model.save_pretrained("./fine_tuned_llama")
|
145 |
tokenizer.save_pretrained("./fine_tuned_llama")
|
146 |
+
|
147 |
return "Training completed successfully! Model saved to ./fine_tuned_llama"
|
148 |
+
|
149 |
except Exception as e:
|
150 |
return f"Error: {str(e)}"
|
151 |
|
|
|
153 |
with gr.Blocks(title="Model Fine-Tuning Interface") as demo:
|
154 |
gr.Markdown("# Llama Fraud Detection Fine-Tuning UI")
|
155 |
gr.Markdown("Upload a JSON file with 'input' and 'output' pairs to fine-tune the Llama model on your fraud dataset.")
|
156 |
+
|
157 |
file_input = gr.File(label="Upload Fraud Dataset (JSON)")
|
158 |
train_button = gr.Button("Start Fine-Tuning")
|
159 |
output = gr.Textbox(label="Training Status")
|
160 |
+
|
161 |
train_button.click(fn=train_ui_tars, inputs=file_input, outputs=output)
|
162 |
|
163 |
demo.launch()
|