Cylanoid commited on
Commit
cffe234
·
verified ·
1 Parent(s): ba6c89d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -13
app.py CHANGED
@@ -74,18 +74,18 @@ def train_ui_tars(file):
74
  # Step 1: Load and preprocess the uploaded JSON file
75
  with open(file.name, "r", encoding="utf-8") as f:
76
  raw_data = json.load(f)
77
-
78
  # Extract training pairs or use flat structure
79
  training_data = raw_data.get("training_pairs", raw_data)
80
-
81
  # Save fixed JSON to avoid issues
82
  fixed_json_path = "fixed_fraud_data.json"
83
  with open(fixed_json_path, "w", encoding="utf-8") as f:
84
  json.dump(training_data, f, indent=4)
85
-
86
  # Load dataset
87
  dataset = datasets.load_dataset("json", data_files=fixed_json_path)
88
-
89
  # Step 2: Tokenize dataset with Llama-compatible context length
90
  def tokenize_data(example):
91
  # Format input for Llama (instruction-following style)
@@ -99,9 +99,9 @@ def train_ui_tars(file):
99
  )
100
  inputs["labels"] = inputs["input_ids"].clone()
101
  return {k: v.squeeze(0) for k, v in inputs.items()}
102
-
103
  tokenized_dataset = dataset["train"].map(tokenize_data, batched=True, remove_columns=dataset["train"].column_names)
104
-
105
  # Step 3: Training setup
106
  training_args = TrainingArguments(
107
  output_dir="./fine_tuned_llama",
@@ -120,7 +120,7 @@ def train_ui_tars(file):
120
  optim="adamw_torch",
121
  warmup_steps=100,
122
  )
123
-
124
  # Custom data collator for Llama
125
  def custom_data_collator(features):
126
  batch = {
@@ -136,16 +136,16 @@ def train_ui_tars(file):
136
  train_dataset=tokenized_dataset,
137
  data_collator=custom_data_collator,
138
  )
139
-
140
  # Step 4: Start training
141
  trainer.train()
142
-
143
  # Step 5: Save the model
144
  model.save_pretrained("./fine_tuned_llama")
145
  tokenizer.save_pretrained("./fine_tuned_llama")
146
-
147
  return "Training completed successfully! Model saved to ./fine_tuned_llama"
148
-
149
  except Exception as e:
150
  return f"Error: {str(e)}"
151
 
@@ -153,11 +153,11 @@ def train_ui_tars(file):
153
  with gr.Blocks(title="Model Fine-Tuning Interface") as demo:
154
  gr.Markdown("# Llama Fraud Detection Fine-Tuning UI")
155
  gr.Markdown("Upload a JSON file with 'input' and 'output' pairs to fine-tune the Llama model on your fraud dataset.")
156
-
157
  file_input = gr.File(label="Upload Fraud Dataset (JSON)")
158
  train_button = gr.Button("Start Fine-Tuning")
159
  output = gr.Textbox(label="Training Status")
160
-
161
  train_button.click(fn=train_ui_tars, inputs=file_input, outputs=output)
162
 
163
  demo.launch()
 
74
  # Step 1: Load and preprocess the uploaded JSON file
75
  with open(file.name, "r", encoding="utf-8") as f:
76
  raw_data = json.load(f)
77
+
78
  # Extract training pairs or use flat structure
79
  training_data = raw_data.get("training_pairs", raw_data)
80
+
81
  # Save fixed JSON to avoid issues
82
  fixed_json_path = "fixed_fraud_data.json"
83
  with open(fixed_json_path, "w", encoding="utf-8") as f:
84
  json.dump(training_data, f, indent=4)
85
+
86
  # Load dataset
87
  dataset = datasets.load_dataset("json", data_files=fixed_json_path)
88
+
89
  # Step 2: Tokenize dataset with Llama-compatible context length
90
  def tokenize_data(example):
91
  # Format input for Llama (instruction-following style)
 
99
  )
100
  inputs["labels"] = inputs["input_ids"].clone()
101
  return {k: v.squeeze(0) for k, v in inputs.items()}
102
+
103
  tokenized_dataset = dataset["train"].map(tokenize_data, batched=True, remove_columns=dataset["train"].column_names)
104
+
105
  # Step 3: Training setup
106
  training_args = TrainingArguments(
107
  output_dir="./fine_tuned_llama",
 
120
  optim="adamw_torch",
121
  warmup_steps=100,
122
  )
123
+
124
  # Custom data collator for Llama
125
  def custom_data_collator(features):
126
  batch = {
 
136
  train_dataset=tokenized_dataset,
137
  data_collator=custom_data_collator,
138
  )
139
+
140
  # Step 4: Start training
141
  trainer.train()
142
+
143
  # Step 5: Save the model
144
  model.save_pretrained("./fine_tuned_llama")
145
  tokenizer.save_pretrained("./fine_tuned_llama")
146
+
147
  return "Training completed successfully! Model saved to ./fine_tuned_llama"
148
+
149
  except Exception as e:
150
  return f"Error: {str(e)}"
151
 
 
153
  with gr.Blocks(title="Model Fine-Tuning Interface") as demo:
154
  gr.Markdown("# Llama Fraud Detection Fine-Tuning UI")
155
  gr.Markdown("Upload a JSON file with 'input' and 'output' pairs to fine-tune the Llama model on your fraud dataset.")
156
+
157
  file_input = gr.File(label="Upload Fraud Dataset (JSON)")
158
  train_button = gr.Button("Start Fine-Tuning")
159
  output = gr.Textbox(label="Training Status")
160
+
161
  train_button.click(fn=train_ui_tars, inputs=file_input, outputs=output)
162
 
163
  demo.launch()