Cylanoid commited on
Commit
9773d48
·
2 Parent(s): 8a0527c c1c1cb3

Merge branch 'main' of https://huggingface.co/spaces/Cylanoid/Nursing-Home-Fraud-Detection-using-Llama

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. train_llama.py +17 -11
requirements.txt CHANGED
@@ -7,4 +7,5 @@ peft==0.14.0
7
  bitsandbytes
8
  sentencepiece
9
  huggingface_hub>=0.19
10
- accelerate
 
 
7
  bitsandbytes
8
  sentencepiece
9
  huggingface_hub>=0.19
10
+ accelerate
11
+ flash-attn
train_llama.py CHANGED
@@ -22,12 +22,13 @@ if tokenizer.pad_token is None:
22
  # Quantization config
23
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
24
 
25
- # Load model without FlashAttention
26
  model = LlamaForCausalLM.from_pretrained(
27
  MODEL_ID,
28
  torch_dtype=torch.bfloat16,
29
  device_map="auto",
30
- quantization_config=quantization_config
 
31
  )
32
 
33
  # Prepare for LoRA
@@ -43,16 +44,15 @@ model.print_trainable_parameters()
43
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
44
  print("First example from dataset:", dataset["train"][0])
45
 
46
- # Tokenization with fixed length
47
  def tokenize_data(example):
48
  formatted_text = f"{example['input']} {example['output']}"
49
- inputs = tokenizer(formatted_text, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
50
- input_ids = inputs["input_ids"].squeeze(0).tolist()
51
- attention_mask = inputs["attention_mask"].squeeze(0).tolist()
52
  labels = input_ids.copy()
53
  input_len = len(tokenizer(example['input'])["input_ids"])
54
  labels[:input_len] = [-100] * input_len
55
- print(f"Debug: input_ids[:5] = {input_ids[:5]}, labels[:5] = {labels[:5]}, attention_mask[:5] = {attention_mask[:5]}")
56
  return {
57
  "input_ids": input_ids,
58
  "labels": labels,
@@ -60,18 +60,24 @@ def tokenize_data(example):
60
  }
61
 
62
  tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
 
63
  first_example = tokenized_dataset[0]
64
  print("First tokenized example:", {k: (type(v), len(v)) for k, v in first_example.items()})
65
 
66
- # Data collator with tensor stacking
67
  def custom_data_collator(features):
68
  input_ids = [torch.tensor(f["input_ids"]) for f in features]
69
  attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
70
  labels = [torch.tensor(f["labels"]) for f in features]
 
 
 
 
 
71
  return {
72
- "input_ids": torch.stack(input_ids),
73
- "attention_mask": torch.stack(attention_mask),
74
- "labels": torch.stack(labels)
75
  }
76
 
77
  # Accelerator and training
 
22
  # Quantization config
23
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
24
 
25
+ # Load model with FlashAttention 2
26
  model = LlamaForCausalLM.from_pretrained(
27
  MODEL_ID,
28
  torch_dtype=torch.bfloat16,
29
  device_map="auto",
30
+ quantization_config=quantization_config,
31
+ attn_implementation="flash_attention_2"
32
  )
33
 
34
  # Prepare for LoRA
 
44
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
45
  print("First example from dataset:", dataset["train"][0])
46
 
47
+ # Tokenization with lists (no tensors)
48
  def tokenize_data(example):
49
  formatted_text = f"{example['input']} {example['output']}"
50
+ inputs = tokenizer(formatted_text, truncation=True, max_length=2048)
51
+ input_ids = inputs["input_ids"]
52
+ attention_mask = inputs["attention_mask"]
53
  labels = input_ids.copy()
54
  input_len = len(tokenizer(example['input'])["input_ids"])
55
  labels[:input_len] = [-100] * input_len
 
56
  return {
57
  "input_ids": input_ids,
58
  "labels": labels,
 
60
  }
61
 
62
  tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
63
+ # Print first example (lists with lengths)
64
  first_example = tokenized_dataset[0]
65
  print("First tokenized example:", {k: (type(v), len(v)) for k, v in first_example.items()})
66
 
67
+ # Data collator: convert lists to tensors and pad
68
  def custom_data_collator(features):
69
  input_ids = [torch.tensor(f["input_ids"]) for f in features]
70
  attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
71
  labels = [torch.tensor(f["labels"]) for f in features]
72
+
73
+ input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
74
+ attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
75
+ labels = pad_sequence(labels, batch_first=True, padding_value=-100)
76
+
77
  return {
78
+ "input_ids": input_ids,
79
+ "attention_mask": attention_mask,
80
+ "labels": labels
81
  }
82
 
83
  # Accelerator and training