In [43]:
from datasets import load_dataset

data_set = load_dataset("json", data_files="data3.json", split="train")
# convert data_set details field to string
#
# data_set = data_set.map(lambda x: {"details": str(x["details"])})
data_set = data_set.train_test_split(test_size=0.1)
# print(data_set.data["train"][0])
# print(type(data_set.data["train"]))

In [44]:
from transformers import AutoTokenizer
checkpoint = "flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
prefix = """extract the calendar event details from the following message. The details should be specified in the following json format:
{
    "datetime": "2024-03-12T12:00:00",
    "description": "Lunch meeting",
    "location": "Italian restaurant on Main Street"
}
"""
prefix = ""

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["message"]]
    target = [doc for doc in examples["details"]]
    model_inputs = tokenizer(inputs, text_target=target, max_length=1024, truncation=True, padding="max_length") 

    # labels = tokenizer(text_target=examples["details"], max_length=128, truncation=True, padding="max_length")

    # model_inputs["labels"] = labels["input_ids"]
    return model_inputs


OSError: flan-t5-small is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
tokenized_data_set = data_set.map(preprocess_function, batched=True)

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import evaluate

In [None]:
rouge = evaluate.load("rouge")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids
    predicted_strings = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    actual_strings = tokenizer.batch_decode(labels, skip_special_tokens=True)

    token_diffs = []
    for predicted, actual in zip(predicted_strings, actual_strings):
        predicted_tokens = tokenizer(predicted)["input_ids"]
        actual_tokens = tokenizer(actual)["input_ids"]
        token_diff = abs(len(predicted_tokens) - len(actual_tokens))
        token_diffs.append(token_diff)

    avg_token_diff = sum(token_diffs) / len(token_diffs)
    return {"average_token_difference": avg_token_diff}



In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
import torch

# Check that MPS is available
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")
    model.to(mps_device)
    print("Model moved to MPS device")

Model moved to MPS device


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="calendar_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    use_mps_device=True,
    # fp16=True,
    # push_to_hub=True,
)



In [None]:
print(data_set)

DatasetDict({
    train: Dataset({
        features: ['details', 'message'],
        num_rows: 69
    })
    test: Dataset({
        features: ['details', 'message'],
        num_rows: 8
    })
})


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data_set["train"],
    eval_dataset=tokenized_data_set["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

  0%|          | 0/9 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 9.526269912719727, 'eval_average_token_difference': 9.875, 'eval_runtime': 3.2711, 'eval_samples_per_second': 2.446, 'eval_steps_per_second': 0.306, 'epoch': 1.0}
{'train_runtime': 270.5513, 'train_samples_per_second': 0.255, 'train_steps_per_second': 0.033, 'train_loss': 10.85148451063368, 'epoch': 1.0}


TrainOutput(global_step=9, training_loss=10.85148451063368, metrics={'train_runtime': 270.5513, 'train_samples_per_second': 0.255, 'train_steps_per_second': 0.033, 'train_loss': 10.85148451063368, 'epoch': 1.0})

In [None]:
# push to hub
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/joshcarp/calendar_model/commit/edfcfa8cc6e1ae5fb389894f56f0fb2a6885828a', commit_message='End of training', commit_description='', oid='edfcfa8cc6e1ae5fb389894f56f0fb2a6885828a', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline
hub_model_id = "joshcarp/calendar_model"
summarizer = pipeline("text2text-generation", model=hub_model_id)


prefix = """extract the calendar event details from a message. The details should be specified in the following json format:
{
    "datetime": "<inferred start time from input text>",
    "description": "<description of event from input text>",
    "location": "<location of event from input text>"
}

Here is an example: "Reminder: Team meeting on Friday at 10 AM in the conference room."

For this example the output should be:

{
    "datetime": "2024-03-15T10:00:00",
    "description": "Team meeting",
    "location": "Conference room"
}


Here is the input text: """

text = prefix+"Doctor's appointment on Friday at 9:00 AM."


summary = summarizer(text, max_length=60, min_length=6, truncation=True)
print(text)
print(summary)

extract the calendar event details from a message. The details should be specified in the following json format:
{
    "datetime": "<inferred start time from input text>",
    "description": "<description of event from input text>",
    "location": "<location of event from input text>"
}

Here is an example: "Reminder: Team meeting on Friday at 10 AM in the conference room."

For this example the output should be:

{
    "datetime": "2024-03-15T10:00:00",
    "description": "Team meeting",
    "location": "Conference room"
}


Here is the input text: Doctor's appointment on Friday at 9:00 AM.
[{'generated_text': 'calendar event details from a message. The details should be specified in json format:  "datetime": "inferred start time from input text>", "description": "description of event from input text>", "location":'}]
