|
import glob |
|
import json |
|
import os |
|
import uuid |
|
from datetime import datetime |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
import spaces |
|
import torch |
|
import transformers |
|
from huggingface_hub import CommitScheduler, hf_hub_download, login |
|
from transformers import AutoTokenizer |
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
login(HF_TOKEN) |
|
|
|
|
|
model_id = "meta-llama/Meta-Llama-3-8B-Instruct" |
|
tokenizer = AutoTokenizer.from_pretrained(model_id, add_special_tokens=True) |
|
|
|
pipeline = transformers.pipeline( |
|
"text-generation", |
|
model=model_id, |
|
model_kwargs={"torch_dtype": torch.bfloat16}, |
|
device="cuda", |
|
) |
|
|
|
|
|
with open("model_configs.json", "r") as f: |
|
model_configs = json.load(f) |
|
model_config = model_configs[model_id] |
|
|
|
|
|
extract_input = model_config["extract_input"] |
|
terminators = [ |
|
tokenizer.eos_token_id, |
|
tokenizer.convert_tokens_to_ids("<|eot_id|>"), |
|
] |
|
|
|
|
|
dataset_folder = Path("dataset") |
|
dataset_folder.mkdir(exist_ok=True) |
|
|
|
|
|
|
|
def get_latest_dataset_file(): |
|
if files := glob.glob(str(dataset_folder / "data_*.jsonl")): |
|
return max(files, key=os.path.getctime) |
|
return None |
|
|
|
|
|
|
|
if latest_file := get_latest_dataset_file(): |
|
dataset_file = Path(latest_file) |
|
print(f"Appending to existing dataset file: {dataset_file}") |
|
else: |
|
dataset_file = dataset_folder / f"data_{uuid.uuid4()}.jsonl" |
|
print(f"Creating new dataset file: {dataset_file}") |
|
|
|
|
|
repo_id = "davanstrien/magpie-preference" |
|
scheduler = CommitScheduler( |
|
repo_id=repo_id, |
|
repo_type="dataset", |
|
folder_path=dataset_folder, |
|
path_in_repo="data", |
|
every=1, |
|
) |
|
|
|
|
|
|
|
def download_existing_dataset(): |
|
try: |
|
files = hf_hub_download( |
|
repo_id=repo_id, filename="data", repo_type="dataset", recursive=True |
|
) |
|
for file in glob.glob(os.path.join(files, "*.jsonl")): |
|
dest_file = dataset_folder / os.path.basename(file) |
|
if not dest_file.exists(): |
|
dest_file.write_bytes(Path(file).read_bytes()) |
|
print(f"Downloaded existing dataset file: {dest_file}") |
|
except Exception as e: |
|
print(f"Error downloading existing dataset: {e}") |
|
|
|
|
|
|
|
download_existing_dataset() |
|
|
|
|
|
|
|
def generate_session_id(): |
|
return str(uuid.uuid4()) |
|
|
|
|
|
|
|
def save_data(generated_input, generated_response, vote, session_id): |
|
data = { |
|
"timestamp": datetime.now().isoformat(), |
|
"prompt": generated_input, |
|
"completion": generated_response, |
|
"label": vote, |
|
"session_id": session_id, |
|
} |
|
with scheduler.lock: |
|
with dataset_file.open("a") as f: |
|
f.write(json.dumps(data) + "\n") |
|
return "Data saved and will be uploaded to the dataset repository." |
|
|
|
|
|
@spaces.GPU |
|
def generate_instruction_response(): |
|
prompt_info = f"""### Generating user prompt using the template: |
|
|
|
``` |
|
{extract_input} |
|
``` |
|
""" |
|
yield ( |
|
prompt_info, |
|
"", |
|
"", |
|
gr.update(interactive=False), |
|
gr.update(interactive=False), |
|
"", |
|
gr.update(interactive=False), |
|
) |
|
instruction = pipeline( |
|
extract_input, |
|
max_new_tokens=2048, |
|
eos_token_id=terminators, |
|
do_sample=True, |
|
temperature=1, |
|
top_p=1, |
|
) |
|
|
|
sanitized_instruction = instruction[0]["generated_text"][ |
|
len(extract_input) : |
|
].split("\n")[0] |
|
|
|
first_step = ( |
|
f"{prompt_info}### LLM generated instruction:\n\n{sanitized_instruction}" |
|
) |
|
yield ( |
|
first_step + "\n\n### Generating LLM response...", |
|
sanitized_instruction, |
|
"", |
|
gr.update(interactive=False), |
|
gr.update(interactive=False), |
|
"", |
|
gr.update(interactive=False), |
|
) |
|
|
|
response_template = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{sanitized_instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n""" |
|
|
|
response = pipeline( |
|
response_template, |
|
max_new_tokens=2048, |
|
eos_token_id=terminators, |
|
do_sample=True, |
|
temperature=1, |
|
top_p=1, |
|
) |
|
|
|
assistant_response = response[0]["generated_text"][len(response_template) :] |
|
|
|
final_output = f"""### Template used for generating instruction: |
|
|
|
``` |
|
{extract_input} |
|
``` |
|
|
|
### LLM Generated Instruction: |
|
|
|
{sanitized_instruction} |
|
|
|
### LLM Generated Response: |
|
|
|
{assistant_response} |
|
""" |
|
yield ( |
|
final_output, |
|
sanitized_instruction, |
|
assistant_response, |
|
gr.update(interactive=True), |
|
gr.update(interactive=True), |
|
"", |
|
gr.update(interactive=True), |
|
) |
|
|
|
|
|
title = """ |
|
# π¦ββ¬ Magpie Preference |
|
""" |
|
|
|
description = """ |
|
This demo showcases **Magpie**, an innovative approach to generating high-quality data by prompting aligned LLMs with their pre-query templates. |
|
|
|
Unlike traditional methods, Magpie doesn't rely on prompt engineering or seed questions for generating synthetic data. Instead, it uses the prompt template of an aligned LLM to generate both a user query and an LLM response. |
|
|
|
As well as providing a demo for the Magpie generations, this Space also allows you to submit a preference rating for the generated data, contributing to a crowdsourced dataset. |
|
|
|
## π How it works |
|
|
|
1. **π Instruction Generation:** The model generates a user instruction. |
|
2. **π¬ Response Generation:** The model generates a response to this instruction. |
|
3. **ππ User Feedback (optional):** Rate the quality of the generated content. |
|
4. **πΎ Dataset Creation:** Feedback and generated data are saved to a Hugging Face dataset. |
|
|
|
π Find the crowd-generated dataset [here](https://huggingface.co/datasets/davanstrien/magpie-preference). It's updated every minute! |
|
|
|
π Learn more about Magpie in the [paper](https://huggingface.co/papers/2406.08464). |
|
|
|
> **Note:** A random session ID groups your feedback. No personal information is collected. |
|
""" |
|
|
|
|
|
with gr.Blocks() as iface: |
|
gr.Markdown(title) |
|
gr.Markdown(description) |
|
|
|
|
|
session_id = gr.State(generate_session_id) |
|
|
|
generated_input = gr.State("") |
|
generated_response = gr.State("") |
|
|
|
generate_btn = gr.Button("π Generate Instructions Response Pair") |
|
|
|
output = gr.Markdown(label="Generated Data") |
|
|
|
with gr.Row(): |
|
thumbs_up = gr.Button("π Thumbs Up", interactive=False) |
|
thumbs_down = gr.Button("π Thumbs Down", interactive=False) |
|
|
|
feedback_output = gr.Markdown(label="Feedback Status") |
|
|
|
def vote_and_submit(vote, input_text, response_text, session_id): |
|
if input_text and response_text: |
|
feedback = save_data( |
|
input_text, response_text, vote == "π Thumbs Up", session_id |
|
) |
|
return ( |
|
feedback, |
|
gr.update(interactive=False), |
|
gr.update(interactive=False), |
|
gr.update(interactive=True), |
|
) |
|
else: |
|
return ( |
|
"Please generate data before submitting feedback.", |
|
gr.update(interactive=True), |
|
gr.update(interactive=True), |
|
gr.update(interactive=True), |
|
) |
|
|
|
generate_btn.click( |
|
generate_instruction_response, |
|
inputs=[], |
|
outputs=[ |
|
output, |
|
generated_input, |
|
generated_response, |
|
thumbs_up, |
|
thumbs_down, |
|
feedback_output, |
|
generate_btn, |
|
], |
|
) |
|
thumbs_up.click( |
|
vote_and_submit, |
|
inputs=[ |
|
gr.State("π Thumbs Up"), |
|
generated_input, |
|
generated_response, |
|
session_id, |
|
], |
|
outputs=[feedback_output, thumbs_up, thumbs_down, generate_btn], |
|
) |
|
thumbs_down.click( |
|
vote_and_submit, |
|
inputs=[ |
|
gr.State("π Thumbs Down"), |
|
generated_input, |
|
generated_response, |
|
session_id, |
|
], |
|
outputs=[feedback_output, thumbs_up, thumbs_down, generate_btn], |
|
) |
|
|
|
|
|
iface.launch(debug=True) |
|
|