Training a Self-Improving LLM with Self-Rewarding#
A notable research direction is to use LLMs to self-improve themselves by letting them judge their own outputs and training them to improve. This is similar to RLHF except it is RLAIF (Reinforcement Learning with AI Feedback). These types of workflows become simple to implement with DataDreamer.
We demonstrate a simple example of the Self-Rewarding Language Models paper below.
from datadreamer import DataDreamer
from datadreamer.steps import HFHubDataSource, Prompt, JudgeGenerationPairsWithPrompt
from datadreamer.trainers import TrainHFDPO
from datadreamer.llms import HFTransformers
from peft import LoraConfig
with DataDreamer("./output"):
# Get a dataset of prompts
prompts_dataset = HFHubDataSource(
"Get Prompts Dataset", "Intel/orca_dpo_pairs", split="train"
).select_columns(["question"])
# Keep only 3000 examples as a quick demo
prompts_dataset = prompts_dataset.take(3000)
# Define how many rounds of self-reward training
rounds = 3
# For each round of self-reward training
adapter_to_apply = None
for r in range(rounds):
# Use a partial set of the prompts for each round
prompts_for_round = prompts_dataset.shard(
num_shards=rounds, index=r, name=f"Round #{r+1}: Get Prompts"
)
# Load the LLM
llm = HFTransformers(
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
adapter_name=adapter_to_apply,
device_map="auto",
dtype="bfloat16",
)
# Sample 2 candidate responses from the LLM
candidate_responses = []
for candidate_idx in range(2):
candidate_responses.append(
Prompt(
f"Round #{r+1}: Sample Candidate Response #{candidate_idx}",
inputs={"prompts": prompts_for_round.output["question"]},
args={
"llm": llm,
"batch_size": 2,
"top_p": 1.0,
"seed": candidate_idx,
},
)
)
# Have the LLM judge its own responses
judgements = JudgeGenerationPairsWithPrompt(
f"Round #{r+1}: Judge Candidate Responses",
args={
"llm": llm,
"batch_size": 1,
"max_new_tokens": 5,
},
inputs={
"prompts": prompts_for_round.output["question"],
"a": candidate_responses[0].output["generations"],
"b": candidate_responses[1].output["generations"],
},
)
# Unload the LLM
llm.unload_model()
# Process the judgements into a preference dataset
dpo_dataset = judgements.map(
lambda row: {
"question": row["prompts"],
"chosen": row["a"] if row["judgements"] == "Response A" else row["b"],
"rejected": row["b"] if row["judgements"] == "Response A" else row["a"],
},
lazy=False,
name=f"Round #{r+1}: Create Self-Reward Preference Dataset",
)
# Create training data splits
splits = dpo_dataset.splits(train_size=0.90, validation_size=0.10)
# Align the TinyLlama chat model with its own preferences
trainer = TrainHFDPO(
f"Round #{r+1}: Self-Reward Align TinyLlama-Chat",
model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
peft_config=LoraConfig(),
device=["cuda:0", "cuda:1"],
dtype="bfloat16",
)
trainer.train(
train_prompts=splits["train"].output["question"],
train_chosen=splits["train"].output["chosen"],
train_rejected=splits["train"].output["rejected"],
validation_prompts=splits["validation"].output["question"],
validation_chosen=splits["validation"].output["chosen"],
validation_rejected=splits["validation"].output["rejected"],
epochs=3,
batch_size=1,
gradient_accumulation_steps=32,
)
# Unload the trained model from memory
trainer.unload_model()
# Use the newly trained adapter for the next round of self-reward
adapter_to_apply = trainer.model_path