Augmenting an Existing Dataset#
DataDreamer can help augment existing datasets using LLMs. We demonstrate this below by augmenting questions from HotpotQA with a decomposition of what steps a user would need to take to solve the complex question.
See the resulting synthetic dataset.from datadreamer import DataDreamer
from datadreamer.llms import OpenAI
from datadreamer.steps import ProcessWithPrompt, HFHubDataSource
with DataDreamer("./output"):
# Load GPT-4
gpt_4 = OpenAI(model_name="gpt-4")
# Get HotPot QA questions
hotpot_qa_dataset = HFHubDataSource(
"Get Hotpot QA Questions",
"hotpot_qa",
config_name="distractor",
split="train",
).select_columns(["question"])
# Keep only 1000 questions as a quick demo
hotpot_qa_dataset = hotpot_qa_dataset.take(1000)
# Ask GPT-4 to decompose the question
questions_and_decompositions = ProcessWithPrompt(
"Generate Decompositions",
inputs={"inputs": hotpot_qa_dataset.output["question"]},
args={
"llm": gpt_4,
"instruction": (
"Given the question which requires multiple steps to solve, give a numbered list of intermediate questions required to solve the question."
"Return only the list, nothing else."
),
},
outputs={"inputs": "questions", "generations": "decompositions"},
).select_columns(["questions", "decompositions"])
# Publish and share the synthetic dataset
questions_and_decompositions.publish_to_hf_hub(
"datadreamer-dev/hotpot_qa_augmented",
)