Bootstrapping Synthetic Few-Shot Examples#
One technique to perform a task with no examples is to bootstrap synthetic examples that can eventually be used as examples in a few-shot prompt. Low-resource machine translation (or translation to languages where little to no training data exists) is a good motivator for this kind of task.
We demonstrate bootstrapping following two papers (Patel et al., 2022, Han et al., 2021) to translate from English to Tamil with no paired training examples. Instead, we generate synthetic examples in 4 rounds of bootstrapping.
See the resulting translations.from datadreamer import DataDreamer
from datadreamer.llms import OpenAI
from datadreamer.steps import (
FewShotPrompt,
ProcessWithPrompt,
HFHubDataSource,
CosineSimilarity,
)
from datadreamer.embedders import SentenceTransformersEmbedder
with DataDreamer("./output"):
# Load GPT-4
gpt_4 = OpenAI(model_name="gpt-4")
# Get English sentences
english_dataset = HFHubDataSource(
"Get FLORES-101 English Sentences",
"gsarti/flores_101",
config_name="eng",
split="dev",
).select_columns(["sentence"])
# Keep only 400 examples as a quick demo
english_dataset = english_dataset.take(400)
# Define how many rounds of bootstrapping
rounds = 4
# For each round of bootstrapping
best_translation_pairs = None
for r in range(rounds):
# Use a partial set of the sentences for each round
sentences_for_round = english_dataset.shard(
num_shards=rounds, index=r, name=f"Round #{r+1}: Get Sentences"
)
# Create synthetic pairs
if r == 0:
# On the first round, ask GPT-4 to zero-shot translate the English sentences
# to Tamil to create synthetic translation pairs
english_to_tamil = ProcessWithPrompt(
f"Round #{r+1}: Zero-shot Translate from English To Tamil",
inputs={"inputs": sentences_for_round.output["sentence"]},
args={
"llm": gpt_4,
"input_label": "Sentence:",
"instruction": "Translate the sentence to Tamil.",
"max_new_tokens": 1000,
},
outputs={"inputs": "english", "generations": "tamil"},
).select_columns(["english", "tamil"])
else:
# On subsequent rounds, use the best synthetic translation pairs from the previous round
# as few-shot examples to translate more English sentences to create even better synthetic pairs
english_to_tamil = FewShotPrompt(
f"Round #{r+1}: Few-shot Translate from English To Tamil",
inputs={
"input_examples": best_translation_pairs.output["english"],
"output_examples": best_translation_pairs.output["tamil"],
"inputs": sentences_for_round.output["sentence"],
},
args={
"llm": gpt_4,
"input_label": "English:",
"output_label": "Tamil:",
"instruction": "Translate the sentence to Tamil.",
"max_new_tokens": 1000,
},
outputs={"inputs": "english", "generations": "tamil"},
).select_columns(["english", "tamil"])
# Automatically filter the best synthetic translation pairs through cosine similarity
embedder = SentenceTransformersEmbedder("google/mt5-small")
best_translation_pairs = (
CosineSimilarity(
f"Round #{r+1}: Compute Similarities between the Source and Translated Sentences",
args={"embedder": embedder, "truncate": True},
inputs={
"a": english_to_tamil.output["english"],
"b": english_to_tamil.output["tamil"],
},
outputs={"a": "english", "b": "tamil"},
)
.sort(
["similarities"],
reverse=True,
name=f"Round #{r+1}: Rank by Similarities",
)
.take(2, name=f"Round #{r+1}: Get Top-2 Translation Pairs")
)
# Load the test set of English sentences
english_test_dataset = HFHubDataSource(
"Get FLORES-101 English Sentences (Test Set)",
"gsarti/flores_101",
config_name="eng",
split="devtest",
).select_columns(["sentence"])
# Finally translate the test set with the final bootstrapped synthetic few-shot examples
english_test_to_tamil = FewShotPrompt(
"Few-shot Translate from English To Tamil (Test Set)",
inputs={
"input_examples": best_translation_pairs.output["english"],
"output_examples": best_translation_pairs.output["tamil"],
"inputs": english_test_dataset.output["sentence"],
},
args={
"llm": gpt_4,
"input_label": "English:",
"output_label": "Tamil:",
"instruction": "Translate the sentence to Tamil.",
"max_new_tokens": 1000,
},
outputs={"inputs": "english", "generations": "tamil"},
).select_columns(["english", "tamil"])
# Publish and share the synthetic dataset
english_test_to_tamil.publish_to_hf_hub(
"datadreamer-dev/english_to_tamil",
)