Cleaning an Existing Dataset#

DataDreamer can help clean or filter existing datasets using LLMs. We demonstrate this below by filtering a dataset of news articles to only include those that are about sports.

See the resulting synthetic dataset.
from datadreamer import DataDreamer
from datadreamer.llms import OpenAI
from datadreamer.steps import FilterWithPrompt, HFHubDataSource

with DataDreamer("./output"):
    # Load GPT-4
    gpt_4 = OpenAI(model_name="gpt-4")

    # Get news articles
    news_dataset = HFHubDataSource(
        "Get CNN & Daily Mail News Articles",
        "cnn_dailymail",
        config_name="3.0.0",
        split="test",
    )

    # Keep only 1000 articles as a quick demo
    news_dataset = news_dataset.take(1000)

    # Ask GPT-4 to filter the dataset
    sports_news_dataset = FilterWithPrompt(
        "Filter to only keep sports articles",
        inputs={"inputs": news_dataset.output["article"]},
        args={
            "llm": gpt_4,
            "instruction": "Is the article about sports? Answer 'Yes' or 'No'.",
        },
    )

    # Publish and share the synthetic dataset
    sports_news_dataset.publish_to_hf_hub(
        "datadreamer-dev/cnn_dailymail_sports",
    )