steps#

class datadreamer.steps.Step(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#

Bases: object

register_arg(arg_name, required=True, default=None, help=None)[source]#

register_input(input_column_name, required=True, help=None)[source]#

register_output(output_column_name, help=None)[source]#

register_data_card(data_card_type, data_card)[source]#

property args: dict[str, Any][source]#

property inputs: dict[str, OutputDatasetColumn | OutputIterableDatasetColumn][source]#

get_run_output_folder_path()[source]#

Return type:: str

pickle(value, *args, **kwargs)[source]#

Return type:: bytes

unpickle(value)[source]#

Return type:: Any

property progress: None | float[source]#

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

property dataset_path: str[source]#

head(n=5, shuffle=False, seed=None, buffer_size=1000)[source]#

Return type:: DataFrame

data_card()[source]#

Return type:: None

select(indices, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

select_columns(column_names, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

take(n, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

skip(n, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

shuffle(seed=None, buffer_size=1000, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

sort(column_names, reverse=False, null_placement='at_end', name=None, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

add_item(item, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

map(function, with_indices=False, input_columns=None, batched=False, batch_size=1000, remove_columns=None, total_num_rows=None, auto_progress=True, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

filter(function, with_indices=False, input_columns=None, batched=False, batch_size=1000, total_num_rows=None, auto_progress=True, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

rename_column(original_column_name, new_column_name, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

rename_columns(column_mapping, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

remove_columns(column_names, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

splits(train_size=None, validation_size=None, test_size=None, stratify_by_column=None, name=None, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: dict[str, Step]

shard(num_shards, index, contiguous=False, name=None, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

reverse(name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

save(name=None, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

copy(name=None, lazy=None, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

export_to_dict(train_size=None, validation_size=None, test_size=None, stratify_by_column=None, writer_batch_size=1000, save_num_proc=None, save_num_shards=None)[source]#

Return type:: dict

export_to_list(train_size=None, validation_size=None, test_size=None, stratify_by_column=None, writer_batch_size=1000, save_num_proc=None, save_num_shards=None)[source]#

Return type:: list | dict

export_to_json(path, train_size=None, validation_size=None, test_size=None, stratify_by_column=None, writer_batch_size=1000, save_num_proc=None, save_num_shards=None, **to_json_kwargs)[source]#

Return type:: str | dict

export_to_csv(path, sep=',', train_size=None, validation_size=None, test_size=None, stratify_by_column=None, writer_batch_size=1000, save_num_proc=None, save_num_shards=None, **to_csv_kwargs)[source]#

Return type:: str | dict

export_to_hf_dataset(path, train_size=None, validation_size=None, test_size=None, stratify_by_column=None, writer_batch_size=1000, save_num_proc=None, save_num_shards=None)[source]#

Return type:: Dataset | DatasetDict

publish_to_hf_hub(repo_id, branch=None, private=False, token=None, train_size=None, validation_size=None, test_size=None, stratify_by_column=None, writer_batch_size=1000, save_num_proc=None, save_num_shards=None, is_synthetic=True, **kwargs)[source]#

Return type:: str

class datadreamer.steps.DataSource(name, data, total_num_rows=None, auto_progress=True, progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False, **kwargs)[source]#

Bases: Step

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

class datadreamer.steps.HFHubDataSource(name, path, config_name=None, split=None, revision=None, streaming=False, progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False, **config_kwargs)[source]#

Bases: DataSource

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

class datadreamer.steps.HFDatasetDataSource(name, dataset_path, progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#

Bases: DataSource

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

class datadreamer.steps.JSONDataSource(name, data_folder=None, data_files=None, progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False, **config_kwargs)[source]#

Bases: DataSource

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

class datadreamer.steps.CSVDataSource(name, data_folder=None, data_files=None, sep=',', progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False, **config_kwargs)[source]#

Bases: DataSource

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

class datadreamer.steps.TextDataSource(name, data_folder=None, data_files=None, progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False, **config_kwargs)[source]#

Bases: DataSource

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

class datadreamer.steps.Prompt(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#

Bases: _PromptBase

Prompt.help#

Prompt(
	name = 'The name of the step.',
	args = {
		'llm': 'The LLM to use.',
		'post_process': 'A function to post-process the generations. (optional)',
		'lazy': 'Whether to run lazily or not. (optional, defaults to False)',
		'**kwargs': 'Any other arguments you want to pass to the .run() method of the LLM. (optional)'
	},
	inputs = {
		'prompts': 'The prompts to process with the LLM.'
	},
	outputs = {
		'prompts': 'The prompts processed with the LLM.',
		'generations': 'The generations by the LLM.'
	},
)

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

class datadreamer.steps.FewShotPrompt(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#

Bases: _PromptBase

FewShotPrompt.help#

FewShotPrompt(
	name = 'The name of the step.',
	args = {
		'llm': 'The LLM to use.',
		'input_label': "The label to use for inputs. (optional, defaults to 'Input:')",
		'output_label': "The label to use for outputs. (optional, defaults to 'Output:')",
		'max_new_tokens': 'The maximum number of tokens to generate. (optional)',
		'instruction': 'An instruction to include in the prompt. (optional)',
		'sep': "The separator to use between instructions and in-context examples. (optional, defaults to '\\n')",
		'min_in_context_examples': 'The minimum number of in-context examples to include. (optional)',
		'max_in_context_examples': 'The maximum number of in-context examples to include. (optional)',
		'post_process': 'A function to post-process the generations. (optional)',
		'lazy': 'Whether to run lazily or not. (optional, defaults to False)',
		'**kwargs': 'Any other arguments you want to pass to the .run() method of the LLM. (optional)'
	},
	inputs = {
		'input_examples': 'The in-context example inputs to include in the prompt.',
		'output_examples': 'The in-context example outputs to include in the prompt.',
		'inputs': 'The inputs to process with the LLM.'
	},
	outputs = {
		'prompts': 'The prompts processed with the LLM.',
		'generations': 'The generations by the LLM.'
	},
)

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

class datadreamer.steps.RunTaskModel(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#

Bases: Step

RunTaskModel.help#

RunTaskModel(
	name = 'The name of the step.',
	args = {
		'model': 'The TaskModel to use.',
		'truncate': 'Whether or not to truncate inputs. (optional, defaults to False)',
		'lazy': 'Whether to run lazily or not. (optional, defaults to False)',
		'**kwargs': 'Any other arguments you want to pass to the .run() method of the TaskModel. (optional)'
	},
	inputs = {
		'texts': 'The texts to process with the TaskModel.'
	},
	outputs = {
		'texts': 'The texts processed with the TaskModel.',
		'results': 'The results from the TaskModel.'
	},
)

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

class datadreamer.steps.Embed(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#

Bases: Step

Embed.help#

Embed(
	name = 'The name of the step.',
	args = {
		'embedder': 'The Embedder to use.',
		'truncate': 'Whether or not to truncate inputs. (optional, defaults to False)',
		'instruction': 'The instruction to prefix inputs to the embedding model with. (optional)',
		'lazy': 'Whether to run lazily or not. (optional, defaults to False)',
		'**kwargs': 'Any other arguments you want to pass to the .run() method of the Embedder. (optional)'
	},
	inputs = {
		'texts': 'The texts to embed.'
	},
	outputs = {
		'texts': 'The texts that were embedded.',
		'embeddings': 'The embeddings by the Embedder.'
	},
)

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

class datadreamer.steps.Retrieve(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#

Bases: Step

Retrieve.help#

Retrieve(
	name = 'The name of the step.',
	args = {
		'retriever': 'The Retriever to use.',
		'k': 'How many results to retrieve. (optional, defaults to 5)',
		'lazy': 'Whether to run lazily or not. (optional, defaults to False)',
		'**kwargs': 'Any other arguments you want to pass to the .run() method of the Retriever. (optional)'
	},
	inputs = {
		'queries': 'The queries to retrieve results for.'
	},
	outputs = {
		'queries': 'The queries used to retrieve results.',
		'results': 'The results from the Retriever.'
	},
)

property output: OutputDataset | OutputIterableDataset[source]#: The output dataset of the step.

datadreamer.steps.concat(*steps, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

datadreamer.steps.zipped(*steps, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#

Return type:: Step

datadreamer.steps.wait(*steps, poll_interval=1.0)[source]#

datadreamer.steps.concurrent(*funcs)[source]#

class datadreamer.steps.DataCardType[source]#

Bases: object

MODEL_NAME = 'Model Name'#

DATASET_NAME = 'Dataset Name'#

LICENSE = 'License Information'#

CITATION = 'Citation Information'#

DATASET_CARD = 'Dataset Card'#

MODEL_CARD = 'Model Card'#

URL = 'URL'#

class datadreamer.steps.LazyRows(value, total_num_rows=None, auto_progress=True, save=False, save_writer_batch_size=None)[source]#: Bases: object

class datadreamer.steps.LazyRowBatches(value, total_num_rows=None, auto_progress=True, save=False, save_writer_batch_size=None)[source]#: Bases: object

class datadreamer.steps.SuperStep(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#: Bases: Step