steps#
- class datadreamer.steps.Step(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#
Bases:
object
- property inputs: dict[str, OutputDatasetColumn | OutputIterableDatasetColumn][source]#
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- select(indices, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- select_columns(column_names, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- take(n, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- skip(n, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- shuffle(seed=None, buffer_size=1000, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- sort(column_names, reverse=False, null_placement='at_end', name=None, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- add_item(item, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- map(function, with_indices=False, input_columns=None, batched=False, batch_size=1000, remove_columns=None, total_num_rows=None, auto_progress=True, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- filter(function, with_indices=False, input_columns=None, batched=False, batch_size=1000, total_num_rows=None, auto_progress=True, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- rename_column(original_column_name, new_column_name, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- rename_columns(column_mapping, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- remove_columns(column_names, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- splits(train_size=None, validation_size=None, test_size=None, stratify_by_column=None, name=None, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- shard(num_shards, index, contiguous=False, name=None, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- reverse(name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- save(name=None, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- copy(name=None, lazy=None, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- export_to_dict(train_size=None, validation_size=None, test_size=None, stratify_by_column=None, writer_batch_size=1000, save_num_proc=None, save_num_shards=None)[source]#
- Return type:
- export_to_list(train_size=None, validation_size=None, test_size=None, stratify_by_column=None, writer_batch_size=1000, save_num_proc=None, save_num_shards=None)[source]#
- export_to_json(path, train_size=None, validation_size=None, test_size=None, stratify_by_column=None, writer_batch_size=1000, save_num_proc=None, save_num_shards=None, **to_json_kwargs)[source]#
- export_to_csv(path, sep=',', train_size=None, validation_size=None, test_size=None, stratify_by_column=None, writer_batch_size=1000, save_num_proc=None, save_num_shards=None, **to_csv_kwargs)[source]#
- class datadreamer.steps.DataSource(name, data, total_num_rows=None, auto_progress=True, progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False, **kwargs)[source]#
Bases:
Step
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- class datadreamer.steps.HFHubDataSource(name, path, config_name=None, split=None, revision=None, streaming=False, progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False, **config_kwargs)[source]#
Bases:
DataSource
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- class datadreamer.steps.HFDatasetDataSource(name, dataset_path, progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#
Bases:
DataSource
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- class datadreamer.steps.JSONDataSource(name, data_folder=None, data_files=None, progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False, **config_kwargs)[source]#
Bases:
DataSource
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- class datadreamer.steps.CSVDataSource(name, data_folder=None, data_files=None, sep=',', progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False, **config_kwargs)[source]#
Bases:
DataSource
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- class datadreamer.steps.TextDataSource(name, data_folder=None, data_files=None, progress_interval=None, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False, **config_kwargs)[source]#
Bases:
DataSource
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- class datadreamer.steps.Prompt(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#
Bases:
_PromptBase
Prompt( name = 'The name of the step.', args = { 'llm': 'The LLM to use.', 'post_process': 'A function to post-process the generations. (optional)', 'lazy': 'Whether to run lazily or not. (optional, defaults to False)', '**kwargs': 'Any other arguments you want to pass to the .run() method of the LLM. (optional)' }, inputs = { 'prompts': 'The prompts to process with the LLM.' }, outputs = { 'prompts': 'The prompts processed with the LLM.', 'generations': 'The generations by the LLM.' }, )
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- class datadreamer.steps.FewShotPrompt(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#
Bases:
_PromptBase
FewShotPrompt( name = 'The name of the step.', args = { 'llm': 'The LLM to use.', 'input_label': "The label to use for inputs. (optional, defaults to 'Input:')", 'output_label': "The label to use for outputs. (optional, defaults to 'Output:')", 'max_new_tokens': 'The maximum number of tokens to generate. (optional)', 'instruction': 'An instruction to include in the prompt. (optional)', 'sep': "The separator to use between instructions and in-context examples. (optional, defaults to '\\n')", 'min_in_context_examples': 'The minimum number of in-context examples to include. (optional)', 'max_in_context_examples': 'The maximum number of in-context examples to include. (optional)', 'post_process': 'A function to post-process the generations. (optional)', 'lazy': 'Whether to run lazily or not. (optional, defaults to False)', '**kwargs': 'Any other arguments you want to pass to the .run() method of the LLM. (optional)' }, inputs = { 'input_examples': 'The in-context example inputs to include in the prompt.', 'output_examples': 'The in-context example outputs to include in the prompt.', 'inputs': 'The inputs to process with the LLM.' }, outputs = { 'prompts': 'The prompts processed with the LLM.', 'generations': 'The generations by the LLM.' }, )
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- class datadreamer.steps.RunTaskModel(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#
Bases:
Step
RunTaskModel( name = 'The name of the step.', args = { 'model': 'The TaskModel to use.', 'truncate': 'Whether or not to truncate inputs. (optional, defaults to False)', 'lazy': 'Whether to run lazily or not. (optional, defaults to False)', '**kwargs': 'Any other arguments you want to pass to the .run() method of the TaskModel. (optional)' }, inputs = { 'texts': 'The texts to process with the TaskModel.' }, outputs = { 'texts': 'The texts processed with the TaskModel.', 'results': 'The results from the TaskModel.' }, )
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- class datadreamer.steps.Embed(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#
Bases:
Step
Embed( name = 'The name of the step.', args = { 'embedder': 'The Embedder to use.', 'truncate': 'Whether or not to truncate inputs. (optional, defaults to False)', 'instruction': 'The instruction to prefix inputs to the embedding model with. (optional)', 'lazy': 'Whether to run lazily or not. (optional, defaults to False)', '**kwargs': 'Any other arguments you want to pass to the .run() method of the Embedder. (optional)' }, inputs = { 'texts': 'The texts to embed.' }, outputs = { 'texts': 'The texts that were embedded.', 'embeddings': 'The embeddings by the Embedder.' }, )
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- class datadreamer.steps.Retrieve(name, args=None, inputs=None, outputs=None, progress_interval=60, force=False, verbose=None, log_level=None, save_num_proc=None, save_num_shards=None, background=False)[source]#
Bases:
Step
Retrieve( name = 'The name of the step.', args = { 'retriever': 'The Retriever to use.', 'k': 'How many results to retrieve. (optional, defaults to 5)', 'lazy': 'Whether to run lazily or not. (optional, defaults to False)', '**kwargs': 'Any other arguments you want to pass to the .run() method of the Retriever. (optional)' }, inputs = { 'queries': 'The queries to retrieve results for.' }, outputs = { 'queries': 'The queries used to retrieve results.', 'results': 'The results from the Retriever.' }, )
- property output: OutputDataset | OutputIterableDataset[source]#
The output dataset of the step.
- datadreamer.steps.concat(*steps, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- datadreamer.steps.zipped(*steps, name=None, lazy=True, progress_interval=DEFAULT, force=False, writer_batch_size=1000, save_num_proc=DEFAULT, save_num_shards=DEFAULT, background=False)[source]#
- Return type:
- class datadreamer.steps.DataCardType[source]#
Bases:
object
- MODEL_NAME = 'Model Name'#
- DATASET_NAME = 'Dataset Name'#
- LICENSE = 'License Information'#
- CITATION = 'Citation Information'#
- DATASET_CARD = 'Dataset Card'#
- MODEL_CARD = 'Model Card'#
- URL = 'URL'#
- class datadreamer.steps.LazyRows(value, total_num_rows=None, auto_progress=True, save=False, save_writer_batch_size=None)[source]#
Bases:
object