DataDreamer

class datadreamer.llms.LLM(cache_folder_path=None)[source]#

Bases: _Cachable

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

abstract count_tokens(value)[source]#

Counts the number of tokens in a string.

Parameters:: value (str) – The string to count tokens for.
Return type:: int
Returns:: The number of tokens in the string.

abstract get_max_context_length(max_new_tokens)[source]#

Gets the maximum context length for the model. When max_new_tokens is greater than 0, the maximum number of tokens that can be used for the prompt context is returned.

Parameters:: max_new_tokens (int) – The maximum number of tokens that can be generated.
Return type:: int
Returns:: The maximum context length.

format_prompt(max_new_tokens=None, beg_instruction=None, in_context_examples=None, end_instruction=None, sep='\\n', min_in_context_examples=None, max_in_context_examples=None)[source]#

Formats a prompt for the LLM given instructions and in-context examples.

Parameters:

max_new_tokens (Optional[int], default: None) – The maximum number of tokens that can be generated.
beg_instruction (Optional[str], default: None) – The instruction at the beginning of the prompt.
in_context_examples (Optional[list[str]], default: None) – The in-context examples to include in the prompt.
end_instruction (Optional[str], default: None) – The instruction at the end of the prompt.
sep (default: '\\n') – The separator to use between the instructions and in-context examples.
min_in_context_examples (Optional[int], default: None) – The minimum number of in-context examples to include in the prompt.
max_in_context_examples (Optional[int], default: None) – The maximum number of in-context examples to include in the prompt.

Return type:

str

Returns:

The formatted prompt.

abstract run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

unload_model()[source]#: Unloads resources required to run the LLM from memory.

class datadreamer.llms.OpenAI(model_name, system_prompt=None, organization=None, api_key=None, base_url=None, api_version=None, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: LLM

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property client: OpenAI | AzureOpenAI[source]#: The API client instance being used.

property tokenizer: Encoding[source]#: The tokenizer instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

class datadreamer.llms.OpenAIAssistant(model_name, system_prompt=None, tools=None, organization=None, api_key=None, base_url=None, api_version=None, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: OpenAI

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property assistant_id: str[source]#: The ID of the assistant.

run(prompts, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property client: OpenAI | AzureOpenAI[source]#: The API client instance being used.

property tokenizer: Encoding[source]#: The tokenizer instance being used.

class datadreamer.llms.HFTransformers(model_name, chat_prompt_template=AUTO, system_prompt=AUTO, revision=None, trust_remote_code=False, device=None, device_map=None, dtype=None, quantization_config=None, adapter_name=None, adapter_kwargs=None, cache_folder_path=None, **kwargs)[source]#

Bases: LLM

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property model: PreTrainedModel[source]#: The model instance being used.

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=True, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

class datadreamer.llms.CTransformers(model_name, model_type=None, model_file=None, max_context_length=None, chat_prompt_template=AUTO, system_prompt=AUTO, revision=None, threads=None, gpu_layers=0, cache_folder_path=None, **kwargs)[source]#

Bases: HFTransformers

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property model: PreTrainedModel[source]#: The model instance being used.

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

class datadreamer.llms.VLLM(model_name, chat_prompt_template=AUTO, system_prompt=AUTO, revision=None, trust_remote_code=False, device=None, dtype=None, quantization=None, swap_space=1, cache_folder_path=None, **kwargs)[source]#

Bases: HFTransformers

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property model: Any[source]#: The model instance being used.

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

class datadreamer.llms.Petals(model_name, chat_prompt_template=AUTO, system_prompt=AUTO, revision=None, trust_remote_code=False, device=None, dtype=None, adapter_name=None, cache_folder_path=None, **kwargs)[source]#

Bases: HFTransformers

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property model: PreTrainedModel[source]#: The model instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=True, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

class datadreamer.llms.HFAPIEndpoint(endpoint, model_name, chat_prompt_template=AUTO, system_prompt=AUTO, token=None, revision=None, trust_remote_code=False, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: HFTransformers

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property client: InferenceClient[source]#: The API client instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property model: PreTrainedModel[source]#: The model instance being used.

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

class datadreamer.llms.Together(model_name, chat_prompt_template=AUTO, system_prompt=AUTO, api_key=None, max_context_length=None, tokenizer_model_name=None, tokenizer_revision=None, tokenizer_trust_remote_code=False, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: LLMAPI

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property client: Any[source]#: The API client instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

class datadreamer.llms.MistralAI(model_name, api_key=None, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: LLMAPI

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property client: Any[source]#: The API client instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

class datadreamer.llms.Anthropic(model_name, api_key=None, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: LiteLLM

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property client: Any[source]#: The API client instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

class datadreamer.llms.Cohere(model_name, api_key=None, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: LiteLLM

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property client: Any[source]#: The API client instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

class datadreamer.llms.AI21(model_name, api_key=None, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: LiteLLM

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property client: Any[source]#: The API client instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

class datadreamer.llms.Bedrock(model_name, aws_access_key_id=None, aws_secret_access_key=None, aws_region_name=None, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: LiteLLM

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property client: Any[source]#: The API client instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

class datadreamer.llms.GoogleAIStudio(model_name, api_key=None, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: LiteLLM

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property client: Any[source]#: The API client instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

class datadreamer.llms.VertexAI(model_name, vertex_project=None, vertex_location=None, retry_on_fail=True, cache_folder_path=None, **kwargs)[source]#

Bases: LiteLLM

Base class for all LLMs.

Parameters:: cache_folder_path (Optional[str], default: None) – The path to the cache folder. If None, the default cache folder for the DataDreamer session will be used.

property client: Any[source]#: The API client instance being used.

run(prompts, max_new_tokens=None, temperature=1.0, top_p=0.0, n=1, stop=None, repetition_penalty=None, logit_bias=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=False, seed=None, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_prompts=None, return_generator=False, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

property tokenizer: PreTrainedTokenizer[source]#: The tokenizer instance being used.

class datadreamer.llms.ParallelLLM(*llms)[source]#

Bases: _ParallelCachable, LLM

Creates a LLM that will run multiple LLMs in parallel. See running models in parallel for more details.

Parameters:: *llms (LLM) – The LLMs to run in parallel.

run(prompts, *args, **kwargs)[source]#

Return type:: Union[Generator[str | list[str], None, None], list[str | list[str]]]

llms#

Efficient Generation Techniques#

Throughput#

Running on Multiple GPUs#

Quantization#

Caching#