embedders#

Embedder objects help convert texts to embeddings. All embedders derive from the Embedder base class.

Tip

Instead of using run() directly, use a step that takes an Embedder as an args argument such as Embed or construct a Retriever with the embedder and then use a retrieval step such as Retrieve.

class datadreamer.embedders.Embedder(model_name, cache_folder_path=None)[source]#

Bases: TaskModel

abstract property dims: int[source]#
abstract run(texts, truncate=False, instruction=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=True, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_texts=None, return_generator=False, **kwargs)[source]#
Return type:

Union[Generator[Any, None, None], list[Any]]

abstract count_tokens(value)[source]#
Return type:

int

abstract property model_max_length: int[source]#
unload_model()[source]#
class datadreamer.embedders.SentenceTransformersEmbedder(model_name, device=None, dtype=None, cache_folder_path=None, **kwargs)[source]#

Bases: Embedder

property model: SentenceTransformer[source]#
property tokenizer: Any[source]#
property model_max_length: int[source]#
property dims: int[source]#
run(texts, truncate=False, instruction=None, batch_size=10, batch_scheduler_buffer_size=None, adaptive_batch_size=True, progress_interval=60, force=False, cache_only=False, verbose=None, log_level=None, total_num_texts=None, return_generator=False, **kwargs)[source]#
Return type:

Union[Generator[ndarray, None, None], list[ndarray]]

class datadreamer.embedders.ParallelEmbedder(*embedders)[source]#

Bases: ParallelTaskModel, Embedder

property dims: int[source]#
property model_max_length: int[source]#
run(texts, *args, **kwargs)[source]#
Return type:

Union[Generator[str | list[str], None, None], list[str | list[str]]]