Skip to content

TensorProcessor

Processor interface for converting between different tensor formats.

TensorProcessor

Tensor processor class.

Source code in src/tensorshare/serialization/processor.py
class TensorProcessor:
    """Tensor processor class."""

    @staticmethod
    def serialize(
        tensors: Dict[
            str,
            Union["Array", "np.ndarray", "paddle.Tensor", "tf.Tensor", "torch.Tensor"],
        ],
        metadata: Optional[Dict[str, str]] = None,
        backend: Optional[Union[str, Backend]] = None,
    ) -> Tuple[bytes, ByteSize]:
        """Serialize a dictionary of tensors to a tuple containing the serialized tensors and their size.

        This method will convert a dictionary of tensors to a tuple containing the base64 encoded serialized
        tensors and the size of the serialized tensors. It will use the backend if provided, otherwise it will
        try to infer the backend from the tensors format.

        Args:
            tensors (Dict[str, Union[Array, np.ndarray, paddle.Tensor, tf.Tensor, torch.Tensor]]):
                Tensors stored in a dictionary with their name as key.
            metadata (Optional[Dict[str, str]], optional):
                Metadata to add to the safetensors file. Defaults to None.
            backend (Optional[Union[str, Backend]], optional):
                Backend to use for the conversion. Defaults to None.
                If None, the backend will be inferred from the tensors format.
                Backend can be one of the following:
                    - Backend.FLAX or 'flax'
                    - Backend.NUMPY or 'numpy'
                    - Backend.PADDLEPADDLE or 'paddlepaddle'
                    - Backend.TENSORFLOW or 'tensorflow'
                    - Backend.TORCH or 'torch'

        Raises:
            TypeError: If tensors is not a dictionary.
            TypeError: If backend is not a string or an instance of Backend enum.
            ValueError: If tensors is empty.
            KeyError: If backend is not one of the supported backends.

        Returns:
            Tuple[bytes, ByteSize]:
                A tuple containing the base64 encoded serialized tensors and the size of the serialized tensors.
        """
        if not isinstance(tensors, dict):
            logger.warning(
                f"Tensors should be a dictionary, got `{type(tensors)}` instead."
                " Consider using the `prepare_tensors_to_dict` to lazy format "
                "your tensors. Check"
                " https://chainyo.github.io/tensorshare/usage/tensorshare/#lazy-tensors-formatting"
            )
            raise TypeError
        elif not tensors:
            raise ValueError("Tensors dictionary cannot be empty.")

        if backend is not None:
            if isinstance(backend, str):
                try:
                    _backend = Backend[backend.upper()]
                except KeyError as e:
                    raise KeyError(
                        f"Invalid backend `{backend}`. Must be one of {list(Backend)}."
                    ) from e

            elif not isinstance(backend, Backend):
                raise TypeError(
                    "Backend must be a string or an instance of Backend enum, got"
                    f" `{type(backend)}` instead. Use"
                    " `tensorshare.serialization.Backend` to access the Backend enum."
                    " If you don't specify a backend, it will be inferred from the"
                    " tensors format."
                )
        else:
            logger.warning(
                "No backend specified. The backend will be inferred from the tensors"
                " format."
                " If you want to specify the backend, use the `backend` argument. Check"
                " https://chainyo.github.io/tensorshare/usage/tensorshare/#with-a-specific-backend"
            )
            _backend = _infer_backend(tensors)

        _tensors = _get_backend_method(_backend, "serialize")(
            tensors, metadata=metadata
        )

        return base64.b64encode(_tensors), ByteSize(len(_tensors))

    @staticmethod
    def deserialize(
        data: bytes,
        backend: Union[str, Backend],
    ) -> Dict[
        str, Union["Array", "np.ndarray", "paddle.Tensor", "tf.Tensor", "torch.Tensor"]
    ]:
        """Deserialize base64 encoded serialized tensors to a dictionary of tensors.

        This method will convert TensorShare.tensors to a dictionary of tensors with their name as key.
        The backend must be specified in order to deserialize the data.

        Args:
            data (bytes):
                The base64 encoded serialized tensors to deserialize.
            backend (Union[str, Backend]):
                The backend to use for the conversion. Must be one of the following:
                    - Backend.FLAX or 'flax'
                    - Backend.NUMPY or 'numpy'
                    - Backend.PADDLEPADDLE or 'paddlepaddle'
                    - Backend.TENSORFLOW or 'tensorflow'
                    - Backend.TORCH or 'torch'

        Raises:
            TypeError: If data is not bytes.
            TypeError: If backend is not a string or an instance of Backend enum.
            KeyError: If backend is not one of the supported backends.

        Returns:
            Dict[str, Union[Array, np.ndarray, paddle.Tensor, tf.Tensor, torch.Tensor]]:
                A dictionary of tensors in the specified backend with their name as key.
        """
        if not isinstance(data, bytes):
            raise TypeError(f"Data must be bytes, got `{type(data)}` instead.")

        if isinstance(backend, str):
            try:
                _backend = Backend[backend.upper()]
            except KeyError as e:
                raise KeyError(
                    f"Invalid backend `{backend}`. Must be one of {list(Backend)}."
                ) from e

        elif not isinstance(backend, Backend):
            raise TypeError(
                "Backend must be a string or an instance of Backend enum, got"
                f" `{type(backend)}` instead. Use `tensorshare.serialization.Backend`"
                " to access the Backend enum."
            )

        tensors = _get_backend_method(_backend, "deserialize")(base64.b64decode(data))

        return tensors  # type: ignore

deserialize(data, backend) staticmethod

Deserialize base64 encoded serialized tensors to a dictionary of tensors.

This method will convert TensorShare.tensors to a dictionary of tensors with their name as key. The backend must be specified in order to deserialize the data.

Parameters:

Name Type Description Default
data bytes

The base64 encoded serialized tensors to deserialize.

required
backend Union[str, Backend]

The backend to use for the conversion. Must be one of the following: - Backend.FLAX or 'flax' - Backend.NUMPY or 'numpy' - Backend.PADDLEPADDLE or 'paddlepaddle' - Backend.TENSORFLOW or 'tensorflow' - Backend.TORCH or 'torch'

required

Raises:

Type Description
TypeError

If data is not bytes.

TypeError

If backend is not a string or an instance of Backend enum.

KeyError

If backend is not one of the supported backends.

Returns:

Type Description
Dict[str, Union[Array, ndarray, Tensor, Tensor, Tensor]]

Dict[str, Union[Array, np.ndarray, paddle.Tensor, tf.Tensor, torch.Tensor]]: A dictionary of tensors in the specified backend with their name as key.

Source code in src/tensorshare/serialization/processor.py
@staticmethod
def deserialize(
    data: bytes,
    backend: Union[str, Backend],
) -> Dict[
    str, Union["Array", "np.ndarray", "paddle.Tensor", "tf.Tensor", "torch.Tensor"]
]:
    """Deserialize base64 encoded serialized tensors to a dictionary of tensors.

    This method will convert TensorShare.tensors to a dictionary of tensors with their name as key.
    The backend must be specified in order to deserialize the data.

    Args:
        data (bytes):
            The base64 encoded serialized tensors to deserialize.
        backend (Union[str, Backend]):
            The backend to use for the conversion. Must be one of the following:
                - Backend.FLAX or 'flax'
                - Backend.NUMPY or 'numpy'
                - Backend.PADDLEPADDLE or 'paddlepaddle'
                - Backend.TENSORFLOW or 'tensorflow'
                - Backend.TORCH or 'torch'

    Raises:
        TypeError: If data is not bytes.
        TypeError: If backend is not a string or an instance of Backend enum.
        KeyError: If backend is not one of the supported backends.

    Returns:
        Dict[str, Union[Array, np.ndarray, paddle.Tensor, tf.Tensor, torch.Tensor]]:
            A dictionary of tensors in the specified backend with their name as key.
    """
    if not isinstance(data, bytes):
        raise TypeError(f"Data must be bytes, got `{type(data)}` instead.")

    if isinstance(backend, str):
        try:
            _backend = Backend[backend.upper()]
        except KeyError as e:
            raise KeyError(
                f"Invalid backend `{backend}`. Must be one of {list(Backend)}."
            ) from e

    elif not isinstance(backend, Backend):
        raise TypeError(
            "Backend must be a string or an instance of Backend enum, got"
            f" `{type(backend)}` instead. Use `tensorshare.serialization.Backend`"
            " to access the Backend enum."
        )

    tensors = _get_backend_method(_backend, "deserialize")(base64.b64decode(data))

    return tensors  # type: ignore

serialize(tensors, metadata=None, backend=None) staticmethod

Serialize a dictionary of tensors to a tuple containing the serialized tensors and their size.

This method will convert a dictionary of tensors to a tuple containing the base64 encoded serialized tensors and the size of the serialized tensors. It will use the backend if provided, otherwise it will try to infer the backend from the tensors format.

Parameters:

Name Type Description Default
tensors Dict[str, Union[Array, ndarray, Tensor, Tensor, Tensor]]

Tensors stored in a dictionary with their name as key.

required
metadata Optional[Dict[str, str]]

Metadata to add to the safetensors file. Defaults to None.

None
backend Optional[Union[str, Backend]]

Backend to use for the conversion. Defaults to None. If None, the backend will be inferred from the tensors format. Backend can be one of the following: - Backend.FLAX or 'flax' - Backend.NUMPY or 'numpy' - Backend.PADDLEPADDLE or 'paddlepaddle' - Backend.TENSORFLOW or 'tensorflow' - Backend.TORCH or 'torch'

None

Raises:

Type Description
TypeError

If tensors is not a dictionary.

TypeError

If backend is not a string or an instance of Backend enum.

ValueError

If tensors is empty.

KeyError

If backend is not one of the supported backends.

Returns:

Type Description
Tuple[bytes, ByteSize]

Tuple[bytes, ByteSize]: A tuple containing the base64 encoded serialized tensors and the size of the serialized tensors.

Source code in src/tensorshare/serialization/processor.py
@staticmethod
def serialize(
    tensors: Dict[
        str,
        Union["Array", "np.ndarray", "paddle.Tensor", "tf.Tensor", "torch.Tensor"],
    ],
    metadata: Optional[Dict[str, str]] = None,
    backend: Optional[Union[str, Backend]] = None,
) -> Tuple[bytes, ByteSize]:
    """Serialize a dictionary of tensors to a tuple containing the serialized tensors and their size.

    This method will convert a dictionary of tensors to a tuple containing the base64 encoded serialized
    tensors and the size of the serialized tensors. It will use the backend if provided, otherwise it will
    try to infer the backend from the tensors format.

    Args:
        tensors (Dict[str, Union[Array, np.ndarray, paddle.Tensor, tf.Tensor, torch.Tensor]]):
            Tensors stored in a dictionary with their name as key.
        metadata (Optional[Dict[str, str]], optional):
            Metadata to add to the safetensors file. Defaults to None.
        backend (Optional[Union[str, Backend]], optional):
            Backend to use for the conversion. Defaults to None.
            If None, the backend will be inferred from the tensors format.
            Backend can be one of the following:
                - Backend.FLAX or 'flax'
                - Backend.NUMPY or 'numpy'
                - Backend.PADDLEPADDLE or 'paddlepaddle'
                - Backend.TENSORFLOW or 'tensorflow'
                - Backend.TORCH or 'torch'

    Raises:
        TypeError: If tensors is not a dictionary.
        TypeError: If backend is not a string or an instance of Backend enum.
        ValueError: If tensors is empty.
        KeyError: If backend is not one of the supported backends.

    Returns:
        Tuple[bytes, ByteSize]:
            A tuple containing the base64 encoded serialized tensors and the size of the serialized tensors.
    """
    if not isinstance(tensors, dict):
        logger.warning(
            f"Tensors should be a dictionary, got `{type(tensors)}` instead."
            " Consider using the `prepare_tensors_to_dict` to lazy format "
            "your tensors. Check"
            " https://chainyo.github.io/tensorshare/usage/tensorshare/#lazy-tensors-formatting"
        )
        raise TypeError
    elif not tensors:
        raise ValueError("Tensors dictionary cannot be empty.")

    if backend is not None:
        if isinstance(backend, str):
            try:
                _backend = Backend[backend.upper()]
            except KeyError as e:
                raise KeyError(
                    f"Invalid backend `{backend}`. Must be one of {list(Backend)}."
                ) from e

        elif not isinstance(backend, Backend):
            raise TypeError(
                "Backend must be a string or an instance of Backend enum, got"
                f" `{type(backend)}` instead. Use"
                " `tensorshare.serialization.Backend` to access the Backend enum."
                " If you don't specify a backend, it will be inferred from the"
                " tensors format."
            )
    else:
        logger.warning(
            "No backend specified. The backend will be inferred from the tensors"
            " format."
            " If you want to specify the backend, use the `backend` argument. Check"
            " https://chainyo.github.io/tensorshare/usage/tensorshare/#with-a-specific-backend"
        )
        _backend = _infer_backend(tensors)

    _tensors = _get_backend_method(_backend, "serialize")(
        tensors, metadata=metadata
    )

    return base64.b64encode(_tensors), ByteSize(len(_tensors))

Last update: 2023-08-20
Created: 2023-08-20