Binary data with lazy loading

Contents

Binary data with lazy loading#

Author: Aureliana Barghini (B-Open)

If you want to make your backend effective with big datasets, then you should support lazy loading.
For doing that you need:

Implement _raw_indexing_method for reading blocks form disk
Implement some glue code to make it work with Xarray:
- put your _raw_indexing_method in a BackendArray subclass
- replace the numpy.ndarray inside your dataset with your subclass of BackendArray

Create sample files#

import os

import dask
import numpy as np
import xarray as xr

arr = np.arange(30000000, dtype=np.int64)
with open("foo.bin", "w") as f:
    arr.tofile(f)

arr = np.arange(30000000, dtype=np.float64)
with open("foo_float.bin", "w") as f:
    arr.tofile(f)

BinaryBackendArray#

The BackendArray subclass shall implement the following method and attributes:

_raw_indexing_method method, supporting item selection and slicing
__getitem__ that wraps _raw_indexing_method with an xarray helper function explicit_indexing_adapter (threadsafe)
shape attribute
dtype attribute.

class BinaryBackendArray(xr.backends.BackendArray):
    def __init__(
        self,
        filename_or_obj,
        shape,
        dtype,
        lock,
    ):
        self.filename_or_obj = filename_or_obj
        self.shape = shape
        self.dtype = dtype
        self.lock = lock

    def __getitem__(self, key: tuple):
        return xr.core.indexing.explicit_indexing_adapter(
            key,
            self.shape,
            xr.core.indexing.IndexingSupport.BASIC,
            self._raw_indexing_method,
        )

    def _raw_indexing_method(self, key: tuple):
        key0 = key[0]
        size = np.dtype(self.dtype).itemsize

        if isinstance(key0, slice):
            start = key0.start or 0
            stop = key0.stop or self.shape[0]
            offset = size * start
            count = stop - start
        else:
            offset = size * key0
            count = 1

        with self.lock, open(self.filename_or_obj) as f:
            arr = np.fromfile(f, np.int64, offset=offset, count=count)

        if isinstance(key, int):
            arr = arr.squeeze()

        return arr

BinaryBackend Entrypoint#

class BinaryBackend(xr.backends.BackendEntrypoint):
    def open_dataset(self, filename_or_obj, *, drop_variables=None, dtype=np.int64):
        size = np.dtype(dtype).itemsize
        shape = os.stat(filename_or_obj).st_size // size

        backend_array = BinaryBackendArray(
            filename_or_obj=filename_or_obj,
            shape=(shape,),
            dtype=dtype,
            lock=dask.utils.SerializableLock(),
        )
        data = xr.core.indexing.LazilyIndexedArray(backend_array)

        var = xr.Variable(dims=("x"), data=data)
        return xr.Dataset({"foo": var})

Reduced memory usage with dask#

arr = xr.open_dataarray("foo.bin", engine=BinaryBackend, chunks=10000)
arr

<xarray.DataArray 'foo' (x: 30000000)> Size: 240MB
dask.array<open_dataset-foo, shape=(30000000,), dtype=int64, chunksize=(10000,), chunktype=numpy.ndarray>
Dimensions without coordinates: x

arr.mean()

<xarray.DataArray 'foo' ()> Size: 8B
dask.array<mean_agg-aggregate, shape=(), dtype=float64, chunksize=(), chunktype=numpy.ndarray>

arr.sel(x=slice(0, 10))

<xarray.DataArray 'foo' (x: 10)> Size: 80B
dask.array<getitem, shape=(10,), dtype=int64, chunksize=(10,), chunktype=numpy.ndarray>
Dimensions without coordinates: x

arr.sel(x=slice(0, 10)).compute()

<xarray.DataArray 'foo' (x: 10)> Size: 80B
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
Dimensions without coordinates: x

arr.load()

<xarray.DataArray 'foo' (x: 30000000)> Size: 240MB
array([       0,        1,        2, ..., 29999997, 29999998, 29999999],
      shape=(30000000,))
Dimensions without coordinates: x