Source code for pynbody.util.hdf_vds

"""Toolset for creating a virtual dataset from multiple HDF5 files."""

from __future__ import annotations

import os
import tempfile
from typing import Iterable

import h5py


[docs] class HdfVdsMaker: """Tool for creating a virtual dataset from multiple HDF5 files."""
[docs] def __init__(self, hdf_files: list[h5py.File | str]): self._files = [] for f in hdf_files: if isinstance(f, h5py.File): self._files.append(f) else: self._files.append(h5py.File(f, 'r'))
[docs] def concatenation_keys(self) -> Iterable[str]: """Returns all keys to concatenate as VDS""" # TODO - make this general return['Subhalos', 'SubhaloParticles', 'NestedSubhalos']
[docs] def copy_keys(self) -> Iterable[str]: """Returns all keys to copy from the first file into the VDS. Examples of copy keys are headers or one-off arrays""" # TODO - make this general return ['SnapshotId']
[docs] def make_hdf_vfile(self, filepath: str) -> h5py.File: """Create an HDF file with virtual datasets combining the datasets in the input files.""" with h5py.File(name=filepath, mode='w') as hdf_vfile: for k in self.concatenation_keys(): self.write_single_vds(k, hdf_vfile) for k in self.copy_keys(): self.write_single_vds(k, hdf_vfile, first_only=True) return hdf_vfile
[docs] def write_single_vds(self, key: str, target_hdf_file: h5py.File, first_only: bool=False): """Write a single virtual dataset to the target HDF file.""" shape = None sources = [] slices = [] dtype = None offset = 0 files = [self._files[0]] if first_only else self._files for f in files: source_dataset: h5py.Dataset = f[key] if shape is None: dtype = source_dataset.dtype shape = source_dataset.shape else: if dtype != source_dataset.dtype: raise ValueError(f"The dtypes of array {key} are inconsistent between files") if source_dataset.shape[1:] != shape[1:]: raise ValueError(f"The shapes of array {key} are inconsistent between files") shape = (source_dataset.shape[0] + shape[0],)+shape[1:] sources.append(h5py.VirtualSource(source_dataset)) slices.append(slice(offset, offset+len(source_dataset))) offset+=len(source_dataset) layout = h5py.VirtualLayout(shape=shape, dtype=dtype) for slice_, vsource in zip(slices, sources): layout[slice_] = vsource target_hdf_file.create_virtual_dataset(key, layout)
[docs] def get_temporary_hdf_vfile(self) -> h5py.File: """Create the HDF file with virtual datasets in a temporary directory, such that it is deleted on closure""" with tempfile.TemporaryDirectory() as tmpdirname: filepath = os.path.join(tmpdirname, "nofile.hdf5") # ideally one would simply use backing_store=False, to File but then there doesn't seem to be a way # to actually use the file (the VDS views just returns zeros). # Instead we write then re-read it, which presumably carries minimal overhead but is a bit ugly. self.make_hdf_vfile(filepath) hdf_vfile = h5py.File(name=filepath, mode='r') # on exiting the with block, the temporary directory and file are unlinked, but the file won't actually be # erased until it's closed. return hdf_vfile