"""Support for the SubFind halo finder, in the HDF5 format used by Gadget3, 4 and Arepo."""
from __future__ import annotations
import os.path
import warnings
import h5py
import numpy as np
from .. import array, config_parser, snapshot, units
from ..snapshot import gadgethdf
from . import Halo, HaloCatalogue
from .details import number_mapping, particle_indices
from .subhalo_catalogue import SubhaloCatalogue
[docs]
class SubFindHDFHaloCatalogue(HaloCatalogue) :
"""Handles catalogues produced by the SubFind halo finder, in the HDF5 format used by Gadget3, 4 and Arepo.
Since the internal format differs quite substantially between these versions, child classes are provided for
Gadget 4, Arepo and TNG. The base class is able to handle the common elements of the format, and is also used
for Gadget 3 SubFind outputs.
See :class:`Gadget4SubfindHDFCatalogue`, :class:`ArepoSubfindHDFCatalogue` and :class:`TNGSubfindHDFCatalogue`.
.. warning::
At present, the Gadget 4, Arepo and TNG subclasses of this class are not tested against multi-file
outputs. If you encounter issues with these, please report them to the pynbody developers.
"""
# Names of various groups and attributes in the hdf file (which vary in different versions of SubFind)
_fof_name = 'FOF'
_header_name = 'FOF'
_subfind_name = 'SUBFIND'
_subfind_grnr_name = 'GrNr'
_subfind_first_gr_name = 'SUBFIND/FirstSubOfHalo'
_numgrps_name = 'Total_Number_of_groups'
_numsubs_name = 'Total_Number_of_subgroups'
_grp_offset_name = 'Offset'
_grp_len_name = 'Length'
_sub_offset_name = 'SUB_Offset'
_sub_len_name = 'SUB_Length'
[docs]
def __init__(self, sim, filename=None, subs=None, subhalos=False, _inherit_data_from=None) :
"""Initialise a SubFindHDF catalogue.
By default, the FoF groups are imported, and subhalos are available via the 'subhalos' attribute of each
halo object, e.g.
>>> snap = pynbody.load('path/to/snapshot.hdf5')
>>> halos = snap.halos()
>>> halos[1].subhalos[2] # returns the third subhalo of FoF group 1
However by setting ``subhalos=True``, the FoF groups are ignored and the catalogue is of all subhalos.
.. note::
Note that this constructor is common between :class:`SubFindHDFHaloCatalogue` and its subclasses
:class:`Gadget4SubfindHDFCatalogue`, :class:`ArepoSubfindHDFCatalogue` and
:class:`TNGSubfindHDFCatalogue`.
For Gadget 3 outputs, the SubFind data is stored internally to the snapshot itself. Therefore
passing a *filename* to the constructor for :class:`SubFindHDFHaloCatalogue` will result in an exception.
For Gadget 4, Arepo and TNG SubFind outputs, a filename can be passed that points to the
``fof_subhalo_tab_XXX.hdf5`` file (see below).
Parameters
----------
sim : ~pynbody.snapshot.simsnap.SimSnap
The simulation snapshot to which this catalogue applies.
filename : str, optional
The filename of the HDF5 file containing the SubFind catalogue. This is only used for Gadget 4, Arepo and
TNG subclasses and **must** be None when used with the Gadget 3 base class. See the note above.
subhalos : bool, optional
If False (default), catalogue represents the FoF groups and subhalos are available through the
:meth:`~pynbody.halo.Halo.subhalos` attribute of each group (see note above). If True, the catalogue
represents the subhalos directly and FoF groups are not available.
subs : bool, optional
Deprecated alias for ``subhalos``.
_inherit_data_from : SubfindCatalogue, optional
For internal use only; allows subhalo catalogue to share data with its parent FOF catalogue
"""
if subs is not None:
warnings.warn("The 'subs' argument to SubFindHDFHaloCatalogue is deprecated. Use 'subhalos' instead.",
DeprecationWarning)
subhalos = subs
self._sub_mode = subhalos
self._hdf_files = self._get_catalogue_multifile(sim, user_provided_filename=filename)
self.__count_halos_and_subhalos()
super().__init__(sim, number_mapping.SimpleHaloNumberMapper(0, self.__get_length()))
if _inherit_data_from:
self.__inherit_data(_inherit_data_from)
else:
self.__init_halo_offset_data()
self.__init_subhalo_relationships()
self.__init_subhalo_offset_data()
self.__init_halo_properties()
self.__reshape_multidimensional_properties()
self.__reassign_properties_from_sub_to_fof()
if not subhalos:
self._subhalo_catalogue = type(self)(sim, subhalos=True, filename=filename, _inherit_data_from=self)
def __inherit_data(self, parent):
attrs_to_share = ["_fof_properties", "_sub_properties", "_fof_group_offsets", "_fof_group_lengths",
"_subfind_halo_offsets", "_subfind_halo_lengths",
"fof_ignore", "sub_ignore","_subfind_halo_parent_groups", "_ngroups", "_nsubhalos"]
for attr in attrs_to_share:
setattr(self, attr, getattr(parent, attr))
def _get_catalogue_multifile(self, sim, user_provided_filename):
"""Some variants of Subfind put all the particle data in the catalogue files, in which case the catalogue
HDF files are already present in the base simulation. In other cases, notably Gadget4/Arepo, this must be
overridden to locate the actual HDF5 files for the catalogue"""
if user_provided_filename is not None:
raise ValueError("Filename must be None for loading a Gadget3-style SubFindHDFCatalogue")
if not isinstance(sim, gadgethdf.SubFindHDFSnap):
raise ValueError("SubFindHDFHaloCatalogue can only work with a SubFindHDFSnap simulation")
return sim._hdf_files
def __init_ignorable_keys(self):
self.fof_ignore = list(map(str.strip,config_parser.get("SubfindHDF","FoF-ignore").split(",")))
self.sub_ignore = list(map(str.strip,config_parser.get("SubfindHDF","Sub-ignore").split(",")))
for t in list(self.base._family_to_group_map.values()):
# Don't add SubFind particles ever as this list is actually spherical overdensity
self.sub_ignore.append(t[0])
self.fof_ignore.append(t[0])
def __init_halo_properties(self):
self.__init_ignorable_keys()
self._fof_properties = self.__get_property_dictionary_from_hdf(self._fof_name)
self._sub_properties = self.__get_property_dictionary_from_hdf(self._subfind_name)
def __get_property_dictionary_from_hdf(self, hdf_key):
props = self._get_properties_from_multifile(self._hdf_files, hdf_key)
for property_key in list(props.keys()):
arr_units = self._get_units(hdf_key, property_key)
if property_key in props:
props[property_key] = props[property_key].view(array.SimArray)
props[property_key].units = arr_units
props[property_key].sim = self.base
return props
def _get_properties_from_multifile(self, multifile, hdf_key):
hdf0 = multifile.get_file0_root()
props = {}
for h in multifile.iterroot():
for property_key in hdf0[hdf_key].keys():
if property_key in self.fof_ignore or property_key not in h[hdf_key]:
continue
if property_key in props:
props[property_key] = np.append(props[property_key], h[hdf_key][property_key][()])
else:
props[property_key] = np.asarray(h[hdf_key][property_key])
return props
def _get_units(self, hdf_key, property_key):
hdf0 = self._hdf_files.get_file0_root()
return self.base._get_units_from_hdf_attr(hdf0[hdf_key][property_key].attrs)
def __reshape_multidimensional_properties(self):
self.__reshape_multidimensional_properties_one_dictionary(self._sub_properties, self._nsubhalos)
self.__reshape_multidimensional_properties_one_dictionary(self._fof_properties, self._ngroups)
def __reshape_multidimensional_properties_one_dictionary(self, properties_dict, expected_array_length):
for key in list(properties_dict.keys()):
# Test if there are no remainders, i.e. array is multiple of halo length
# then solve for the case where this is 1, 2 or 3 dimension
if len(properties_dict[key]) % expected_array_length == 0:
ndim = len(properties_dict[key]) // expected_array_length
if ndim > 1:
properties_dict[key] = properties_dict[key].reshape(expected_array_length, ndim)
def __reassign_properties_from_sub_to_fof(self):
reassign = []
for k,v in self._sub_properties.items():
if v.shape[0]==self._ngroups:
reassign.append(k)
for reassign_i in reassign:
self._fof_properties[reassign_i] = self._sub_properties[reassign_i]
del self._sub_properties[reassign_i]
def __init_subhalo_relationships(self):
nsub = 0
nfof = 0
self._subfind_halo_parent_groups = np.empty(self._nsubhalos, dtype=int)
self._fof_group_first_subhalo = np.empty(self._ngroups, dtype=int)
for h in self._hdf_files.iterroot():
parent_groups = h[self._subfind_name].get(self._subfind_grnr_name, np.array([]))
# .astype(int)[:] stopgap until h5py support numpy 2.0
self._subfind_halo_parent_groups[nsub:nsub + len(parent_groups)] = parent_groups.astype(int)[:]
nsub += len(parent_groups)
first_groups = h.get(self._subfind_first_gr_name, np.array([]))
# .astype(int)[:] stopgap until h5py support numpy 2.0
self._fof_group_first_subhalo[nfof:nfof + len(first_groups)] = first_groups.astype(int)[:]
nfof += len(first_groups)
def __get_length(self):
if self._sub_mode:
return self._nsubhalos
else:
return self._ngroups
def __count_halos_and_subhalos(self):
hdf0 = self._hdf_files.get_file0_root()
self._ngroups = int(hdf0[self._header_name].attrs[self._numgrps_name])
self._nsubhalos = int(hdf0[self._header_name].attrs[self._numsubs_name])
def __init_halo_offset_data(self):
self._fof_group_offsets = {}
self._fof_group_lengths = {}
# Process FOF groups first
for fam in self.base._families_ordered():
ptypes = self.base._family_to_group_map[fam]
ptype_group_offset = 0
for ptype in ptypes:
self._fof_group_offsets[ptype] = np.empty(self._ngroups, dtype='int64')
self._fof_group_lengths[ptype] = np.empty(self._ngroups, dtype='int64')
curr_groups = 0
current_offset = 0
for h in self._hdf_files:
length = self._get_halodata_array_with_default(h, self._grp_len_name, self._fof_name, ptype, np.array([]))
if len(length) == 0:
# Nothing to process in this file
continue
offset = self._get_halodata_array_with_default(h, self._grp_offset_name, self._fof_name, ptype,
None)
if offset is None:
# Arepo doesn't store offsets, so we need to calculate them. Note these are relative to
# the specific PartType we are looking at, but across all files (ordered sequentially).
lengths_cumulative = np.cumsum(length)
offset = np.concatenate(([current_offset], current_offset+lengths_cumulative[:-1]))
current_offset += lengths_cumulative[-1]
self._fof_group_offsets[ptype][curr_groups:curr_groups + len(offset)] = offset.astype('int64')[:]
self._fof_group_lengths[ptype][curr_groups:curr_groups + len(offset)] = length.astype('int64')[:]
curr_groups += len(offset)
if curr_groups != self._ngroups:
warnings.warn(
f"Incorrect number of groups recovered from HDF files. Expected {self._ngroups}, found {curr_groups}")
self._ngroups = curr_groups
self._fof_group_offsets[ptype] = self._fof_group_offsets[ptype][:curr_groups]
self._fof_group_lengths[ptype] = self._fof_group_lengths[ptype][:curr_groups]
def __init_subhalo_offset_data(self):
self._subfind_halo_offsets = {}
self._subfind_halo_lengths = {}
first_subs_in_groups = self._fof_group_first_subhalo.copy()
first_subs_in_groups = first_subs_in_groups[(first_subs_in_groups > -1) & (
first_subs_in_groups < self._nsubhalos)] # Just need actual first subhalo numbers, don't need to worry about haloes with no subhaloes. Accounts for formats where *no* first subhalo is recorded as i) '-1' and ii) 'total number of subhaloes'
last_subs_in_groups = np.concatenate((first_subs_in_groups[1:], [self._nsubhalos])) - 1
for fam in self.base._families_ordered():
ptypes = self.base._family_to_group_map[fam]
for ptype in ptypes:
self._subfind_halo_offsets[ptype] = np.empty(self._nsubhalos, dtype='int64')
self._subfind_halo_lengths[ptype] = np.empty(self._nsubhalos, dtype='int64')
curr_subhalos = 0
# Only get lengths
for h in self._hdf_files:
length = self._get_halodata_array_with_default(h, self._sub_len_name, self._subfind_name, ptype, np.array([]))
self._subfind_halo_lengths[ptype][curr_subhalos:curr_subhalos + len(length)] = length.astype('int64')[:]
curr_subhalos += len(length)
# Add offsets in blocks for all subhalos of a single halo
for first_sub, last_sub in zip(first_subs_in_groups, last_subs_in_groups):
length = self._subfind_halo_lengths[ptype][first_sub:last_sub + 1]
offset = np.concatenate(([0], np.cumsum(length)[:-1]))
parent_fof_offset = self._fof_group_offsets[ptype][self._subfind_halo_parent_groups[first_sub]]
self._subfind_halo_offsets[ptype][first_sub:last_sub + 1] = offset + parent_fof_offset
if curr_subhalos != self._nsubhalos:
warnings.warn(
f"Incorrect number of subhalos recovered from HDF files. Expected {self._nsubhalos}, found {curr_subhalos}")
self._nsubhalos = curr_subhalos
self._subfind_halo_offsets[ptype] = self._subfind_halo_offsets[ptype][:curr_groups]
self._subfind_halo_lengths[ptype] = self._subfind_halo_lengths[ptype][:curr_groups]
def _get_halodata_array(self, hdf_file, array_name, halo_or_group, particle_type):
# In gadget3 implementation, halo_or_group is not needed. In Gadget4 implementation (below), it is.
return hdf_file[particle_type][array_name]
def _get_halodata_array_with_default(self, hdf_file, array_name, halo_or_group, particle_type, default):
try:
return self._get_halodata_array(hdf_file, array_name, halo_or_group, particle_type)
except KeyError:
return default
[docs]
def get_properties_one_halo(self, i):
def extract(arr, i):
if np.issubdtype(arr.dtype, np.integer):
return arr[i]
else:
return units.get_item_with_unit(arr, i)
properties = {}
if self._sub_mode:
for key in self._sub_properties:
properties[key] = extract(self._sub_properties[key], i)
else:
for key in self._fof_properties:
properties[key] = extract(self._fof_properties[key], i)
properties['children'], = np.where(self._subfind_halo_parent_groups == i)
return properties
[docs]
def get_properties_all_halos(self, with_units=True) -> dict:
if self._sub_mode:
result = {'parent': self._subfind_halo_parent_groups}
result.update(self._sub_properties)
else:
children = [[] for _ in range(self._ngroups)]
for i, parent in enumerate(self._subfind_halo_parent_groups):
children[parent].append(i)
result = {'children': children}
result.update(self._fof_properties)
if with_units:
return result
else:
result_nounits = {}
for k, v in result.items():
if hasattr(v, 'view'):
result_nounits[k] = v.view(np.ndarray)
else:
result_nounits[k] = v
return result_nounits
def _get_particle_indices_one_halo(self, number):
if self.base is None :
raise RuntimeError("Parent SimSnap has been deleted")
if number > len(self)-1 :
description = "Subhalo" if self._sub_mode else "Group"
raise ValueError(f"{description} {number} does not exist")
type_map = self.base._family_to_group_map
if self._sub_mode:
lengths = self._subfind_halo_lengths
offsets = self._subfind_halo_offsets
else:
lengths = self._fof_group_lengths
offsets = self._fof_group_offsets
# create the particle lists
tot_len = 0
for g_ptypes in list(type_map.values()) :
for g_ptype in g_ptypes:
tot_len += lengths[g_ptype][number]
plist = np.empty(tot_len,dtype='int64')
self._write_pynbody_index_list_into_array(plist, offsets, lengths, number, type_map)
return plist
def _write_pynbody_index_list_into_array(self, particle_indices, halo_or_group_offsets,
halo_or_group_lengths, halo_or_group_index, type_map):
npart = 0
for ptype in self.base._families_ordered():
# family slice in the SubFindHDFSnap
for g_ptype in type_map[ptype]:
sl = self.base._gadget_ptype_slice[g_ptype]
# add the particle indices to the particle list
offset = halo_or_group_offsets[g_ptype][halo_or_group_index]
length = halo_or_group_lengths[g_ptype][halo_or_group_index]
ind = np.arange(sl.start + offset, sl.start + offset + length)
particle_indices[npart:npart + length] = ind
npart += length
return npart
def _get_all_particle_indices(self):
if self.base is None:
raise RuntimeError("Parent SimSnap has been deleted")
type_map = self.base._family_to_group_map
if self._sub_mode:
lengths = self._subfind_halo_lengths
offsets = self._subfind_halo_offsets
else:
lengths = self._fof_group_lengths
offsets = self._fof_group_offsets
# create the particle lists
tot_len = 0
for g_ptypes in list(type_map.values()):
for g_ptype in g_ptypes:
tot_len += lengths[g_ptype].sum()
plist = np.empty(tot_len, dtype='int64')
boundaries = np.empty((len(self), 2), dtype='int64')
location = 0
for i in range(len(self)):
npart = self._write_pynbody_index_list_into_array(plist[location:], offsets, lengths, i, type_map)
boundaries[i] = (location, location := location+npart)
assert location == tot_len
return particle_indices.HaloParticleIndices(plist, boundaries)
def _group_and_halo_from_halo_index(self, i):
"""Return the group, halo pair of indices from the overall halo index i
Subfind stores a long list of halos, gathered together into groups. This maps from the index in the long
list to the particular group and subhalo indices."""
if i>=self._nsubhalos:
raise ValueError("Subhalo out of range")
group = np.argmax(self._fof_group_first_subhalo>i)-1
halo = i - self._fof_group_first_subhalo[group]
return group, halo
def _get_subhalo_catalogue(self, parent_halo_number):
if not self._sub_mode:
props = self.get_properties_one_halo(parent_halo_number)
return SubhaloCatalogue(self._subhalo_catalogue, props['children'])
else:
return SubhaloCatalogue(self._subhalo_catalogue, [])
@classmethod
def _can_load(cls, sim, **kwargs):
if isinstance(sim, gadgethdf.SubFindHDFSnap):
return True
else:
return False
[docs]
class Gadget4SubfindHDFCatalogue(SubFindHDFHaloCatalogue):
"""Handles catalogues produced by the SubFind halo finder, in the HDF5 format used by Gadget 4
.. warning::
At present, this is not tested against multi-file outputs. If you encounter issues with these, please report
them to the pynbody developers.
"""
_fof_name = 'Group'
_subfind_name = 'Subhalo'
_header_name = 'Header'
_subfind_grnr_name = 'SubhaloGroupNr'
_subfind_first_gr_name = 'Group/GroupFirstSub'
_numgrps_name = 'Ngroups_Total'
_numsubs_name = 'Nsubhalos_Total'
_grp_offset_name = 'GroupOffsetType'
_grp_len_name = 'GroupLenType'
_sub_offset_name = 'SubhaloOffsetType'
_sub_len_name = 'SubhaloLenType'
[docs]
def __init__(self, sim, filename=None, **kwargs):
super().__init__(sim, filename, **kwargs)
i = 0
for prog_or_desc in "prog", "desc":
try:
files = self._get_progenitor_or_descendant_multifile(sim, prog_or_desc, filename)
except FileNotFoundError:
continue
props = self._get_properties_from_multifile(files, 'Subhalo')
self._sub_properties.update(props)
def _get_halodata_array(self, hdf_file, array_name, halo_or_group, particle_type):
return hdf_file[halo_or_group][array_name][:,int(particle_type[-1])]
def _get_units(self, hdf_key, property_key):
# Gadget4 doesn't seem to store unit information, so have a good guess
if property_key == 'SubhaloVmax':
dimensions = 'm s^-1'
elif 'Mass' in property_key or '_M_' in property_key:
dimensions = 'kg'
elif 'Pos' in property_key or '_R_' in property_key or 'CM' in property_key or 'VmaxRad' in property_key:
dimensions = 'm'
elif 'Vel' in property_key:
dimensions = 'm s^-1'
else:
return None
return self.base.infer_original_units(dimensions)
@classmethod
def _get_catalogue_multifile(cls, sim, user_provided_filename):
class Gadget4SubfindHdfMultiFileManager(gadgethdf._SubfindHdfMultiFileManager):
_nfiles_groupname = cls._fof_name
_nfiles_attrname = "NTask"
_subgroup_name = None
return Gadget4SubfindHdfMultiFileManager(cls._catalogue_filename(sim, user_provided_filename))
@classmethod
def _get_progenitor_or_descendant_multifile(cls, sim, prog_or_desc, user_provided_filename):
class Gadget4SubfindHdfProgenitorsMultiFileManager(gadgethdf._SubfindHdfMultiFileManager):
_nfiles_groupname = "Header"
_nfiles_attrname = "NumFiles"
_subgroup_name = None
return Gadget4SubfindHdfProgenitorsMultiFileManager(cls._catalogue_filename(sim, user_provided_filename,
"subhalo_"+prog_or_desc+"_"))
@classmethod
def _catalogue_filename(cls, sim, user_provided_filename=None, namestem ="fof_subhalo_tab_"):
if user_provided_filename is not None:
user_provided_filename = str(user_provided_filename)
if "fof_subhalo_tab_" not in user_provided_filename:
raise ValueError("Filename must contain 'fof_subhalo_tab_'")
return user_provided_filename.replace("fof_subhalo_tab_", namestem)
snapnum = os.path.basename(sim.filename).split("_")[-1]
parent_dir = os.path.dirname(os.path.abspath(sim.filename))
return os.path.join(parent_dir, namestem + snapnum)
@classmethod
def _can_load(cls, sim, filename=None, **kwargs):
try:
file = cls._catalogue_filename(sim, user_provided_filename=filename)
except ValueError:
return False
if not h5py.is_hdf5(file):
file = file + ".0.hdf5"
if not h5py.is_hdf5(file):
return False
with h5py.File(file, 'r') as f:
if cls._header_name not in f:
return False
if cls._numsubs_name not in f[cls._header_name].attrs:
return False
return True
[docs]
class ArepoSubfindHDFCatalogue(Gadget4SubfindHDFCatalogue):
"""Handles catalogues produced by the SubFind halo finder, in the HDF5 format used by Arepo
.. warning::
At present, this is not tested against multi-file outputs. If you encounter issues with these, please report
them to the pynbody developers.
"""
_numsubs_name = 'Nsubgroups_Total'
_subfind_grnr_name = 'SubhaloGrNr'
[docs]
class TNGSubfindHDFCatalogue(ArepoSubfindHDFCatalogue):
"""Handles catalogues produced by the SubFind halo finder, in the HDF5 format used by Arepo (TNG variant)
.. warning::
At present, this is not tested against multi-file outputs. If you encounter issues with these, please report
them to the pynbody developers.
"""
@classmethod
def _catalogue_filename(cls, sim, user_provided_filename, namestem ="fof_subhalo_tab_"):
if user_provided_filename is not None:
return super()._catalogue_filename(sim, user_provided_filename, namestem)
snapnum = os.path.basename(sim.filename).split("_")[-1]
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(sim.filename)))
f = os.path.join(parent_dir, "groups_"+snapnum, namestem + snapnum)
return f
@classmethod
def _get_catalogue_multifile(cls, sim, user_provided_filename):
class TNGSubfindHdfMultiFileManager(gadgethdf._SubfindHdfMultiFileManager):
_nfiles_groupname = "Header"
_nfiles_attrname = "NumFiles"
_subgroup_name = None
return TNGSubfindHdfMultiFileManager(cls._catalogue_filename(sim, user_provided_filename))