Source code for pynbody.halo.subfind

"""Support for the SubFind halo finder"""

from __future__ import annotations

import os.path
import warnings

import numpy as np

from .. import units
from ..array import SimArray
from . import HaloCatalogue
from .details import number_mapping, particle_indices
from .subhalo_catalogue import SubhaloCatalogue



[docs]
class SubfindCatalogue(HaloCatalogue):
    """Handles catalogues produced by the SubFind halo finder (old versions that do not use HDF5 outputs)."""


[docs]
    def __init__(self, sim, filename=None, subs=None, subhalos=False,  ordered=None, _inherit_data_from=None):
        """Initialise a SubFind catalogue

        By default, the FoF groups are imported, and subhalos are available via the 'subhalos' attribute of each
        halo object, e.g.

        >>> snap = pynbody.load('path/to/snapshot')
        >>> halos = snap.halos()
        >>> halos[1].subhalos[2] # returns the third subhalo of FoF group 1

        However by setting ``subs=True``, the FoF groups are ignored and the catalogue is of all subhalos.

        Parameters
        ----------

        sim : ~pynbody.snapshot.simsnap.SimSnap
            The simulation snapshot to which this catalogue applies.

        filename : str, optional
            The path to the SubFind output(s). This is expected to be a folder named ``groups_XXX`` where XXX is the
            snapshot number. The code extracts the snapshot number from the folder name and uses it to construct
            the filename of the catalogue files, for example ``groups_XXX/subhalo_tab_XXX.0``. If no filename is
            provided, the code will attempt to find a suitable catalogue starting from the simulation's directory.

        subhalos : bool, optional
            If False (default), catalogue represents the FoF groups and subhalos are available through the
            :meth:`~pynbody.halo.Halo.subhalos` attribute of each group (see note above). If True, the catalogue
            represents the subhalos directly and FoF groups are not available.

        ordered : bool, optional
            If True, the snapshot must have sequential iords. If False, the iords might be out-of-sequence, and
            therefore reordering will take place. If None (default), the iords are examined to check whether
            re-ordering is required or not. Note that re-ordering can be undesirable because it also destroys
            subfind's order-by-binding-energy

        _inherit_data_from : SubfindCatalogue, optional
            For internal use only; allows subhalo catalogue to share data with its parent FOF catalogue


        """

        if subs is not None:
            warnings.warn("The 'subs' argument to SubfindCatalogue is deprecated; use 'subhalos' instead",
                          DeprecationWarning)
            subhalos = subs

        self._subs=subhalos

        if ordered is not None:
            self._ordered = ordered
        else:
            self._ordered = bool((sim['iord']==np.arange(len(sim))).all())

        self._halos = {}

        self.dtype_int = sim['iord'].dtype
        self.dtype_flt = 'float32' #SUBFIND data is apparently always single precision???
        self._subfind_dir = filename or self._name_of_catalogue(sim)

        self.header = self._read_header()
        self._tasks = self.header[4]

        if _inherit_data_from:
            self._inherit_data(_inherit_data_from)
        else:
            self._read_data(sim)



        if subhalos:
            if self.header[6]==0:
                raise ValueError("This file does not contain subhalos")
            length = len(self._subhalodat['sub_off'])
        else:
            length = len(self._halodat['group_off'])

        super().__init__(sim, number_mapping.SimpleHaloNumberMapper(0, length))

        if not subhalos:
            self._subhalo_catalogue = SubfindCatalogue(sim, subhalos=True, filename=filename,
                                                       ordered=self._ordered, _inherit_data_from=self)


    def _inherit_data(self, from_):
        inherit = ['_halodat', '_subhalodat', '_keys_halo', '_keys_subhalo']
        for k in inherit:
            setattr(self, k, getattr(from_, k))

    def _get_all_particle_indices(self):
        ids = self._read_ids()
        boundaries = np.empty((len(self), 2), dtype=self.dtype_int)
        if self._subs:
            boundaries[:, 0] = self._subhalodat['sub_off']
            boundaries[:, 1] = self._subhalodat['sub_off'] + self._subhalodat['sub_len']
        else:
            boundaries[:, 0] = self._halodat['group_off']
            boundaries[:, 1] = self._halodat['group_off'] + self._halodat['group_len']

        if not self._ordered:
            self._init_iord_to_fpos()
            for a, b in boundaries:
                ids[a:b] = self._iord_to_fpos.map_ignoring_order(ids[a:b])
                # must be done segmented in case iord_to_fpos doesn't preserve input order

        return particle_indices.HaloParticleIndices(ids, boundaries)


[docs]
    def get_properties_one_halo(self, i):

        extract = units.get_item_with_unit

        properties = {}

        if self._subs is False:
            for key in self._keys_halo:
                properties[key] = extract(self._halodat[key], i)
            properties['children'] = self._get_children_of_group(i)
        else:
            for key in self._keys_subhalo:
                properties[key] = extract(self._subhalodat[key], i)
            properties['children'] = self._get_children_of_sub(i)
        return properties


    def _get_children_of_group(self, group_nr):
        if self.header[6] > 0:
            return np.where(self._subhalodat['sub_groupNr'] == group_nr)[0]
        else:
            return []

    def _get_children_of_sub(self, subhalo_nr):
        # this goes down one level in the hierarchy, i.e. a subhalo will have all its sub-subhalos listed,
        # but not its sub-sub-subhalos (those will be listed in each sub-subhalo)
        halo_number_within_group = (self._subhalodat['sub_groupNr'][:subhalo_nr]
                                    == self._subhalodat['sub_groupNr'][subhalo_nr]).sum()
        return np.where((self._subhalodat['sub_parent'] == halo_number_within_group) &
                        (self._subhalodat['sub_groupNr'] == self._subhalodat['sub_groupNr'][subhalo_nr]))[0]



[docs]
    def get_properties_all_halos(self, with_units=True) -> dict:
        properties = {}
        if self._subs:
            data = self._subhalodat
            properties['children'] = [self._get_children_of_sub(subhalo_nr)
                                      for subhalo_nr in self.number_mapper.all_numbers]
        else:
            data = self._halodat
            if self.header[6] > 0:
                properties['children'] = [self._get_children_of_group(group_nr)
                                          for group_nr in self.number_mapper.all_numbers]
            else:
                properties['children'] = []

        keys = self._keys_halo if not self._subs else self._keys_subhalo

        for key in keys:
            if with_units:
                properties[key] = data[key]
            else:
                properties[key] = data[key].view(np.ndarray)


        return properties


    def _read_header(self):
        iout = self._subfind_dir.split("_")[-1]
        filename = os.path.join(
            self._subfind_dir,
            f"subhalo_tab_{iout}.0"
        )
        with open(filename, "rb") as fd:
            # read header: this is strange but it works: there is an extra value in
            # header which we delete in the next step
            header = np.fromfile(fd, dtype='int32', sep="", count=8)

        header = np.delete(header, 4)

        return header

    def _get_subhalo_catalogue(self, parent_halo_number):
        if self._subs:
            return SubhaloCatalogue(self, self._get_children_of_sub(parent_halo_number))
        else:
            return SubhaloCatalogue(self._subhalo_catalogue,
                                    self._get_children_of_group(parent_halo_number))
    def _read_ids(self):
        data_ids = np.array([], dtype=self.dtype_int)
        iout = self._subfind_dir.split("_")[-1]
        for n in range(0, self._tasks):
            filename = os.path.join(
                self._subfind_dir,
                f"subhalo_ids_{iout}.{n}"
            )
            fd = open(filename, "rb")
            # for some reason there is an extra value in header which we delete
            # in the next step
            header1 = np.fromfile(fd, dtype='int32', sep="", count=7)
            header = np.delete(header1, 4)
            # TODO: include a check if both headers agree (they better)
            ids = np.fromfile(fd, dtype=self.dtype_int, sep="", count=-1)
            fd.close()
            data_ids = np.append(data_ids, ids)
        return data_ids

    def _read_data(self, sim):
        halodat={}
        keys_flt=['mass', 'pos', 'mmean_200', 'rmean_200', 'mcrit_200', 'rcrit_200', 'mtop_200', 'rtop_200', 'contmass']
        keys_int=['group_len', 'group_off',  'first_sub', 'Nsubs', 'cont_count', 'mostboundID']
        for key in keys_flt:
            halodat[key]=np.array([], dtype=self.dtype_flt)
        for key in keys_int:
            halodat[key]=np.array([], dtype='int32')

        subhalodat={}
        subkeys_int=['sub_len', 'sub_off', 'sub_parent', 'sub_mostboundID', 'sub_groupNr']
        subkeys_flt=['sub_pos', 'sub_vel', 'sub_CM', 'sub_mass', 'sub_spin', 'sub_veldisp', 'sub_VMax', 'sub_VMaxRad', 'sub_HalfMassRad', ]
        for key in subkeys_int:
            subhalodat[key]=np.array([], dtype='int32')
        subhalodat['sub_mostboundID']=np.array([], dtype=self.dtype_int)
        #subhalodat['sub_groupNr']=np.array([], dtype=self.dtype_int) #these are special
        for key in subkeys_flt:
            subhalodat[key]=np.array([], dtype=self.dtype_flt)

        self._keys_subhalo = subkeys_flt+subkeys_int
        self._keys_halo = keys_flt + keys_int

        for n in range(0,self._tasks):
            iout = self._subfind_dir.split("_")[-1]
            filename = os.path.join(
                self._subfind_dir,
                f"subhalo_tab_{iout}.{n}"
            )
            fd=open(filename, "rb")
            header1=np.fromfile(fd, dtype='int32', sep="", count=8)
            header=np.delete(header1,4)
            #read groups
            if header[0]>0:
                halodat['group_len']=np.append(halodat['group_len'], np.fromfile(fd, dtype='int32', sep="", count=header[0]))
                halodat['group_off']=np.append(halodat['group_off'], np.fromfile(fd, dtype='int32', sep="", count=header[0]))
                halodat['mass']=np.append(halodat['mass'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[0]))
                halodat['pos']=np.append(halodat['pos'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=3*header[0]) )
                halodat['mmean_200']=np.append(halodat['mmean_200'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[0]))
                halodat['rmean_200']=np.append(halodat['rmean_200'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[0]))
                halodat['mcrit_200']=np.append(halodat['mcrit_200'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[0]))
                halodat['rcrit_200']=np.append(halodat['rcrit_200'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[0]))
                halodat['mtop_200']=np.append(halodat['mtop_200'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[0]))
                halodat['rtop_200']=np.append(halodat['rtop_200'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[0]))
                halodat['cont_count']=np.append(halodat['cont_count'], np.fromfile(fd, dtype='int32', sep="", count=header[0]))
                halodat['contmass']=np.append(halodat['contmass'],np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[0]))
                halodat['Nsubs']=np.append(halodat['Nsubs'],np.fromfile(fd, dtype='int32', sep="", count=header[0]))
                halodat['first_sub']=np.append(halodat['first_sub'],np.fromfile(fd, dtype='int32', sep="", count=header[0]))
            #read subhalos only if expected to exist from header
            if header[5]>0:
                subhalodat['sub_len']=np.append(subhalodat['sub_len'], np.fromfile(fd, dtype='int32', sep="", count=header[5]))
                subhalodat['sub_off']=np.append(subhalodat['sub_off'], np.fromfile(fd, dtype='int32', sep="", count=header[5]))
                subhalodat['sub_parent']=np.append(subhalodat['sub_parent'], np.fromfile(fd, dtype='int32', sep="", count=header[5]))
                subhalodat['sub_mass']=np.append(subhalodat['sub_mass'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[5]))
                subhalodat['sub_pos']=np.append(subhalodat['sub_pos'],np.fromfile(fd, dtype=self.dtype_flt, sep="", count=3*header[5]))
                subhalodat['sub_vel']=np.append(subhalodat['sub_vel'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=3*header[5]))
                subhalodat['sub_CM']=np.append(subhalodat['sub_CM'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=3*header[5]))
                subhalodat['sub_spin']=np.append(subhalodat['sub_spin'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=3*header[5]))
                subhalodat['sub_veldisp']=np.append(subhalodat['sub_veldisp'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[5]))
                subhalodat['sub_VMax']=np.append(subhalodat['sub_VMax'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[5]))
                subhalodat['sub_VMaxRad']=np.append(subhalodat['sub_VMaxRad'],np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[5]))
                subhalodat['sub_HalfMassRad']=np.append(subhalodat['sub_HalfMassRad'], np.fromfile(fd, dtype=self.dtype_flt, sep="", count=header[5]))
                subhalodat['sub_mostboundID']=np.append(subhalodat['sub_mostboundID'], np.fromfile(fd, dtype=self.dtype_int, sep="", count=header[5]))
                subhalodat['sub_groupNr']=np.append(subhalodat['sub_groupNr'], np.fromfile(fd, dtype='int32', sep="", count=header[5]))
            fd.close()

        halodat['pos']=np.reshape(halodat['pos'], (header[1],3))
        if header[6]>0:
            #some voodoo because some SubFind files may have (at least?) one extra entry which is not really a subhalo
            real_ones=np.where(halodat['first_sub']<header[6])[0]
            fake_ones=np.where(halodat['first_sub']>=header[6])[0]
            halodat['mostboundID']=np.zeros(len(halodat['Nsubs']),dtype=self.dtype_int)-1
            halodat['mostboundID'][real_ones]=subhalodat['sub_mostboundID'][halodat['first_sub'][real_ones]]  #useful for the case of unordered snapshot IDs

            subhalodat['sub_pos']=np.reshape(subhalodat['sub_pos'], (header[6],3))
            subhalodat['sub_vel']=np.reshape(subhalodat['sub_vel'], (header[6],3))
            subhalodat['sub_CM']=np.reshape(subhalodat['sub_CM'], (header[6],3))
            subhalodat['sub_spin']=np.reshape(subhalodat['sub_spin'], (header[6],3))

        ar_names = 'mass', 'pos', 'mmean_200', 'rmean_200', 'mcrit_200', 'rcrit_200', 'mtop_200', 'rtop_200', \
                   'sub_mass', 'sub_pos', 'sub_vel', 'sub_CM', 'sub_veldisp', 'sub_VMax', 'sub_VMaxRad', 'sub_HalfMassRad'
        ar_dimensions = 'kg', 'm', 'kg', 'm', 'kg', 'm', 'kg', 'm', \
                    'kg', 'm', 'm s^-1', 'm', 'm s^-1', 'm s^-1', 'm', 'm'

        for name, dimension in zip(ar_names, ar_dimensions):
            if name in halodat:
                halodat[name] = SimArray(halodat[name], sim.infer_original_units(dimension))
                halodat[name].sim = sim
            if name in subhalodat:
                subhalodat[name] = SimArray(subhalodat[name], sim.infer_original_units(dimension))
                subhalodat[name].sim = sim

        self._halodat, self._subhalodat = halodat, subhalodat

    @staticmethod
    def _name_of_catalogue(sim):
        # standard path for multiple snapshot files
        snapnum = os.path.basename(
            os.path.dirname(sim.filename)).split("_")[-1]
        parent_dir = os.path.dirname(os.path.dirname(sim.filename))
        dir_path=os.path.join(parent_dir,"groups_" + snapnum)

        if os.path.exists(dir_path):
            return dir_path
        # alternative path if snapshot is single file
        else:
            snapnum = os.path.basename(sim.filename).split("_")[-1]
            parent_dir = os.path.dirname(sim.filename)
            return os.path.join(parent_dir,"groups_" + snapnum)

    @staticmethod
    def _can_load(sim, filename=None, **kwargs):
        if filename is not None:
            if str(filename)[:-3].endswith("groups_"):
                return True
        file = SubfindCatalogue._name_of_catalogue(sim)
        if os.path.exists(file):
            if os.path.exists(file):
                if file.endswith(".hdf5"):
                    return False
                elif os.path.isdir(file) and os.listdir(file)[0].endswith(".hdf5"):
                    return False
                else:
                    return True
        else:
            return False