Source code for pynbody.test_utils.pyread_gadget_hdf5

"""Routine for reading Gadget3 HDF5 output files."""

import fnmatch
import os
import warnings

import h5py
import numpy

with warnings.catch_warnings():
    # pandas raises really annoying warning about its own dependencies
    # that seems to be impossible to disable
    warnings.simplefilter("ignore", DeprecationWarning)
    import pandas as pd


__author__ = 'Alan Duffy'
__email__ = 'mail@alanrduffy.com'
__version__ = '0.1.0'



[docs]
def pyread_gadget_hdf5(filename, ptype, var_name, sub_dir=None,\
    smooth=None,cgsunits=None,physunits=None,floatunits=None,\
    silent=None,nopanda=None,noloop=None,leaveh=None):

    """
    +
     NAME:
           PYREAD_GADGET_HDF5

     PURPOSE:
           This function reads in HDF5 outputs (snapshots, FOF and SubFind files) of the OWLS Gadget3 code

     CATEGORY:
           I/O, HDF5, OWLS / PAISTE :)

     REQUIREMENTS:
             import pandas as pd
             import h5py
             import numpy
             import fnmatch
             import os

     CALLING SEQUENCE:
             from pyread_gadget_hdf5 import *
           Result = pyread_gadget_hdf5(filename, ptype, var_name
           [, sub_dir=sub_dir, smooth=True, cgsunits=True, physunits=True,
                floatunits=True, silent=True, nopanda=True, noloop=True,
                leaveh=True] )

     INPUTS:
           filename: Name of the file to read in. In case of a multiple files
                     output, any sub-file name can be given. The routine will
                     take care of reading data from each sub-file.
           ptype:    Defines either the particle type or the sub-set for
                     which the data has to be read in (see table below).
           sub_dir:  Define the group (/FOF or /SUBFIND) from which
                     particle and/or subhaloes properties are read-in
                     from (see table below).

     ptype | snapshot                     | Old FOF
     ------+------------------------------+-------------
         0 | gas particles                | FOF/
         1 | dark matter particles        | FOF/NSF
         2 | extra collisionless parts    | FOF/SF
         3 | extra collisionless parts    | FOF/Stars
         4 | star particles               | -
         5 | extra collisionless parts    | -
         6 | -                            | -
         7 | virtual particles (TRAPHIC)  | -
         8 | -                            | -
         9 | -                            | -
        10 | -                            | -

     ptype | SubFind / sub_dir='FOF'      | SubFind / sub_dir='SUBFIND'
     ------+------------------------------+-----------------------------
         0 | gas particles                | gas particles
         1 | dark matter particles        | dark matter particles
         2 | extra collisionless parts    | extra collisionless parts
         3 | extra collisionless parts    | extra collisionless parts
         4 | star particles               | star particles
         5 | extra collisionless parts    | extra collisionless parts
         6 | -                            | -
         7 | -                            | -
         8 | -                            | -
         9 | -                            | -
        10 | FOF                          | SUBFIND
        11 | FOF/NSF                      | SUBFIND/NSF
        12 | FOF/SF                       | SUBFIND/SF
        13 | FOF/Stars                    | SUBFIND/Stars

            var_name: Dataset to read in.

     OPTIONAL INPUTS:


           sub_dir:  Kind of data group to read:
                     'FOF' - friends-of-friends group file
                     'SUBFIND' - subfind group file

     KEYWORD PARAMETERS (set to True)
           floatunits:   returns quantities in single precision
           cgsunits:       Converts from Gadget code units to CGS
           physunits:      Converts from Gadget code units to Proper (with h-> h_73)
                           and a small change with 10^10 Msol to Msol, and ENTROPY!
           silent:       does not print any message on the screen
           smooth:       reads SPH smoothed quantities
           noloop:     limit the read in to only the file selected
           nopanda:    convert from PANDAS dataframe to Numpy array (you still need Pandas to run!)
           leaveh:    set to true to leave 'litte h' values as code 'h' dependency, will have no effect
                        unless cgsunits or physunits selected in which case it overrides them regarding h

     OUTPUTS:
            An array (either 1D or 2D depending on variable) in PANDAS dataframe format
            or Numpy style, if nopanda=True set... but you shouldn't as PANDAS is fast.

     RESTRICTIONS:


     PROCEDURE:


     EXAMPLE:

           Reading star positions from a snapshot file in CGS units:

           pos = pyread_gadget_hdf5('snap_035.0.hdf5', 4, 'Coordinates', cgsunits=True)

           and in phys units (proper h_73^-1 Mpc):

           pos = pyread_gadget_hdf5('snap_035.0.hdf5', 4, 'Coordinates', physunits=True)

           and in code units:

           pos = pyread_gadget_hdf5('snap_035.0.hdf5', 4, 'Coordinates')

           Reading of FOF data from a SubFind output:

           pos = pyread_gadget_hdf5('subhalo_035.0.hdf5', 10, 'CenterOfMass', sub_dir='fof')

           Reading of FOF gas particle data from a SubFind output:

           FeIa= pyread_gadget_hdf5('subhalo_035.0.hdf5', 0, 'IronFromSNIa', sub_dir='fof')

           Reading of SPH Smoothed FOF gas particle data from a SubFind output:

           SmoothFeIa = pyread_gadget_hdf5('subhalo_035.0.hdf5', 0, 'IronFromSNIa', sub_dir='fof', smooth=True)

           Reading of SUBFIND non-star-forming gas data from a SubFind output:

           pos = pyread_gadget_hdf5('subhalo_035.0.hdf5', 11, 'Temperature', sub_dir='subfind')



     MODIFICATION HISTORY (by Alan Duffy):

            1/09/13 Converted Claudio Dalla Vecchia's superlative IDL script nread_gadget_hdf5.pro
            5/09/13 Removed ' input ' as an option in the script
            5/09/13 Removed ' noconversion ', default is to leave in Code units, i.e. don't convert
            5/09/13 Added ' cgsunits ' as the option to convert variables entirely to CGS units
            (the default switch in the IDL version)
            5/09/13 Added ' physunits ' as an option to convert variables to proper units with h-> h_73
            as well as masses to Msol instead of 10^10, and MaximumEntropy in particular changes
            6/09/13 Added ' noloop ' as an option to force a read in from one file alone
            6/09/13 Changed header information to match flake8 standard
            25/09/13 Added ' leaveh ' as an option to make units remain in code 'h' dependency
            08/01/14 Fixed bug on looping over subfiles
            08/01/14 Force C-Contiguous array

            Any issues please contact Alan Duffy on mail@alanrduffy.com or (preferred) twitter @astroduff
    """

#########################################################
##
## Check user choices...
##
#########################################################

    if cgsunits is not None and physunits is not None:
        print("[ERROR] Can't convert units to both CGS and PHYS, as CGS includes PHYS, assuming CGS ")
        physunits is None

    if silent is None:
        print("User gave "+filename)

    ## Determine the file type, i.e. SubFind, Snapshot or old FOF.
    if filename.rfind('/subhalo') >= 0:
        inputtype = 'SubFind' ## Define File type
        inputname = 'subhalo'
    elif filename.rfind('/group') >= 0:
        inputtype = 'FoF' ## Define File type
        inputname = 'group'
    elif filename.rfind('/snap') >= 0:
        inputtype = 'Snapshot' ## Define File type
        inputname = 'snap'
    if silent is None:
        print("This is a "+inputtype+" output filetype")

    ## Check that the file given and the ptype used makes sense
    if ptype < 0 or \
       inputtype == 'Snapshot' and ptype > 7 or \
       inputtype == 'FoF' and ptype > 3 or \
       inputtype == 'SubFind' and ptype > 13:
        print('[ERROR] ptype = '+str(ptype)+' is not allowed with this input file type ('+input+')')
        return -1

    if ptype < 6:
        if inputtype == 'SubFind' and sub_dir is None:
            print('[ERROR] ptype = '+str(ptype)+' is not allowed without specifying sub_dir flag')
            return -2

#########################################################
##
## Determine if, and how many, subfiles we must loop over
##
#########################################################
    folder_index = filename.rfind(inputname)
    snapnum_index = filename[:folder_index-1].rfind('_')
    snapnum = int(filename[snapnum_index+1:folder_index-1])
    numfile = len(fnmatch.filter(os.listdir(filename[:folder_index]), '*.hdf5'))
    if noloop is not None:
        numfile = 1 ## Force the code to only consider the input file

    if silent is None:
        print("Considering "+str(numfile)+" subfiles")
#########################################################
##
##  Read in common header parameters
##
#########################################################

    with h5py.File(filename, "r") as fin:
        hubble = fin['Header'].attrs['HubbleParam']
        aexp = fin['Header'].attrs['ExpansionFactor']
        if numfile == 0: ## Stupid bug with one file subfind outputs, the total array isn't created.
            numpart_total = fin['Header'].attrs['NumPart_Total']
        else: ## instead must read the case for the single file 'ThisFile'
            numpart_total = fin['Header'].attrs['NumPart_ThisFile']
        if ptype < 6:
            if numpart_total[ptype] == 0:
                print("There are no particles of type "+str(ptype)+" quitting")
                return -3
        masstable = fin['Header'].attrs['MassTable']
#########################################################
##
##  Read in unit conversion
##
#########################################################
        if cgsunits is not None or physunits is not None:
            gamma = fin['Constants'].attrs['GAMMA']
            solar_mass = fin['Constants'].attrs['SOLAR_MASS'] ## [g]
            boltzmann = fin['Constants'].attrs['BOLTZMANN'] ## [erg/K]
            protonmass = fin['Constants'].attrs['PROTONMASS'] ## [g]
            sec_per_year = fin['Constants'].attrs['SEC_PER_YEAR']
        if silent is None:
            print("Note h="+("%.3f" % hubble)+" in this simulation"+\
            " and snapshot is at a="+("%.3f" % aexp)+\
            " (z="+("%.3f" % (1./aexp-1.))+")")

#########################################################
##
##  Read in Chemical Parameters, i.e. Element Names, if present
##
#########################################################
        e = 'Parameters/ChemicalElements' in fin
        if e is True:
            nelements = fin['Parameters/ChemicalElements'].attrs['BG_NELEMENTS']
            elementnames = fin['Parameters/ChemicalElements'].attrs['ElementNames']
            if silent is None:
                print("Number of Elements Tracked "+str(nelements)+", they are")
                for elementtype in elementnames:
                    print(elementtype)

#########################################################
##
##  Assuming the user has given us the correct information
##  let's work out what array to access
##  If particle type is less than 6 (more than 10) we want
##  the actual particle values, otherwise 10 is the group variables
##  and
##
##  Also checks the case for smooth_element variables!
##
#########################################################

    ## For the case of a Baryon run, check if the user is asking for an element
    ## AND if that's a smoothed quantity then change their request slightly...
    if numpart_total[0] > 0 or numpart_total[4] > 0 or numpart_total[5] > 0:
        if var_name in list(elementnames):
            var_name = 'ElementAbundance/'+var_name
        if smooth is not None:
            var_name = 'Smoothed'+var_name
    if ptype < 6:
        global_var_name = '/'+'PartType'+str(ptype)+'/'+var_name+'/'
    elif ptype == 10:
        global_var_name = '/'+var_name+'/'
    elif ptype == 11:
        global_var_name = '/NSF/'+var_name+'/'
    elif ptype == 12:
        global_var_name = '/SF/'+var_name+'/'
    elif ptype == 13:
        global_var_name = '/Stars/'+var_name+'/'

    if sub_dir is not None:
        global_var_name = sub_dir.upper()+global_var_name

    if silent is None:
        print("Now select the requested variable "+global_var_name)

#########################################################
##
##  Now loop over the subfiles and concatenate arrays
##
#########################################################
    prefix_index = filename.rfind('.hdf5') ## First HDF5
    for ifile in range(0,numfile):
        if noloop is not None:
            newinfile = filename
        else:
            newinfile = filename[0:folder_index] + inputname +'_'+str(snapnum).zfill(3)+'.'+str(ifile)+filename[prefix_index:]
        if silent is None:
            print("Reading from "+newinfile)

        with h5py.File(newinfile, "r") as fin:
## If the dataset is present in the file then read it into tmpset, and if dataset exists concatenate them
            e = global_var_name in fin
            if e is True:
                tmpset = pd.DataFrame(fin[global_var_name][()]) ## Let HDF5 metadata dictate array format

                try:
                    dataset
                except NameError:
                    dataset = tmpset
                    ## Grab the conversion factors while we find this dataset for the first time
                    if cgsunits is not None or physunits is not None:
                        cgsconvfactor = fin[global_var_name].attrs['CGSConversionFactor']
                        aexpscaleexponent = fin[global_var_name].attrs['aexp-scale-exponent']
                        hscaleexponent = fin[global_var_name].attrs['h-scale-exponent']
                else:
                    dataset = pd.concat([dataset,tmpset],ignore_index=True)

    try:
        dataset
    except NameError:
        if var_name == 'Mass' and inputtype == 'Snapshot':
            if ptype == 1 or ptype == 2 or ptype == 3:
            ## Have to hardwire in the case for collisionless particles, which all have the same mass,
            ## as given by the masstable, and a number of particles given by Header numppart_total
            ## Assume that the standard Gadget units are used.
                if cgsunits is not None or physunits is not None:
                    cgsconvfactor=1.989e43
                    aexpscaleexponent=0.0
                    hscaleexponent=-1.0
                dataset = pd.DataFrame(numpy.ones(numpart_total[ptype])*masstable[ptype], dtype='f8')
            else:
                print("Are you reading in a strange particle type mass? ")
                return -4
        else:
            print("There is no dataset... something went wrong reading this! ")
            return -5


#########################################################
##
##  Horribe Bug in the Code which has the FOF CoM wrong scaling
##
#########################################################
        if cgsunits is not None or physunits is not None:
            if global_var_name == 'FOF/CenterOfMassVelocity/' and abs(aexpscaleexponent-0.5) < 0.01:
                print("OWLS SubFind had a bug which stated FoF CenterOfMassVelocity had aexp of 0.5, in reality it was -1.0 ")
                aexpscaleexponent = -1.0

#########################################################
##
##  Convert the array to cgs units, including the factors
##  for 'a' and 'h' dependencies, else output in code units
##
#########################################################
    if cgsunits is not None:
        if leaveh is not None:
            convfactor = aexp**aexpscaleexponent * cgsconvfactor
        else:
            convfactor = aexp**aexpscaleexponent * hubble**hscaleexponent * cgsconvfactor

    if physunits is not None:
        if leaveh is not None:
            convfactor = aexp**aexpscaleexponent
        else:
            convfactor = aexp**aexpscaleexponent * hubble**hscaleexponent

        if  var_name == 'Mass' or var_name == 'MassType' or var_name == 'InitialMass' or var_name == 'Halo_M_TopHat200' or \
            var_name == 'Halo_M_Crit200' or var_name == 'Halo_M_Crit500' or var_name == 'Halo_M_Crit2500' or \
            var_name == 'Halo_M_Mean200' or var_name == 'Halo_M_Mean500' or var_name == 'Halo_M_Mean2500':
            convfactor *= cgsconvfactor/solar_mass
        if  var_name == 'StarFormationRate':
            convfactor *= cgsconvfactor/(solar_mass/sec_per_year)
        if  var_name == 'StellarAge':
            convfactor *= cgsconvfactor/sec_per_year
        if  var_name == 'MaximumEntropy':
            convfactor *= cgsconvfactor*protonmass**gamma / boltzmann

    if cgsunits is not None or physunits is not None:
        dataset *= convfactor
        if silent is None:
            if cgsunits is not None:
                print("Converting units to CGS")
            if physunits is not None:
                print("Converting units to Proper Normalised Units ")
            print("Scale Factor dependency "+str(aexpscaleexponent))
            if leaveh is not None:
                print("User asked to leave little h unchanged! ")
            else:
                print("little h dependency "+str(hscaleexponent))

            print("Overall conversion factor "+str(convfactor))
    else:
        if silent is None:
            print("No conversion - still in Code units")

#########################################################
##
##  Convert the 1D array (nadded*3) to 2D matrix (nadded,3)
##  Except for particle type arrays which are (nadded,6)
##
#########################################################
    ncols = 1
    if  var_name == 'Coordinates' or var_name == 'Velocity' or var_name == 'CenterOfMass' \
        or var_name == 'CenterOfMassVelocity' or var_name == 'Position' or var_name == 'SubSpin' \
        or var_name == 'GasSpin' or var_name == 'StarSpin' or var_name == 'SFSpin' \
        or var_name == 'NSFSpin':
        if inputtype != 'Snapshot':
            ncols = 3
    if  var_name == 'LengthType' or var_name == 'MassType' or var_name == 'OffsetType' \
        or var_name == 'SubHalfMassProj' or var_name == 'SubHalfMass':
        if inputtype != 'Snapshot':
            ncols = 6

    if ncols > 1:
        nadded = len(dataset.index) // ncols
        dataset = pd.DataFrame(dataset.values.reshape((nadded,ncols)))

    if floatunits is not None:
        dataset = dataset.astype('f4')## Make the units float32

    if silent is None:
        print("Done ")

    ## User wants to change from Dataframe to numpy format
    if nopanda is not None:
        dataset = numpy.ascontiguousarray(dataset)

    return dataset