Source code for fitting.likelihood

import astropy.constants
import astropy.units as apu
import numpy as np
import pandas as pd
import scipy.integrate
import sympy
import os
import warnings

from esr.fitting.sympy_symbols import (
    square, cube, sqrt, log, pow, x, a0, a1, a2, inv
)

from esr.generation.simplifier import time_limit
import esr.generation.simplifier



[docs]
class Likelihood:
    """Likelihood class used to fit a function directly

    Args:
        :data_file (str): Name of the file containing the data to use
        :cov_file (str): Name of the file containing the errors/covariance on the data
        :run_name (str): The name to be associated with this likelihood, e.g. 'my_esr_run'
        :data_dir (str, default=None): The path containing the data and cov files
        :fn_set (str, default='core_maths'): The name of the function set to use with the likelihood. Must match one of those defined in ``generation.duplicate_checker``

    """

    def __init__(self, data_file, cov_file, run_name, data_dir=None, fn_set='core_maths'):

        esr_dir = os.path.abspath(os.path.join(os.path.dirname(
            esr.generation.simplifier.__file__), '..', '')) + '/'
        if data_dir is None:
            self.data_dir = esr_dir + '/data/'
        else:
            self.data_dir = data_dir
        self.data_file = self.data_dir + '/' + data_file
        self.cov_file = self.data_dir + '/' + cov_file
        self.fn_dir = esr_dir + "function_library/" + fn_set + "/"
        if data_dir is None:
            self.like_dir = esr_dir + "/fitting/"
        else:
            self.like_dir = data_dir + "/fitting/"
        if not os.path.isdir(self.like_dir):
            os.makedirs(self.like_dir, exist_ok=True)
        self.fnprior_prefix = "aifeyn_"
        self.combineDL_prefix = "combine_DL_"
        self.final_prefix = "final_"

        self.base_out_dir = self.like_dir + "/output/"
        self.temp_dir = self.base_out_dir + "/partial_" + run_name
        self.out_dir = self.base_out_dir + "/output_" + run_name
        self.fig_dir = self.base_out_dir + "/figs_" + run_name

        #  Warning to not use MSE for DL
        self.is_mse = False


[docs]
    def get_pred(self, x, a, eq_numpy, **kwargs):
        """Return the predicted y(x)

        Args:
            :x (float or np.array): x value being used
            :a (list): parameters to subsitute into equation considered
            :eq_numpy (numpy function): function to use which gives H^2

        Returns:
            :y (float or np.array): the predicted y value at x supplied

        """
        try:
            return eq_numpy(x, *a)
        except Exception:
            return np.inf



[docs]
    def clear_data(self):
        """Clear data used for numerical integration (not required in most cases)"""
        pass



[docs]
    def run_sympify(self, fcn_i, **kwargs):
        """Sympify a function

        Args:
            :fcn_i (str): string representing function we wish to fit to data

        Returns:
            :fcn_i (str): string representing function we wish to fit to data (with superfluous characters removed)
            :eq (sympy object): sympy object representing function we wish to fit to data
            :integrated (bool, always False): whether we analytically integrated the function (True) or not (False)

        """

        fcn_i = fcn_i.replace('\n', '')
        fcn_i = fcn_i.replace('\'', '')

        eq = sympy.sympify(fcn_i,
                           locals={"inv": inv,
                                   "square": square,
                                   "cube": cube,
                                   "sqrt": sqrt,
                                   "log": log,
                                   "pow": pow,
                                   "x": x,
                                   "a0": a0,
                                   "a1": a1,
                                   "a2": a2})
        return fcn_i, eq, False





[docs]
class CCLikelihood(Likelihood):
    """Likelihood class used to fit cosmic chronometer data.
    Should be used as a template for other likelihoods as all functions in this class are required in fitting functions.

    """

    def __init__(self):

        super().__init__('CC_Hubble.dat', 'CC_Hubble.dat', 'cc_dimful')

        self.Hfid = 1.
        self.ylabel = r'$H \left( z \right) \ / \ H_{\rm fid}$'  # for plotting
        self.xvar, self.yvar, self.yerr = np.genfromtxt(
            self.data_file, unpack=True)
        self.xvar += 1
        self.yvar /= self.Hfid
        self.yerr /= self.Hfid
        self.inv_cov = 1 / self.yerr ** 2


[docs]
    def get_pred(self, zp1, a, eq_numpy, **kwargs):
        """Return the predicted H(z), which is the square root of the functions we are using.

        Args:
            :zp1 (float or np.array): 1 + z for redshift z
            :a (list): parameters to subsitute into equation considered
            :eq_numpy (numpy function): function to use which gives H^2

        Returns:
            :H (float or np.array): the predicted Hubble parameter at redshifts supplied

        """
        return np.sqrt(eq_numpy(zp1, *a))



[docs]
    def negloglike(self, a, eq_numpy, **kwargs):
        """Negative log-likelihood for a given function

        Args:
            :a (list): parameters to subsitute into equation considered
            :eq_numpy (numpy function): function to use which gives H^2

        Returns:
            :nll (float): - log(likelihood) for this function and parameters

        """

        H = self.get_pred(self.xvar, np.atleast_1d(a), eq_numpy)
        if not np.all(np.isreal(H)):
            return np.inf
        # inv_cov diagonal, so is vector here
        nll = np.sum(0.5 * (H - self.yvar) ** 2 * self.inv_cov)
        if np.isnan(nll):
            return np.inf
        return nll





[docs]
class PanthLikelihood(Likelihood):
    """Likelihood class used to fit Pantheon data"""

    def __init__(self):

        super().__init__(
            '/DataRelease/Pantheon+_Data/4_DISTANCES_AND_COVAR/Pantheon+SH0ES.dat',
            '/DataRelease/Pantheon+_Data/4_DISTANCES_AND_COVAR/Pantheon+SH0ES_STAT+SYS.cov',
            'panth_dimful',
        )

        self.Hfid = 1.0 * apu.km / apu.s / apu.Mpc
        # data = pd.read_csv(self.data_file, delim_whitespace=True)
        # origlen = len(data)
        # ww = (data['zHD'] > 0.01)
        # zCMB = data['zHD'][ww]
        # mu_obs = data['MU_SH0ES'][ww]
        # mu_err = data['MU_SH0ES_ERR_DIAG'][ww]  #  for plotting

        # with open(self.cov_file, 'r') as f:
        #     _ = f.readline()
        #     n = int(len(zCMB))
        #     C = np.zeros((n, n))
        #     ii = -1
        #     jj = -1
        #     for i in range(origlen):
        #         jj = -1
        #         if ww[i]:
        #             ii += 1
        #         for j in range(origlen):
        #             if ww[j]:
        #                 jj += 1
        #             val = float(f.readline())
        #             if ww[i]:
        #                 if ww[j]:
        #                     C[ii, jj] = val

        self.ylabel = r'$\mu \left( z \right)$'
        # self.xvar = zCMB.to_numpy() + 1
        # self.yvar = mu_obs.to_numpy()
        # self.inv_cov = np.linalg.inv(C)
        # self.yerr = mu_err.to_numpy()
        # np.savez(self.data_dir + '/pantheon_data.npz', 
        #          xvar=self.xvar, yvar=self.yvar, yerr=self.yerr,
        #          inv_cov=self.inv_cov)

        data = np.load(self.data_dir + '/pantheon_data.npz')
        self.xvar = data['xvar']
        self.yvar = data['yvar']
        self.yerr = data['yerr']
        self.inv_cov = data['inv_cov']

        self.mu_const = astropy.constants.c / self.Hfid / (10 * apu.pc)
        self.mu_const = 5 * np.log10(self.mu_const.to(''))

        self.delta_z = 0.02
        self.min_nz = 10
        self.data_x = None
        self.data_mask = None


[docs]
    def clear_data(self):
        """Clear data used for numerical integration"""
        self.data_x = None
        self.data_mask = None



[docs]
    def get_pred(self, zp1, a, eq_numpy, integrated=False):
        """Return the predicted distance modulus from the H^2 function supplied.

        Args:
            :zp1 (float or np.array): 1 + z for redshift z
            :a (list): parameters to subsitute into equation considered
            :eq_numpy (numpy function): function to use which gives H^2
            :integrated (bool, default=False): whether we previously analytically integrated the function (True) or not (False)

        Returns:
            :mu (float or np.array): the predicted distance modulus at redshifts supplied

        """

        if integrated:
            dL = eq_numpy(zp1, *a) - eq_numpy(1, *a)
        else:
            if self.data_x is None or self.data_mask is None:
                nx = int(np.ceil((zp1.max() - zp1.min()) / self.delta_z))
                self.data_x = np.concatenate(
                    (np.linspace(1, zp1.min(), self.min_nz),
                     np.linspace(zp1.min() + self.delta_z,
                                 zp1.max() + self.delta_z, nx),
                     zp1))
                self.data_x = np.sort(np.unique(self.data_x))
                self.data_mask = np.squeeze(
                    np.array([np.where(self.data_x == d)[0] for d in zp1]))

            if len(a) == 0:
                dL = 1 / np.sqrt(eq_numpy(self.data_x))
            else:
                dL = 1 / np.sqrt(eq_numpy(self.data_x, *a))

            # If prediction is a float, need to make it an array of length = len(self.data_x)
            if np.isscalar(dL):
                dL = np.full(len(self.data_x), dL)

            dL = scipy.integrate.cumulative_trapezoid(
                dL, x=self.data_x, initial=0)
            dL = dL[self.data_mask]

        dL *= zp1
        mu = 5 * np.log10(dL) + self.mu_const

        return mu



[docs]
    def negloglike(self, a, eq_numpy, integrated=False):
        """Negative log-likelihood for a given function

        Args:
            :a (list): parameters to subsitute into equation considered
            :eq_numpy (numpy function): function to use which gives H^2

        Returns:
            :nll (float): - log(likelihood) for this function and parameters

        """
        mu_pred = self.get_pred(self.xvar, np.atleast_1d(
            a), eq_numpy, integrated=integrated)
        if not np.all(np.isreal(mu_pred)):
            return np.inf
        nll = 0.5 * np.dot((mu_pred - self.yvar),
                           np.dot(self.inv_cov, (mu_pred - self.yvar)))
        if np.isnan(nll):
            return np.inf
        return nll



[docs]
    def run_sympify(self, fcn_i, tmax=5, try_integration=True):
        """Sympify a function

        Args:
            :fcn_i (str): string representing function we wish to fit to data
            :tmax (float): maximum time in seconds to attempt analytic integration
            :try_integration (bool, default=True): as the likelihood requires an integral, whether to try to analytically integrate (True) or not (False)

        Returns:
            :fcn_i (str): string representing function we wish to fit to data (with superfluous characters removed)
            :eq (sympy object): sympy object representing function we wish to fit to data
            :integrated (bool): whether we we able to analytically integrate the function (True) or not (False)

        """

        fcn_i = fcn_i.replace('\n', '')
        fcn_i = fcn_i.replace('\'', '')

        eq = sympy.sympify(fcn_i,
                           locals={"inv": inv,
                                   "square": square,
                                   "cube": cube,
                                   "sqrt": sqrt,
                                   "log": log,
                                   "pow": pow,
                                   "x": x,
                                   "a0": a0,
                                   "a1": a1,
                                   "a2": a2})

        if try_integration:
            try:
                with time_limit(tmax):
                    eq2 = sympy.integrate(1 / sqrt(eq), x)
                    if eq2.has(sympy.Integral):
                        raise ValueError
                    eq = eq2
                    integrated = True
            except Exception:
                integrated = False
        else:
            integrated = False

        return fcn_i, eq, integrated





[docs]
class MockLikelihood(Likelihood):
    """Likelihood class used to fit mock cosmic chronometer data

    Args:
        :nz (int): number of mock redshifts to use
        :yfracerr (float): the fractional uncertainty on the cosmic chronometer mock we are using
        :data_dir (str, default=None): The path containing the data and cov files

    """

    def __init__(self, nz, yfracerr, data_dir=None):
        super().__init__(
            '/mock/CC_Hubble_%i_' % nz + str(yfracerr) + '.dat',
            '/mock/CC_Hubble_%i_' % nz + str(yfracerr) + '.dat',
            'mock_%i_' % nz + str(yfracerr),
            data_dir=data_dir
        )

        self.Hfid = 1.
        self.ylabel = r'$H \left( z \right) \ / \ H_{\rm fid}$'  # for plotting
        self.xvar, self.yvar, self.yerr = np.genfromtxt(
            self.data_file, unpack=True)
        self.xvar += 1
        self.yvar /= self.Hfid
        self.yerr /= self.Hfid
        self.inv_cov = 1 / self.yerr ** 2


[docs]
    def get_pred(self, zp1, a, eq_numpy, **kwargs):
        """Return the predicted H(z), which is the square root of the functions we are using.

        Args:
            :zp1 (float or np.array): 1 + z for redshift z
            :a (list): parameters to subsitute into equation considered
            :eq_numpy (numpy function): function to use which gives H^2

        Returns:
            :H (float or np.array): the predicted Hubble parameter at redshifts supplied

        """
        return np.sqrt(eq_numpy(zp1, *a))



[docs]
    def negloglike(self, a, eq_numpy, **kwargs):
        """Negative log-likelihood for a given function

        Args:
            :a (list): parameters to subsitute into equation considered
            :eq_numpy (numpy function): function to use which gives H^2

        Returns:
            :nll (float): - log(likelihood) for this function and parameters

        """
        H = self.get_pred(self.xvar, np.atleast_1d(a), eq_numpy)
        if not np.all(np.isreal(H)):
            return np.inf
        # inv_cov diagonal, so is vector here
        nll = np.sum(0.5 * (H - self.yvar) ** 2 * self.inv_cov)
        if np.isnan(nll):
            return np.inf
        return nll





[docs]
class MSE(Likelihood):
    """Likelihood class used to fit a function directly using a MSE

    IMPORTANT - MSE is NOT a likelihood in the probabilistic sense.
    It should not be used for MDL calculations as the answer will
    be nonesense since an uncertainty is required for MDL to have meaning.

    Args:
        :data_file (str): Name of the file containing the data to use
        :run_name (str): The name to be associated with this likelihood, e.g. 'my_esr_run'
        :data_dir (str, default=None): The path containing the data and cov files
        :fn_set (str, default='core_maths'): The name of the function set to use with the likelihood. Must match one of those defined in ``generation.duplicate_checker``

    """

    def __init__(self, data_file, run_name, data_dir=None, fn_set='core_maths'):

        super().__init__(data_file, data_file, run_name, data_dir=data_dir, fn_set=fn_set)
        self.ylabel = r'$y$'    # for plotting
        self.xvar, self.yvar, self.yerr = np.loadtxt(
            self.data_file, unpack=True)
        self.yerr = 0.

        warnings.warn("You are using the MSE class. MSE is NOT a likelihood in the probabilistic sense. It should not be used for MDL calculations as the answer will be nonesense since an uncertainty is required for MDL to have meaning.")
        self.is_mse = True  #  Warning to not use MSE for DL


[docs]
    def negloglike(self, a, eq_numpy, **kwargs):
        """Negative log-likelihood for a given function. Here it is (y-ypred)^2
        Note that this is technically not a log-likelihood, but the function
        name is required to be accessed by other functions.

        Args:
            :a (list): parameters to subsitute into equation considered
            :eq_numpy (numpy function): function to use which gives y

        Returns:
            :nll (float): - log(likelihood) for this function and parameters

        """

        ypred = self.get_pred(self.xvar, np.atleast_1d(a), eq_numpy)
        if not np.all(np.isreal(ypred)):
            return np.inf
        nll = np.mean((ypred - self.yvar) ** 2)
        if np.isnan(nll):
            return np.inf
        return nll





[docs]
class GaussLikelihood(Likelihood):
    """Likelihood class used to fit a function directly using a Gaussian likelihood

    Args:
        :data_file (str): Name of the file containing the data to use
        :run_name (str): The name to be associated with this likelihood, e.g. 'my_esr_run'
        :data_dir (str, default=None): The path containing the data and cov files
        :fn_set (str, default='core_maths'): The name of the function set to use with the likelihood. Must match one of those defined in ``generation.duplicate_checker``

    """

    def __init__(self, data_file, run_name, data_dir=None, fn_set='core_maths'):

        super().__init__(data_file, data_file, run_name, data_dir=data_dir, fn_set=fn_set)
        self.ylabel = r'$y$'    # for plotting
        self.xvar, self.yvar, self.yerr = np.loadtxt(
            self.data_file, unpack=True)


[docs]
    def negloglike(self, a, eq_numpy, **kwargs):
        """Negative log-likelihood for a given function.

        Args:
            :a (list): parameters to subsitute into equation considered
            :eq_numpy (numpy function): function to use which gives y

        Returns:
            :nll (float): - log(likelihood) for this function and parameters


        """

        ypred = self.get_pred(self.xvar, np.atleast_1d(a), eq_numpy)
        if not np.all(np.isreal(ypred)):
            return np.inf
        nll = np.sum(0.5 * (ypred - self.yvar) ** 2 / self.yerr **
                     2 + 0.5 * np.log(2 * np.pi) + np.log(self.yerr))
        if np.isnan(nll):
            return np.inf
        return nll





[docs]
class PoissonLikelihood(Likelihood):
    """Likelihood class used to fit a function directly using a Poisson likelihood

    Args:
        :data_file (str): Name of the file containing the data to use
        :run_name (str): The name to be associated with this likelihood, e.g. 'my_esr_run'
        :data_dir (str, default=None): The path containing the data and cov files
        :fn_set (str, default='core_maths'): The name of the function set to use with the likelihood. Must match one of those defined in ``generation.duplicate_checker``

    """

    def __init__(self, data_file, run_name, data_dir=None, fn_set='core_maths'):

        super().__init__(data_file, data_file, run_name, data_dir=data_dir, fn_set=fn_set)
        self.ylabel = r'$y$'    # for plotting
        self.xvar, self.yvar = np.loadtxt(self.data_file, unpack=True)
        self.yerr = np.sqrt(self.yvar)


[docs]
    def negloglike(self, a, eq_numpy, **kwargs):
        """Negative log-likelihood for a given function. Here it is a Poisson

        Args:
            :a (list): parameters to subsitute into equation considered
            :eq_numpy (numpy function): function to use which gives y

        Returns:
            :nll (float): - log(likelihood) for this function and parameters

        """

        ypred = self.get_pred(self.xvar, np.atleast_1d(a), eq_numpy)
        if (not np.all(np.isreal(ypred))) or (not np.all(ypred > 0)):
            return np.inf
        nll = np.sum(ypred - self.yvar * np.log(ypred))
        if np.isnan(nll):
            return np.inf
        return nll