Source code for fitting.match

import numpy as np
import math
import sympy
from mpi4py import MPI
import warnings
import os
import itertools
import esr.fitting.test_all as test_all
import esr.fitting.test_all_Fisher as test_all_Fisher
from esr.fitting.sympy_symbols import x, a0
import esr.generation.simplifier as simplifier

warnings.filterwarnings("ignore")

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()



[docs]
def check_match_results(comp, likelihood, rtol=1e-5, atol=1e-8, tmax=5, try_integration=False, print_frequency=1000 ):
    """
    Check that the matches have been done correctly by re-evaluating the likelihoods of all functions
    from the output

    Args:
        :comp (int): complexity of functions to consider
        :likelihood (fitting.likelihood object): object containing data, likelihood functions and file paths
        :rtol (float, default=1e-5): relative tolerance for comparing likelihoods
        :atol (float, default=1e-8): absolute tolerance for comparing likelihood
        :tmax (float, default=5.): maximum time in seconds to run any one part of simplification procedure for a given function
        :try_integration (bool, default=False): when likelihood requires integral, whether to try to analytically integrate (True) or just numerically integrate (False)
        :print_frequency (int, default=1000): the status of the fits will be printed every ``print_frequency`` number of iterations
        
    Returns:
        :total_nbad (int): number of functions where the likelihood did not match
    """

    # Output was [negloglike_all, codelen, index_arr] + [params[:, i] for i in range(max_param)])]

    # Stream read through codelen_matches_comp*.dat file
    if rank == 0:
        with open(likelihood.out_dir + "/codelen_matches_comp" + str(comp) + ".dat", 'r') as f:
            num_lines = sum(1 for _ in f)  # Count total lines in the file
    else:
        num_lines = None

    # Get start and end indices for each rank based on number of lines
    num_lines = comm.bcast(num_lines, root=0)
    lines_per_rank = num_lines // size
    extra_lines = num_lines % size
    if rank < extra_lines:
        start_line = rank * (lines_per_rank + 1)
        end_line = start_line + lines_per_rank + 1
    else:
        start_line = rank * lines_per_rank + extra_lines
        end_line = start_line + lines_per_rank

    print(f"Rank {rank} processing lines {start_line} to {end_line-1} of {num_lines}", flush=True)
    comm.Barrier()

    allfn_file = likelihood.fn_dir + \
        "/compl_%i/all_equations_%i.txt" % (comp, comp)
    
    nbad = 0

    with open(likelihood.out_dir + "/codelen_matches_comp" + str(comp) + ".dat", 'r') as f, \
            open(allfn_file, 'r') as allfn_f:
        
        for i, (line, line_fcn) in enumerate(zip(f, allfn_f)):

            comm.Barrier()

            if rank == 0 and i % print_frequency == 0:
                print(f'{i+1} of {num_lines}', flush=True)

            if i < start_line or i >= end_line:
                continue  # Skip lines not assigned to this rank

            fcn_i = line_fcn.strip()
            d = line.strip().split()
            stored_negloglike = float(d[0])
            params = [float(p) for p in d[3:]]
            max_param = len(params)
            codelen = float(d[1])

            # Evaluate the function with the stored parameters
            k = simplifier.count_params([fcn_i], max_param)[0]
            measured = params[:k]

            if 'zoo' in fcn_i:
                # zoo functions can't be evaluated
                continue

            if np.any(np.isnan(measured)) or np.any(np.isinf(measured)):
                # skip functions with invalid parameters
                continue

            fcn_i, eq, integrated = likelihood.run_sympify(
                fcn_i, tmax=tmax, try_integration=try_integration)

            if k == 0:
                eq_numpy = sympy.lambdify([x], eq, modules=["numpy"])
            elif k > 1:
                all_a = ' '.join([f'a{j}' for j in range(k)])
                all_a = list(sympy.symbols(all_a, real=True))
                eq_numpy = sympy.lambdify([x] + all_a, eq, modules=["numpy"])
            else:
                eq_numpy = sympy.lambdify([x, a0], eq, modules=["numpy"])
            new_negloglike = likelihood.negloglike(
                measured, eq_numpy, integrated=integrated)

            if not np.isclose(stored_negloglike, new_negloglike, rtol=rtol, atol=atol):

                # We only care if the codelength is finite
                if np.isnan(codelen) or np.isinf(codelen):
                    continue
                print(f"Rank {rank}: Mismatch in negloglike for function {fcn_i}",
                       stored_negloglike, new_negloglike, measured)
                nbad += 1

    # Gather nbad from all ranks and sum them
    total_nbad = comm.reduce(nbad, op=MPI.SUM, root=0)

    if rank == 0:
        if total_nbad == 0:
            print("All likelihoods match!", flush=True)
        else:
            print(f"Total mismatches in likelihoods: {total_nbad}", flush=True)

    comm.Barrier()

    return total_nbad




[docs]
def main(comp, likelihood, tmax=5, print_frequency=1000, try_integration=False):
    """Apply results of fitting the unique functions to all functions and save to file

    Args:
        :comp (int): complexity of functions to consider
        :likelihood (fitting.likelihood object): object containing data, likelihood functions and file paths
        :tmax (float, default=5.): maximum time in seconds to run any one part of simplification procedure for a given function
        :print_frequency (int, default=1000): the status of the fits will be printed every ``print_frequency`` number of iterations
        :try_integration (bool, default=False): when likelihood requires integral, whether to try to analytically integrate (True) or just numerically integrate (False)

    Returns:
        None

    """

    if likelihood.is_mse:
        raise ValueError('Cannot use MSE with description length')

    def fop(x):
        return likelihood.negloglike(x, eq_numpy, integrated=integrated)

    def f1(x):
        return likelihood.negloglike([x], eq_numpy, integrated=integrated)

    if rank == 0:
        print('\nMatching', flush=True)

    invsubs_file = likelihood.fn_dir + \
        "/compl_%i/inv_subs_%i.txt" % (comp, comp)
    match_file = likelihood.fn_dir + "/compl_%i/matches_%i.txt" % (comp, comp)

    fcn_list_proc, data_start, data_end = test_all.get_functions(
        comp, likelihood, unique=False)

    negloglike, params_meas = test_all_Fisher.load_loglike(
        comp, likelihood, data_start, data_end, split=False)
    max_param = params_meas.shape[1]

    # all_inv_subs_proc = simplifier.load_subs(invsubs_file, max_param)[data_start:data_end]
    all_inv_subs_proc = simplifier.load_subs(
        invsubs_file, max_param, bcast_res=False)

    with open(match_file, 'r') as f:
        matches_proc = np.fromiter(
            (int(float(line.strip()))
             for line in itertools.islice(f, data_start, data_end)),
            dtype=int
        )

    # 2D array of shape (# unique fcns, 10)
    all_fish = np.loadtxt(likelihood.out_dir + '/derivs_comp'+str(comp)+'.dat')
    all_fish = np.atleast_2d(all_fish)

    # Both of these are also just for this proc
    codelen = np.zeros(len(fcn_list_proc))
    negloglike_all = np.zeros(len(fcn_list_proc))
    index_arr = np.zeros(len(fcn_list_proc))
    params = np.zeros([len(fcn_list_proc), max_param])

    # The part of all eqs analysed by this proc
    for i in range(len(fcn_list_proc)):
        if i % print_frequency == 0 and rank == 0:
            print(i, len(fcn_list_proc))

        fcn_i = fcn_list_proc[i].replace('\'', '')

        nparams = simplifier.count_params([fcn_i], max_param)[0]

        # Index in total unique eqs file, common to all procs
        index = matches_proc[i]
        index_arr[i] = index

        # Assign the likelihood of this variant to the that of the unique eq
        negloglike_all[i] = negloglike[index]

        # Element of the unique eqs file, common to all procs
        if np.isnan(negloglike[index]) or np.isinf(negloglike[index]):
            codelen[i] = np.nan
            continue

        if nparams == 0:
            continue
        else:
            k = nparams
            measured = params_meas[index, :nparams].copy()

        # Access from the unique eqs all_fish array, common to all procs
        fish_measured = all_fish[index, :]


        # if (i in all_inv_subs_proc) and not isinstance(all_inv_subs_proc[i], dict):
        #     codelen[i] = np.inf
        #     continue

        try:
            if i + data_start in all_inv_subs_proc:
                sub = all_inv_subs_proc[i + data_start]
            else:
                sub = {}
            p, fish = simplifier.convert_params(
                measured, fish_measured, sub, n=max_param)
            if isinstance(p, float):
                p = [p]
            p = np.atleast_1d(p)
        except Exception as e:
            print('\nError with function:', fcn_i.strip(), e)
            codelen[i] = np.inf
            continue

        if np.sum(fish <= 0) > 0:
            codelen[i] = np.inf
            continue

        try:
            Delta = np.zeros(len(fish))
            m = (fish != 0)
            Delta[m] = np.atleast_1d(np.sqrt(12./fish[m]))
            Delta[~m] = np.inf
            Nsteps = np.atleast_1d(np.abs(np.array(p)))
            m = (Delta != 0)
            Nsteps[m] /= Delta[m]
            Nsteps[~m] = np.nan
        except Exception as e:
            print('Error with function:', fcn_i, e)
            codelen[i] = np.inf
            continue

        negloglike_orig = np.copy(negloglike_all[i])
        ptrue = np.copy(p)

        # Should reevaluate -log(L) with the param(s) set to 0, but doesn't matter unless the fcn is a very good one
        if np.sum(Nsteps < 1) > 0:
            # Set any parameter to 0 that doesn't have at least one precision step, and recompute -log(L)
            try:
                p[Nsteps < 1] = 0.
            except (IndexError, TypeError):
                p = 0.

            # It's possible that after putting params to 0 the likelihood is botched, in which case give it nan
            try:
                fcn_i, eq, integrated = likelihood.run_sympify(
                    fcn_i, tmax=tmax, try_integration=try_integration)
                if k == 1:
                    eq_numpy = sympy.lambdify([x, a0], eq, modules=["numpy"])
                    # Modified here for this variant, but if this doesn't happen it stays the same as the unique eq
                    negloglike_all[i] = f1(p)
                else:
                    all_a = ' '.join([f'a{j}' for j in range(nparams)])
                    all_a = list(sympy.symbols(all_a, real=True))
                    eq_numpy = sympy.lambdify(
                        [x] + all_a, eq, modules=["numpy"])
                    negloglike_all[i] = fop(p)

            except NameError:
                if try_integration:
                    fcn_i, eq, integrated = likelihood.run_sympify(
                        fcn_i, tmax=tmax, try_integration=False)
                    if k == 1:
                        eq_numpy = sympy.lambdify(
                            [x, a0], eq, modules=["numpy"])
                        # Modified here for this variant, but if this doesn't happen it stays the same as the unique eq
                        negloglike_all[i] = f1(p)
                    else:
                        all_a = ' '.join([f'a{j}' for j in range(nparams)])
                        all_a = list(sympy.symbols(all_a, real=True))
                        eq_numpy = sympy.lambdify(
                            [x] + all_a, eq, modules=["numpy"])
                        negloglike_all[i] = fop(p)
                else:
                    negloglike_all[i] = np.nan

            except Exception:
                negloglike_all[i] = np.nan

            if np.isfinite(negloglike_all[i]):
                k -= np.sum(Nsteps < 1)
                kept_mask = Nsteps >= 1
            else:
                # Let's see if setting any of the parameters to zero is ok
                try_idx = np.arange(nparams)[Nsteps < 1]
                for r in reversed(range(1, len(try_idx))):
                    for idx in itertools.combinations(try_idx, r):
                        p = np.copy(ptrue)
                        for idx_ in idx:
                            p[idx_] = 0.
                        if k == 1:
                            # Modified here for this variant, but if this doesn't happen it stays the same as the unique eq
                            negloglike_all[i] = f1(p)
                        else:
                            negloglike_all[i] = fop(p)
                        if np.isfinite(negloglike_all[i]):
                            break
                kept_mask = np.ones(len(p), dtype=bool)
                if np.isfinite(negloglike_all[i]):
                    k -= len(idx)
                    kept_mask[idx] = 0
                # infinite nll
                elif not np.isfinite(negloglike_all[i]) and not np.isnan(negloglike_all[i]):
                    p = ptrue
                    # set uncertainty=parameter in this case
                    fish[Nsteps < 1] = 12./(p[Nsteps < 1]**2)
                    codelen[i] = -k/2.*math.log(3.) + np.sum(0.5 *
                                                             np.log(fish) + np.log(abs(np.array(p))))
                    negloglike_all[i] = negloglike_orig
                    # If p was an array, we can make a list out of it
                    try:
                        params[i, :] = np.pad(p, (0, max_param-len(p)))
                    except Exception:
                        # p is either a number or nothing
                        if p:
                            # p is a number
                            params[i, :] = 0
                            params[i, 0] = p
                        else:
                            params[i, :] = np.zeros(max_param)

                    assert len(params[i, :]) == max_param
                    continue

            if k < 0:
                print("This shouldn't have happened", flush=True)
                quit()
            elif k == 0:
                # If we have no parameters left then the parameter codelength is 0 so we can move on
                continue

            # Only consider these parameters in the codelen
            fish = fish[kept_mask]
            p = p[kept_mask]

        else:
            kept_mask = np.ones(len(p), dtype=bool)

        try:
            codelen[i] = -k/2.*math.log(3.) + np.sum(0.5 *
                                                     np.log(fish) + np.log(abs(np.array(p))))
        except Exception:
            codelen[i] = np.nan

        p = ptrue
        p[~kept_mask] = 0.

        try:        # If p was an array, we can make a list out of it
            params[i, :] = np.pad(p, (0, max_param-len(p)))
        except Exception:     # p is either a number or nothing
            if p:   # p is a number
                params[i, :] = 0
                params[i, 0] = p
            else:
                params[i, :] = np.zeros(max_param)

        assert len(params[i, :]) == max_param

    out_arr = np.transpose(np.vstack(
        [negloglike_all, codelen, index_arr] + [params[:, i] for i in range(max_param)]))

    np.savetxt(likelihood.temp_dir + '/codelen_matches_'+str(comp)+'_'+str(rank) +
               '.dat', out_arr, fmt='%.7e')        # Save the data for this proc in Partial

    comm.Barrier()

    if rank == 0:
        string = 'cat `find ' + likelihood.temp_dir + '/ -name "codelen_matches_' + \
            str(comp)+'_*.dat" | sort -V` > ' + likelihood.out_dir + \
            '/codelen_matches_comp'+str(comp)+'.dat'
        os.system(string)
        string = 'rm ' + likelihood.temp_dir + \
            '/codelen_matches_'+str(comp)+'_*.dat'
        os.system(string)

        print('Saved output to', likelihood.out_dir, flush=True)

    comm.Barrier()

    return