Source code for fitting.test_all

import numpy as np
import sympy
import warnings
import os
import sys
from mpi4py import MPI
from scipy.optimize import minimize
import itertools

from esr.fitting.sympy_symbols import x, a0
import esr.generation.simplifier as simplifier

warnings.filterwarnings("ignore")

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()


[docs] def chi2_fcn(x, likelihood, eq_numpy, integrated, signs): """Compute chi2 for a function Args: :x (list): parameters to use for function :likelihood (fitting.likelihood object): object containing data and likelihood function :eq_numpy (numpy function): function to pass to likelihood object to make prediction of y(x) :integrated (bool): whether eq_numpy has already been integrated :signs (list): each entry specifies whether than parameter should be optimised logarithmically. If None, then do nothing, if '+' then optimise 10**x[i] and if '-' then optimise -10**x[i] Returns: :negloglike (float): - log(likelihood) for this function and parameters """ if signs is None: p = x else: p = [None] * len(signs) for i in range(len(signs)): if signs[i] is None: p[i] = x[i] elif signs[i] == '+': p[i] = 10 ** x[i] elif signs[i] == '-': p[i] = - 10 ** x[i] else: raise ValueError return likelihood.negloglike(p, eq_numpy, integrated=integrated)
[docs] def get_functions(comp, likelihood, unique=True): """Load all functions for a given complexity to use and distribute among ranks Args: :comp (int): complexity of functions to consider :likelihood (fitting.likelihood object): object containing data, functions to convert SR expressions to variable of data and file path :unique (bool, default=True): whether to load just the unique functions (True) or all functions (False) Returns: :fcn_list (list): list of strings representing functions to be used by given rank :data_start (int): first index of function used by rank :data_end (int): last index of function used by rank """ if unique: unifn_file = likelihood.fn_dir + \ "/compl_%i/unique_equations_%i.txt" % (comp, comp) else: unifn_file = likelihood.fn_dir + \ "/compl_%i/all_equations_%i.txt" % (comp, comp) if comp >= 8: sys.setrecursionlimit(2000 + 500 * (comp - 8)) if rank == 0: for dirname in [likelihood.base_out_dir, likelihood.out_dir, likelihood.temp_dir]: if not os.path.isdir(dirname): print('Making dir:', dirname) os.mkdir(dirname) comm.Barrier() if rank == 0: print("Number of cores:", size, flush=True) # First, count total number of lines without loading into memory if rank == 0: with open(unifn_file, "r") as f: total_lines = sum(1 for _ in f) else: total_lines = None # Broadcast total number of lines to all processes total_lines = comm.bcast(total_lines, root=0) # Number of lines per file for given thread nLs = int(np.ceil(total_lines / float(size))) while nLs*(size-1) > total_lines: if rank == 0: print("Correcting for many cores.", flush=True) nLs -= 1 if rank == 0: print("Total number of functions: ", nLs, flush=True) print("Number of test points per proc: ", nLs, flush=True) data_start = rank*nLs data_end = (rank+1)*nLs if rank == size-1: data_end = total_lines # Load the functions for this rank with open(unifn_file, "r") as f: # Skip lines up to data_start quickly for _ in range(data_start): next(f) # Now read only lines from data_start to data_end-1 fcn_list = [] for i in range(data_end - data_start): line = next(f) fcn_list.append(line.strip()) return fcn_list, data_start, data_end
[docs] def optimise_fun(fcn_i, likelihood, tmax, pmin, pmax, comp=0, try_integration=False, log_opt=False, max_param=4, Niter_params=[40, 60], Nconv_params=[5, 20], test_success=False, ignore_previous_eqns=True): """Optimise the parameters of a function to fit data The list of parameters, P, passed as Niter_params and Nconv_params compute these values, N, to be N = P[0] + P[1] * nparam + P[2] * nparam ** 2 + ... where nparam is the number of parameters of the function. The order of the polynomial is determined by the length of P, so P can be arbirary in length. Args: :fcn_i (str): string representing function we wish to fit to data :likelihood (fitting.likelihood object): object containing data and likelihood function :tmax (float): maximum time in seconds to run any one part of simplification procedure for a given function :pmin (float): minimum value for each parameter to consider when generating initial guess :pmax (float): maximum value for each parameter to consider when generating initial guess :comp (float, default=0): Complexity. Deafault of 0 because it is not provided when fitting a single function :try_integration (bool, default=False): when likelihood requires integral, whether to try to analytically integrate (True) or just numerically integrate (False) :log_opt (bool, default=False): whether to optimise 1 and 2 parameter cases in log space :max_param (int, default=4): The maximum number of parameters considered. This sets the shapes of arrays used. :Niter_params (list, default=[40, 60]): Parameters determining maximum number of parameter optimisation iterations to attempt. :Nconv_params (list, default=[-5, 20]): If we find Nconv solutions for the parameters which are within a logL of 0.5 of the best, we say we have converged and stop optimising parameters. These parameters determine Nconv. :test_sucess (bool, default=False): Whether to test whether the optimisation was successful using scipy's criteria :ignore_previous_eqns (bool, default=True): If we have seen an equation at lower complexity, whether to ignore the equation in this routine. Returns: :chi2_i (float): the minimum value of -log(likelihood) (corresponding to the maximum likelihood) :params (list): the maximum likelihood values of the parameters """ xvar = getattr(likelihood, 'xvar', None) nparam = simplifier.count_params([fcn_i], max_param)[0] params = np.zeros(max_param) if comp > 1 and ignore_previous_eqns: previous_fns_file = likelihood.fn_dir + "/compl_" + \ str(comp)+"/previous_eqns_"+str(comp)+".txt" with open(previous_fns_file, "r") as f: previous_fns = f.readlines() # discard repeat of lower complexity (e.g. [inv, inv, ...]) if fcn_i in previous_fns: return np.inf, params Niter = int(np.sum(nparam ** np.arange(len(Niter_params)) * np.array(Niter_params))) Nconv = int(np.sum(nparam ** np.arange(len(Nconv_params)) * np.array(Nconv_params))) if (Nconv <= 0) or (Niter <= 0) or (Nconv > Niter): raise ValueError("Nconv and/or Niter have unacceptable values") try: fcn_i, eq, integrated = likelihood.run_sympify( fcn_i, tmax=tmax, try_integration=try_integration) if "a0" not in fcn_i: eq_numpy = sympy.lambdify(x, eq, modules=["numpy"]) chi2_i = likelihood.negloglike([], eq_numpy, integrated=integrated) return chi2_i, params flag_three = False mult_arr = np.ones(max_param) count_lowest = 0 inf_count = 0 if nparam > 1: all_a = ' '.join([f'a{i}' for i in range(nparam)]) all_a = list(sympy.symbols(all_a, real=True)) eq_numpy = sympy.lambdify([x] + all_a, eq, modules=["numpy"]) else: eq_numpy = sympy.lambdify([x, a0], eq, modules=["numpy"]) bad_fun = True if xvar is not None: for p in itertools.product([1, -1], repeat=nparam): if not (np.sum(np.isnan(eq_numpy(xvar, *p))) > 0): bad_fun = False break if bad_fun: # Don't bother trying to optimise bc this fcn is clearly really bad chi2_i = np.inf return chi2_i, params # Reset chi2 chi2_min = np.inf if nparam > 2: flag_three = True for j in range(Niter): if nparam > 2: inpt = [np.random.uniform(pmin, pmax) for _ in range(nparam)] res = minimize(chi2_fcn, inpt, args=(likelihood, eq_numpy, integrated, # Default=3000 None), method="BFGS", options={'maxiter': 7000}) elif nparam == 2: if log_opt: # These are now in log-space, so this is 1e-1 -- 1e1; was -10:10 inpt = [np.random.uniform( pmin, pmax), np.random.uniform(pmin, pmax)] res_pp = minimize(chi2_fcn, inpt, args=( likelihood, eq_numpy, integrated, ['+', '+']), method="BFGS") res_mp = minimize(chi2_fcn, inpt, args=( likelihood, eq_numpy, integrated, ['-', '+']), method="BFGS") res_pm = minimize(chi2_fcn, inpt, args=( likelihood, eq_numpy, integrated, ['+', '-']), method="BFGS") res_mm = minimize(chi2_fcn, inpt, args=( likelihood, eq_numpy, integrated, ['-', '-']), method="BFGS") choose = np.argmin( [res_pp['fun'], res_mp['fun'], res_pm['fun'], res_mm['fun']]) mult_arr = np.ones(max_param) if choose == 0: res = res_pp elif choose == 1: res = res_mp mult_arr[0] = -1 elif choose == 2: res = res_pm mult_arr[1] = -1 elif choose == 3: res = res_mm mult_arr[0] = -1 mult_arr[1] = -1 else: print("Some ambiguity in choose", eq, flush=True) res = res_pp else: flag_three = True inpt = [np.random.uniform( pmin, pmax), np.random.uniform(pmin, pmax)] res = minimize(chi2_fcn, inpt, args=(likelihood, eq_numpy, integrated, # Default=3000 None), method="BFGS", options={'maxiter': 5000}) else: if log_opt: inpt = np.random.uniform(pmin, pmax) res_p = minimize(chi2_fcn, inpt, args=( likelihood, eq_numpy, integrated, ['+']), method="BFGS") res_m = minimize(chi2_fcn, inpt, args=( likelihood, eq_numpy, integrated, ['-']), method="BFGS") mult_arr = np.ones(max_param) if res_p['fun'] < res_m['fun']: res = res_p elif res_p['fun'] > res_m['fun']: res = res_m mult_arr[0] = -1 else: # Both give same. Choose positive (arbitrarily) res = res_p else: flag_three = True inpt = np.random.uniform(pmin, pmax) res = minimize(chi2_fcn, inpt, args=( likelihood, eq_numpy, integrated, None), method="BFGS", options={'maxiter': 5000}) if test_success and (not res.success): continue if np.isinf(res['fun']): inf_count += 1 # Failure if first 50 all give inf if inf_count == 50 and np.isinf(chi2_min): break # Reset count if log-like improves by 2 if res['fun']-chi2_min < -2.: count_lowest = 0 #  If within 0.5 of lowest, say converged to that value if abs(res['fun']-chi2_min) < 0.5: count_lowest += 1 if res['fun'] < chi2_min: best = res mult_arr_best = mult_arr chi2_min = res['fun'] # Converged the required number of times, so a success if count_lowest == Nconv: break if chi2_min < 1.e100: # Optimisation happened. Print something if flag_three: params = np.pad(np.array(best.x), (0, max_param-len(best.x))) else: # Params put in linear space and sign added back in params = np.pad(10.**np.array(best.x), (0, max_param-len(best.x))) * mult_arr_best elif not np.isfinite(chi2_min): print('\tFailed to find parameters for function:', fcn_i) # This is after all the iterations, so it's the best we have; reduced chi2 chi2_i = chi2_min except NameError: print(NameError) # Occurs if function produced not implemented in numpy raise NameError except simplifier.TimeoutException: print('TIMED OUT:', fcn_i, flush=True) try: if chi2_min < 1.e100: if flag_three: params = np.pad(np.array(best.x), (0, max_param-len(best.x))) else: params = np.pad(10.**np.array(best.x), (0, max_param-len(best.x))) * mult_arr_best chi2_i = chi2_min else: chi2_i = np.nan params[:] = 0. except Exception: chi2_i = np.nan params[:] = 0. except Exception as e: print(e) return np.nan, params return chi2_i, params
[docs] def main(comp, likelihood, tmax=5, pmin=0, pmax=3, print_frequency=50, try_integration=False, log_opt=False, Niter_params=[40, 60], Nconv_params=[5, 20], ignore_previous_eqns=False): """Optimise all functions for a given complexity and save results to file. This can optimise in log-space, with separate +ve and -ve branch (except when there are >=3 params in which case it does it in linear) The list of parameters, P, passed as Niter_params and Nconv_params compute these values, N, to be N = P[0] + P[1] * nparam + P[2] * nparam ** 2 + ... where nparam is the number of parameters of the function. The order of the polynomial is determined by the length of P, so P can be arbirary in length. Args: :comp (int): complexity of functions to consider :likelihood (fitting.likelihood object): object containing data, likelihood functions and file paths :tmax (float, default=5.): maximum time in seconds to run any one part of simplification procedure for a given function :pmin (float, default=0.): minimum value for each parameter to considered when generating initial guess :pmax (float, default=3.): maximum value for each parameter to considered when generating initial guess :print_frequency (int, default=50): the status of the fits will be printed every ``print_frequency`` number of iterations :try_integration (bool, default=False): when likelihood requires integral, whether to try to analytically integrate (True) or just numerically integrate (False) :log_opt (bool, default=False): whether to optimise 1 and 2 parameter cases in log space :Niter_params (list, default=[40, 60]): Parameters determining maximum number of parameter optimisation iterations to attempt. :Nconv_params (list, default=[-5, 20]): If we find Nconv solutions for the parameters which are within a logL of 0.5 of the best, we say we have converged and stop optimising parameters. These parameters determine Nconv. :ignore_previous_eqns (bool, default=False): If we have seen an equation at lower complexity, whether to ignore the equation in this routine. Returns: None """ if rank == 0: print('\nRunning fits', flush=True) fcn_list_proc, _, _ = get_functions(comp, likelihood) if rank == 0 and ignore_previous_eqns: previous_unifn_list = [] if comp > 1: for compl in range(1, comp): unifn_file_i = likelihood.fn_dir + \ "/compl_%i/unique_equations_%i.txt" % (compl, compl) with open(unifn_file_i, "r") as f: fcn_list_i = f.readlines() previous_unifn_list += fcn_list_i previous_unifn_list = np.array(previous_unifn_list) np.savetxt(likelihood.fn_dir + "/compl_"+str(comp) + "/previous_eqns_"+str(comp)+".txt", previous_unifn_list, fmt='%s') comm.Barrier() # Set max param >=4 for backwards compatibility max_param = int(max(4, np.floor((comp - 1) / 2))) chi2 = np.zeros(len(fcn_list_proc)) # This is now only for this proc params = np.zeros([len(fcn_list_proc), max_param]) for i in range(len(fcn_list_proc)): # Consider all possible complexities if rank == 0 and ((i == 0) or ((i+1) % print_frequency == 0)): print(f'{i+1} of {len(fcn_list_proc)}', flush=True) try: with simplifier.time_limit(tmax): try: chi2[i], params[i, :] = optimise_fun(fcn_list_proc[i], likelihood, tmax, pmin, pmax, comp=comp, try_integration=try_integration, log_opt=log_opt, max_param=max_param, Niter_params=Niter_params, Nconv_params=Nconv_params, ignore_previous_eqns=ignore_previous_eqns) except NameError: if try_integration: chi2[i], params[i, :] = optimise_fun(fcn_list_proc[i], likelihood, tmax, pmin, pmax, comp=comp, try_integration=False, log_opt=log_opt, max_param=max_param, Niter_params=Niter_params, Nconv_params=Nconv_params, ignore_previous_eqns=ignore_previous_eqns) else: raise NameError except Exception as e: print(e) chi2[i] = np.nan params[i, :] = 0. out_arr = np.transpose( np.vstack([chi2] + [params[:, i] for i in range(max_param)])) # Save the data for this proc in Partial np.savetxt(likelihood.temp_dir + '/chi2_comp'+str(comp) + 'weights_'+str(rank)+'.dat', out_arr, fmt='%.7e') comm.Barrier() if rank == 0: string = 'cat `find ' + likelihood.temp_dir + '/ -name "chi2_comp' + \ str(comp)+'weights_*.dat" | sort -V` > ' + \ likelihood.out_dir + '/negloglike_comp'+str(comp)+'.dat' os.system(string) string = 'rm ' + likelihood.temp_dir + \ '/chi2_comp'+str(comp)+'weights_*.dat' os.system(string) comm.Barrier() return