import numpy as np
import math
import sympy
from mpi4py import MPI
import warnings
import os
import itertools
import esr.fitting.test_all as test_all
import esr.fitting.test_all_Fisher as test_all_Fisher
from esr.fitting.sympy_symbols import x, a0
import esr.generation.simplifier as simplifier
warnings.filterwarnings("ignore")
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
[docs]
def check_match_results(comp, likelihood, rtol=1e-5, atol=1e-8, tmax=5, try_integration=False, print_frequency=1000 ):
"""
Check that the matches have been done correctly by re-evaluating the likelihoods of all functions
from the output
Args:
:comp (int): complexity of functions to consider
:likelihood (fitting.likelihood object): object containing data, likelihood functions and file paths
:rtol (float, default=1e-5): relative tolerance for comparing likelihoods
:atol (float, default=1e-8): absolute tolerance for comparing likelihood
:tmax (float, default=5.): maximum time in seconds to run any one part of simplification procedure for a given function
:try_integration (bool, default=False): when likelihood requires integral, whether to try to analytically integrate (True) or just numerically integrate (False)
:print_frequency (int, default=1000): the status of the fits will be printed every ``print_frequency`` number of iterations
Returns:
:total_nbad (int): number of functions where the likelihood did not match
"""
# Output was [negloglike_all, codelen, index_arr] + [params[:, i] for i in range(max_param)])]
# Stream read through codelen_matches_comp*.dat file
if rank == 0:
with open(likelihood.out_dir + "/codelen_matches_comp" + str(comp) + ".dat", 'r') as f:
num_lines = sum(1 for _ in f) # Count total lines in the file
else:
num_lines = None
# Get start and end indices for each rank based on number of lines
num_lines = comm.bcast(num_lines, root=0)
lines_per_rank = num_lines // size
extra_lines = num_lines % size
if rank < extra_lines:
start_line = rank * (lines_per_rank + 1)
end_line = start_line + lines_per_rank + 1
else:
start_line = rank * lines_per_rank + extra_lines
end_line = start_line + lines_per_rank
print(f"Rank {rank} processing lines {start_line} to {end_line-1} of {num_lines}", flush=True)
comm.Barrier()
allfn_file = likelihood.fn_dir + \
"/compl_%i/all_equations_%i.txt" % (comp, comp)
nbad = 0
with open(likelihood.out_dir + "/codelen_matches_comp" + str(comp) + ".dat", 'r') as f, \
open(allfn_file, 'r') as allfn_f:
for i, (line, line_fcn) in enumerate(zip(f, allfn_f)):
comm.Barrier()
if rank == 0 and i % print_frequency == 0:
print(f'{i+1} of {num_lines}', flush=True)
if i < start_line or i >= end_line:
continue # Skip lines not assigned to this rank
fcn_i = line_fcn.strip()
d = line.strip().split()
stored_negloglike = float(d[0])
params = [float(p) for p in d[3:]]
max_param = len(params)
codelen = float(d[1])
# Evaluate the function with the stored parameters
k = simplifier.count_params([fcn_i], max_param)[0]
measured = params[:k]
if 'zoo' in fcn_i:
# zoo functions can't be evaluated
continue
if np.any(np.isnan(measured)) or np.any(np.isinf(measured)):
# skip functions with invalid parameters
continue
fcn_i, eq, integrated = likelihood.run_sympify(
fcn_i, tmax=tmax, try_integration=try_integration)
if k == 0:
eq_numpy = sympy.lambdify([x], eq, modules=["numpy"])
elif k > 1:
all_a = ' '.join([f'a{j}' for j in range(k)])
all_a = list(sympy.symbols(all_a, real=True))
eq_numpy = sympy.lambdify([x] + all_a, eq, modules=["numpy"])
else:
eq_numpy = sympy.lambdify([x, a0], eq, modules=["numpy"])
new_negloglike = likelihood.negloglike(
measured, eq_numpy, integrated=integrated)
if not np.isclose(stored_negloglike, new_negloglike, rtol=rtol, atol=atol):
# We only care if the codelength is finite
if np.isnan(codelen) or np.isinf(codelen):
continue
print(f"Rank {rank}: Mismatch in negloglike for function {fcn_i}",
stored_negloglike, new_negloglike, measured)
nbad += 1
# Gather nbad from all ranks and sum them
total_nbad = comm.reduce(nbad, op=MPI.SUM, root=0)
if rank == 0:
if total_nbad == 0:
print("All likelihoods match!", flush=True)
else:
print(f"Total mismatches in likelihoods: {total_nbad}", flush=True)
comm.Barrier()
return total_nbad
[docs]
def main(comp, likelihood, tmax=5, print_frequency=1000, try_integration=False):
"""Apply results of fitting the unique functions to all functions and save to file
Args:
:comp (int): complexity of functions to consider
:likelihood (fitting.likelihood object): object containing data, likelihood functions and file paths
:tmax (float, default=5.): maximum time in seconds to run any one part of simplification procedure for a given function
:print_frequency (int, default=1000): the status of the fits will be printed every ``print_frequency`` number of iterations
:try_integration (bool, default=False): when likelihood requires integral, whether to try to analytically integrate (True) or just numerically integrate (False)
Returns:
None
"""
if likelihood.is_mse:
raise ValueError('Cannot use MSE with description length')
def fop(x):
return likelihood.negloglike(x, eq_numpy, integrated=integrated)
def f1(x):
return likelihood.negloglike([x], eq_numpy, integrated=integrated)
if rank == 0:
print('\nMatching', flush=True)
invsubs_file = likelihood.fn_dir + \
"/compl_%i/inv_subs_%i.txt" % (comp, comp)
match_file = likelihood.fn_dir + "/compl_%i/matches_%i.txt" % (comp, comp)
fcn_list_proc, data_start, data_end = test_all.get_functions(
comp, likelihood, unique=False)
negloglike, params_meas = test_all_Fisher.load_loglike(
comp, likelihood, data_start, data_end, split=False)
max_param = params_meas.shape[1]
# all_inv_subs_proc = simplifier.load_subs(invsubs_file, max_param)[data_start:data_end]
all_inv_subs_proc = simplifier.load_subs(
invsubs_file, max_param, bcast_res=False)
with open(match_file, 'r') as f:
matches_proc = np.fromiter(
(int(float(line.strip()))
for line in itertools.islice(f, data_start, data_end)),
dtype=int
)
# 2D array of shape (# unique fcns, 10)
all_fish = np.loadtxt(likelihood.out_dir + '/derivs_comp'+str(comp)+'.dat')
all_fish = np.atleast_2d(all_fish)
# Both of these are also just for this proc
codelen = np.zeros(len(fcn_list_proc))
negloglike_all = np.zeros(len(fcn_list_proc))
index_arr = np.zeros(len(fcn_list_proc))
params = np.zeros([len(fcn_list_proc), max_param])
# The part of all eqs analysed by this proc
for i in range(len(fcn_list_proc)):
if i % print_frequency == 0 and rank == 0:
print(i, len(fcn_list_proc))
fcn_i = fcn_list_proc[i].replace('\'', '')
nparams = simplifier.count_params([fcn_i], max_param)[0]
# Index in total unique eqs file, common to all procs
index = matches_proc[i]
index_arr[i] = index
# Assign the likelihood of this variant to the that of the unique eq
negloglike_all[i] = negloglike[index]
# Element of the unique eqs file, common to all procs
if np.isnan(negloglike[index]) or np.isinf(negloglike[index]):
codelen[i] = np.nan
continue
if nparams == 0:
continue
else:
k = nparams
measured = params_meas[index, :nparams].copy()
# Access from the unique eqs all_fish array, common to all procs
fish_measured = all_fish[index, :]
# if (i in all_inv_subs_proc) and not isinstance(all_inv_subs_proc[i], dict):
# codelen[i] = np.inf
# continue
try:
if i + data_start in all_inv_subs_proc:
sub = all_inv_subs_proc[i + data_start]
else:
sub = {}
p, fish = simplifier.convert_params(
measured, fish_measured, sub, n=max_param)
if isinstance(p, float):
p = [p]
p = np.atleast_1d(p)
except Exception as e:
print('\nError with function:', fcn_i.strip(), e)
codelen[i] = np.inf
continue
if np.sum(fish <= 0) > 0:
codelen[i] = np.inf
continue
try:
Delta = np.zeros(len(fish))
m = (fish != 0)
Delta[m] = np.atleast_1d(np.sqrt(12./fish[m]))
Delta[~m] = np.inf
Nsteps = np.atleast_1d(np.abs(np.array(p)))
m = (Delta != 0)
Nsteps[m] /= Delta[m]
Nsteps[~m] = np.nan
except Exception as e:
print('Error with function:', fcn_i, e)
codelen[i] = np.inf
continue
negloglike_orig = np.copy(negloglike_all[i])
ptrue = np.copy(p)
# Should reevaluate -log(L) with the param(s) set to 0, but doesn't matter unless the fcn is a very good one
if np.sum(Nsteps < 1) > 0:
# Set any parameter to 0 that doesn't have at least one precision step, and recompute -log(L)
try:
p[Nsteps < 1] = 0.
except (IndexError, TypeError):
p = 0.
# It's possible that after putting params to 0 the likelihood is botched, in which case give it nan
try:
fcn_i, eq, integrated = likelihood.run_sympify(
fcn_i, tmax=tmax, try_integration=try_integration)
if k == 1:
eq_numpy = sympy.lambdify([x, a0], eq, modules=["numpy"])
# Modified here for this variant, but if this doesn't happen it stays the same as the unique eq
negloglike_all[i] = f1(p)
else:
all_a = ' '.join([f'a{j}' for j in range(nparams)])
all_a = list(sympy.symbols(all_a, real=True))
eq_numpy = sympy.lambdify(
[x] + all_a, eq, modules=["numpy"])
negloglike_all[i] = fop(p)
except NameError:
if try_integration:
fcn_i, eq, integrated = likelihood.run_sympify(
fcn_i, tmax=tmax, try_integration=False)
if k == 1:
eq_numpy = sympy.lambdify(
[x, a0], eq, modules=["numpy"])
# Modified here for this variant, but if this doesn't happen it stays the same as the unique eq
negloglike_all[i] = f1(p)
else:
all_a = ' '.join([f'a{j}' for j in range(nparams)])
all_a = list(sympy.symbols(all_a, real=True))
eq_numpy = sympy.lambdify(
[x] + all_a, eq, modules=["numpy"])
negloglike_all[i] = fop(p)
else:
negloglike_all[i] = np.nan
except Exception:
negloglike_all[i] = np.nan
if np.isfinite(negloglike_all[i]):
k -= np.sum(Nsteps < 1)
kept_mask = Nsteps >= 1
else:
# Let's see if setting any of the parameters to zero is ok
try_idx = np.arange(nparams)[Nsteps < 1]
for r in reversed(range(1, len(try_idx))):
for idx in itertools.combinations(try_idx, r):
p = np.copy(ptrue)
for idx_ in idx:
p[idx_] = 0.
if k == 1:
# Modified here for this variant, but if this doesn't happen it stays the same as the unique eq
negloglike_all[i] = f1(p)
else:
negloglike_all[i] = fop(p)
if np.isfinite(negloglike_all[i]):
break
kept_mask = np.ones(len(p), dtype=bool)
if np.isfinite(negloglike_all[i]):
k -= len(idx)
kept_mask[idx] = 0
# infinite nll
elif not np.isfinite(negloglike_all[i]) and not np.isnan(negloglike_all[i]):
p = ptrue
# set uncertainty=parameter in this case
fish[Nsteps < 1] = 12./(p[Nsteps < 1]**2)
codelen[i] = -k/2.*math.log(3.) + np.sum(0.5 *
np.log(fish) + np.log(abs(np.array(p))))
negloglike_all[i] = negloglike_orig
# If p was an array, we can make a list out of it
try:
params[i, :] = np.pad(p, (0, max_param-len(p)))
except Exception:
# p is either a number or nothing
if p:
# p is a number
params[i, :] = 0
params[i, 0] = p
else:
params[i, :] = np.zeros(max_param)
assert len(params[i, :]) == max_param
continue
if k < 0:
print("This shouldn't have happened", flush=True)
quit()
elif k == 0:
# If we have no parameters left then the parameter codelength is 0 so we can move on
continue
# Only consider these parameters in the codelen
fish = fish[kept_mask]
p = p[kept_mask]
else:
kept_mask = np.ones(len(p), dtype=bool)
try:
codelen[i] = -k/2.*math.log(3.) + np.sum(0.5 *
np.log(fish) + np.log(abs(np.array(p))))
except Exception:
codelen[i] = np.nan
p = ptrue
p[~kept_mask] = 0.
try: # If p was an array, we can make a list out of it
params[i, :] = np.pad(p, (0, max_param-len(p)))
except Exception: # p is either a number or nothing
if p: # p is a number
params[i, :] = 0
params[i, 0] = p
else:
params[i, :] = np.zeros(max_param)
assert len(params[i, :]) == max_param
out_arr = np.transpose(np.vstack(
[negloglike_all, codelen, index_arr] + [params[:, i] for i in range(max_param)]))
np.savetxt(likelihood.temp_dir + '/codelen_matches_'+str(comp)+'_'+str(rank) +
'.dat', out_arr, fmt='%.7e') # Save the data for this proc in Partial
comm.Barrier()
if rank == 0:
string = 'cat `find ' + likelihood.temp_dir + '/ -name "codelen_matches_' + \
str(comp)+'_*.dat" | sort -V` > ' + likelihood.out_dir + \
'/codelen_matches_comp'+str(comp)+'.dat'
os.system(string)
string = 'rm ' + likelihood.temp_dir + \
'/codelen_matches_'+str(comp)+'_*.dat'
os.system(string)
print('Saved output to', likelihood.out_dir, flush=True)
comm.Barrier()
return