created matt_dumont
on: 25/01/24
import time
import traceback
from pathlib import Path
import numpy as np
import pandas as pd
import logging
import multiprocessing
import os
import psutil
import sys
import warnings
from komanawa.gw_age_tools import binary_exp_piston_flow_cdf, predict_historical_source_conc, make_age_dist, \
class BaseDetectionCalculator:
Base class for detection power calculations, provides some general methods for power calculations
implemented_mrt_models = None # override in subclass
_power_from_max = False # override in subclass
_power_from_min = False # override in subclass
implemented_significance_modes = None
_auto_mode = False
_counterfactual = False
log_level = logging.INFO
def __init__(self):
raise NotImplementedError('must be implemented in subclass')
def _round_kwarg_value(self, val, key):
helper function to round a kwarg value to the precision of the kwarg
:param val: value to round
:param key: kwarg name
use_key = key.replace('_vals', '')
if hasattr(self, f'{use_key}_per'):
float_percision = getattr(self, f'{use_key}_per')
float_percision = 9
if val is None:
return val
if pd.api.types.is_float(val):
return np.round(val, float_percision)
return val
def _get_id_str(self, val, name):
helper function to get a string for an idv used to reduce the number of runs for multiprocessing workload
:param val:
:param name:
:param float_percision:
name = name.replace('_vals', '')
if hasattr(self, f'{name}_per'):
float_percision = getattr(self, f'{name}_per')
float_percision = 9
if val is None:
return f'{name}=None'
if pd.api.types.is_float(val):
return f'{name}={val:.{float_percision}f}'
return f'{name}={val}'
def _get_key_info(self, key):
base_data = dict(
error_vals=(False, False, False),
error_base_vals=(False, False, False),
error_alt_vals=(True, False, False),
seed_alt_vals_vals=(True, True, False),
seed_base_vals_vals=(True, True, False),
seed=(True, True, False),
auto_data = dict(
samp_years_vals=(False, True, False),
samp_per_year_vals=(False, True, False),
implementation_time_vals=(False, True, False),
initial_conc_vals=(False, False, False),
target_conc_vals=(False, False, False),
prev_slope_vals=(False, False, False),
max_conc_lim_vals=(False, False, False),
min_conc_lim_vals=(False, False, False),
mrt_vals=(False, False, False),
mrt_p1_vals=(True, False, False),
frac_p1_vals=(True, False, False),
f_p1_vals=(True, False, False),
f_p2_vals=(True, False, False),
if self._auto_mode:
data = {**base_data, **auto_data}
data = base_data
none_allowed, is_int, is_any = data.get(key, (False, False, True))
return none_allowed, is_int, is_any
def _run_multiprocess_pass_conc(self, outpath, idv_vals, run, debug_mode, use_kwargs):
outpath, idv_vals, use_kwargs = self._multiprocess_checks(outpath, idv_vals, **use_kwargs)
runs = []
for i, idv in enumerate(idv_vals):
kwargs = {k.replace('_vals', ''): v[i] for k, v in use_kwargs.items()}
kwargs['idv'] = idv
if self.log_level <= logging.INFO:
print(f'running {len(runs)} runs')
if not run:
print(f'stopping as {run=}')
if debug_mode:
result_data = []
for run_kwargs in runs:
print(f'running power calc for: {run_kwargs["idv"]} with debug_mode=True (single process)')
result_data = _run_multiprocess(self._power_calc_mp, runs, num_cores=self.ncores,
result_data = pd.DataFrame(result_data)
result_data.set_index('idv', inplace=True)
if outpath is not None:
print(f'saving results to: {outpath}')
outpath.parent.mkdir(parents=True, exist_ok=True)
result_data.to_hdf(outpath, 'data')
return result_data
def _run_multiprocess_auto(self, outpath, idv_vals, run, debug_mode, use_kwargs):
outpath, idv_vals, use_kwargs = self._multiprocess_checks(outpath, idv_vals, **use_kwargs)
# make runs
runs = []
if self.condensed_mode:
if self._counterfactual:
identifiers = ['error_base', 'error_alt', 'samp_years', 'samp_per_year', 'implementation_time_base',
'implementation_time_alt', 'delay_years',
'target_conc_base', 'target_conc_alt', 'prev_slope', 'max_conc_lim', 'min_conc_lim',
'mrt', 'mrt_p1', 'frac_p1', 'f_p1', 'f_p2', 'seed_alt', 'seed_base']
identifiers = ['error', 'samp_years', 'samp_per_year', 'implementation_time', 'initial_conc',
'target_conc', 'prev_slope', 'max_conc_lim', 'min_conc_lim', 'mrt_model',
'mrt', 'mrt_p1', 'frac_p1', 'f_p1', 'f_p2', 'seed']
all_use_idv = []
run_list = []
print('creating and condensing runs')
use_kwargs = {k: self._round_kwarg_value(v, k) for k, v in use_kwargs.items()}
for i, idv in enumerate(idv_vals):
if i % 1000 == 0:
print(f'forming/condesing run {i} of {len(idv_vals)}')
use_idv = '_'.join([self._get_id_str(use_kwargs[e + '_vals'][i], e) for e in identifiers])
if use_idv in run_list:
kwargs = {k.replace('_vals', ''): v[i] for k, v in use_kwargs.items()}
kwargs['idv'] = use_idv
all_use_idv = idv_vals
for i, idv in enumerate(idv_vals):
kwargs = {k.replace('_vals', ''): v[i] for k, v in use_kwargs.items()}
kwargs['idv'] = idv
if self.condensed_mode:
print(f'running {len(runs)} runs, simplified from {len(idv_vals)} runs')
print(f'running {len(runs)} runs')
if not run:
print(f'stopping as {run=}')
if debug_mode:
result_data = []
for run_kwargs in runs:
print(f'running power calc for: {run_kwargs["idv"]} with debug_mode=True (single process)')
result_data = _run_multiprocess(self._power_calc_mp, runs, num_cores=self.ncores,
result_data = pd.DataFrame(result_data)
result_data.set_index('idv', inplace=True)
outdata = result_data.loc[all_use_idv]
outdata.loc[:, 'idv'] = idv_vals
outdata.set_index('idv', inplace=True, drop=True)
if outpath is not None:
outpath.parent.mkdir(parents=True, exist_ok=True)
outdata.to_hdf(outpath, 'data')
return outdata
def _adjust_shape(x, shape, none_allowed, is_int, idv, any_val=False):
helper function to adjust the shape of an input variable
:param x: input variable
:param shape: shape needed
:param none_allowed: Is None allowed as a value
:param is_int: is it an integer
:param idv: str name of the input variable for error messages
:param any_val: if True then any value is allowed
if any_val:
if hasattr(x, '__iter__') and not isinstance(x, str):
x = np.atleast_1d(x)
assert len(x) == shape[0], (f'wrong_shape for {idv} must have shape {shape} or not be iterable '
f'got: shp {x.shape} dtype {x.dtype}')
x = np.full(shape, x)
return x
if x is None and none_allowed:
x = np.full(shape, None)
return x
if is_int:
if pd.api.types.is_integer(x):
x = np.full(shape, x, dtype=int)
x = np.atleast_1d(x)
assert x.shape == shape, (f'wrong_shape for {idv} must have shape {shape} '
f'got: shp {x.shape} dtype {x.dtype}')
not_bad = [e is None or pd.api.types.is_integer(e) for e in x]
assert all(not_bad), (f'{idv} must be an integer or None got {x[~np.array(not_bad)]} '
f'at indices {np.where(~np.array(not_bad))[0]}')
if pd.api.types.is_number(x):
x = np.full(shape, x).astype(float)
x = np.atleast_1d(x)
assert x.shape == shape, (f'wrong_shape for {idv} must be a float or have shape {shape} '
f'got: shp {x.shape} dtype {x.dtype}')
not_bad = [e is None or pd.api.types.is_number(e) for e in x]
assert all(not_bad), (f'{idv} must be an number or None got {x[~np.array(not_bad)]} '
f'at indices {np.where(~np.array(not_bad))[0]}')
return x
def truets_from_binary_exp_piston_flow(mrt, mrt_p1, frac_p1, f_p1, f_p2,
initial_conc, target_conc, prev_slope, max_conc, min_conc,
samp_per_year, samp_years, implementation_time, past_source_data=None,
return_extras=False, low_mem=False,
create a true concentration time series using binary piston flow model for the mean residence time note that this can be really slow for large runs and it may be better to create the source data first and then pass it to the power calcs via pass_true_conc
:param mrt: mean residence time years
:param mrt_p1: mean residence time of the first pathway years
:param frac_p1: fraction of the first pathway
:param f_p1: ratio of the exponential volume to the total volume pathway 1
:param f_p2: ratio of the exponential volume to the total volume pathway 2
:param initial_conc: initial concentration
:param target_conc: target concentration
:param prev_slope: previous slope of the concentration data
:param max_conc: maximum concentration limit user specified or None here the maximum concentration is specified as the maximum concentration of the source (before temporal mixing)
:param min_conc: minimum concentration limit user specified, the lowest concentration for the source
:param samp_per_year: samples per year
:param samp_years: number of years to sample
:param implementation_time: number of years to implement reductions
:param past_source_data: past source data, if None will use the initial concentration and the previous slope to estimate the past source data, this is only set as an option to allow users to preclude re-running the source data calculations if they have already been done so. Suggest that users only pass results from get_source_initial_conc_bepm with age_step = 0.01
:param return_extras: return extra variables for debugging
:return: true timeseries, max_conc, max_conc_time, frac_p2
mrt, mrt_p2 = check_age_inputs(mrt=mrt, mrt_p1=mrt_p1, mrt_p2=None, frac_p1=frac_p1,
precision=precision, f_p1=f_p1, f_p2=f_p2)
# make cdf of age
age_step, ages, age_fractions = make_age_dist(mrt, mrt_p1, mrt_p2, frac_p1, precision, f_p1, f_p2, start=np.nan)
ages = np.arange(0, np.nanmax([mrt_p1, mrt_p2]) * 5, age_step).round(precision) # approximately monthly steps
age_cdf = binary_exp_piston_flow_cdf(ages, mrt_p1, mrt_p2, frac_p1, f_p1, f_p2)
age_fractions = np.diff(age_cdf, prepend=0)
# make historical source concentrations from prev_slope, initial_conc, max_conc
if past_source_data is not None:
source_conc_past = past_source_data.sort_index()
use_max_conc = source_conc_past.iloc[-1]
if prev_slope == 0:
hist_ages = np.arange(0., np.nanmax([mrt_p1, mrt_p2]) * 5 * 2 + age_step, age_step).round(precision)
source_conc_past = pd.Series(index=hist_ages * -1, data=np.ones(len(hist_ages)) * initial_conc)
use_max_conc = initial_conc
# make a historical source timeseries from preivous slope, inital conc age pdf and max conc
source_conc_past = predict_historical_source_conc(init_conc=initial_conc,
mrt=mrt, mrt_p1=mrt_p1, mrt_p2=mrt_p2,
frac_p1=frac_p1, f_p1=f_p1, f_p2=f_p2,
prev_slope=prev_slope, max_conc=max_conc,
min_conc=min_conc, start_age=np.nan,
source_conc_past = source_conc_past.sort_index()
use_max_conc = source_conc_past.iloc[-1]
# make a future source timeseries from target_conc and implementation_time
if low_mem:
fut_idx = np.arange(0, max(implementation_time, samp_years) + 1, 1)
fut_idx = np.arange(0, max(implementation_time, samp_years) + age_step, age_step).round(precision)
future_conc = pd.Series(
future_conc[future_conc.index >= implementation_time] = target_conc
future_conc[0] = use_max_conc
future_conc = future_conc.interpolate(method='linear')
future_conc = future_conc.sort_index()
total_source_conc = pd.concat([source_conc_past.drop(index=0), future_conc]).sort_index()
# sample the source concentration onto the age pdf to return the true timeseries
out_years = np.arange(0, samp_years, 1 / samp_per_year).round(precision)
out_conc = np.full_like(out_years, np.nan)
if low_mem:
for i, t in enumerate(out_years):
use_ages = (t - ages).round(precision)
temp_out = total_source_conc.loc[use_ages.min() - 1:use_ages.max() + 2]
temp_out = pd.concat((temp_out,
pd.Series(index=use_ages[~np.in1d(use_ages, temp_out.index)], data=np.nan)))
temp_out = temp_out.sort_index()
temp_out = temp_out.interpolate(method='linear')
out_conc[i] = (temp_out.loc[(t - ages).round(precision)] * age_fractions).sum()
use_ages = np.repeat(ages[:, np.newaxis], len(out_years), axis=1)
ags_shp = use_ages.shape
use_ages = (out_years[np.newaxis] - use_ages).round(precision).flatten()
out_conc = total_source_conc.loc[use_ages].values.reshape(ags_shp) * age_fractions[:, np.newaxis]
out_conc = out_conc.sum(axis=0)
max_conc_time = out_years[out_conc.argmax()]
conc_max = out_conc.max()
if return_extras:
past_years = np.arange(ages.max() * -1, 0., 1 / samp_per_year)
past_conc = np.full_like(past_years, np.nan)
for i, t in enumerate(past_years):
past_conc[i] = (total_source_conc.loc[(t - ages).round(precision)] * age_fractions).sum()
past_conc = pd.Series(index=past_years, data=past_conc)
return out_conc, conc_max, max_conc_time, mrt_p2, total_source_conc, age_fractions, out_years, ages, past_conc
return out_conc, conc_max, max_conc_time, mrt_p2
def truets_from_piston_flow(mrt, initial_conc, target_conc, prev_slope, max_conc, samp_per_year, samp_years,
piston flow model for the mean residence time
:param mrt: mean residence time
:param initial_conc: initial concentration
:param target_conc: target concentration
:param prev_slope: previous slope of the concentration data mg/l/yr
:param max_conc: maximum concentration limit user specified or None
:param samp_per_year: samples per year
:param samp_years: number of years to sample
:param implementation_time: number of years to implement reductions
:return: true timeseries, max_conc, max_conc_time, frac_p2
# expand from
nsamples_imp = samp_per_year * implementation_time
nsamples_total = samp_per_year * samp_years
true_conc_ts = []
# lag period
if mrt >= 1:
nsamples_lag = int(round(mrt * samp_per_year))
temp = np.interp(np.arange(nsamples_lag),
[0, nsamples_lag - 1],
[initial_conc, initial_conc + prev_slope * mrt])
if max_conc is not None:
temp[temp > max_conc] = max_conc
max_conc_time = np.argmax(temp) / samp_per_year
max_conc = temp.max()
max_conc = initial_conc
max_conc_time = 0
nsamples_lag = 0
# reduction_period
np.interp(np.arange(nsamples_imp), [0, nsamples_imp - 1], [max_conc, target_conc]))
if nsamples_total > (nsamples_lag + nsamples_imp):
true_conc_ts.append(np.ones(nsamples_total - (nsamples_lag + nsamples_imp)) * target_conc)
true_conc_ts = np.concatenate(true_conc_ts)
true_conc_ts = true_conc_ts[:nsamples_total]
frac_p2 = None # dummy value
return true_conc_ts, max_conc, max_conc_time, frac_p2
def time_test_power_calc_itter(self, testnitter=10, **kwargs):
run a test power calc iteration to check for errors
:param testnitter: number of iterations to run
:param kwargs: kwargs for power_calc
:return: None
t = time.time()
use_action = 'default'
for wv in warnings.filters:
if str(wv[2]) == str(UserWarning):
use_action = wv[0]
warnings.filterwarnings("ignore", category=UserWarning)
for i in range(testnitter):
self.power_calc(testnitter=testnitter, **kwargs)
warnings.filterwarnings(use_action, category=UserWarning)
temp = (time.time() - t) / testnitter
print(f'time per iteration: {temp} s. based on {testnitter} iterations\n'
f'with set number of iterations: {self.nsims} it will take {temp * self.nsims / 60} s to run the power calc')
def _power_calc_mp(self, kwargs):
multiprocessing wrapper for power_calc
:param kwargs:
out = self.power_calc(**kwargs)
if self.return_true_conc or self.return_noisy_conc_itters < 0:
out = out['power']
except Exception:
# capture kwargs to make debugging easier
out = {
'idv': kwargs['idv'],
'python_error': traceback.format_exc(),
for k in kwargs:
if k not in ['true_conc_ts', 'true_conc_base', 'true_conc_alt', 'idv']:
out[k] = kwargs[k]
out = pd.Series(out)
return out
def _check_propogate_truets(x, shape):
if x is None:
return np.full(shape, None)
len_x = len(x)
assert len_x == shape[0], f'wrong_shape for true_conc_ts_vals must have len {shape[0]} got: shp {len_x}'
return x
def _multiprocess_checks(self, outpath, id_vals, **kwargs):
if self.return_true_conc or self.return_noisy_conc_itters > 0:
warnings.warn('return_true_conc and return_noisy_conc_itters are not supported for mulitprocess_power_calcs'
'only power results will be returned')
if isinstance(outpath, str):
outpath = Path(outpath)
id_vals = np.atleast_1d(id_vals)
expect_shape = id_vals.shape
# check other inputs
for key, value in kwargs.items():
if key in ['mrt_model_vals', 'true_conc_ts_vals', 'true_conc_base_vals',
none_allowed, is_int, is_any = self._get_key_info(key)
temp = self._adjust_shape(value, expect_shape, none_allowed=none_allowed, is_int=is_int, idv=key,
kwargs[key] = temp
if self._auto_mode:
mrt_model_vals = kwargs['mrt_model_vals']
if isinstance(mrt_model_vals, str):
mrt_model_vals = np.array([mrt_model_vals] * len(id_vals))
mrt_model_vals = np.atleast_1d(mrt_model_vals)
assert mrt_model_vals.shape == id_vals.shape, f'mrt_model_vals and mrt_vals must have the same shape'
assert np.in1d(mrt_model_vals, self.implemented_mrt_models).all(), (
f'mrt_model_vals must be one of {self.implemented_mrt_models} '
f'got {np.unique(mrt_model_vals)}')
kwargs['mrt_model_vals'] = mrt_model_vals
# check min/max values
assert 'initial_conc_vals' in kwargs, 'if max_conc_lim_vals is passed initial_conc_vals must be passed'
not_na_idx = pd.notna(kwargs['max_conc_lim_vals']) & pd.notna(kwargs['initial_conc_vals'])
assert (kwargs['max_conc_lim_vals'][not_na_idx] >= kwargs['initial_conc_vals'][not_na_idx]).all(), (
'max_conc_lim must be greater than or equal to initial_conc')
if self._counterfactual:
not_na_idx = pd.notna(kwargs['max_conc_lim_vals']) & pd.notna(kwargs['target_conc_base_vals'])
assert (kwargs['max_conc_lim_vals'][not_na_idx] >= kwargs['target_conc_base_vals'][not_na_idx]).all(), (
'max_conc_lim must be greater than or equal to target_conc_base')
not_na_idx = pd.notna(kwargs['max_conc_lim_vals']) & pd.notna(kwargs['target_conc_alt_vals'])
assert (kwargs['max_conc_lim_vals'][not_na_idx] >= kwargs['target_conc_alt_vals'][not_na_idx]).all(), (
'max_conc_lim must be greater than or equal to target_conc_alt')
not_na_idx = pd.notna(kwargs['max_conc_lim_vals']) & pd.notna(kwargs['target_conc_vals'])
assert (kwargs['max_conc_lim_vals'][not_na_idx] >= kwargs['target_conc_vals'][not_na_idx]).all(), (
'max_conc_lim must be greater than or equal to target_conc')
elif self._counterfactual:
kwargs['true_conc_base_vals'] = self._check_propogate_truets(kwargs['true_conc_base_vals'], expect_shape)
kwargs['true_conc_alt_vals'] = self._check_propogate_truets(kwargs['true_conc_alt_vals'], expect_shape)
kwargs['true_conc_ts_vals'] = self._check_propogate_truets(kwargs['true_conc_ts_vals'], expect_shape)
return outpath, id_vals, kwargs
def _run_multiprocess(func, runs, logical=True, num_cores=None, logging_level=logging.INFO):
count the number of processors and then instiute the runs of a function to
:param func: function with one argument kwargs.
:param runs: a list of runs to pass to the function the function is called via func(kwargs)
:param num_cores: int or None, if None then use all cores (+-logical) if int, set pool size to number of cores
:param logical: bool if True then add the logical processors to the count
:param logging_level: logging level to use one of: logging.DEBUG, logging.INFO, logging.WARNING, logging.ERROR,
logging.CRITICAL more info https://docs.python.org/3/howto/logging.html
default is logging.INFO
assert isinstance(num_cores, int) or num_cores is None
if num_cores is None:
pool_size = psutil.cpu_count(logical=logical)
pool_size = num_cores
pool = multiprocessing.Pool(processes=pool_size,
results = pool.map_async(func, runs)
pool_outputs = results.get()
pool.close() # no more tasks
return pool_outputs
def _start_process():
function to run at the start of each multiprocess sets the priority lower
logger = multiprocessing.get_logger()
if logger.level <= logging.INFO:
print('Starting', multiprocessing.current_process().name)
p = psutil.Process(os.getpid())
# set to lowest priority, this is windows only, on Unix use ps.nice(19)
if sys.platform == "linux":
# linux
elif sys.platform == "darwin":
# OS X
elif sys.platform == "win32":
# Windows...
raise ValueError(f'unexpected platform: {sys.platform}')