Source code for emgfit.sample

################################################################################
##### Python module for creating simulated time-of-flight mass spectra with
##### Gaussian and hyper-exponentially-modified Gaussian lines shapes
##### Author: Stefan Paul

import numpy as np
import pandas as pd
from scipy.stats import exponnorm, uniform, poisson

################################################################################
##### Define Functions for drawing random variates from Hyper-EMG PDFs
norm_precision = 1e-09

[docs]def _h_m_i_rvs(mu,sigma,tau_m,N_i): """Helper function for definition of h_m_emg_rvs """ rvs = mu - exponnorm.rvs(loc=0,scale=sigma,K=tau_m/sigma,size=N_i) return rvs
[docs]def h_m_emg_rvs(mu, sigma, *t_args,N_samples=None): """Draw random samples from neg. skewed Hyper-EMG PDF Parameters ---------- mu : float [u] Nominal position of simulated peak (mean of underlying Gaussian). sigma : float [u] Nominal standard deviation (of the underlying Gaussian) of the simulated hyper-EMG peak. theta : float Mixing weight of pos. & neg. skewed EMG distributions. t_args : list of lists of float List containing lists of the EMG tail parameters with the signature: [[eta_m1, eta_m2, ...], [tau_m1, tau_m2, ...]] N_samples : int Number of random events to sample. Returns ------- :class:`numpy.ndarray` of floats Array with simulated events. """ if not isinstance(N_samples,int): raise TypeError("N_samples must be of type int") li_eta_m = t_args[0] li_tau_m = t_args[1] t_order_m = len(li_eta_m) # order of negative tail exponentials assert abs(sum(li_eta_m) - 1) < norm_precision, "eta_m's don't add up to 1." if len(li_tau_m) != t_order_m: # check if all arguments match tail order raise Exception("orders of eta_m and tau_m do not match!") # randomly distribute ions between tails according to eta_m weights tail_nos = np.random.choice(range(t_order_m),size=N_samples,p = li_eta_m) rvs = np.array([]) for i in range(t_order_m): N_i = np.count_nonzero(tail_nos == i) tau_m = li_tau_m[i] rvs_i = _h_m_i_rvs(mu,sigma,tau_m,N_i) rvs = np.append(rvs,rvs_i) return rvs
[docs]def _h_p_i_rvs(mu,sigma,tau_p,N_i): """Helper function for definition of h_p_emg_rvs """ rvs = exponnorm.rvs(loc=mu,scale=sigma,K=tau_p/sigma,size=N_i) return rvs
[docs]def h_p_emg_rvs(mu, sigma, *t_args,N_samples=None): """Draw random samples from pos. skewed Hyper-EMG PDF Parameters ---------- mu : float [u] Nominal position of simulated peak (mean of underlying Gaussian). sigma : float [u] Nominal standard deviation (of the underlying Gaussian) of the simulated hyper-EMG peak. t_args : list of lists of float List containing lists of the EMG tail parameters with the signature: [[eta_p1, eta_p2, ...], [tau_p1, tau_p2, ...]] N_samples : int Number of random events to sample. Returns ------- :class:`numpy.ndarray` of floats Array with simulated events. """ if not isinstance(N_samples,int): raise TypeError("N_samples must be of type int") li_eta_p = t_args[0] li_tau_p = t_args[1] t_order_p = len(li_eta_p) # order of negative tail exponentials assert abs(sum(li_eta_p) - 1) < norm_precision, "eta_p's don't add up to 1." if len(li_tau_p) != t_order_p: # check if all arguments match tail order raise Exception("orders of eta_p and tau_p do not match!") # randomly distribute ions between tails according to eta_p weights tail_nos = np.random.choice(range(t_order_p),size=N_samples,p = li_eta_p) rvs = np.array([]) for i in range(t_order_p): N_i = np.count_nonzero(tail_nos == i) tau_p = li_tau_p[i] rvs_i = _h_p_i_rvs(mu,sigma,tau_p,N_i) rvs = np.append(rvs,rvs_i) return rvs
[docs]def h_emg_rvs(mu, sigma , theta, *t_args, N_samples=None): """Draw random samples from Hyper-EMG PDF Parameters ---------- mu : float [u] Nominal position of simulated peak (mean of underlying Gaussian). sigma : float [u] Nominal standard deviation (of the underlying Gaussian) of the simulated hyper-EMG peak. theta : float Mixing weight of pos. & neg. skewed EMG distributions. t_args : list of lists of float List containing lists of the EMG tail parameters with the signature: [[eta_m1, eta_m2, ...], [tau_m1, tau_m2, ...], [eta_p1, eta_p2, ...], [tau_p1, tau_p2, ...]] N_samples : int Number of random events to sample. Returns ------- :class:`numpy.ndarray` of floats Array with simulated events. """ if not isinstance(N_samples,int): raise TypeError("N_samples must be of type int") li_eta_m = t_args[0] li_tau_m = t_args[1] li_eta_p = t_args[2] li_tau_p = t_args[3] if theta == 1: rvs = h_m_emg_rvs(mu, sigma, li_eta_m, li_tau_m, N_samples=N_samples) elif theta == 0: rvs = h_p_emg_rvs(mu, sigma, li_eta_p, li_tau_p, N_samples=N_samples) else: neg = np.random.choice([1,0],size=N_samples,p = [theta,1-theta]) # randomly distribute ions between h_m_emg and h_p_emg according to weight theta N_m = int(np.sum(neg)) #int(np.round(theta*N_samples)) # np.rint(theta*N_samples,dtype=int) N_p = N_samples - N_m # np.rint((1-theta)*N_samples,dtype=int) rvs_m = h_m_emg_rvs(mu, sigma, li_eta_m, li_tau_m, N_samples=N_m) rvs_p = h_p_emg_rvs(mu, sigma, li_eta_p, li_tau_p, N_samples=N_p) rvs = np.append(rvs_m,rvs_p) return rvs
################################################################################ ##### Define functions for creating simulated spectra
[docs]def simulate_events(shape_pars, mus, amps, bkg_c, N_events, x_min, x_max, out='hist', N_bins=None, bin_cens=None): """Create simulated ion events drawn from a user-defined probability distribution function (PDF) Events can either be output as a list of single events (mass stamps) or as a histogram. In histogram output mode, uniform binning is easily realized by specifying the `N_bins` argument. More control over the binning can be achieved by parsing the desired bin centers to the `bin_cens` argument (e.g. for non-uniform binning). Parameters ---------- shape_pars : dict Peak-shape parameters to use for sampling. The dictionary must follow the structure of the :attr:`~spectrum.shape_cal_pars` attribute of the :class:`~spectrum.spectrum` class. mus : float or list of float [u] Nominal peak positions of peaks in simulated spectrum. amps : float or list of float [counts per u] Nominal amplitudes of peaks in simulated spectrum. bkg_c : float [counts per bin], optional, default: 0.0 Nominal amplitude of uniform background in simulated spectrum. x_min : float [u] Beginning of sampling mass range. x_max : float [u] End of sampling mass range. N_events : int, optional, default: 1000 Total number of events to simulate (signal and background events). out : str, optional Output format of sampled data. Options: - ``'hist'`` for binned mass spectrum (default). The centres of the mass bins must be specified with the `bin_cens` argument. - ``'list'`` for unbinned list of single ion and background events. N_bins : int, optional Number of uniform bins to use in ``'hist'`` output mode. The **outer** edges of the first and last bin are fixed to the start and end of the sampling range respectively (i.e. `x_min` and `x_max`). In between, bins are distributed with a fixed spacing of (`x_max`-`x_min`)/`N_bins`. bin_cens : :class:`numpy.ndarray` [u] Centres of mass bins to use in ``'hist'`` output mode. This argument allows the realization of non-uniform binning. Bin edges are centred between neighboring bins. Note: Bins outside the sampling range defined with `x_min` and `x_max` will be empty. Returns ------- :class:`numpy.array` or :class:`pandas.Dataframe` If out='hist' a dataframe with a histogram of the format [bin centre [u], counts in bin] is returned. If out='list' an unbinned array with mass values [u] of single ion or background events is returned. Notes ----- Random events are created via custom Hyper-EMG extensions of Scipy's :meth:`scipy.stats.exponnorm.rvs` method. Currently, all simulated peaks have identical width and shape (no re-scaling of mass-dependent shape parameters to a peak's mass centroid). Routine requires tail arguments in shape_cal_pars dict to be ordered (eta_m1,eta_m2,...) etc.. **Mind the different units for peak amplitudes `amps` (counts per u) and the background level `bkg_c` (counts per bin)**. When spectrum data is simulated counts are randomly distributed between the different peaks and the background with probability weights `amps` * <bin width in> and `bkg_c` * <number of bins>, respectively. As a consequence, simply changing `N_events` (while keeping all other arguments constant), will cause `amps` and `bkg_c` to deviate from their nominal units. """ # TODO: Implement rescaling of peak-shape parameters with mass mus = np.atleast_1d(mus) amps = np.atleast_1d(amps) assert len(mus) == len(amps), "Lengths of `mus` and `amps` arrays must match." if (mus < x_min).any() or (mus > x_max).any(): import warnings msg = str("At least one peak centroid in `mus` is outside the sampling range.") warnings.warn(msg, UserWarning) sample_range = x_max - x_min if N_bins is not None and bin_cens is not None: msg = "Either specify the `N_bins` OR the `bin_cens` argument." raise Exception(msg) elif bin_cens is not None: # user-defined bins N_bins = len(bin_cens) bin_edges = np.empty((N_bins+1,)) spacings = bin_cens[1:] - bin_cens[:-1] inner_edges = bin_cens[:-1] + spacings/2 bin_edges[1:-1] = inner_edges # set inner bin edges # Get outer edges width_start = bin_cens[1] - bin_cens[0] bin_edges[0] = bin_cens[0] - width_start/2 # set first bin edge width_end = bin_cens[-1] - bin_cens[-2] bin_edges[-1] = bin_cens[-1] + width_end/2 # set last bin edge bin_width = (bin_edges[-1] - bin_edges[0])/N_bins # AVERAGE bin width elif N_bins is not None: # automatic uniform binning bin_edges = np.linspace(x_min, x_max, num=N_bins+1, endpoint=True) bin_width = sample_range/N_bins bin_cens = bin_edges[:-1] + bin_width/2 else: raise Exception("`N_bins` or `bin_cens` argument must be specified!") # Prepare shape parameters sigma = shape_pars['sigma'] theta = shape_pars['theta'] li_eta_m = [] li_tau_m = [] li_eta_p = [] li_tau_p = [] for key, val in shape_pars.items(): if key.startswith('eta_m'): li_eta_m.append(val) if key.startswith('tau_m'): li_tau_m.append(val) if key.startswith('eta_p'): li_eta_p.append(val) if key.startswith('tau_p'): li_tau_p.append(val) if len(li_eta_m) == 0 and len(li_tau_m) != 0: li_eta_m = [1] if len(li_eta_p) == 0 and len(li_tau_p) != 0: li_eta_p = [1] # Distribute counts over different peaks and background # randomly distribute ions using amps and c_bkg as prob. weights N_peaks = len(amps) counts = np.append(amps/bin_width,bkg_c*N_bins) # cts in each peak & background weights = counts/np.sum(counts) # normalized probability weights peak_dist = np.random.choice(range(N_peaks+1),size=N_events,p = weights) N_bkg = np.count_nonzero(peak_dist == N_peaks) # calc. number of background counts events = np.array([]) # Loop over peaks and create random samples from each peak for i in range(N_peaks): N_i = np.count_nonzero(peak_dist == i) # get no. of ions in peak mu = mus[i] events_i = h_emg_rvs(mu,sigma,theta,li_eta_m, li_tau_m, li_eta_p,li_tau_p, N_samples=N_i) events = np.append(events,events_i) # Create background events bkg = uniform.rvs(size=N_bkg,loc=x_min,scale=sample_range) events = np.append(events,bkg) if out == 'list': # return unbinned list of events return events elif out == 'hist': # return histogram y = np.histogram(events,bins=bin_edges)[0] df = pd.DataFrame(data=y,index=bin_cens,columns = ['Counts']) df.index.rename('Mass [u]',inplace=True) return df
[docs]def simulate_spectrum(spec, x_cen=None, x_range=None, mus=None, amps=None, bkg_c=None, N_events=None, copy_spec=False): """Create a simulated spectrum using the attributes of a reference spectrum The peak shape of the sampling probability distribution function (PDF) follows the shape calibration of the reference spectrum (`spec`). By default, all other parameters of the sampling PDF are identical to the best-fit parameters of the reference spectrum. If desired, the positions, amplitudes and number of peaks in the sampling PDF as well as the background level can be changed with the `mus`, `amps` and `bkg_c` arguments. Parameters ---------- spec : :class:`~emgfit.spectrum.spectrum` Reference spectrum object whose best-fit parameters will be used to sample from. mus : float or list of float [u], optional Nominal peak centres of peaks in simulated spectrum. Defaults to the mus of the reference spectrum fit. amps : float or list of float [counts per u], optional Nominal amplitudes of peaks in simulated spectrum. Defaults to the amplitudes of the reference spectrum fit. bkg_c : float [counts per bin], optional Nominal amplitude of uniform background in simulated spectrum. Defaults to the c_bkg obtained in the fit of the first peak in the reference spectrum. x_cen : float [u], optional Mass center of simulated spectrum. Defaults to `x_cen` of `spec`. x_range : float [u], optional Covered mass range of simulated spectrum. Defaults to `x_range` of `spectrum`. N_events : int, optional Number of ion events to simulate (including background events). Defaults to total number of events in `spec`. copy_spec : bool, optional, default: False If ``False`` (default), this function returns a fresh :class:`~spectrum.spectrum` object created from the simulated mass data. If ``True``, this function returns an exact copy of `spec` with only the :attr`data` attribute replaced by the new simulated mass data. Returns ------- :class:`spectrum.spectrum` If `copy_spec = False` (default) a fresh spectrum object holding the simulated mass data is returned. If `copy_spec = True`, a copy of the reference spectrum `spec` is returned with only the :attr:`data` attribute replaced by the new simulated mass data. Notes ----- Random events are created via custom Hyper-EMG extensions of Scipy's :meth:`scipy.stats.exponnorm.rvs` method. Currently, all simulated peaks have identical width and shape (no re-scaling of mass-dependent shape parameters to a peak's mass centroid). The returned spectrum follows the binning of the reference spectrum. Mind the different units for peak amplitudes `amps` (counts per u) and the background level `bkg_c` (counts per bin). When spectrum data is simulated counts are distributed between the different peaks and the background with probability weights `amps` * <bin width in> and `bkg_c` * <number of bins>, respectively. As a consequence, simply changing `N_events` (while keeping all other arguments constant), will render `amps` and `bkg_c` away from their nominal units. """ if spec.fit_results is [] or None: raise Exception("No fit results found in reference spectrum `spec`.") if x_cen is None and x_range is None: x_min = spec.data.index.values[0] x_max = spec.data.index.values[-1] indeces = range(len(spec.peaks)) # get peak indeces in sampling range else: x_min = x_cen - x_range x_max = x_cen + x_range # Get peak indeces in sampling range: peaks = spec.peaks indeces = [i for i in range(len(peaks)) if x_min <= peaks[i].x_pos <= x_max] if mus is None: if len(indeces) == 0: import warnings msg = str("No peaks in sampling range.") warnings.warn(msg, UserWarning) mus = [] for i in indeces: result = spec.fit_results[i] pref = 'p{0}_'.format(i) mus.append(result.best_values[pref+'mu']) if amps is None: amps = [] for i in indeces: result = spec.fit_results[i] pref = 'p{0}_'.format(i) amps.append(result.best_values[pref+'amp']) if bkg_c is None: assert len(indeces) != 0, "Zero background and no peaks in sampling range." result = spec.fit_results[indeces[0]] bkg_c = result.best_values['bkg_c'] if N_events is None: N_events = int(np.sum(spec.data['Counts'])) # total number of counts in spectrum # Create histogram with Monte Carlo events x = spec.data[x_min:x_max].index.values df = simulate_events(spec.shape_cal_pars,mus,amps,bkg_c,N_events,x_min, x_max,out='hist',N_bins=None,bin_cens=x) # Copy original spectrum and overwrite data # This copies all results such as peak assignments, PS calibration, # fit_results etc. if copy_spec: from copy import deepcopy new_spec = deepcopy(spec) new_spec.data = df else: # Define a fresh spectrum with sampled data from emgfit import spectrum new_spec = spectrum.spectrum(df=df) return new_spec