################################################################################
##### Python module for creating simulated time-of-flight mass spectra with
##### Gaussian and hyper-exponentially-modified Gaussian lines shapes
##### Author: Stefan Paul
import numpy as np
import pandas as pd
from scipy.stats import exponnorm, uniform, poisson
################################################################################
##### Define Functions for drawing random variates from Hyper-EMG PDFs
norm_precision = 1e-09
[docs]def _h_m_i_rvs(mu,sigma,tau_m,N_i):
"""Helper function for definition of h_m_emg_rvs """
rvs = mu - exponnorm.rvs(loc=0,scale=sigma,K=tau_m/sigma,size=N_i)
return rvs
[docs]def h_m_emg_rvs(mu, sigma, *t_args,N_samples=None):
"""Draw random samples from neg. skewed Hyper-EMG PDF
Parameters
----------
mu : float [u]
Nominal position of simulated peak (mean of underlying Gaussian).
sigma : float [u]
Nominal standard deviation (of the underlying Gaussian) of the simulated
hyper-EMG peak.
theta : float
Mixing weight of pos. & neg. skewed EMG distributions.
t_args : list of lists of float
List containing lists of the EMG tail parameters with the signature:
[[eta_m1, eta_m2, ...], [tau_m1, tau_m2, ...]]
N_samples : int
Number of random events to sample.
Returns
-------
:class:`numpy.ndarray` of floats
Array with simulated events.
"""
if not isinstance(N_samples,int):
raise TypeError("N_samples must be of type int")
li_eta_m = t_args[0]
li_tau_m = t_args[1]
t_order_m = len(li_eta_m) # order of negative tail exponentials
assert abs(sum(li_eta_m) - 1) < norm_precision, "eta_m's don't add up to 1."
if len(li_tau_m) != t_order_m: # check if all arguments match tail order
raise Exception("orders of eta_m and tau_m do not match!")
# randomly distribute ions between tails according to eta_m weights
tail_nos = np.random.choice(range(t_order_m),size=N_samples,p = li_eta_m)
rvs = np.array([])
for i in range(t_order_m):
N_i = np.count_nonzero(tail_nos == i)
tau_m = li_tau_m[i]
rvs_i = _h_m_i_rvs(mu,sigma,tau_m,N_i)
rvs = np.append(rvs,rvs_i)
return rvs
[docs]def _h_p_i_rvs(mu,sigma,tau_p,N_i):
"""Helper function for definition of h_p_emg_rvs """
rvs = exponnorm.rvs(loc=mu,scale=sigma,K=tau_p/sigma,size=N_i)
return rvs
[docs]def h_p_emg_rvs(mu, sigma, *t_args,N_samples=None):
"""Draw random samples from pos. skewed Hyper-EMG PDF
Parameters
----------
mu : float [u]
Nominal position of simulated peak (mean of underlying Gaussian).
sigma : float [u]
Nominal standard deviation (of the underlying Gaussian) of the simulated
hyper-EMG peak.
t_args : list of lists of float
List containing lists of the EMG tail parameters with the signature:
[[eta_p1, eta_p2, ...], [tau_p1, tau_p2, ...]]
N_samples : int
Number of random events to sample.
Returns
-------
:class:`numpy.ndarray` of floats
Array with simulated events.
"""
if not isinstance(N_samples,int):
raise TypeError("N_samples must be of type int")
li_eta_p = t_args[0]
li_tau_p = t_args[1]
t_order_p = len(li_eta_p) # order of negative tail exponentials
assert abs(sum(li_eta_p) - 1) < norm_precision, "eta_p's don't add up to 1."
if len(li_tau_p) != t_order_p: # check if all arguments match tail order
raise Exception("orders of eta_p and tau_p do not match!")
# randomly distribute ions between tails according to eta_p weights
tail_nos = np.random.choice(range(t_order_p),size=N_samples,p = li_eta_p)
rvs = np.array([])
for i in range(t_order_p):
N_i = np.count_nonzero(tail_nos == i)
tau_p = li_tau_p[i]
rvs_i = _h_p_i_rvs(mu,sigma,tau_p,N_i)
rvs = np.append(rvs,rvs_i)
return rvs
[docs]def h_emg_rvs(mu, sigma , theta, *t_args, N_samples=None):
"""Draw random samples from Hyper-EMG PDF
Parameters
----------
mu : float [u]
Nominal position of simulated peak (mean of underlying Gaussian).
sigma : float [u]
Nominal standard deviation (of the underlying Gaussian) of the simulated
hyper-EMG peak.
theta : float
Mixing weight of pos. & neg. skewed EMG distributions.
t_args : list of lists of float
List containing lists of the EMG tail parameters with the signature:
[[eta_m1, eta_m2, ...], [tau_m1, tau_m2, ...],
[eta_p1, eta_p2, ...], [tau_p1, tau_p2, ...]]
N_samples : int
Number of random events to sample.
Returns
-------
:class:`numpy.ndarray` of floats
Array with simulated events.
"""
if not isinstance(N_samples,int):
raise TypeError("N_samples must be of type int")
li_eta_m = t_args[0]
li_tau_m = t_args[1]
li_eta_p = t_args[2]
li_tau_p = t_args[3]
if theta == 1:
rvs = h_m_emg_rvs(mu, sigma, li_eta_m, li_tau_m, N_samples=N_samples)
elif theta == 0:
rvs = h_p_emg_rvs(mu, sigma, li_eta_p, li_tau_p, N_samples=N_samples)
else:
neg = np.random.choice([1,0],size=N_samples,p = [theta,1-theta]) # randomly distribute ions between h_m_emg and h_p_emg according to weight theta
N_m = int(np.sum(neg)) #int(np.round(theta*N_samples)) # np.rint(theta*N_samples,dtype=int)
N_p = N_samples - N_m # np.rint((1-theta)*N_samples,dtype=int)
rvs_m = h_m_emg_rvs(mu, sigma, li_eta_m, li_tau_m, N_samples=N_m)
rvs_p = h_p_emg_rvs(mu, sigma, li_eta_p, li_tau_p, N_samples=N_p)
rvs = np.append(rvs_m,rvs_p)
return rvs
################################################################################
##### Define functions for creating simulated spectra
[docs]def simulate_events(shape_pars, mus, amps, bkg_c, N_events, x_min,
x_max, out='hist', N_bins=None, bin_cens=None):
"""Create simulated ion events drawn from a user-defined probability
distribution function (PDF)
Events can either be output as a list of single events (mass stamps) or as a
histogram. In histogram output mode, uniform binning is easily realized by
specifying the `N_bins` argument. More control over the binning can be
achieved by parsing the desired bin centers to the `bin_cens` argument (e.g.
for non-uniform binning).
Parameters
----------
shape_pars : dict
Peak-shape parameters to use for sampling. The dictionary must follow
the structure of the :attr:`~spectrum.shape_cal_pars` attribute of the
:class:`~spectrum.spectrum` class.
mus : float or list of float [u]
Nominal peak positions of peaks in simulated spectrum.
amps : float or list of float [counts per u]
Nominal amplitudes of peaks in simulated spectrum.
bkg_c : float [counts per bin], optional, default: 0.0
Nominal amplitude of uniform background in simulated spectrum.
x_min : float [u]
Beginning of sampling mass range.
x_max : float [u]
End of sampling mass range.
N_events : int, optional, default: 1000
Total number of events to simulate (signal and background events).
out : str, optional
Output format of sampled data. Options:
- ``'hist'`` for binned mass spectrum (default). The centres of the mass
bins must be specified with the `bin_cens` argument.
- ``'list'`` for unbinned list of single ion and background events.
N_bins : int, optional
Number of uniform bins to use in ``'hist'`` output mode. The **outer**
edges of the first and last bin are fixed to the start and end of the
sampling range respectively (i.e. `x_min` and `x_max`). In between, bins
are distributed with a fixed spacing of (`x_max`-`x_min`)/`N_bins`.
bin_cens : :class:`numpy.ndarray` [u]
Centres of mass bins to use in ``'hist'`` output mode. This argument
allows the realization of non-uniform binning. Bin edges are centred
between neighboring bins. Note: Bins outside the sampling range defined
with `x_min` and `x_max` will be empty.
Returns
-------
:class:`numpy.array` or :class:`pandas.Dataframe`
If out='hist' a dataframe with a histogram of the format
[bin centre [u], counts in bin] is returned. If out='list' an unbinned
array with mass values [u] of single ion or background events is returned.
Notes
-----
Random events are created via custom Hyper-EMG extensions of Scipy's
:meth:`scipy.stats.exponnorm.rvs` method.
Currently, all simulated peaks have identical width and shape (no re-scaling
of mass-dependent shape parameters to a peak's mass centroid).
Routine requires tail arguments in shape_cal_pars dict to be ordered
(eta_m1,eta_m2,...) etc..
**Mind the different units for peak amplitudes `amps` (counts per u) and the
background level `bkg_c` (counts per bin)**. When spectrum data is
simulated counts are randomly distributed between the different peaks and
the background with probability weights `amps` * <bin width in> and
`bkg_c` * <number of bins>, respectively. As a consequence, simply changing
`N_events` (while keeping all other arguments constant), will cause `amps`
and `bkg_c` to deviate from their nominal units.
"""
# TODO: Implement rescaling of peak-shape parameters with mass
mus = np.atleast_1d(mus)
amps = np.atleast_1d(amps)
assert len(mus) == len(amps), "Lengths of `mus` and `amps` arrays must match."
if (mus < x_min).any() or (mus > x_max).any():
import warnings
msg = str("At least one peak centroid in `mus` is outside the sampling range.")
warnings.warn(msg, UserWarning)
sample_range = x_max - x_min
if N_bins is not None and bin_cens is not None:
msg = "Either specify the `N_bins` OR the `bin_cens` argument."
raise Exception(msg)
elif bin_cens is not None: # user-defined bins
N_bins = len(bin_cens)
bin_edges = np.empty((N_bins+1,))
spacings = bin_cens[1:] - bin_cens[:-1]
inner_edges = bin_cens[:-1] + spacings/2
bin_edges[1:-1] = inner_edges # set inner bin edges
# Get outer edges
width_start = bin_cens[1] - bin_cens[0]
bin_edges[0] = bin_cens[0] - width_start/2 # set first bin edge
width_end = bin_cens[-1] - bin_cens[-2]
bin_edges[-1] = bin_cens[-1] + width_end/2 # set last bin edge
bin_width = (bin_edges[-1] - bin_edges[0])/N_bins # AVERAGE bin width
elif N_bins is not None: # automatic uniform binning
bin_edges = np.linspace(x_min, x_max, num=N_bins+1, endpoint=True)
bin_width = sample_range/N_bins
bin_cens = bin_edges[:-1] + bin_width/2
else:
raise Exception("`N_bins` or `bin_cens` argument must be specified!")
# Prepare shape parameters
sigma = shape_pars['sigma']
theta = shape_pars['theta']
li_eta_m = []
li_tau_m = []
li_eta_p = []
li_tau_p = []
for key, val in shape_pars.items():
if key.startswith('eta_m'):
li_eta_m.append(val)
if key.startswith('tau_m'):
li_tau_m.append(val)
if key.startswith('eta_p'):
li_eta_p.append(val)
if key.startswith('tau_p'):
li_tau_p.append(val)
if len(li_eta_m) == 0 and len(li_tau_m) != 0:
li_eta_m = [1]
if len(li_eta_p) == 0 and len(li_tau_p) != 0:
li_eta_p = [1]
# Distribute counts over different peaks and background
# randomly distribute ions using amps and c_bkg as prob. weights
N_peaks = len(amps)
counts = np.append(amps/bin_width,bkg_c*N_bins) # cts in each peak & background
weights = counts/np.sum(counts) # normalized probability weights
peak_dist = np.random.choice(range(N_peaks+1),size=N_events,p = weights)
N_bkg = np.count_nonzero(peak_dist == N_peaks) # calc. number of background counts
events = np.array([])
# Loop over peaks and create random samples from each peak
for i in range(N_peaks):
N_i = np.count_nonzero(peak_dist == i) # get no. of ions in peak
mu = mus[i]
events_i = h_emg_rvs(mu,sigma,theta,li_eta_m, li_tau_m, li_eta_p,li_tau_p, N_samples=N_i)
events = np.append(events,events_i)
# Create background events
bkg = uniform.rvs(size=N_bkg,loc=x_min,scale=sample_range)
events = np.append(events,bkg)
if out == 'list': # return unbinned list of events
return events
elif out == 'hist': # return histogram
y = np.histogram(events,bins=bin_edges)[0]
df = pd.DataFrame(data=y,index=bin_cens,columns = ['Counts'])
df.index.rename('Mass [u]',inplace=True)
return df
[docs]def simulate_spectrum(spec, x_cen=None, x_range=None, mus=None, amps=None,
bkg_c=None, N_events=None, copy_spec=False):
"""Create a simulated spectrum using the attributes of a reference spectrum
The peak shape of the sampling probability distribution function (PDF)
follows the shape calibration of the reference spectrum (`spec`). By
default, all other parameters of the sampling PDF are identical to the
best-fit parameters of the reference spectrum. If desired, the positions,
amplitudes and number of peaks in the sampling PDF as well as the background
level can be changed with the `mus`, `amps` and `bkg_c` arguments.
Parameters
----------
spec : :class:`~emgfit.spectrum.spectrum`
Reference spectrum object whose best-fit parameters will be used to
sample from.
mus : float or list of float [u], optional
Nominal peak centres of peaks in simulated spectrum. Defaults to the
mus of the reference spectrum fit.
amps : float or list of float [counts per u], optional
Nominal amplitudes of peaks in simulated spectrum. Defaults to the
amplitudes of the reference spectrum fit.
bkg_c : float [counts per bin], optional
Nominal amplitude of uniform background in simulated spectrum. Defaults
to the c_bkg obtained in the fit of the first peak in the reference
spectrum.
x_cen : float [u], optional
Mass center of simulated spectrum. Defaults to `x_cen` of `spec`.
x_range : float [u], optional
Covered mass range of simulated spectrum. Defaults to `x_range` of
`spectrum`.
N_events : int, optional
Number of ion events to simulate (including background events). Defaults
to total number of events in `spec`.
copy_spec : bool, optional, default: False
If ``False`` (default), this function returns a fresh
:class:`~spectrum.spectrum` object created from the simulated mass data.
If ``True``, this function returns an exact copy of `spec` with only the
:attr`data` attribute replaced by the new simulated mass data.
Returns
-------
:class:`spectrum.spectrum`
If `copy_spec = False` (default) a fresh spectrum object holding the
simulated mass data is returned. If `copy_spec = True`, a copy of the
reference spectrum `spec` is returned with only the :attr:`data`
attribute replaced by the new simulated mass data.
Notes
-----
Random events are created via custom Hyper-EMG extensions of Scipy's
:meth:`scipy.stats.exponnorm.rvs` method.
Currently, all simulated peaks have identical width and shape (no re-scaling
of mass-dependent shape parameters to a peak's mass centroid).
The returned spectrum follows the binning of the reference spectrum.
Mind the different units for peak amplitudes `amps` (counts per u) and the
background level `bkg_c` (counts per bin). When spectrum data is
simulated counts are distributed between the different peaks and the
background with probability weights `amps` * <bin width in> and
`bkg_c` * <number of bins>, respectively. As a consequence, simply changing
`N_events` (while keeping all other arguments constant), will render `amps`
and `bkg_c` away from their nominal units.
"""
if spec.fit_results is [] or None:
raise Exception("No fit results found in reference spectrum `spec`.")
if x_cen is None and x_range is None:
x_min = spec.data.index.values[0]
x_max = spec.data.index.values[-1]
indeces = range(len(spec.peaks)) # get peak indeces in sampling range
else:
x_min = x_cen - x_range
x_max = x_cen + x_range
# Get peak indeces in sampling range:
peaks = spec.peaks
indeces = [i for i in range(len(peaks)) if x_min <= peaks[i].x_pos <= x_max]
if mus is None:
if len(indeces) == 0:
import warnings
msg = str("No peaks in sampling range.")
warnings.warn(msg, UserWarning)
mus = []
for i in indeces:
result = spec.fit_results[i]
pref = 'p{0}_'.format(i)
mus.append(result.best_values[pref+'mu'])
if amps is None:
amps = []
for i in indeces:
result = spec.fit_results[i]
pref = 'p{0}_'.format(i)
amps.append(result.best_values[pref+'amp'])
if bkg_c is None:
assert len(indeces) != 0, "Zero background and no peaks in sampling range."
result = spec.fit_results[indeces[0]]
bkg_c = result.best_values['bkg_c']
if N_events is None:
N_events = int(np.sum(spec.data['Counts'])) # total number of counts in spectrum
# Create histogram with Monte Carlo events
x = spec.data[x_min:x_max].index.values
df = simulate_events(spec.shape_cal_pars,mus,amps,bkg_c,N_events,x_min,
x_max,out='hist',N_bins=None,bin_cens=x)
# Copy original spectrum and overwrite data
# This copies all results such as peak assignments, PS calibration,
# fit_results etc.
if copy_spec:
from copy import deepcopy
new_spec = deepcopy(spec)
new_spec.data = df
else: # Define a fresh spectrum with sampled data
from emgfit import spectrum
new_spec = spectrum.spectrum(df=df)
return new_spec