# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# PMDA
# Copyright (c) 2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
"""Utility functions --- :mod:`pmda.util`
=========================================
This module contains helper functions and classes that can be used throughout
:mod:`pmda`.
"""
from __future__ import absolute_import, division
import time
import functools
import numpy as np
[docs]class timeit(object):
"""measure time spend in context
:class:`timeit` is a context manager (to be used with the :keyword:`with`
statement) that records the execution time for the enclosed context block
in :attr:`elapsed`.
Attributes
----------
elapsed : float
Time in seconds that elapsed between entering
and exiting the context.
Example
-------
Use as a context manager::
with timeit() as total:
# code to be timed
print(total.elapsed, "seconds")
See Also
--------
:func:`time.time`
"""
def __enter__(self):
self._start_time = time.time()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
end_time = time.time()
self.elapsed = end_time - self._start_time
# always propagate exceptions forward
return False
[docs]def make_balanced_slices(n_frames, n_blocks, start=None, stop=None, step=None):
"""Divide `n_frames` into `n_blocks` balanced blocks.
The blocks are generated in such a way that they contain equal numbers of
frames when possible, but there are also no empty blocks (which can happen
with a naive distribution of ``ceil(n_frames/n_blocks)`` per block and a
remainder block).
If the trajectory is sliced in any way (``u.trajectory[start:stop:step]``)
then the appropriate values for `start`, `stop`, and `step` must be passed
to this function. Defaults can be set to ``None``. Only a subset of legal
values for slices is supported: ``0 ≤ start ≤ stop`` and ``step ≥ 1``.
Arguments
---------
n_frames : int
number of frames in the trajectory (≥0). This must be the
number of frames *after* the trajectory has been sliced,
i.e. ``len(u.trajectory[start:stop:step])``. If any of
`start`, `stop, and `step` are not the defaults (left empty or
set to ``None``) they must be provided as parameters.
n_blocks : int
number of blocks (>0 and <n_frames)
start : int or None
The first index of the trajectory (default is ``None``, which
is interpreted as "first frame", i.e., 0).
stop : int or None
The index of the last frame + 1 (default is ``None``, which is
interpreted as "up to and including the last frame".
step : int or None
Step size by which the trajectory is sliced; the default is
``None`` which corresponds to ``step=1``.
Returns
-------
slices : list of slice
List of length ``n_blocks`` with one :class:`slice`
for each block.
If `n_frames` = 0 then an empty list ``[]`` is returned.
Example
-------
For a trajectory with 5 frames and 4 blocks we get block sizes ``[2, 1, 1,
1]`` (instead of ``[2, 2, 1, 0]`` with the naive algorithm).
The slices will be ``[slice(0, 2, None), slice(2, 3, None),
slice(3, 4, None), slice(4, 5, None)]``.
The indices can be used to slice a trajectory into blocks::
n_blocks = 5
n_frames = len(u.trajectory[start:stop:step])
slices = make_balanced_slices(n_frames, n_blocks,
start=start, stop=stop, step=step)
for i_block, block in enumerate(slices):
for ts in u.trajectory[block]:
# do stuff for block number i_block
Notes
-----
Explanation of the algorithm: For `M` frames in the trajectory and
`N` blocks (or processes), where `i` with 0 ≤ `i` ≤ `N` - 1 is the
block number and `m[i]` is the number of frames for block `i` we
get a *balanced distribution* (one that does not contain blocks of
size 0) with the algorithm ::
m[i] = M // N # initial frames for block i
r = M % N # remaining frames 0 ≤ r < N
for i in range(r):
m[i] += 1 # distribute the remaining frames
# over the first r blocks
For a `step` > 1, we use ``m[i] *= step``. The last slice will
never go beyond the original `stop` if a value was provided.
.. versionadded:: 0.2.0
.. versionchanged:: 0.3.0
raise ValueError when n_blocks is larger than n_frames
"""
start = int(start) if start is not None else 0
stop = int(stop) if stop is not None else None
step = int(step) if step is not None else 1
if n_frames < 0:
raise ValueError("n_frames must be >= 0")
elif n_blocks < 1:
raise ValueError("n_blocks must be > 0")
elif n_frames != 0 and n_blocks > n_frames:
raise ValueError(f"n_blocks must be smaller than n_frames: "
f"{n_frames}")
elif start < 0:
raise ValueError("start must be >= 0 or None")
elif stop is not None and stop < start:
raise ValueError("stop must be >= start and >= 0 or None")
elif step < 1:
raise ValueError("step must be > 0 or None")
if n_frames == 0:
# not very useful but allows calling code to work more gracefully
return []
bsizes = np.ones(n_blocks, dtype=np.int64) * n_frames // n_blocks
bsizes += (np.arange(n_blocks, dtype=np.int64) < n_frames % n_blocks)
# This can give a last index that is larger than the real last index;
# this is not a problem for slicing but it's not pretty.
# Example: original [0:20:3] -> n_frames=7, start=0, step=3:
# last frame 21 instead of 20
bsizes *= step
idx = np.cumsum(np.concatenate(([start], bsizes)))
slices = [slice(bstart, bstop, step)
for bstart, bstop in zip(idx[:-1], idx[1:])]
# fix very last stop index: make sure it's within trajectory range or None
# (no really critical because the slices will work regardless, but neater)
last = slices[-1]
last_stop = min(last.stop, stop) if stop is not None else stop
slices[-1] = slice(last.start, last_stop, last.step)
return slices
[docs]def second_order_moments(S1, S2):
r"""Calculates the combined second order moment.
Given the partial centered moments of two partitions (S1 and S2) of a data
set S, calculates the second order moments of S = S1 ∪ S2.
Parameters
----------
S1 : array
Contains `(T1, mu1, M1)` where `T1` is an integer (number of elements
in the partition, e.g., the number of time frames), `mu1` is an
`n x m` array of the means for `n` atoms (and for example, `m=3` for
the center of geometry), `M1` is also an `n x m` array of the sum of
squares.
S2 : array
Contains `(T2, mu2, M2)` where `T2` is an integer (number of elements
in the partition, e.g., the number of time frames), `mu2` is an
`n x m` array of the means for `n` atoms (and for example, `m=3` for
the center of geometry), `M2` is also an `n x m` array of the sum of
squares.
Returns
-------
S : (T, mu, M)
The returned tuple contains the total number of elements in the
partition `T`, the mean `mu` and the "second moment" `M` (sum of
squares) for the combined data.
Notes
-----
For a given set of data, the sum of squares, also known as the second
order moment about the mean, is defined as the sum of the squares of the
differences between each data point and the mean (sum of the individual
deviations from the mean) of the data set,
.. math::
M_{2, S_{i}} = \sum_{t=t_{0}}^{T}(x_{t} - \mu_{i})^2,
where :math:`\mu_{i}` is the time average of :math:`x_{t}`. If the
average of the squares of the individual deviations is taken (instead of
the sum), this yields the variance:
.. math::
\sigma_{i}^{2} = \frac{1}{T}\sum_{t=t_{0}}^{T}(x_{t} - \mu_{i})^2
In order to combine the mean and second order moments of two separate
partitions, [CGL1979]_ derived the following formulae:
.. math::
\mu = \frac{T_{1}\mu_{1} + T_{2}\mu_{2}}{T}
and
.. math::
M_{2, S} = M_{2, S_{1}} + M_{2, S_{2}} + \
\frac{T_{1}T_{2}}{T}(\mu_{2} - \mu_{1})^{2},
where :math:`T`, :math:`T_{1}`, and :math:`T_{2}` are the respective
cardinalities of :math:`S`, :math:`S_{1}`, and :math:`S_{2}`, :math:`\mu`,
:math:`\mu_{1}`, and :math:`\mu_{2}` are the respective means of
:math:`S`, :math:`S_{1}`, and :math:`S_{2}`, and :math:`M_{2, S}`,
:math:`M_{2, S_{1}}`, and :math:`M_{2, S_{2}}` are the respective second
order moments of :math:`S`, :math:`S_{1}`, and :math:`S_{2}`. This is
similar notation to [Pebay2008]_. With a combined sum of squares and mean,
it is possible to calculate the root-mean-square fluctuations, otherwise
known as the population standard deviation:
.. math::
\sigma_{i} = \sqrt{\frac{1}{T} \
\sum_{t=t_{0}}^{T}(x_{t} - \mu_{i})^2}
References
----------
.. [CGL1979] T. F. Chan, G. H. Golub, and R. J. LeVeque. "Updating
formulae and a pairwise algorithm for computing sample variances."
Technical Report STAN-CS-79-773, Stanford University, Department of
Computer Science, 1979.
.. [Pebay2008] P. Pebay. "Formulas for robust one-pass parallel
computation of co-variances and arbitrary-order statistical moments."
Technical Report SAND2008-6212, 2008.
.. versionadded:: 0.3.0
"""
T = S1[0] + S2[0]
mu = (S1[0]*S1[1] + S2[0]*S2[1])/T
M = S1[2] + S2[2] + (S1[0] * S2[0]/T) * (S2[1] - S1[1])**2
S = T, mu, M
return S
[docs]def fold_second_order_moments(*args):
"""Fold action for :func:`second_order_moments` calculation.
Takes in data that can be combined associatively (order doesn't matter) and
applies a combining function in a recursive fashion. In this case, it takes
in a list of lists that each contain the total number of time steps, an `n
x m` array of mean positions for `n` atoms, and an `n x m` array of second
order moments for `n` atoms, for a given partition of a trajectory. It
takes the first partition, combines it with the second, combines that
result with the third, and that result with the fourth, etc. using
:func:`second_order_moments`. The final result is a list of the summed time
steps, combined mean positions, and combined second order moments of all
atoms in the combined trajectory.
See Also
--------
`Haskell fold/reduce method <https://wiki.haskell.org/Fold>`_
.. versionadded:: 0.3.0
"""
return functools.reduce(second_order_moments, *args)