Source code for pmda.util

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# PMDA
# Copyright (c) 2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
"""Utility functions --- :mod:`pmda.util`
=========================================


This module contains helper functions and classes that can be used throughout
:mod:`pmda`.

"""
from __future__ import absolute_import, division

import time

import functools

import numpy as np


[docs]class timeit(object):
    """measure time spend in context

    :class:`timeit` is a context manager (to be used with the :keyword:`with`
    statement) that records the execution time for the enclosed context block
    in :attr:`elapsed`.

    Attributes
    ----------
    elapsed : float
        Time in seconds that elapsed between entering
        and exiting the context.

    Example
    -------
    Use as a context manager::

       with timeit() as total:
          # code to be timed

       print(total.elapsed, "seconds")

    See Also
    --------
    :func:`time.time`

    """
    def __enter__(self):
        self._start_time = time.time()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        end_time = time.time()
        self.elapsed = end_time - self._start_time
        # always propagate exceptions forward
        return False


[docs]def make_balanced_slices(n_frames, n_blocks, start=None, stop=None, step=None):
    """Divide `n_frames` into `n_blocks` balanced blocks.

    The blocks are generated in such a way that they contain equal numbers of
    frames when possible, but there are also no empty blocks (which can happen
    with a naive distribution of ``ceil(n_frames/n_blocks)`` per block and a
    remainder block).

    If the trajectory is sliced in any way (``u.trajectory[start:stop:step]``)
    then the appropriate values for `start`, `stop`, and `step` must be passed
    to this function. Defaults can be set to ``None``. Only a subset of legal
    values for slices is supported: ``0 ≤ start ≤ stop`` and ``step ≥ 1``.

    Arguments
    ---------
    n_frames : int
        number of frames in the trajectory (≥0). This must be the
        number of frames *after* the trajectory has been sliced,
        i.e. ``len(u.trajectory[start:stop:step])``. If any of
        `start`, `stop, and `step` are not the defaults (left empty or
        set to ``None``) they must be provided as parameters.
    n_blocks : int
        number of blocks (>0 and <n_frames)
    start : int or None
        The first index of the trajectory (default is ``None``, which
        is interpreted as "first frame", i.e., 0).
    stop : int or None
        The index of the last frame + 1 (default is ``None``, which is
        interpreted as "up to and including the last frame".
    step : int or None
        Step size by which the trajectory is sliced; the default is
        ``None`` which corresponds to ``step=1``.

    Returns
    -------
    slices : list of slice
        List of length ``n_blocks`` with one :class:`slice`
        for each block.

        If `n_frames` = 0 then an empty list ``[]`` is returned.

    Example
    -------
    For a trajectory with 5 frames and 4 blocks we get block sizes ``[2, 1, 1,
    1]`` (instead of ``[2, 2, 1, 0]`` with the naive algorithm).

    The slices will be ``[slice(0, 2, None), slice(2, 3, None),
    slice(3, 4, None), slice(4, 5, None)]``.

    The indices can be used to slice a trajectory into blocks::

        n_blocks = 5
        n_frames = len(u.trajectory[start:stop:step])

        slices = make_balanced_slices(n_frames, n_blocks,
                                      start=start, stop=stop, step=step)
        for i_block, block in enumerate(slices):
           for ts in u.trajectory[block]:
               # do stuff for block number i_block

    Notes
    -----
    Explanation of the algorithm: For `M` frames in the trajectory and
    `N` blocks (or processes), where `i` with 0 ≤ `i` ≤ `N` - 1 is the
    block number and `m[i]` is the number of frames for block `i` we
    get a *balanced distribution* (one that does not contain blocks of
    size 0) with the algorithm ::

        m[i] = M // N     # initial frames for block i
        r = M % N         # remaining frames 0 ≤ r < N
        for i in range(r):
            m[i] += 1     # distribute the remaining frames
                          # over the first r blocks

    For a `step` > 1, we use ``m[i] *= step``. The last slice will
    never go beyond the original `stop` if a value was provided.


    .. versionadded:: 0.2.0
    .. versionchanged:: 0.3.0
        raise ValueError when n_blocks is larger than n_frames
    """

    start = int(start) if start is not None else 0
    stop = int(stop) if stop is not None else None
    step = int(step) if step is not None else 1

    if n_frames < 0:
        raise ValueError("n_frames must be >= 0")
    elif n_blocks < 1:
        raise ValueError("n_blocks must be > 0")
    elif n_frames != 0 and n_blocks > n_frames:
        raise ValueError(f"n_blocks must be smaller than n_frames: "
                         f"{n_frames}")
    elif start < 0:
        raise ValueError("start must be >= 0 or None")
    elif stop is not None and stop < start:
        raise ValueError("stop must be >= start and >= 0 or None")
    elif step < 1:
        raise ValueError("step must be > 0 or None")

    if n_frames == 0:
        # not very useful but allows calling code to work more gracefully
        return []

    bsizes = np.ones(n_blocks, dtype=np.int64) * n_frames // n_blocks
    bsizes += (np.arange(n_blocks, dtype=np.int64) < n_frames % n_blocks)
    # This can give a last index that is larger than the real last index;
    # this is not a problem for slicing but it's not pretty.
    # Example: original [0:20:3] -> n_frames=7, start=0, step=3:
    #          last frame 21 instead of 20
    bsizes *= step
    idx = np.cumsum(np.concatenate(([start], bsizes)))
    slices = [slice(bstart, bstop, step)
              for bstart, bstop in zip(idx[:-1], idx[1:])]

    # fix very last stop index: make sure it's within trajectory range or None
    # (no really critical because the slices will work regardless, but neater)
    last = slices[-1]
    last_stop = min(last.stop, stop) if stop is not None else stop
    slices[-1] = slice(last.start, last_stop, last.step)

    return slices


[docs]def second_order_moments(S1, S2):
    r"""Calculates the combined second order moment.

    Given the partial centered moments of two partitions (S1 and S2) of a data
    set S, calculates the second order moments of S = S1 ∪ S2.

    Parameters
    ----------
    S1 : array
       Contains `(T1, mu1, M1)` where `T1` is an integer (number of elements
       in the partition, e.g., the number of time frames), `mu1` is an
       `n x m` array of the means for `n` atoms (and for example, `m=3` for
       the center of geometry), `M1` is also an `n x m` array of the sum of
       squares.
    S2 : array
       Contains `(T2, mu2, M2)` where `T2` is an integer (number of elements
       in the partition, e.g., the number of time frames), `mu2` is an
       `n x m` array of the means for `n` atoms (and for example, `m=3` for
       the center of geometry), `M2` is also an `n x m` array of the sum of
       squares.

    Returns
    -------
    S : (T, mu, M)
       The returned tuple contains the total number of elements in the
       partition `T`, the mean `mu` and the "second moment" `M` (sum of
       squares) for the combined data.

    Notes
    -----
    For a given set of data, the sum of squares, also known as the second
    order moment about the mean, is defined as the sum of the squares of the
    differences between each data point and the mean (sum of the individual
    deviations from the mean) of the data set,

    .. math::

        M_{2, S_{i}} = \sum_{t=t_{0}}^{T}(x_{t} - \mu_{i})^2,

    where :math:`\mu_{i}` is the time average of :math:`x_{t}`. If the
    average of the squares of the individual deviations is taken (instead of
    the sum), this yields the variance:

    .. math::

        \sigma_{i}^{2} = \frac{1}{T}\sum_{t=t_{0}}^{T}(x_{t} - \mu_{i})^2

    In order to combine the mean and second order moments of two separate
    partitions, [CGL1979]_ derived the following formulae:

    .. math::

        \mu = \frac{T_{1}\mu_{1} + T_{2}\mu_{2}}{T}

    and

    .. math::

        M_{2, S} = M_{2, S_{1}} + M_{2, S_{2}} + \
        \frac{T_{1}T_{2}}{T}(\mu_{2} - \mu_{1})^{2},

    where :math:`T`, :math:`T_{1}`, and :math:`T_{2}` are the respective
    cardinalities of :math:`S`, :math:`S_{1}`, and :math:`S_{2}`, :math:`\mu`,
    :math:`\mu_{1}`, and :math:`\mu_{2}` are the respective means of
    :math:`S`, :math:`S_{1}`, and :math:`S_{2}`, and :math:`M_{2, S}`,
    :math:`M_{2, S_{1}}`, and :math:`M_{2, S_{2}}` are the respective second
    order moments of :math:`S`, :math:`S_{1}`, and :math:`S_{2}`. This is
    similar notation to [Pebay2008]_. With a combined sum of squares and mean,
    it is possible to calculate the root-mean-square fluctuations, otherwise
    known as the population standard deviation:

    .. math::

        \sigma_{i} = \sqrt{\frac{1}{T} \
        \sum_{t=t_{0}}^{T}(x_{t} - \mu_{i})^2}

    References
    ----------
    .. [CGL1979] T. F. Chan, G. H. Golub, and R. J. LeVeque. "Updating
       formulae and a pairwise algorithm for computing sample variances."
       Technical Report STAN-CS-79-773, Stanford University, Department of
       Computer Science, 1979.
    .. [Pebay2008] P. Pebay. "Formulas for robust one-pass parallel
       computation of co-variances and arbitrary-order statistical moments."
       Technical Report SAND2008-6212, 2008.


    .. versionadded:: 0.3.0
    """

    T = S1[0] + S2[0]
    mu = (S1[0]*S1[1] + S2[0]*S2[1])/T
    M = S1[2] + S2[2] + (S1[0] * S2[0]/T) * (S2[1] - S1[1])**2
    S = T, mu, M

    return S


[docs]def fold_second_order_moments(*args):
    """Fold action for :func:`second_order_moments` calculation.

    Takes in data that can be combined associatively (order doesn't matter) and
    applies a combining function in a recursive fashion. In this case, it takes
    in a list of lists that each contain the total number of time steps, an `n
    x m` array of mean positions for `n` atoms, and an `n x m` array of second
    order moments for `n` atoms, for a given partition of a trajectory. It
    takes the first partition, combines it with the second, combines that
    result with the third, and that result with the fourth, etc. using
    :func:`second_order_moments`. The final result is a list of the summed time
    steps, combined mean positions, and combined second order moments of all
    atoms in the combined trajectory.

    See Also
    --------
    `Haskell fold/reduce method <https://wiki.haskell.org/Fold>`_


    .. versionadded:: 0.3.0
    """
    return functools.reduce(second_order_moments, *args)