Source code for pmda.rms.rmsf

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# PMDA
# Copyright (c) 2019 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version

"""
Calculating Root-Mean-Square Fluctuations (RMSF) --- :mod:`pmda.rmsf`
=====================================================================

This module contains parallel versions of analysis tasks in
:mod:`MDAnalysis.analysis.rms`.

.. autoclass:: RMSF
    :members:
    :inherited-members:


See Also
--------
MDAnalysis.analysis.rms.RMSF

"""

from __future__ import absolute_import, division

import numpy as np

from pmda.parallel import ParallelAnalysisBase

from pmda.util import fold_second_order_moments


[docs]class RMSF(ParallelAnalysisBase):
    r"""Parallel RMSF analysis.

    Calculates RMSF of given atoms across a trajectory.

    Attributes
    ----------
    rmsf : array
         ``N``-length :class:`numpy.ndarray` array of RMSF values,
         where ``N`` is the number of atoms in the `atomgroup` of
         interest. Returned values have units of ångströms.

    Parameters
    ----------
    atomgroup : AtomGroup
       Atoms for which RMSF is calculated

    Raises
    ------
    ValueError
        raised if negative values are calculated, which indicates that a
        numerical overflow or underflow occured

    See Also
    --------
    MDAnalysis.analysis.rms.RMSF

    Notes
    -----
    No RMSD-superposition is performed; it is assumed that the user is
    providing a trajectory where the protein of interest has been structurally
    aligned to a reference structure (see the Examples section below). The
    protein also has be whole because periodic boundaries are not taken into
    account.

    Run the analysis with :meth:`RMSF.run`, which stores the results
    in the array :attr:`RMSF.rmsf`.

    The root mean square fluctuation of an atom :math:`i` is computed as the
    time average:

    .. math::

        \sigma_{i} = \sqrt{\left\langle (\mathbf{x}_{i} -
                           \langle\mathbf{x}_{i}\rangle)^2
                           \right\rangle}

    No mass weighting is performed.
    This method implements an algorithm for computing sums of squares while
    avoiding overflows and underflows [Welford1962]_, as well as an algorithm
    for combining the sum of squares and means of separate partitions of a
    given trajectory to calculate the RMSF of the entire trajectory
    [CGL1979]_.

    For more details about RMSF calculations, refer to [Awtrey2019]_.

    References
    ----------
    .. [Welford1962] B. P. Welford (1962). "Note on a Method for
       Calculating Corrected Sums of Squares and Products." Technometrics
       4(3):419-420.

    Examples
    --------
    In this example we calculate the residue RMSF fluctuations by analyzing
    the :math:`\text{C}_\alpha` atoms. First we need to fit the trajectory
    to the average structure as a reference. That requires calculating the
    average structure first. Because we need to analyze and manipulate the
    same trajectory multiple times, we are going to load it into memory
    using the :mod:`~MDAnalysis.coordinates.MemoryReader`. (If your
    trajectory does not fit into memory, you will need to :ref:`write out
    intermediate trajectories <writing-trajectories>` to disk or
    :ref:`generate an in-memory universe
    <creating-in-memory-trajectory-label>` that only contains, say, the
    protein)::

        import MDAnalysis as mda
        from MDAnalysis.analysis import align
        from MDAnalysis.tests.datafiles import TPR, XTC
        u = mda.Universe(TPR, XTC, in_memory=True)
        protein = u.select_atoms("protein")

        # TODO: Need to center and make whole (this test trajectory
        # contains the protein being split across periodic boundaries
        # and the results will be WRONG!)

        # Fit to the initial frame to get a better average structure
        # (the trajectory is changed in memory)
        prealigner = align.AlignTraj(u, u, select="protein and name CA",
                                     in_memory=True).run()
        # ref = average structure
        ref_coordinates = u.trajectory.timeseries(asel=protein).mean(axis=1)
        # Make a reference structure (need to reshape into a
        # 1-frame "trajectory").
        ref = mda.Merge(protein).load_new(ref_coordinates[:, None, :],
                                          order="afc")

    We created a new universe ``reference`` that contains a single frame
    with the averaged coordinates of the protein.  Now we need to fit the
    whole trajectory to the reference by minimizing the RMSD. We use
    :class:`MDAnalysis.analysis.align.AlignTraj`::

        aligner = align.AlignTraj(u, ref, select="protein and name CA",
                                  in_memory=True).run()
        # need to write the trajectory to disk for PMDA 0.3.0 (see issue #15)
        with mda.Writer("rmsfit.xtc", n_atoms=u.atoms.n_atoms) as W:
            for ts in u.trajectory:
                W.write(u.atoms)

    (For use with PMDA we cannot currently use a in-memory trajectory
    (see `Issue #15`_) so we must write out the RMS-superimposed
    trajectory to the file "rmsfit.xtc".)

    The trajectory is now fitted to the reference (the RMSD is stored as
    `aligner.rmsd` for further inspection). Now we can calculate the RMSF::

        from pmda.rms import RMSF

        u = mda.Universe(TPR, "rmsfit.xtc")
        calphas = protein.select_atoms("protein and name CA")

        rmsfer = RMSF(calphas).run()

    and plot::

        import matplotlib.pyplot as plt
        plt.plot(calphas.resnums, rmsfer.rmsf)


    .. versionadded:: 0.3.0


    .. _`Issue #15`: https://github.com/MDAnalysis/pmda/issues/15

    """

    def __init__(self, atomgroup):
        u = atomgroup.universe
        super(RMSF, self).__init__(u, (atomgroup, ))
        self._atomgroup = atomgroup
        self._top = u.filename
        self._traj = u.trajectory.filename

    def _single_frame(self, ts, atomgroups):
        # mean and sum of squares calculations done in _reduce()
        return atomgroups[0]

    def _conclude(self):
        """
        self._results : Array
            (n_blocks x 2 x N x 3) array
        """
        n_blocks = len(self._results)
        # serial case
        if n_blocks == 1:
            # get length of trajectory slice
            self.mean = self._results[0, 0]
            self.sumsquares = self._results[0, 1]
            self.rmsf = np.sqrt(self.sumsquares.sum(axis=1) / self.n_frames)
        # parallel case
        else:
            mean = self._results[:, 0]
            sos = self._results[:, 1]
            # create list of (timesteps, mean, sumsq tuples for each block
            vals = []
            for i in range(n_blocks):
                vals.append((len(self._blocks[i]), mean[i], sos[i]))
            # combine block results using fold method
            results = fold_second_order_moments(vals)
            self.mean = results[1]
            self.sumsquares = results[2]
            self.rmsf = np.sqrt(self.sumsquares.sum(axis=1) / self.n_frames)
            self._negative_rmsf(self.rmsf)

    @staticmethod
    def _reduce(res, result_single_frame):
        """
        'sum' action for time series
        """
        atoms = result_single_frame
        positions = atoms.positions.astype(np.float64)
        # initial time step case
        if isinstance(res, list) and len(res) == 0:
            # initial mean position = initial position
            mean = positions
            # create new zero-array for sum of squares to prevent blocks from
            # using data from previous blocks
            sumsq = np.zeros((atoms.n_atoms, 3))
            # set initial time step for each block to zero
            k = 0
            # assign initial (sum of squares and mean) zero-arrays to res
            res = [mean, sumsq, k]
        else:
            # update time step
            k = res[2] + 1
            # update sum of squares
            res[1] += (k / (k + 1)) * (positions - res[0]) ** 2
            # update mean
            res[0] = (k * res[0] + positions) / (k + 1)
            # update time step in res
            res[2] = k
        return res

    @staticmethod
    def _negative_rmsf(rmsf):
        if not (rmsf >= 0).all():
            raise ValueError("Some RMSF values negative; overflow " +
                             "or underflow occurred")