Source code for MDAnalysis.topology.MMCIFParser

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
"""
MMCIF Topology Parser
====================

Read topology information from mmCIF/PDBx coordinate files using the `Gemmi library <https://github.com/project-gemmi/gemmi>`_

mmCIF files contain topology information about the molecules in the structure. For each atom the following attributes are read
and stored in the relevant topology attributes:
    - :class:`..core.topologyattrs.AtomAttr` subclasses:
        - :class:`MDAnalysis.core.topologyattrs.AltLocs`
        - :class:`MDAnalysis.core.topologyattrs.Atomids`
        - :class:`MDAnalysis.core.topologyattrs.Atomnames`
        - :class:`MDAnalysis.core.topologyattrs.Atomtypes`
        - :class:`MDAnalysis.core.topologyattrs.ChainIDs`
        - :class:`MDAnalysis.core.topologyattrs.Elements`
        - :class:`MDAnalysis.core.topologyattrs.FormalCharges`
        - :class:`MDAnalysis.core.topologyattrs.Masses`
        - :class:`MDAnalysis.core.topologyattrs.Occupancies`
        - :class:`MDAnalysis.core.topologyattrs.RecordTypes`
        - :class:`MDAnalysis.core.topologyattrs.Tempfactors`
    - :class:`..core.topologyattrs.ResidueAttr` subclasses:
        - :class:`MDAnalysis.core.topologyattrs.Resnums`
        - :class:`MDAnalysis.core.topologyattrs.ICodes`
        - :class:`MDAnalysis.core.topologyattrs.Resids`
        - :class:`MDAnalysis.core.topologyattrs.Resnames`
    - :class:`..core.topologyattrs.SegmentAttr` subclasses:
        - :class:`MDAnalysis.core.topologyattrs.Segids`

Classes
-------

.. autoclass:: MMCIFParser
   :members:
   :inherited-members:

.. versionadded:: 2.9.0
"""

try:
    import gemmi
except ImportError:
    HAS_GEMMI = False
else:
    HAS_GEMMI = True

import itertools
import warnings

import numpy as np

from ..core.topology import Topology
from ..core.topologyattrs import (
    AltLocs,
    AtomAttr,
    Atomids,
    Atomnames,
    Atomtypes,
    ChainIDs,
    Elements,
    FormalCharges,
    ICodes,
    Masses,
    Occupancies,
    RecordTypes,
    Resids,
    ResidueAttr,
    Resnames,
    Resnums,
    Segids,
    SegmentAttr,
    Tempfactors,
)
from .base import TopologyReaderBase


def _into_idx(arr: list) -> list[int]:
    """Replace consecutive identical elements of an array with their indices.

    Example
    -------
    .. code-block:: python

        arr: list[int] = [1, 1, 5, 5, 7, 3, 3]
        assert _into_idx(arr) == [0, 0, 1, 1, 2, 3, 3]

    Parameters
    ----------
    arr
        input array of elements that can be compared with `__eq__`

    Returns
    -------
        list[int] -- array where these elements are replaced with their unique indices, in order of appearance.

    .. versionadded:: 2.9.0
    """
    return [
        idx
        for idx, (_, group) in enumerate(itertools.groupby(arr))
        for _ in group
    ]


def get_Atomattrs(model: "gemmi.Model") -> tuple[list[AtomAttr], np.ndarray]:
    """Extract all attributes that are subclasses of :class:`..core.topologyattrs.AtomAttr` from a  ``gemmi.Model`` object,
    and a `residx` index with indices of all atoms in residues.

    Parameters
    ----------
    model
        input `gemmi.Model`, e.g. `gemmi.read_structure('file.cif')[0]`

    Returns
    -------
        tuple[list[AtomAttr], np.ndarray] -- first element is list of all extracted attributes, second element is `segidx`

    Raises
    ------
    ValueError
        if any of the records is neither 'ATOM' nor 'HETATM'

    .. versionadded:: 2.9.0
    """
    (
        altlocs,  # at.altloc
        serials,  # at.serial
        names,  # at.name
        atomtypes,  # at.name
        # ------------------
        chainids,  # chain.name
        elements,  # at.element.name
        formalcharges,  # at.charge
        weights,  # at.element.weight
        # ------------------
        occupancies,  # at.occ
        record_types,  # res.het_flag
        tempfactors,  # at.b_iso
        residx,  # _into_idx(res.label_seq or res.seqid.num)
    ) = map(  # this construct takes np.ndarray of all lists of attributes, extracted from the `gemmi.Model`
        np.array,
        list(
            zip(
                *[
                    (
                        # tuple of attributes
                        # extracted from residue, atom or chain in the structure
                        # ------------------
                        atom.altloc,  # altlocs
                        atom.serial,  # serials
                        atom.name,  # names
                        atom.name,  # atomtypes
                        # ------------------
                        chain.name,  # chainids
                        atom.element.name,  # elements
                        atom.charge,  # formalcharges
                        atom.element.weight,  # weights
                        # ------------------
                        atom.occ,  # occupancies
                        residue.het_flag,  # record_types
                        atom.b_iso,  # tempfactors
                        # residue.seqid.num,
                        (
                            residue.label_seq
                            if residue.label_seq is not None
                            else residue.seqid.num
                        ),  # residx, later translated into continious repr
                    )
                    # the main loop over the `gemmi.Model` object
                    for chain in model
                    for residue in chain
                    for atom in residue
                ]
            )
        ),
    )

    # transform *idx into continious numpy arrays
    print(f"Before: {len(residx)=}")
    residx = np.array(_into_idx(residx))
    print(f"After: {len(residx)=}")

    # fill in altlocs, since gemmi has '' as default
    altlocs = ["A" if not elem else elem for elem in altlocs]

    # convert default gemmi record types to default MDAnalysis record types
    record_types = [
        "ATOM" if record == "A" else "HETATM" if record == "H" else None
        for record in record_types
    ]
    if any((elem is None for elem in record_types)):
        raise ValueError("Found an atom that is neither ATOM or HETATM")

    attrs = [
        AltLocs(altlocs),
        Atomids(serials),
        Atomnames(names),
        Atomtypes(atomtypes),
        # ----------------------------
        ChainIDs(chainids),
        Elements(elements),
        FormalCharges(formalcharges),
        Masses(weights),
        # ----------------------------
        Occupancies(occupancies),
        RecordTypes(record_types),
        Tempfactors(tempfactors),
    ]

    return attrs, residx


def make_resid(residue: "gemmi.Residue") -> str:
    # return residue.seqid.num
    # return f'{residue.seqid.num}{residue.seqid.icode.strip()}'
    return (
        residue.label_seq
        if residue.label_seq is not None
        else residue.seqid.num
    )


def get_Residueattrs(
    model: "gemmi.Model",
) -> tuple[list[ResidueAttr], np.ndarray]:
    """Extract all attributes that are subclasses of :class:`..core.topologyattrs.ResidueAttr` from a  ``gemmi.Model`` object,
    and a `segidx` index witn indices of all residues in segments.

    Parameters
    ----------
    model
        input `gemmi.Model`, e.g. `gemmi.read_structure('file.cif')[0]`

    Returns
    -------
        tuple[list[ResidueAttr], np.ndarray] -- first element is list of all extracted attributes, second element is `segidx`

    .. versionadded:: 2.9.0
    """
    (
        icodes,  # residue.seqid.icode
        resids,  # residue.seqid.num # FIXME: perhaps this is what's wrong, and not residx per se?
        resnames,  # residue.name
        segidx,  # chain.name
        resnums,  # residue.seqid.num
    ) = map(
        np.array,
        list(
            zip(
                *[
                    (
                        residue.seqid.icode,
                        residue.seqid.num,
                        residue.name,
                        chain.name,
                        residue.seqid.num,
                    )
                    for chain in model
                    for residue in chain
                ]
            )
        ),
    )
    segidx = np.array(_into_idx(segidx))

    attrs = [
        Resnums(resnums),
        ICodes([icode.strip() for icode in icodes]),
        Resids(resids),
        Resnames(resnames),
    ]
    return attrs, segidx


def get_Segmentattrs(model: "gemmi.Model") -> list[SegmentAttr]:
    """Extract all attributes that are subclasses of :class:`..core.topologyattrs.SegmentAttr` from a  ``gemmi.Model`` object.

    Parameters
    ----------
    model
        input `gemmi.Model`, e.g. `gemmi.read_structure('file.cif')[0]`

    Returns
    -------
        list[SegmentAttr] -- list of all extracted attributes

    .. versionadded:: 2.9.0
    """
    segids = [chain.name for chain in model]
    return [Segids(segids)]


[docs]class MMCIFParser(TopologyReaderBase): """Parser that obtains a list of atoms from a standard MMCIF/PDBx file using ``gemmi`` library (https://github.com/project-gemmi/gemmi). Creates the following Attributes (if present): - :class:`..core.topologyattrs.AtomAttr` subclasses: - :class:`..core.topologyattrs.AltLocs` - :class:`..core.topologyattrs.Atomids` - :class:`..core.topologyattrs.Atomnames` - :class:`..core.topologyattrs.Atomtypes` - :class:`..core.topologyattrs.ChainIDs` - :class:`..core.topologyattrs.Elements` - :class:`..core.topologyattrs.FormalCharges` - :class:`..core.topologyattrs.Masses` - :class:`..core.topologyattrs.Occupancies` - :class:`..core.topologyattrs.RecordTypes` - :class:`..core.topologyattrs.Tempfactors` - :class:`..core.topologyattrs.ResidueAttr` subclasses: - :class:`..core.topologyattrs.Resnums` - :class:`..core.topologyattrs.ICodes` - :class:`..core.topologyattrs.Resids` - :class:`..core.topologyattrs.Resnames` - :class:`..core.topologyattrs.SegmentAttr` subclasses: - :class:`..core.topologyattrs.Segids` .. versionadded:: 2.9.0 """ format = ["cif", "cif.gz", "mmcif", "mmcif.gz"]
[docs] def parse(self, **kwargs) -> Topology: """Read the file and return the structure. Returns ------- MDAnalysis Topology object """ structure = gemmi.read_structure(self.filename) if len(structure) > 1: warnings.warn( f"MMCIF model {self.filename} contains {len(structure)=} different models, " "but only the first one will be used to assign the topology" ) model = structure[0] atomAttrs, residx = get_Atomattrs(model) residAttrs, segidx = get_Residueattrs(model) segmentAttrs = get_Segmentattrs(model) attrs = atomAttrs + residAttrs + segmentAttrs # due to the list(map(...)) construction, all elements in array have equal lengths n_atoms = len(atomAttrs[0]) n_residues = len(residAttrs[0]) n_segments = len(segmentAttrs[0]) return Topology( n_atoms, n_residues, n_segments, attrs=attrs, atom_resindex=residx, residue_segindex=segidx, )