Source code for ampal.pdb_parser

"""Contains code for parsing PDB files."""

from collections import OrderedDict
import itertools
import pathlib

from .base_ampal import Atom
from .assembly import AmpalContainer, Assembly
from .protein import Polypeptide, Residue
from .nucleic_acid import Polynucleotide, Nucleotide
from .ligands import Ligand, LigandGroup
from .amino_acids import standard_amino_acids
from .data import PDB_ATOM_COLUMNS


[docs]def load_pdb(pdb, path=True, pdb_id="", ignore_end=False):
    """Converts a PDB file into an AMPAL object.

    Parameters
    ----------
    pdb : str
        Either a path to a PDB file or a string containing PDB
        format structural data.
    path : bool, optional
        If `true`, flags `pdb` as a path and not a PDB string.
    pdb_id : str, optional
        Identifier for the `Assembly`.
    ignore_end : bool, optional
        If `false`, parsing of the file will stop when an "END"
        record is encountered.

    Returns
    -------
    ampal : ampal.Assembly or ampal.AmpalContainer
        AMPAL object that contains the structural information from
        the PDB file provided. If the PDB file has a single state
        then an `Assembly` will be returned, otherwise an
        `AmpalContainer` will be returned.
    """
    pdb_p = PdbParser(pdb, path=path, pdb_id=pdb_id, ignore_end=ignore_end)
    return pdb_p.make_ampal()


[docs]class PdbParser(object):
    """Parses a PDB file and produces and AMPAL Assembly.

    Parameters
    ----------
    pdb : str
        Either a path to a PDB file or a string containing PDB
        format structural data.
    path : bool, optional
        If `true`, flags `pdb` as a path and not a PDB string.
    pdb_id : str, optional
        Identifier for the `Assembly`.
    ignore_end : bool, optional
        If `false`, parsing of the file will stop when an "END"
        record is encountered.

    Attributes
    ----------
    proc_functions : dict
        Keys are PDB labels i.e. "ATOM" or "END", values are the
        function that processes that specific line.
    id : str
        Identifier for the `Assembly`.
    pdb_lines : [str]
        Input PDB split into line.
    new_labels : bool
        Indicates if new atom or residue labels have been found while
        parsing the PDB file.
    state : int
        Current state being appended to. This is used on multi-state
        files like NMR structures.
    pdb_parse_tree : dict
        This dictionary represents the parse tree of the PDB file.
        Each line of the structure is broken down into a key, the
        entry label, and a value, the data.
    current_line : int
        The line that is currently being parsed.
    ignore_end : bool, optional
        If `false`, parsing of the file will stop when an "END"
        record is encountered.
    """

    def __init__(self, pdb, path=True, pdb_id="", ignore_end=False):
        self.proc_functions = {
            "ATOM": self.proc_atom,
            "HETATM": self.proc_atom,
            "ENDMDL": self.change_state,
            "END": self.end,
        }
        if path:
            pdb_path = pathlib.PurePath(pdb)
            with open(str(pdb_path), "r") as inf:
                pdb_str = inf.read()
            self.id = pdb_path.stem
        else:
            pdb_str = pdb
            self.id = pdb_id
        self.pdb_lines = pdb_str.splitlines()
        self.new_labels = False
        self.state = 0
        self.pdb_parse_tree = None
        self.current_line = None
        self.ignore_end = ignore_end
        self.parse_pdb_file()

[docs]    def parse_pdb_file(self):
        """Runs the PDB parser."""
        self.pdb_parse_tree = {"info": {}, "data": {self.state: {}}}
        try:
            for line in self.pdb_lines:
                self.current_line = line
                record_name = line[:6].strip()
                if record_name in self.proc_functions:
                    self.proc_functions[record_name]()
                else:
                    if record_name not in self.pdb_parse_tree["info"]:
                        self.pdb_parse_tree["info"][record_name] = []
                    self.pdb_parse_tree["info"][record_name].append(line)
        except EOFError:
            # Raised by END record
            pass
        return

[docs]    def proc_atom(self):
        """Processes an "ATOM" or "HETATM" record."""
        atom_data = self.proc_line_coordinate(self.current_line)
        (
            at_type,
            at_ser,
            _,
            _,
            res_name,
            chain_id,
            res_seq,
            i_code,
            _,
            _,
            _,
            _,
            _,
            _,
            _,
        ) = atom_data
        # currently active state
        a_state = self.pdb_parse_tree["data"][self.state]
        res_id = (res_seq, i_code)
        if chain_id not in a_state:
            a_state[chain_id] = (set(), OrderedDict())
        if res_id not in a_state[chain_id][1]:
            a_state[chain_id][1][res_id] = (set(), OrderedDict())
        if at_type == "ATOM":
            if res_name in standard_amino_acids.values():
                poly = "P"
            else:
                poly = "N"
        else:
            poly = "H"
        a_state[chain_id][0].add((chain_id, at_type, poly))
        a_state[chain_id][1][res_id][0].add((at_type, res_seq, res_name, i_code))
        if at_ser not in a_state[chain_id][1][res_id][1]:
            a_state[chain_id][1][res_id][1][at_ser] = [atom_data]
        else:
            a_state[chain_id][1][res_id][1][at_ser].append(atom_data)
        return

[docs]    def change_state(self):
        """Increments current state and adds a new dict to the parse tree."""
        self.state += 1
        self.pdb_parse_tree["data"][self.state] = {}
        return

[docs]    def end(self):
        """Processes an "END" record."""
        if not self.ignore_end:
            raise EOFError
        else:
            return

[docs]    def proc_line_coordinate(self, line):
        """Extracts data from columns in ATOM/HETATM record."""
        at_type = line[0:6].strip()  # 0
        at_ser = int(line[6:11].strip())  # 1
        at_name = line[12:16].strip()  # 2
        alt_loc = line[16].strip()  # 3
        res_name = line[17:20].strip()  # 4
        chain_id = line[21].strip()  # 5
        res_seq = int(line[22:26].strip())  # 6
        i_code = line[26].strip()  # 7
        x = float(line[30:38].strip())  # 8
        y = float(line[38:46].strip())  # 9
        z = float(line[46:54].strip())  # 10
        occupancy = float(line[54:60].strip())  # 11
        _temp_factor = line[60:66].strip()
        if _temp_factor:
            temp_factor = float(_temp_factor)
        else:
            temp_factor = 0.0
        element = line[76:78].strip()  # 13
        charge = line[78:80].strip()  # 14
        if at_name not in PDB_ATOM_COLUMNS:
            PDB_ATOM_COLUMNS[at_name] = line[12:16]
            self.new_labels = True
        return (
            at_type,
            at_ser,
            at_name,
            alt_loc,
            res_name,
            chain_id,
            res_seq,
            i_code,
            x,
            y,
            z,
            occupancy,
            temp_factor,
            element,
            charge,
        )

    # Generate PDB from parse tree
[docs]    def make_ampal(self):
        """Generates an AMPAL object from the parse tree.

        Notes
        -----
        Will create an `Assembly` if there is a single state in the
        parese tree or an `AmpalContainer` if there is more than one.
        """
        data = self.pdb_parse_tree["data"]
        if len(data) > 1:
            ac = AmpalContainer(id=self.id)
            for state, chains in sorted(data.items()):
                if chains:
                    ac.append(
                        self.proc_state(chains, self.id + "_state_{}".format(state + 1))
                    )
            return ac
        elif len(data) == 1:
            return self.proc_state(data[0], self.id)
        else:
            raise ValueError("Empty parse tree, check input PDB format.")

[docs]    def proc_state(self, state_data, state_id):
        """Processes a state into an `Assembly`.

        Parameters
        ----------
        state_data : dict
            Contains information about the state, including all
            the per line structural data.
        state_id : str
            ID given to `Assembly` that represents the state.
        """
        assembly = Assembly(assembly_id=state_id)
        for _, chain in sorted(state_data.items()):
            assembly._molecules.append(self.proc_chain(chain, assembly))
        return assembly

[docs]    def proc_chain(self, chain_info, parent):
        """Converts a chain into a `Polymer` type object.

        Parameters
        ----------
        chain_info : (set, OrderedDict)
            Contains a set of chain labels and atom records.
        parent : ampal.Assembly
            `Assembly` used to assign `parent` on created
            `Polymer`.

        Raises
        ------
        ValueError
            Raised if multiple or unknown atom types found
            within the same chain.
        AttributeError
            Raised if unknown `Monomer` type encountered.
        """
        hetatom_filters = {"nc_aas": self.check_for_non_canonical}

        polymer = False
        chain_labels, chain_data = chain_info
        chain_label = list(chain_labels)[0]
        monomer_types = {x[2] for x in chain_labels if x[2]}
        if ("P" in monomer_types) and ("N" in monomer_types):
            raise ValueError('Malformed PDB, multiple "ATOM" types in a single chain.')
        # Changes Polymer type based on chain composition
        if "P" in monomer_types:
            polymer_class = Polypeptide
            polymer = True
        elif "N" in monomer_types:
            polymer_class = Polynucleotide
            polymer = True
        elif "H" in monomer_types:
            polymer_class = LigandGroup
        else:
            raise AttributeError("Malformed parse tree, check inout PDB.")
        chain = polymer_class(polymer_id=chain_label[0], parent=parent)
        # Changes where the ligands should go based on the chain composition
        if polymer:
            chain.ligands = LigandGroup(polymer_id=chain_label[0], parent=parent)
            ligands = chain.ligands
        else:
            ligands = chain

        for residue in chain_data.values():
            res_info = list(residue[0])[0]
            if res_info[0] == "ATOM":
                chain._monomers.append(self.proc_monomer(residue, chain))
            elif res_info[0] == "HETATM":
                mon_cls = None
                on_chain = False
                for filt_func in hetatom_filters.values():
                    filt_res = filt_func(residue)
                    if filt_res:
                        mon_cls, on_chain = filt_res
                        break
                    mon_cls = Ligand
                if on_chain:
                    chain._monomers.append(
                        self.proc_monomer(residue, chain, mon_cls=mon_cls)
                    )
                else:
                    ligands._monomers.append(
                        self.proc_monomer(residue, chain, mon_cls=mon_cls)
                    )
            else:
                raise ValueError("Malformed PDB, unknown record type for data")
        return chain

[docs]    def proc_monomer(self, monomer_info, parent, mon_cls=False):
        """Processes a records into a `Monomer`.

        Parameters
        ----------
        monomer_info : (set, OrderedDict)
            Labels and data for a monomer.
        parent : ampal.Polymer
            `Polymer` used to assign `parent` on created
            `Monomer`.
        mon_cls : `Monomer class or subclass`, optional
            A `Monomer` class can be defined explicitly.
        """
        monomer_labels, monomer_data = monomer_info
        if len(monomer_labels) > 1:
            raise ValueError(
                "Malformed PDB, single monomer id with "
                "multiple labels. {}".format(monomer_labels)
            )
        else:
            monomer_label = list(monomer_labels)[0]
        if mon_cls:
            monomer_class = mon_cls
            het = True
        elif monomer_label[0] == "ATOM":
            if monomer_label[2] in standard_amino_acids.values():
                monomer_class = Residue
            else:
                monomer_class = Nucleotide
            het = False
        else:
            raise ValueError("Unknown Monomer type.")
        monomer = monomer_class(
            atoms=None,
            mol_code=monomer_label[2],
            monomer_id=monomer_label[1],
            insertion_code=monomer_label[3],
            is_hetero=het,
            parent=parent,
        )
        monomer.states = self.gen_states(monomer_data.values(), monomer)
        monomer._active_state = sorted(monomer.states.keys())[0]
        return monomer

[docs]    def gen_states(self, monomer_data, parent):
        """Generates the `states` dictionary for a `Monomer`.

        monomer_data : list
            A list of atom data parsed from the input PDB.
        parent : ampal.Monomer
            `Monomer` used to assign `parent` on created
            `Atoms`.
        """
        states = {}
        for atoms in monomer_data:
            for atom in atoms:
                state = "A" if not atom[3] else atom[3]
                if state not in states:
                    states[state] = OrderedDict()
                states[state][atom[2]] = Atom(
                    tuple(atom[8:11]),
                    atom[13],
                    atom_id=atom[1],
                    res_label=atom[2],
                    occupancy=atom[11],
                    bfactor=atom[12],
                    charge=atom[14],
                    state=state,
                    parent=parent,
                )

        # This code is to check if there are alternate states and populate any
        # both states with the full complement of atoms
        states_len = [(k, len(x)) for k, x in states.items()]
        if (len(states) > 1) and (len(set([x[1] for x in states_len])) > 1):
            for t_state, t_state_d in states.items():
                new_s_dict = OrderedDict()
                for k, v in states[
                    sorted(states_len, key=lambda x: x[0])[0][0]
                ].items():
                    if k not in t_state_d:
                        c_atom = Atom(
                            v._vector,
                            v.element,
                            atom_id=v.id,
                            res_label=v.res_label,
                            occupancy=v.tags["occupancy"],
                            bfactor=v.tags["bfactor"],
                            charge=v.tags["charge"],
                            state=t_state[0],
                            parent=v.parent,
                        )
                        new_s_dict[k] = c_atom
                    else:
                        new_s_dict[k] = t_state_d[k]
                states[t_state] = new_s_dict
        return states

    # HETATM filters
[docs]    @staticmethod
    def check_for_non_canonical(residue):
        """Checks to see if the residue is non-canonical."""
        res_label = list(residue[0])[0][2]
        atom_labels = {
            x[2] for x in itertools.chain(*residue[1].values())
        }  # Used to find unnatural aas
        if (all(x in atom_labels for x in ["N", "CA", "C", "O"])) and (
            len(res_label) == 3
        ):
            return Residue, True
        return None


__author__ = "Christopher W. Wood"
Source code for ampal.pdb_parser

AMPAL

Navigation

Related Topics