Source code for pywindow.io_tools

"""Module contains classes for input/output processing."""

import os
import json
import numpy as np

from .utilities import decipher_atom_key, unit_cell_to_lattice_array


class _CorruptedPDBFile(Exception):
    def __init__(self, message):
        self.message = message


class _CorruptedXYZFile(Exception):
    def __init__(self, message):
        self.message = message


class _FileAlreadyExists(Exception):
    def __init__(self, message):
        self.message = message


class _NotADictionary(Exception):
    def __init__(self, message):
        self.message = message


class _FileTypeError(Exception):
    def __init__(self, message):
        self.message = message


[docs]class Input(object):
    """Class used to load and process input files."""

    def __init__(self):
        self._load_funcs = {
            '.xyz': self._read_xyz,
            '.pdb': self._read_pdb,
            '.mol': self._read_mol,
        }

[docs]    def load_file(self, filepath):
        """
        This function opens any type of a readable file and decompose
        the file object into a list, for each line, of lists containing
        splitted line strings using space as a spacer.

        Parameters
        ----------
        filepath : :class:`str`
            The full path or a relative path to any type of file.

        Returns
        -------
        :class:`dict`
            Returns a dictionary containing the molecular information
            extracted from the input files. This information will
            vary with file type and information stored in it.
            The data is sorted into lists that contain one feature
            for example key atom_id: [atom_id_1, atom_id_2]
            Over the process of analysis this dictionary will be updated
            with new data.
        """
        self.file_path = filepath
        _, self.file_type = os.path.splitext(filepath)
        _, self.file_name = os.path.split(filepath)
        with open(filepath) as ffile:
            self.file_content = ffile.readlines()

        return (self._load_funcs[self.file_type]())

[docs]    def load_rdkit_mol(self, mol):
        """
        Return molecular data from :class:`rdkit.Chem.rdchem.Mol` object.

        Parameters
        ----------
        mol : :class:`rdkit.Chem.rdchem.Mol`
            A molecule object from RDKit.

        Returns
        -------
        :class:`dict`
            A dictionary with ``elements`` and ``coordinates`` as keys
            containing molecular data extracted from
            :class:`rdkit.Chem.rdchem.Mol` object.

        """
        self.system = {
            'elements': np.empty(
                mol.GetNumAtoms(), dtype=str),
            'coordinates': np.empty((mol.GetNumAtoms(), 3))
        }
        for atom in mol.GetAtoms():
            atom_id = atom.GetIdx()
            atom_sym = atom.GetSymbol()
            x, y, z = mol.GetConformer().GetAtomPosition(atom_id)
            self.system['elements'][atom_id] = atom_sym
            self.system['coordinates'][atom_id] = x, y, z
        return self.system

    def _read_xyz(self):
        """"""
        try:
            self.system = dict()
            self.file_remarks = self.file_content[1]
            self.system['elements'] = np.array(
                [i.split()[0] for i in self.file_content[2:]])
            self.system['coordinates'] = np.array(
                [[float(j[0]), float(j[1]), float(j[2])]
                 for j in [i.split()[1:] for i in self.file_content[2:]]])
            return self.system
        except IndexError:
            raise _CorruptedXYZFile(
                "The XYZ file is corrupted in some way. For example, an empty "
                "line at the end etc. or it is a trajectory. If the latter is "
                "the case, please use `trajectory` module, otherwise fix it.")

    def _read_pdb(self):
        """"""
        if sum([i.count('END ') for i in self.file_content]) > 1:
            raise _CorruptedPDBFile(
                "Multiple 'END' statements were found in this PDB file."
                "If this is a trajectory, use a trajectory module, "
                "Otherwise, fix it.")
        self.system = dict()
        self.system['remarks'] = [
            i for i in self.file_content if i[:6] == 'REMARK'
        ]
        self.system['unit_cell'] = np.array([
            float(x)
            for i in self.file_content for x in
            [i[6:15], i[15:24], i[24:33], i[33:40], i[40:47], i[47:54]]
            if i[:6] == 'CRYST1'
        ])
        if self.system['unit_cell'].any():
            self.system['lattice'] = unit_cell_to_lattice_array(self.system[
                'unit_cell'])
        self.system['atom_ids'] = np.array(
            [
                i[12:16].strip() for i in self.file_content
                if i[:6] == 'HETATM' or i[:6] == 'ATOM  '
            ],
            dtype='<U8')
        self.system['elements'] = np.array(
            [
                i[76:78].strip() for i in self.file_content
                if i[:6] == 'HETATM' or i[:6] == 'ATOM  '
            ],
            dtype='<U8')
        self.system['coordinates'] = np.array(
            [[float(i[30:38]), float(i[38:46]), float(i[46:54])]
             for i in self.file_content
             if i[:6] == 'HETATM' or i[:6] == 'ATOM  '])
        return self.system

    def _read_mol(self):
        """-V3000"""
        self.system = dict()
        if self.file_content[2] != '\n':
            self.system['remarks'] = self.file_content[2]
        file_body = [i.split() for i in self.file_content]
        elements = []
        coordinates = []
        atom_data = False
        for line in file_body:
            if len(line) > 2:
                if line[2] == 'END' and line[3] == 'ATOM':
                    atom_data = False
                if atom_data is True:
                    elements.append(line[3])
                    coordinates.append(line[4:7])
                if line[2] == 'BEGIN' and line[3] == 'ATOM':
                    atom_data = True
        self.system['elements'] = np.array(elements)
        self.system['coordinates'] = np.array(coordinates, dtype=float)
        return self.system


[docs]class Output(object):
    """Class used to process and save output files."""

    def __init__(self):
        self.cwd = os.getcwd()
        self._save_funcs = {
            'xyz': self._save_xyz,
            'pdb': self._save_pdb,
        }

[docs]    def dump2json(self, obj, filepath, override=False, **kwargs):
        """
        Dump a dictionary into a JSON dictionary.

        Uses the json.dump() function.

        Parameters
        ----------
        obj : :class:`dict`
            A dictionary to be dumpped as JSON file.

        filepath : :class:`str`
           The filepath for the dumped file.

        override : :class:`bool`
           If True, any file in the filepath will be override. (default=False)
        """
        # We make sure that the object passed by the user is a dictionary.
        if isinstance(obj, dict):
            pass
        else:
            raise _NotADictionary(
                "This function only accepts dictionaries as input")
        # We check if the filepath has a json extenstion, if not we add it.
        if str(filepath[-4:]) == 'json':
            pass
        else:
            filepath = ".".join((str(filepath), "json"))
        # First we check if the file already exists. If yes and the override
        # keyword is False (default), we will raise an exception. Otherwise
        # the file will be overwritten.
        if override is False:
            if os.path.isfile(filepath):
                raise _FileAlreadyExists(
                    "The file {0} already exists. Use a different filepath, "
                    "or set the 'override' kwarg to True.".format(filepath))
        # We dump the object to the json file. Additional kwargs can be passed.
        with open(filepath, 'w+') as json_file:
            json.dump(obj, json_file, **kwargs)

[docs]    def dump2file(self, obj, filepath, override=False, **kwargs):
        """
        Dump a dictionary into a file. (Extensions: XYZ or PDB)

        Parameters
        ----------
        obj : :class:`dict`
            A dictionary containing molecular information.

        filepath : :class:`str`
           The filepath for the dumped file.

        override : :class:`bool`
           If True, any file in the filepath will be override. (default=False)
        """
        # First we check if the file already exists. If yes and the override
        # keyword is False (default), we will raise an exception. Otherwise
        # the file will be overwritten.
        if override is False:
            if os.path.isfile(filepath):
                raise _FileAlreadyExists(
                    "The file {0} already exists. Use a different filepath, "
                    "or set the 'override' kwarg to True.".format(filepath))
        if str(filepath[-3:]) not in self._save_funcs.keys():
            raise _FileTypeError(
                "The {0} file extension is "
                "not supported for dumping a MolecularSystem or a Molecule. "
                "Please use XYZ or PDB.".format(str(filepath[-3:])))
        self._save_funcs[str(filepath[-3:])](obj, filepath, **kwargs)

    def _save_xyz(self, system, filepath, **kwargs):
        """"""
        # Initial settings.
        settings = {
            'elements': 'elements',
            'coordinates': 'coordinates',
            'remark': " ",
            'decipher': False,
            'forcefield': None,
        }
        settings.update(kwargs)
        # Extract neccessary data.
        elements = system['elements']
        coordinates = system['coordinates']
        if settings['decipher'] is True:
            elements = np.array([
                decipher_atom_key(
                    key, forcefield=settings['forcefield']) for key in elements
            ])
        string = '{0:0d}\n{1}\n'.format(len(elements), str(settings['remark']))
        for i, j in zip(elements, coordinates):
            string += '{0} {1:.2f} {2:.2f} {3:.2f}\n'.format(i, *j)
        with open(filepath, 'w+') as file_:
            file_.write(string)

    def _save_pdb(self, system, filepath, **kwargs):
        """"""
        settings = {
            'atom_ids': 'atom_ids',
            'elements': 'elements',
            'coordinates': 'coordinates',
            'cryst': 'unit_cell',
            'connect': None,
            'remarks': None,
            'space_group': None,
            'resName': "MOL",
            'chainID': 'A',
            'resSeq': 1,
            'decipher': False,
            'forcefield': None,
        }
        settings.update(kwargs)
        # We create initial string that we will gradually extend while we
        # process the data and in the end it will be written into a pdb file.
        string = "REMARK File generated using pyWINDOW."
        # Number of items (atoms) in the provided system.
        len_ = system[settings['atom_ids']].shape[0]
        # We process the remarks, if any, given by the user (optional).
        if isinstance(settings['remarks'], (list, tuple)):
            # If a list or tuple of remarks each is written at a new line
            # with the REMARK prefix not to have to long remark line.
            for remark in settings['remarks']:
                string = "\n".join([string, 'REMARK {0}'.format(remark)])
        else:
            # Otherwise if it's a single string or an int/float we just write
            # it under single remark line, otherwise nothing happens.
            if isinstance(settings['remarks'], (str, int, float)):
                remark = settings['remarks']
                string = "\n".join([string, 'REMARK {0}'.format(remark)])
        # If there is a unit cell (crystal data) provided we need to add it.
        if settings['cryst'] in system.keys():
            if system[settings['cryst']].any():
                cryst_line = "CRYST1"
                cryst = system[settings['cryst']]
                # The user have to provide the crystal data as a list/array
                # of six items containing unit cell edges lengths a, b and c
                # in x, y and z directions and three angles, or it can be.
                # Other options are not allowed for simplicity. It can convert
                # from the lattice array using function from utilities.
                for i in cryst[:3]:
                    cryst_line = "".join([cryst_line, "{0:9.3f}".format(i)])
                for i in cryst[3:]:
                    cryst_line = "".join([cryst_line, "{0:7.2f}".format(i)])
                # This is kind of messy, by default the data written in PDB
                # file should be P1 symmetry group therefore containing all
                # atom coordinates and not considering symmetry operations.
                # But, user can still define a space group if he wishes to.
                if settings['space_group'] is not None:
                    space_group = settings['space_group']
                else:
                    space_group = "{0}".format("P1")
                cryst_line = " ".join([cryst_line, space_group])
                # We add the unit cell parameters to the main string.
                string = "\n".join([string, cryst_line])
        # For the sake of code readability we extract interesting data from the
        # system. Atom_ids are the atom ids written at the third column of a
        # PDB file and the user has here the freedom to use the forcefield
        # assigned ones. However, they have to specify it directly using the
        # atom_ids key. Otherwise, the 'elements' array from system object
        # will be used, that is also used for elements in the last column of
        # a PDB file. Other parameters like residue name (resName), chain id
        # (chainID) and residue sequence (resSeq) can be controlled by
        # appropriate parameter keyword passed to this function, Otherwise
        # the default values from settings dictionary are used.
        atom_ids = system[settings['atom_ids']]
        elements = system[settings['elements']]
        # If the 'elements' array of the system need deciphering atom keys this
        # is done if the user sets decipher to True. They can also provided
        # forcefield, otherwise it's None which equals to DLF.
        if settings['decipher'] is True:
            elements = np.array([
                decipher_atom_key(
                    key, forcefield=settings['forcefield']) for key in elements
            ])
        coordinates = system[settings['coordinates']]
        for i in range(len_):
            atom_line = "ATOM  {0:5d}".format(i + 1)
            atom_id = "{0:4}".format(atom_ids[i].center(4))
            resName = "{0:3}".format(settings['resName'])
            chainID = settings['chainID']
            atom_line = " ".join([atom_line, atom_id, resName, chainID])
            resSeq = str(settings['resSeq']).rjust(4)
            atom_line = "".join([atom_line, resSeq])
            coor = "{0:8.3f}{1:8.3f}{2:8.3f}".format(
                coordinates[i][0],
                coordinates[i][1],
                coordinates[i][2], )
            atom_line = "    ".join([atom_line, coor])
            big_space = "{0}".format(" ".center(22))
            element = "{0:2}  ".format(elements[i].rjust(2))
            atom_line = "".join([atom_line, big_space, element])
            string = "\n".join([string, atom_line])
        # The connectivity part is to be written after a function calculating
        # connectivity is finished
        # "Everything that has a beginning has an end" by Neo. :)
        string = "\n".join([string, 'END'])
        # Check if .pdb extension is missing from filepath.
        if filepath[-4:].lower() != '.pdb':
            filepath = ".".join((filepath, 'pdb'))
        # Write the string to a a PDB file.
        with open(filepath, 'w+') as file:
            file.write(string)