Spaces:

sandl
/

private_polymer_compound_prediction

Sleeping

File size: 5,305 Bytes

4f5540c

import os, pickle
import qml
import numpy as np
import pandas as pd
from tqdm import tqdm
from polymerlearn.utils.comparison_rep.element_info import *

import ase
from dscribe.descriptors import SOAP
from dscribe.descriptors import MBTR as MBTR_

import pyarrow.feather as feather

def Atom_XYZ(xyz_list):
    atoms = []
    charges = []
    coords = np.empty((0,3), float)
    
    for j in xyz_list:
        while j.startswith(" "):
            j = j[1:]
        while "\t" in j:
            j = j.replace("\t", " ")
        while "  " in j:
            j = j.replace("  ", " ")
        temp = j[:-1].split(" ")
        print(temp)

        atoms.append(temp[0])
        coords = np.append(coords, np.array([[temp[1], temp[2], temp[3]]]), axis=0)
        charges.append(AtomicNumber(temp[0]))
    
    return atoms, charges, coords

def CM(new_struct):
    '''
    new_struct: one chunk of XYZ file
    '''
    atoms, charges, coords = Atom_XYZ(new_struct)
    mol = qml.representations.generate_coulomb_matrix(nuclear_charges=charges, 
                                                        coordinates=coords, 
                                                        size=len(atoms), 
                                                        sorting='row-norm'
                                                        )
    return mol

def BOB(new_struct):
    '''
    new_struct: one chunk of XYZ file
    '''
    atoms, charges, coords = Atom_XYZ(new_struct)
    atom_dict = {}

    for j in atoms:
        if j not in atom_dict:
            atom_dict[j] = 1
        else:
            atom_dict[j] += 1

    mol = qml.representations.generate_bob(nuclear_charges=charges, 
                                            coordinates=coords, 
                                            atomtypes=np.unique(np.asarray(atoms)),
                                            size=len(atoms), 
                                            asize=atom_dict
                                            )
    return mol

def mySOAP(new_struct):
    '''
    new_struct: one chunk of XYZ file
    '''

    atoms, charges, coords = Atom_XYZ(new_struct)
    species = np.unique(np.asarray(atoms))

    soap = SOAP(species=species, 
                periodic=False, 
                rcut=3.0,
                nmax=5,
                lmax=4
                )

    mol = soap.create(system=ase.Atoms(positions=coords, numbers=charges),
                positions=coords,
                n_jobs=1,
                )

    return mol.flatten()

def MBTR(new_struct):

    atoms, charges, coords = Atom_XYZ(new_struct)
    species = np.unique(np.asarray(atoms))

    mbtr = MBTR_(species=species,
                k1={
                    "geometry": {"function": "atomic_number"},
                    "grid": {"min": 0, "max": 8, "n": 100, "sigma": 0.1},
                    },
                k2={
                    "geometry": {"function": "inverse_distance"},
                    "grid": {"min": 0, "max": 1, "n": 100, "sigma": 0.1},
                    "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-3},
                    },
                k3={
                    "geometry": {"function": "cosine"},
                    "grid": {"min": -1, "max": 1, "n": 100, "sigma": 0.1},
                    "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-3},
                    },
                periodic=False,
                normalization="l2_each", 
                flatten=True
                )
    mol = mbtr.create(system=ase.Atoms(positions=coords, numbers=charges),
                        n_jobs=1) # Hardcoded number of jobs

    return mol 

def get_one_top_xyz(filename):
    '''
    Gets the top XYZ chunk for the top conformer in file. Ready to input to representation generator
    '''
    with open(filename) as f:
        ff = f.readlines()
    natoms = int(ff[0])

    end = natoms + 1
    file_chunk = ff[2:end]
    return file_chunk

def screen_build(all_AG, 
        xyz_loc = '../../../Structures/AG/xyz', 
        rep_dir_loc = '../../../Representations',
        reps_to_screen = ['MBTR']):
    '''
    Screens all acids/glycols in a dataframe, builds representations of a given type
    '''

    gen_dict = {
        'CM': CM,
        'SOAP': mySOAP,
        'BOB': BOB,
        'MBTR': MBTR
    }

    xyzpath = lambda x: os.path.join(xyz_loc, x)

    for rep in reps_to_screen:
        print('REP', rep)
        for ag in all_AG:
            rloc_rep = os.path.join(rep_dir_loc, rep, 'AG')
            pickle_path = os.path.join(rloc_rep, ag.lower() + '.pickle')
        
            if not os.path.exists(pickle_path):
                fchunk = get_one_top_xyz(xyzpath(ag + '.xyz'))
                F = gen_dict[rep]
                mol = F(fchunk)
                pickle.dump(mol, open(pickle_path, 'wb'))
                #feather.write_feather(mol, feather_path)


if __name__ == '__main__':

    data = pd.read_csv('../../../dataset/pub_data.csv')
    ac = (20,33); gc = (34,46)
    acid_names = [c[1:] for c in data.columns[ac[0]:ac[1]].tolist() if '95% trans' not in c]
    glycol_names = [c[1:] for c in data.columns[gc[0]:gc[1]].tolist()]

    full_names = acid_names + glycol_names
    xyz_loc = '../../../Structures/AG'

    screen_build(full_names)