From 1d21583cee08f0f5af25d6b92a4feadac5556798 Mon Sep 17 00:00:00 2001 From: Luna Pratali Maffei Date: Fri, 11 Oct 2024 17:52:11 +0200 Subject: [PATCH] super functional groups for aromatics --- automol/graph/_3super_func_group.py | 190 +++++++++++++++++++++++++ automol/tests/test_super_func_group.py | 101 +++++++++++++ 2 files changed, 291 insertions(+) create mode 100644 automol/graph/_3super_func_group.py create mode 100644 automol/tests/test_super_func_group.py diff --git a/automol/graph/_3super_func_group.py b/automol/graph/_3super_func_group.py new file mode 100644 index 00000000..3da88fec --- /dev/null +++ b/automol/graph/_3super_func_group.py @@ -0,0 +1,190 @@ +""" +Assign species class based on CRECK classification +Implemented for aromatic species +Features: automatically recognize functional groups from species InChI +Identify them in graphs, and quantify how many there are +""" +import itertools + +from .base import ( + # base functions + implicit, + # classification functions + bonds_of_type, + functional_group_dct +) + + + +def gra_has_grp(gra, grp): + """ filter species based on the presence of a functional group + """ + fc_grps_count = classify_species(gra) + if grp in fc_grps_count.keys(): + return True + else: + return False + +def classify_species(gra): + """ uses the SuperFunctionalGroup to classify a species according to species classes + + """ + # call SuperFunctionalGroup + fc_grps = SuperFunctionalGroup() + fc_grps.assign_grps(gra) + fc_grps.count_grps() + return fc_grps.dct_count_grps + # returns dct: number of groups for each type + +BASE_GRP_DCT = { + 'C5-M': "cyclopentadiene", + 'C5O-M': "cyclopentadienone", + 'C5CH2-M': "fulvene", + 'FUR-M': "furan", + 'C5-RSR': "cyclopentadienyl", + 'C5H2-RSR': "cyclopentenyl", + # SUBSTITUTED C5 RINGS + 'C5O-RSR': "cyclopentadienonyl", + # AROMATICS + 'A1-M': "benzene", + 'A1-R': "phenyl", + # SUBSTITUTED AROMATICS + 'A1CH2-RSR': "benzyl", + # OXYGENATED AROMATICS + 'A1O-RSR': "phenoxy", +} +SUBSTITUENTS_GRP_DCT = { + 'OH': "alcohol", + 'CHO': "aldehyde", + 'CH3': "methyl", + 'C2H': "alkyne", + 'C2H3': "alkene", + 'C2H5': "alkane", + 'C3.DD': "allene", + 'C3.ST': "propyne", + 'OCH3': "alkoxy_oc", +} + +# POTENTIALLY, THE COMPOSITE GROUP LIST CAN BE MADE OF ALL THE STRUCTURES FROM THE BASE GROUP DICTIONARY +# COMBINED WITH ANY NUMBER AND TYPE OF SUBSTITUENTS. BUT THIS MAKES THE LIST SIMPLER AND MORE EFFECTIVE +# AND THE CODE FASTER +COMPOSITE_GRP_LIST = [ + # molecules - alkylated + 'C5,CH3-M', + 'A1,CH3-M', + 'A1,C2H-M', + 'A1,C2H3-M', + 'A1,C3.DD-M', + 'A1,C3.ST-M', + # molecules - oxygenated + 'C5,OH-M', + 'A1,OH-M', + 'A1,OH,OH-M', + 'A1,OH,CHO-M', + 'A1,OH,OCH3-M', + 'A1,CHO-M', + 'A1,OCH3-M', + # radicals + 'C5,CH3-RSR', + 'A1,CH3-R', + 'A1,OH-R', + 'A1O,OH-RSR', +] + + +class SuperFunctionalGroup: + """ super functional groups composed of combinations of basic functional groups + classification reflects that adopted in CRECK model for aromatic hydrocarbons + """ + + def __init__(self, ): + self.sup_grps = {} + + def assign_grps(self, gra): + # call functional group dct + self.grp_fct_dct = functional_group_dct(gra) + for key, val in self.grp_fct_dct.items(): + if len(val) > 0: + print("'"+key+"':",val,",") + print('\n') + # assign base groups + for key, fct in BASE_GRP_DCT.items(): + self.sup_grps[key] = self.grp_fct_dct[fct] + if len(self.grp_fct_dct[fct]): + print("'"+key+"':",self.grp_fct_dct[fct],",") + + # assign substituents + subs_fct_dct = {} + for key, fct in SUBSTITUENTS_GRP_DCT.items(): + subs_fct_dct[key] = self.grp_fct_dct[fct] + if len(self.grp_fct_dct[fct]): + print("'"+key+"':",self.grp_fct_dct[fct],",") + + # CH3CK C6H5C2H2, C6H5C2H4!! + # assign composite + for comp_grp in COMPOSITE_GRP_LIST: + base_and_subs, base_type = comp_grp.split('-') + base, subs = base_and_subs.split(',')[0] + '-' + base_type, base_and_subs.split(',')[1:] + base_grps = self.sup_grps[base] # base groups to search substituents in + for sub in subs: + sub_grps = subs_fct_dct[sub] + # intersection becomes the new base_grps; filter by bond type, e.g., C-C, C-O.. + # with bonded_grps only: fails for OCH3 (CH2-O bonded to an aromatic would work too) + base_grps = bonded_grps_checksymb(gra, base_grps, sub_grps, "C", sub[0]) + # add to dct + self.sup_grps[comp_grp] = base_grps + if len(base_grps) > 0: + print(comp_grp, base_grps) + + + def count_grps(self): + # count functional groups for each type + self.dct_count_grps = { + fgrp: len(grp_idx_lst) for fgrp, grp_idx_lst in self.sup_grps.items() if grp_idx_lst + } + + +def bonded_grps(gra, grps1, grps2): + """ check if there is a bond between group1 and group2 of atoms in a graph + return tuple of bonded groups + grps1, grps2: tuple(tuple), groups of bonded atoms + """ + heavy_atms = list(implicit(gra)[0].keys()) + grps = () + if len(grps1) > 0 and len(grps2) > 0: + for grp1 in grps1: + # keep only heavy atoms + grp1 = tuple([atm for atm in grp1 if atm in heavy_atms]) + for grp2 in grps2: + grp2 = tuple([atm for atm in grp2 if atm in heavy_atms and atm not in grp1]) + possible_bonds = list(itertools.product(grp1, grp2)) + if any([frozenset(bond) in gra[1].keys() for bond in possible_bonds]): + grp = grp1 + grp2 + if sorted(grp) not in [sorted(grpi) for grpi in grps]: + grps += (grp, ) + + return grps + +def bonded_grps_checksymb(gra, grps1, grps2, symb1, symb2): + """ check if there is a bond between group1 and group2 of atoms in a graph + return tuple of bonded groups + grps1, grps2: tuple(tuple), groups of bonded atoms + symb1, symb2: atom symbols of the bonded group sym1-sym2 + """ + heavy_atms = list(implicit(gra)[0].keys()) + correct_bonds = bonds_of_type(gra, symb1, symb2) + grps = () + if len(grps1) > 0 and len(grps2) > 0 and len(correct_bonds) > 0: + for grp1 in grps1: + # keep only heavy atoms + grp1 = tuple([atm for atm in grp1 if atm in heavy_atms]) + for grp2 in grps2: + grp2 = tuple([atm for atm in grp2 if atm in heavy_atms and atm not in grp1]) + possible_bonds = list(itertools.product(grp1, grp2)) + effective_bonds = (bond for bond in possible_bonds if frozenset(bond) in gra[1].keys()) + if len(tuple(set(effective_bonds).intersection(correct_bonds))) > 0: + grp = grp1 + grp2 + if sorted(grp) not in [sorted(grpi) for grpi in grps]: + grps += (grp, ) + + return grps \ No newline at end of file diff --git a/automol/tests/test_super_func_group.py b/automol/tests/test_super_func_group.py new file mode 100644 index 00000000..b96b4f41 --- /dev/null +++ b/automol/tests/test_super_func_group.py @@ -0,0 +1,101 @@ +""" + Tests the functional group +""" + +import automol + +SPCS_CHECKS_SMI = { + 'C6H5': 'C=1C=C[C]=CC=1', + 'AMN': 'Cc1ccc2ccccc2c1', + 'INDENYL': 'C=1C=CC=2C=C[CH]C=2C=1', + 'C12H8': 'C=1C=C2C=CC=C3C=CC(C=1)=C32', + 'C9H7O': '[O]C=1CC=C2C=CC=CC2=1', + 'C5H4O': 'O=C1C=CC=C1', + 'OC6H4CH3': 'CC1=CC=CC=C1[O]', + 'HOC6H4CH3': 'CC1=CC=CC(=C1)O', + 'C6H5CH2OOH': 'OOCC=1C=CC=CC=1', + 'BZFUR': 'C=1C=CC2=C(C=1)C=CO2', + 'C10H7CH2': '[CH2]C1=CC=CC=2C=CC=CC1=2', + 'C6H5C2H3': 'C=CC1=CC=CC=C1', + 'C10H9': 'C=1C=CC=2[CH]CC=CC=2C=1', + 'CH3C6H4': 'CC1=CC=CC=[C]1', + 'C6H5CCC6H5': 'C=1C=CC(=CC=1)C#CC2=CC=CC=C2', + 'C6H5C3H3-A': 'C=C=CC1=CC=CC=C1', + 'C6H5C3H3-P': 'C#CCC1=CC=CC=C1', + 'CYC5H7': 'C1=CCC[CH]1', + 'MEINDENYL': 'C[C]1C=CC=2C=CC=CC=21', + 'BENZOFLUORENE':'C=1C=CC2=C(C=1)C=CC=3CC=4C=CC=CC=4C=32', + 'C6H5OCH3': 'COC1=CC=CC=C1', + 'CATECHOL': 'OC=1C=CC=CC=1O', + 'SALICALD': 'O=CC1=CC=CC=C1O', +} + + +for spc, smi in SPCS_CHECKS_SMI.items(): + SPCS_CHECKS_SMI[spc] = automol.geom.graph( + automol.chi.geometry( + automol.smiles.chi(smi))) + +SPCS_GRPS = { + 'C6H5': {'A1-R': ((0, 1, 3, 5, 4, 2),)}, + 'INDENYL': { + 'C5-RSR': ((2, 5, 7, 8, 6),) , + }, + 'C12H8': { + 'A1-M': ((0, 2, 8, 11, 9, 4), (1, 3, 8, 11, 10, 5)) , + }, + 'C9H7O': {'C5O-RSR': ((4, 5, 8, 7, 6, 9),) ,}, + 'C5H4O': {'C5O-M': ((0, 1, 3, 4, 2, 5),) ,}, + 'OC6H4CH3': {'A1O-RSR': ((1, 2, 4, 6, 5, 3, 7),) , + }, + 'HOC6H4CH3': { + 'A1-M': ((1, 2, 5, 4, 6, 3),) , + 'A1,OH-M': ((1, 2, 5, 4, 6, 3, 7),) + }, # da aggiungere altri}, + 'C6H5CH2OOH': {'A1-M': ((0, 1, 3, 6, 4, 2),) , + }, + 'BZFUR': {'FUR-M': ((4, 5, 8, 7, 6),) , + 'A1-M': ((0, 1, 3, 7, 6, 2),) ,}, + 'C10H7CH2': {'A1CH2-RSR': ((3, 4, 8, 10, 9, 6, 0),) ,}, + 'C6H5C2H3': {'A1-M': ((2, 3, 5, 7, 6, 4),) , + }, + 'C10H9': {'A1CH2-RSR': ((0, 1, 5, 9, 8, 4, 7), (0, 1, 5, 9, 8, 4, 6)) , + }, + 'CH3C6H4': {'A1-R': ((1, 2, 4, 6, 5, 3),) , + }, + 'C6H5CCC6H5': { + 'A1-M': ((1, 4, 8, 13, 9, 5), (0, 2, 6, 12, 7, 3)) , + }, + 'C6H5C3H3-A': {'A1-M': ((2, 3, 6, 8, 7, 4),) , + 'A1,C3.DD-M': ((2, 3, 6, 8, 7, 4, 0, 1, 5),)}, + 'C6H5C3H3-P': {'A1-M': ((2, 3, 6, 8, 7, 4),) , + 'A1,C3.ST-M': ((2, 3, 6, 8, 7, 4, 0, 1, 5),) + }, + 'CYC5H7': {'C5H2-RSR': ((0, 1, 3, 4, 2),) ,}, + 'MEINDENYL': {'C5-RSR': ((5, 6, 8, 9, 7),) , + }, + 'BENZOFLUORENE': {'C5-M': ((10, 12, 15, 16, 13),) , + 'A1-M': ((0, 2, 6, 14, 11, 4), (8, 9, 13, 16, 14, 11), (1, 3, 7, 15, 12, 5)) , + }, + 'C6H5OCH3': {'A1-M': ((1, 2, 4, 6, 5, 3),) , + 'A1,OCH3-M': ((1, 2, 4, 6, 5, 3, 7, 0),)}, + 'CATECHOL': {'A1-M': ((0, 1, 3, 5, 4, 2),) , + 'A1,OH-M': ((0, 1, 3, 5, 4, 2, 6), (0, 1, 3, 5, 4, 2, 7)), + 'A1,OH,OH-M': ((0, 1, 3, 5, 4, 2, 6, 7),)}, + 'SALICALD': { + 'A1-M': ((0, 1, 3, 6, 5, 2),) , + 'A1,OH-M': ((0, 1, 3, 6, 5, 2, 8),) , + }, +} + +def test_super_functional_group_dct(): + + for SPC, DCT in SPCS_GRPS.items(): + gra = SPCS_CHECKS_SMI[SPC] + fgrps = automol.graph.SuperFunctionalGroup() + fgrps.assign_grps(gra) + for key, val in DCT.items(): + assert val == fgrps.sup_grps[key] + +if __name__ == '__main__': + test_super_functional_group_dct()