diff --git a/SynRD/features_extractor/__init__.py b/SynRD/features_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/SynRD/features_extractor/features.json b/SynRD/features_extractor/features.json new file mode 100644 index 0000000..555464c --- /dev/null +++ b/SynRD/features_extractor/features.json @@ -0,0 +1,649 @@ +{ + "adult": { + "general": { + "attr_to_inst": 0.00042996222474739717, + "cat_to_num": 1.3333333333333333, + "freq_class.mean": 0.5, + "freq_class.sd": 0.3665506390973169, + "inst_to_attr": 2325.785714285714, + "nr_attr": 14, + "nr_bin": 1, + "nr_cat": 8, + "nr_class": 2, + "nr_inst": 32561, + "nr_num": 6, + "num_to_cat": 0.75 + }, + "info-theory": { + "attr_conc.mean": 0.0458436925407551, + "attr_conc.sd": 0.13948556412198465, + "attr_ent.mean": 2.2197528515718576, + "attr_ent.sd": 1.5139967849269165, + "class_conc.mean": 0.02703239097275378, + "class_conc.sd": 0.03320210419262275, + "class_ent": 0.7963839552022132, + "eq_num_attr": 12.006187880013178, + "joint_ent.mean": 2.9498056812610374, + "joint_ent.sd": 1.5033030990636995, + "mut_inf.mean": 0.0663311255130333, + "mut_inf.sd": 0.05319818541302973, + "ns_ratio": 32.464724658346135 + }, + "itemset": { + "one_itemset.mean": 0.07106598984771574, + "one_itemset.sd": 0.155956043719432, + "two_itemset.mean": 0.13962906444418358, + "two_itemset.sd": 0.19991073911332466 + }, + "statistical": { + "can_cor.mean": 0.6075382184616811, + "can_cor.sd": null, + "cor.mean": 0.020575612632741463, + "cor.sd": 0.04535153811741261, + "cov.mean": 211.1074182124332, + "cov.sd": 8025.120485282461, + "eigenvalues.mean": 110846564.91990046, + "eigenvalues.sd": 1108509665.939067, + "g_mean.mean": 1586.32115786632, + "g_mean.sd": 15933.907433185732, + "gravity": 4511.757083547598, + "h_mean.mean": 1251.6757800002574, + "h_mean.sd": 12571.548183529578, + "iq_range.mean": 1180.7722772277227, + "iq_range.sd": 11863.197745870368, + "kurtosis.mean": 834.6502878290377, + "kurtosis.sd": 3300.235280828971, + "lh_trace": 0.5850439987975836, + "mad.mean": 879.4019881188119, + "mad.sd": 8835.794743563976, + "max.mean": 15736.237623762376, + "max.sd": 147964.76551477704, + "mean.mean": 1891.4795428899488, + "mean.sd": 18882.70462422157, + "median.mean": 1766.8019801980197, + "median.sd": 17746.995603774634, + "min.mean": 121.82178217821782, + "min.sd": 1222.3854579962863, + "nr_cor_attr": 0.0011881188118811883, + "nr_disc": 1, + "nr_norm": 3.0, + "nr_outliers": 95, + "p_trace": 0.36910268689159337, + "range.mean": 15614.415841584158, + "range.sd": 146745.1749871366, + "roy_root": 0.5850439987975836, + "sd.mean": 1122.5829209841374, + "sd.sd": 10520.562543441738, + "sd_ratio": null, + "skewness.mean": 17.616097299843247, + "skewness.sd": 23.04905261843532, + "sparsity.mean": 0.09852711143124308, + "sparsity.sd": 0.13239694804959984, + "t_mean.mean": 1758.7537427080476, + "t_mean.sd": 17665.95817440692, + "var.mean": 110846564.91990048, + "var.sd": 1108509663.0036533, + "w_lambda": 0.6308973131084066 + } + }, + "fairman2019marijuana": { + "general": { + "attr_to_inst": 0.00001703107489926119, + "cat_to_num": 4.0, + "freq_class.mean": 0.16666666666666666, + "freq_class.sd": 0.14470247838587796, + "inst_to_attr": 58716.2, + "nr_attr": 5, + "nr_bin": 1, + "nr_cat": 4, + "nr_class": 6, + "nr_inst": 293581, + "nr_num": 1, + "num_to_cat": 0.25 + }, + "info-theory": { + "attr_conc.mean": 0.00862284316796299, + "attr_conc.sd": 0.024786672418147204, + "attr_ent.mean": 2.521293122340193, + "attr_ent.sd": 1.083027203766085, + "class_conc.mean": 0.07817273846496639, + "class_conc.sd": 0.14361253242900696, + "class_ent": 2.1567101571814913, + "eq_num_attr": 8.469380238328613, + "joint_ent.mean": 4.423355348210112, + "joint_ent.sd": 1.0000881548804132, + "mut_inf.mean": 0.25464793131157215, + "mut_inf.sd": 0.4320763698298892, + "ns_ratio": 8.90109406879606 + }, + "itemset": { + "one_itemset.mean": 0.10869565217391304, + "one_itemset.sd": 0.1291662893565054, + "two_itemset.mean": 0.2068095838587642, + "two_itemset.sd": 0.15217973907789384 + }, + "statistical": { + "can_cor.mean": 0.28979750344903177, + "can_cor.sd": 0.36940043439051734, + "cor.mean": 0.033916475598542994, + "cor.sd": 0.04940358423991378, + "cov.mean": 0.01143781554280118, + "cov.sd": 0.05534029193569672, + "eigenvalues.mean": 1.86415005899953, + "eigenvalues.sd": 9.414023090683303, + "g_mean.mean": null, + "g_mean.sd": null, + "gravity": 13.446017977834678, + "h_mean.mean": 0.0, + "h_mean.sd": 0.0, + "iq_range.mean": 0.5357142857142857, + "iq_range.sd": 2.645501322750992, + "kurtosis.mean": 15.126693058873796, + "kurtosis.sd": 38.79828388336963, + "lh_trace": 8.852862245584198, + "mad.mean": 0.26475, + "mad.sd": 1.4009253192087006, + "max.mean": 1.7142857142857142, + "max.sd": 3.7796447300922718, + "mean.mean": 0.3903238921164127, + "mean.sd": 1.518058035058907, + "median.mean": 0.39285714285714285, + "median.sd": 2.0788046015507495, + "min.mean": 0.0, + "min.sd": 0.0, + "nr_cor_attr": 0.0, + "nr_disc": 5, + "nr_norm": 11.0, + "nr_outliers": 26, + "p_trace": 0.9657396887380695, + "range.mean": 1.7142857142857142, + "range.sd": 3.7796447300922718, + "roy_root": 8.783409166902468, + "sd.mean": 0.5252950734977363, + "sd.sd": 1.2833697549409835, + "sd_ratio": null, + "skewness.mean": 3.2629118443516076, + "skewness.sd": 2.590241657401533, + "sparsity.mean": 0.17584134858261163, + "sparsity.sd": 0.1844815935639719, + "t_mean.mean": 0.30357294919966293, + "t_mean.sd": 1.512107557228859, + "var.mean": 1.8641500589990838, + "var.sd": 9.409284946491422, + "w_lambda": 0.09542781732881896 + } + }, + "fruiht2018naturally": { + "general": { + "attr_to_inst": 0.0023963575365444525, + "cat_to_num": 0.0, + "freq_class.mean": 0.07692307692307693, + "freq_class.sd": 0.09732973624554023, + "inst_to_attr": 417.3, + "nr_attr": 10, + "nr_bin": 8, + "nr_cat": 0, + "nr_class": 13, + "nr_inst": 4173, + "nr_num": 10, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.08443177796724899, + "attr_conc.sd": 0.22978046900859486, + "attr_ent.mean": 1.0156816369401709, + "attr_ent.sd": 0.6593692099153022, + "class_conc.mean": 0.12440914830592446, + "class_conc.sd": 0.3109366084954623, + "class_ent": 2.8175661528428, + "eq_num_attr": 27.168989187285476, + "joint_ent.mean": 3.72954256418844, + "joint_ent.sd": 0.7277575442618792, + "mut_inf.mean": 0.1037052255945305, + "mut_inf.sd": 0.2561992452064771, + "ns_ratio": 8.793929198045529 + }, + "itemset": { + "one_itemset.mean": 0.37037037037037035, + "one_itemset.sd": 0.28483736758582634, + "two_itemset.mean": 0.48417721518987344, + "two_itemset.sd": 0.20428261038074555 + }, + "statistical": { + "can_cor.mean": 0.16798613010999006, + "can_cor.sd": 0.30500122840629146, + "cor.mean": 0.1274799626397623, + "cor.sd": 0.18247284301061095, + "cov.mean": 0.030737729912916505, + "cov.sd": 0.04508699250430289, + "eigenvalues.mean": 0.5260493369349912, + "eigenvalues.sd": 0.9527701055836804, + "g_mean.mean": 1.5355053816343125, + "g_mean.sd": 4.855694365410508, + "gravity": 1.3251618277297676, + "h_mean.mean": 1.5251679969670537, + "h_mean.sd": 4.823004684812668, + "iq_range.mean": 0.8, + "iq_range.sd": 0.9189365834726815, + "kurtosis.mean": 1.7466195684725043, + "kurtosis.sd": 5.476645027785524, + "lh_trace": 175703465.65490502, + "mad.mean": 0.14826, + "mad.sd": 0.4688392858965639, + "max.mean": 3.4, + "max.sd": 6.310485101972924, + "mean.mean": 2.0397555715312725, + "mean.sd": 4.7284946820689155, + "median.mean": 2.1, + "median.sd": 4.557045826702499, + "min.mean": 1.1, + "min.sd": 3.478505426185217, + "nr_cor_attr": 0.06666666666666667, + "nr_disc": 10, + "nr_norm": 0.0, + "nr_outliers": 5, + "p_trace": 1.119425143057426, + "range.mean": 2.3, + "range.sd": 2.9832867780352594, + "roy_root": 175703465.5260015, + "sd.mean": 0.5881618650527289, + "sd.sd": 0.44735637972511794, + "sd_ratio": null, + "skewness.mean": 0.6322671598235938, + "skewness.sd": 1.7834962915526746, + "sparsity.mean": 0.4256199337575176, + "sparsity.sd": 0.1575698045779039, + "t_mean.mean": 2.0454690618762474, + "t_mean.sd": 4.73431893634828, + "var.mean": 0.5260493369349921, + "var.sd": 0.9372545899022917, + "w_lambda": 5.027594768321065E-9 + } + }, + "iverson22football": { + "general": { + "attr_to_inst": 0.014755959137343927, + "cat_to_num": 0.0, + "freq_class.mean": 0.5, + "freq_class.sd": 0.46070294256649064, + "inst_to_attr": 67.76923076923077, + "nr_attr": 26, + "nr_bin": 1, + "nr_cat": 0, + "nr_class": 2, + "nr_inst": 1762, + "nr_num": 26, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.25323533267430237, + "attr_conc.sd": 0.24824770519055797, + "attr_ent.mean": 0.4985664656944789, + "attr_ent.sd": 0.9924918415361424, + "class_conc.mean": 0.35137502737347687, + "class_conc.sd": 0.22793497984259123, + "class_ent": 0.6672989356393069, + "eq_num_attr": 189.4880525349412, + "joint_ent.mean": 1.1623438128921282, + "joint_ent.sd": 0.9914437226702077, + "mut_inf.mean": 0.0035215884416578626, + "mut_inf.sd": 0.00989859442438207, + "ns_ratio": 140.57431339698746 + }, + "itemset": { + "one_itemset.mean": 0.4406779661016949, + "one_itemset.sd": 0.42328118527056224, + "two_itemset.mean": 0.5191944619257395, + "two_itemset.sd": 0.3621615952024312 + }, + "statistical": { + "can_cor.mean": null, + "can_cor.sd": null, + "cor.mean": null, + "cor.sd": null, + "cov.mean": null, + "cov.sd": null, + "eigenvalues.mean": null, + "eigenvalues.sd": null, + "g_mean.mean": null, + "g_mean.sd": null, + "gravity": null, + "h_mean.mean": null, + "h_mean.sd": null, + "iq_range.mean": null, + "iq_range.sd": null, + "kurtosis.mean": null, + "kurtosis.sd": null, + "lh_trace": null, + "mad.mean": null, + "mad.sd": null, + "max.mean": null, + "max.sd": null, + "mean.mean": null, + "mean.sd": null, + "median.mean": null, + "median.sd": null, + "min.mean": null, + "min.sd": null, + "nr_cor_attr": 0.0, + "nr_disc": null, + "nr_norm": 18.0, + "nr_outliers": 4, + "p_trace": null, + "range.mean": null, + "range.sd": null, + "roy_root": null, + "sd.mean": null, + "sd.sd": null, + "sd_ratio": null, + "skewness.mean": null, + "skewness.sd": null, + "sparsity.mean": 0.2998697442692332, + "sparsity.sd": 0.17959195161730515, + "t_mean.mean": null, + "t_mean.sd": null, + "var.mean": null, + "var.sd": null, + "w_lambda": null + } + }, + "jeong2021math": { + "general": { + "attr_to_inst": 0.0037199415437757406, + "cat_to_num": 0.0, + "freq_class.mean": 0.5, + "freq_class.sd": 0.01324592216650769, + "inst_to_attr": 268.82142857142856, + "nr_attr": 56, + "nr_bin": 14, + "nr_cat": 0, + "nr_class": 2, + "nr_inst": 15054, + "nr_num": 56, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.010090896851239785, + "attr_conc.sd": 0.033703906974763255, + "attr_ent.mean": 1.6774033809497375, + "attr_ent.sd": 1.1130854889416772, + "class_conc.mean": 0.016853385542393653, + "class_conc.sd": 0.06683286177402585, + "class_ent": 0.9997468579230723, + "eq_num_attr": 50.597586033055094, + "joint_ent.mean": 2.657391453201297, + "joint_ent.sd": 1.10145806941284, + "mut_inf.mean": 0.01975878567151294, + "mut_inf.sd": 0.025945288082994112, + "ns_ratio": 83.89405213641845 + }, + "itemset": { + "one_itemset.mean": 0.1728395061728395, + "one_itemset.sd": 0.21827694955900606, + "two_itemset.mean": 0.2904204594711747, + "two_itemset.sd": 0.22445489887183492 + }, + "statistical": { + "can_cor.mean": 0.5403579611152178, + "can_cor.sd": null, + "cor.mean": 0.06554819579321111, + "cor.sd": 0.09786276332742616, + "cov.mean": 0.16468940993285874, + "cov.sd": 1.0329496620057688, + "eigenvalues.mean": 5.269255637375979, + "eigenvalues.sd": 16.19876176989202, + "g_mean.mean": null, + "g_mean.sd": null, + "gravity": 5.282793071239359, + "h_mean.mean": 0.0, + "h_mean.sd": 0.0, + "iq_range.mean": 1.9116517857142858, + "iq_range.sd": 3.201587258220015, + "kurtosis.mean": 7.720150421216947, + "kurtosis.sd": 39.111865301024565, + "lh_trace": 0.41240289825212684, + "mad.mean": 1.1516625, + "mad.sd": 2.303620744110588, + "max.mean": 5.61625, + "max.sd": 7.8327054567835805, + "mean.mean": 2.5655758792157752, + "mean.sd": 4.365244583529911, + "median.mean": 2.514464285714286, + "median.sd": 4.392893401286324, + "min.mean": 0.0, + "min.sd": 0.0, + "nr_cor_attr": 0.016883116883116882, + "nr_disc": 1, + "nr_norm": 0.0, + "nr_outliers": 32, + "p_trace": 0.2919867261405953, + "range.mean": 5.61625, + "range.sd": 7.8327054567835805, + "roy_root": 0.41240289825212684, + "sd.mean": 1.3640819995400726, + "sd.sd": 1.8629303037310834, + "sd_ratio": 1.0200179969901564, + "skewness.mean": 0.3431678774719039, + "skewness.sd": 2.8750837173281245, + "sparsity.mean": 0.25647443354243105, + "sparsity.sd": 0.16495250052343224, + "t_mean.mean": 2.5380055306935705, + "t_mean.sd": 4.466928721617069, + "var.mean": 5.269255637375979, + "var.sd": 14.183393684074005, + "w_lambda": 0.7080132738594047 + } + }, + "lee2021ability": { + "general": { + "attr_to_inst": 0.000548885077186964, + "cat_to_num": 0.0, + "freq_class.mean": 0.00006961849067112225, + "freq_class.sd": 0.00000837287816198932, + "inst_to_attr": 1821.875, + "nr_attr": 8, + "nr_bin": 1, + "nr_cat": 0, + "nr_class": 14364, + "nr_inst": 14575, + "nr_num": 8, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.0067981219400851745, + "attr_conc.sd": 0.015967117588566067, + "attr_ent.mean": 2.9264965465129515, + "attr_ent.sd": 1.2768155520476885, + "class_conc.mean": 0.9847774711734155, + "class_conc.sd": 0.0015393240740786877, + "class_ent": 13.802099195221773, + "eq_num_attr": 4.821933447480678, + "joint_ent.mean": 13.866237862773751, + "joint_ent.sd": 0.035495585619732886, + "mut_inf.mean": 2.8623578789609745, + "mut_inf.sd": 1.2421986714275994, + "ns_ratio": 0.022407633938233872 + }, + "itemset": { + "one_itemset.mean": 0.08602150537634405, + "one_itemset.sd": 0.10389229939062207, + "two_itemset.mean": 0.1674641148325359, + "two_itemset.sd": 0.1286196232436834 + }, + "statistical": null + }, + "mushrooms": { + "general": { + "attr_to_inst": 0.0027080256031511572, + "cat_to_num": 0.0, + "freq_class.mean": 0.00012309207287050715, + "freq_class.sd": 0.0, + "inst_to_attr": 369.27272727272725, + "nr_attr": 22, + "nr_bin": 0, + "nr_cat": 0, + "nr_class": 8124, + "nr_inst": 8124, + "nr_num": 22, + "num_to_cat": null + }, + "info-theory": { + "attr_conc.mean": 0.5, + "attr_conc.sd": 0.0, + "attr_ent.mean": 0.0, + "attr_ent.sd": 0.0, + "class_conc.mean": 0.5000203091753722, + "class_conc.sd": 1.136349493877995E-16, + "class_ent": 12.98797452429615, + "eq_num_attr": -13847.43282808942, + "joint_ent.mean": 12.98891245804066, + "joint_ent.sd": 0.0, + "mut_inf.mean": -0.000937933744509678, + "mut_inf.sd": 0.0, + "ns_ratio": -1.0 + }, + "itemset": { + "one_itemset.mean": 1.0, + "one_itemset.sd": 0.0, + "two_itemset.mean": 0.0, + "two_itemset.sd": 0.0 + }, + "statistical": { + "can_cor.mean": null, + "can_cor.sd": null, + "cor.mean": null, + "cor.sd": null, + "cov.mean": null, + "cov.sd": null, + "eigenvalues.mean": null, + "eigenvalues.sd": null, + "g_mean.mean": null, + "g_mean.sd": null, + "gravity": null, + "h_mean.mean": null, + "h_mean.sd": null, + "iq_range.mean": null, + "iq_range.sd": null, + "kurtosis.mean": null, + "kurtosis.sd": null, + "lh_trace": null, + "mad.mean": null, + "mad.sd": null, + "max.mean": null, + "max.sd": null, + "mean.mean": null, + "mean.sd": null, + "median.mean": null, + "median.sd": null, + "min.mean": null, + "min.sd": null, + "nr_cor_attr": 0.0, + "nr_disc": null, + "nr_norm": 22.0, + "nr_outliers": 0, + "p_trace": null, + "range.mean": null, + "range.sd": null, + "roy_root": null, + "sd.mean": null, + "sd.sd": null, + "sd_ratio": null, + "skewness.mean": null, + "skewness.sd": null, + "sparsity.mean": 1.0, + "sparsity.sd": 0.0, + "t_mean.mean": null, + "t_mean.sd": null, + "var.mean": null, + "var.sd": null, + "w_lambda": null + } + }, + "saw2018cross": { + "general": { + "attr_to_inst": 0.0003952178638474459, + "cat_to_num": null, + "freq_class.mean": 0.5, + "freq_class.sd": 0.5121127068567723, + "inst_to_attr": 2530.25, + "nr_attr": 8, + "nr_bin": 4, + "nr_cat": 8, + "nr_class": 2, + "nr_inst": 20242, + "nr_num": 0, + "num_to_cat": 0.0 + }, + "info-theory": { + "attr_conc.mean": 0.0841495924145353, + "attr_conc.sd": 0.227670095417584, + "attr_ent.mean": 1.270132629600944, + "attr_ent.sd": 0.6556396727229159, + "class_conc.mean": 0.17201296378306205, + "class_conc.sd": 0.24549176852151852, + "class_ent": 0.5786639974253174, + "eq_num_attr": 4.06014146306263, + "joint_ent.mean": 1.7062735147924277, + "joint_ent.sd": 0.7180867078110545, + "mut_inf.mean": 0.14252311223383382, + "mut_inf.sd": 0.14517552019309857, + "ns_ratio": 7.911766026530994 + }, + "itemset": { + "one_itemset.mean": 0.24242424242424243, + "one_itemset.sd": 0.2623268785252508, + "two_itemset.mean": 0.3854625550660793, + "two_itemset.sd": 0.23647531117015527 + }, + "statistical": { + "can_cor.mean": 0.9415954799514438, + "can_cor.sd": null, + "cor.mean": 0.05235378612739712, + "cor.sd": 0.12169954685116255, + "cov.mean": 0.006570606870586722, + "cov.sd": 0.0210517529628225, + "eigenvalues.mean": 0.09794872814709431, + "eigenvalues.sd": 0.13949616361808648, + "g_mean.mean": null, + "g_mean.sd": null, + "gravity": 1.2696149183358842, + "h_mean.mean": 0.0, + "h_mean.sd": 0.0, + "iq_range.mean": 0.14423076923076922, + "iq_range.sd": 0.3476569927606586, + "kurtosis.mean": 42.944514865439814, + "kurtosis.sd": 90.49556593722505, + "lh_trace": 7.818501403000752, + "mad.mean": 0.02851153846153846, + "mad.sd": 0.1453808909779358, + "max.mean": 1.0, + "max.sd": 0.0, + "mean.mean": 0.14086096691570457, + "mean.sd": 0.15491377046678032, + "median.mean": 0.057692307692307696, + "median.sd": 0.21572774865200242, + "min.mean": 0.0, + "min.sd": 0.0, + "nr_cor_attr": 0.01846153846153846, + "nr_disc": 1, + "nr_norm": 0.0, + "nr_outliers": 22, + "p_trace": 0.8866020478649898, + "range.mean": 1.0, + "range.sd": 0.0, + "roy_root": 7.818501403000752, + "sd.mean": 0.27888261379095614, + "sd.sd": 0.14484524330162923, + "sd_ratio": null, + "skewness.mean": 4.6893540450473745, + "skewness.sd": 4.885739045540896, + "sparsity.mean": 0.3355326500196442, + "sparsity.sd": 0.17948327036390738, + "t_mean.mean": 0.06796159546036049, + "t_mean.sd": 0.16136020097951118, + "var.mean": 0.09794872814709499, + "var.sd": 0.08276203933293032, + "w_lambda": 0.11339795213501022 + } + } +} \ No newline at end of file diff --git a/SynRD/features_extractor/features_extractor.py b/SynRD/features_extractor/features_extractor.py new file mode 100644 index 0000000..5a4da06 --- /dev/null +++ b/SynRD/features_extractor/features_extractor.py @@ -0,0 +1,143 @@ +import json +import os +import math +import numpy as np +import pandas as pd +from typing import Tuple, Union, List + +from pymfe.mfe import MFE + +# FEATURE_GROUPS = ["general", "statistical", "info-theory", "concept", "itemset", "complexity"] +FEATURE_GROUPS = ["general", "statistical", "info-theory", "itemset"] + +MAPPINGS = { + "29621-0001-Data.tsv": {"name": "fairman2019marijuana", "target": "CLASS", + "cat_cols": ["YEAR", "CLASS", "SEX", "AGE", "RACE"]}, + "adult.data": {"name": "adult", "target": "income", + "columns": ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", + "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", + "hours-per-week", "native-country", "income"], + "cat_cols": ["workclass", "education", "marital-status", "occupation", "relationship", "race", + "sex", "native-country", "income"]}, + "agaricus-lepiota.data": {"name": "mushrooms", "target": "class", + "columns": ["class", "cap-shape", "cap-surface", "cap-color", "bruises?", "odor", + "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", + "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", + "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", + "ring-number", "ring-type", "spore-print-color", "population", "habitat"], + "cat_cols": "all"}, + "saw2018cross_dataframe.tsv": {"name": "saw2018cross", "target": "stem_career_aspirations", "cat_cols": "all"}, + "lee2021ability_dataframe.tsv": {"name": "lee2021ability", "target": "math", "cat_cols": "auto"}, + "jeong2021math_dataframe.tsv": {"name": "jeong2021math", "target": "TARGET", "cat_cols": "auto"}, + "iverson22football_dataframe.tsv": {"name": "iverson22football", "target": "H5ID6G"}, + "fruiht2018naturally_dataframe.tsv": {"name": "fruiht2018naturally", "target": "EDU_ATTAINED"}, +} + +PAPER2TARGET = {d['name']: d['target'] for d in MAPPINGS.values()} +PAPER2FEATURES = {d['name']: d.get('features') for d in MAPPINGS.values()} +PAPER2CAT_COLS = {d['name']: d.get('cat_cols', 'auto') for d in MAPPINGS.values()} + + +def load_data(input_file_path, column_names=None): + if input_file_path.endswith('.tsv'): + return pd.read_csv(input_file_path, sep='\t') + if input_file_path.endswith('.data'): + return pd.read_csv(input_file_path, sep=',\s', names=column_names, index_col=None) + if input_file_path.endswith('.csv'): + return pd.read_csv(input_file_path) + if input_file_path.endswith('.json'): + return pd.read_json(input_file_path) + raise ValueError(f'file {input_file_path} is not supported') + + +def load_dataframes(input_dir='data/papers'): + dataframes = dict() + for input_file_path in MAPPINGS: + input_file_dict = MAPPINGS[input_file_path] + dataframe = load_data(os.path.join(input_dir, input_file_path), column_names=input_file_dict.get('columns')) + dataframes[input_file_dict["name"]] = dataframe + return dataframes + + +def features_target_split(dataframe, target, features=None) -> Tuple[pd.DataFrame, pd.DataFrame]: + if features is not None: + features = features + [target] + dataframe = dataframe[features] + X = dataframe.drop(columns=[target], errors='ignore') + y = dataframe[[target]] + return X, y + + +def get_categorical_columns(dataframe_name: str, X: pd.DataFrame) -> Union[str, List[int]]: + cat_cols = PAPER2CAT_COLS[dataframe_name] + if cat_cols == 'all': + return [i for i in range(len(X.columns))] + if cat_cols == 'auto': + return cat_cols + return [i for i, c in enumerate(X.columns) if c in cat_cols] + + +def __replace_nans(vals): + return [None if math.isnan(v) else v for v in vals] + + +class NpEncoder(json.JSONEncoder): + """ + based on: https://stackoverflow.com/a/57915246 + """ + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + super(NpEncoder, self).default(obj) + + +def get_features_from_files(): + features = dict() + for peper_name in PAPER2TARGET.keys(): + file_name = f'features_{peper_name}.json' + features[peper_name] = json.load(open(file_name)) + return features + + +def calculate_features(dataframe_name, dataframe): + X, y = features_target_split(dataframe, features=PAPER2FEATURES[dataframe_name], + target=PAPER2TARGET[dataframe_name]) + dataframe_features = dict() + for group_name in FEATURE_GROUPS: + mfe = MFE(groups=group_name) + cat_cols = get_categorical_columns(dataframe_name, X) + mfe.fit(X.to_numpy(), y.to_numpy(), cat_cols=cat_cols) + features_names, features_vals = mfe.extract() + dataframe_features[group_name] = dict(zip(features_names, __replace_nans(features_vals))) + print(f'Calculated features for {dataframe_name}') + return dataframe_features + + +def save_to_file(objs, file_name): + with open(file_name, 'w') as output_file: + json.dump(objs, output_file, cls=NpEncoder) + print(f'Saved to file {file_name}') + + +def load_from_file(file_name): + objs = json.load(open(file_name)) + print(f'Loaded from {file_name}') + return objs + + +def main(): + dataframes = load_dataframes() + features = dict() + for dataframe_name, dataframe in dataframes.items(): + features_file_name = f'features_{dataframe_name}.json' + if os.path.exists(features_file_name): + dataframe_features = load_from_file(features_file_name) + else: + dataframe_features = calculate_features(dataframe_name, dataframe) + save_to_file(dataframe_features, features_file_name) + features[dataframe_name] = dataframe_features + save_to_file(features, 'features.json') + + +if __name__ == '__main__': + main() diff --git a/SynRD/publication.py b/SynRD/publication.py index 6bfee50..a22aa12 100644 --- a/SynRD/publication.py +++ b/SynRD/publication.py @@ -82,8 +82,13 @@ def __init__(self, dataframe=None, description=None): self.dataframe = dataframe self.real_dataframe = dataframe else: - raise ValueError("Must set dataframe to initialize a paper class.") - + try: + self.dataframe = self._recreate_dataframe() + except NotImplementedError: + raise ValueError("Must set dataframe to initialize a paper class or implement _recreate_dataframe().") + except Exception as e: + raise ValueError(f"Couldn't initialize dataframe for paper.\nCaught {e}.") + self._description = description self.columns = self.real_dataframe.columns diff --git a/setup.py b/setup.py index ceecab4..c3f7d23 100644 --- a/setup.py +++ b/setup.py @@ -24,18 +24,21 @@ "SynRD.datasets"], package_data={'SynRD': ['papers/process.R']}, # setup_requires=['wheel'], - install_requires=["DataSynthesizer", - "smartnoise-synth", - "pandas", - "numpy", - "tqdm", - "requests", - "scikit-learn", - "disjoint-set", - "networkx", - "diffprivlib", - "pathlib", - "statsmodels"], + install_requires=[ + "DataSynthesizer", + "smartnoise-synth", + "pandas", + "numpy", + "tqdm", + "requests", + "scikit-learn", + "disjoint-set", + "networkx", + "diffprivlib", + "pathlib", + "statsmodels", + "pymfe", + ], ) # NOTE: Independent installation of mbi required with: