diff --git a/.travis.yml b/.travis.yml index 7b948df..5307017 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,7 @@ language: python python: - "3.4" - "3.5" + - "3.6" before_install: - "wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh" - "bash miniconda.sh -b -p $HOME/miniconda" @@ -17,7 +18,7 @@ before_install: install: - "conda install -q nomkl" - "conda install -q jinja2" - - "conda install -q numpy" + - "conda install -q 'numpy<1.12'" - "conda install -q pandas" - "conda install -q scipy" - "conda install -q patsy" diff --git a/README.mkd b/README.mkd index eacd237..8f629f4 100644 --- a/README.mkd +++ b/README.mkd @@ -16,7 +16,7 @@ tool: > **genipe: an automated genome-wide imputation pipeline with automatic reporting > and statistical tools.** > *Bioinformatics* 2016, Epub ahead of print. -> [DOI:[10.1093/bioinformatics/btw487](http://dx.doi.org/10.1093/bioinformatics/btw487)]. +> (DOI:[10.1093/bioinformatics/btw487](http://dx.doi.org/10.1093/bioinformatics/btw487)). ## Documentation @@ -78,6 +78,9 @@ and Cox's regressions), `genipe` requires the following Python modules: * `pyfaidx` version 0.3.7 or latest * `drmaa` version 0.7.6 or latest +Note that `statsmodels` (specifically MixedLM analysis) version 0.6 **is not +compatible** with `numpy` version 1.12 and latest. + Finally, the tool requires a LaTeX installation to compile the automatically generated report in PDF format. diff --git a/conda_build.sh b/conda_build.sh index cfb2258..1176219 100755 --- a/conda_build.sh +++ b/conda_build.sh @@ -26,7 +26,7 @@ then fi # The different python versions and platforms -python_versions="3.4 3.5" +python_versions="3.4 3.5 3.6" platforms="linux-32 linux-64 osx-64" # Building diff --git a/genipe/tests/test_arguments.py b/genipe/tests/test_arguments.py index d854fcc..784b1c6 100644 --- a/genipe/tests/test_arguments.py +++ b/genipe/tests/test_arguments.py @@ -787,8 +787,8 @@ def test_segment_length_large(self): # Checking the warning is logged with self.assertLogs(level="WARNING") as cm: check_args(self.args) - log_m = ["WARNING:root:segment length (5e+06 bp) is more than 5Mb"] - self.assertEqual(log_m, cm.output) + log_m = "WARNING:root:segment length (5e+06 bp) is more than 5Mb" + self.assertTrue(log_m in cm.output) def test_segment_length_small(self): """Tests different invalid segment length (too small).""" @@ -797,8 +797,8 @@ def test_segment_length_small(self): # Checking the warning is logged with self.assertLogs(level="WARNING") as cm: check_args(self.args) - log_m = ["WARNING:root:segment length (999 bp) is too small"] - self.assertEqual(log_m, cm.output) + log_m = "WARNING:root:segment length (999 bp) is too small" + self.assertTrue(log_m in cm.output) def test_segment_length_invalid(self): """Tests different invalid segment length.""" diff --git a/genipe/tests/test_formats.py b/genipe/tests/test_formats.py index e918b75..fc48022 100644 --- a/genipe/tests/test_formats.py +++ b/genipe/tests/test_formats.py @@ -51,10 +51,8 @@ def test_matrix_from_line(self): self.assertTrue(np.allclose(expected_geno, observed_geno)) # An invalid line should raise an exception - with self.assertRaises(ValueError) as cm: + with self.assertRaises(ValueError): impute2.matrix_from_line(input_line.split(" ")[:-1]) - error_m = "total size of new array must be unchanged" - self.assertEqual(error_m, str(cm.exception)) def test_get_good_probs(self): """Tests the 'get_good_probs' function.""" diff --git a/genipe/tests/test_imputed_stats.py b/genipe/tests/test_imputed_stats.py index 30af08f..7bde78e 100644 --- a/genipe/tests/test_imputed_stats.py +++ b/genipe/tests/test_imputed_stats.py @@ -2172,21 +2172,21 @@ def setUpClass(cls): # Computing the random effect for REML data = data.drop(["snp1", "snp2", "snp3"], axis=1).dropna() - cls.random_effects = imputed_stats.smf.mixedlm( - formula="y ~ C1 + C2 + C3 + age + C(gender) + C(visit)", - data=data, - groups=data.index, - ).fit(reml=True).random_effects.rename( - columns={"Intercept": "RE"}, + cls.random_effects = imputed_stats._extract_mixedlm_random_effect( + imputed_stats.smf.mixedlm( + formula="y ~ C1 + C2 + C3 + age + C(gender) + C(visit)", + data=data, + groups=data.index, + ).fit(reml=True), ) # Computing the random effect for ML - cls.random_effects_ml = imputed_stats.smf.mixedlm( - formula="y ~ C1 + C2 + C3 + age + C(gender) + C(visit)", - data=data, - groups=data.index, - ).fit(reml=False).random_effects.rename( - columns={"Intercept": "RE"}, + cls.random_effects_ml = imputed_stats._extract_mixedlm_random_effect( + imputed_stats.smf.mixedlm( + formula="y ~ C1 + C2 + C3 + age + C(gender) + C(visit)", + data=data, + groups=data.index, + ).fit(reml=False), ) def setUp(self): @@ -2234,10 +2234,10 @@ def test_fit_mixedlm_snp1(self): # Comparing the results self.assertAlmostEqual(expected_coef, observed_coef, places=10) - self.assertAlmostEqual(expected_se, observed_se, places=7) - self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=7) - self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=7) - self.assertAlmostEqual(expected_z, observed_z, places=5) + self.assertAlmostEqual(expected_se, observed_se, places=6) + self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=6) + self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=6) + self.assertAlmostEqual(expected_z, observed_z, places=4) self.assertAlmostEqual(np.log10(expected_p), np.log10(observed_p), places=4) self.assertEqual(expected_type, observed_type) @@ -2319,12 +2319,12 @@ def test_fit_mixedlm_snp3(self): # Comparing the results self.assertAlmostEqual(expected_coef, observed_coef, places=10) - self.assertAlmostEqual(expected_se, observed_se, places=8) - self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=7) - self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=7) - self.assertAlmostEqual(expected_z, observed_z, places=6) + self.assertAlmostEqual(expected_se, observed_se, places=6) + self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=6) + self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=6) + self.assertAlmostEqual(expected_z, observed_z, places=4) self.assertAlmostEqual(np.log10(expected_p), np.log10(observed_p), - places=6) + places=4) self.assertEqual(expected_type, observed_type) def test_fit_mixedlm_invalid_column(self): @@ -2396,10 +2396,10 @@ def test_fit_mixedlm_snp1_use_ml(self): # Comparing the results self.assertAlmostEqual(expected_coef, observed_coef, places=10) - self.assertAlmostEqual(expected_se, observed_se, places=7) - self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=7) - self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=7) - self.assertAlmostEqual(expected_z, observed_z, places=5) + self.assertAlmostEqual(expected_se, observed_se, places=6) + self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=6) + self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=6) + self.assertAlmostEqual(expected_z, observed_z, places=4) self.assertAlmostEqual(np.log10(expected_p), np.log10(observed_p), places=4) self.assertEqual(expected_type, observed_type) @@ -2481,12 +2481,12 @@ def test_fit_mixedlm_snp3_use_ml(self): # Comparing the results self.assertAlmostEqual(expected_coef, observed_coef, places=10) - self.assertAlmostEqual(expected_se, observed_se, places=9) - self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=9) - self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=9) - self.assertAlmostEqual(expected_z, observed_z, places=7) + self.assertAlmostEqual(expected_se, observed_se, places=6) + self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=6) + self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=6) + self.assertAlmostEqual(expected_z, observed_z, places=4) self.assertAlmostEqual(np.log10(expected_p), np.log10(observed_p), - places=7) + places=4) self.assertEqual(expected_type, observed_type) def test_fit_mixedlm_interaction(self): @@ -2525,12 +2525,12 @@ def test_fit_mixedlm_interaction(self): # Comparing the results self.assertAlmostEqual(expected_coef, observed_coef, places=10) - self.assertAlmostEqual(expected_se, observed_se, places=8) - self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=8) - self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=8) - self.assertAlmostEqual(expected_z, observed_z, places=7) + self.assertAlmostEqual(expected_se, observed_se, places=6) + self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=6) + self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=6) + self.assertAlmostEqual(expected_z, observed_z, places=5) self.assertAlmostEqual(np.log10(expected_p), np.log10(observed_p), - places=7) + places=5) self.assertEqual(expected_type, observed_type) def test_fit_mixedlm_interaction_use_ml(self): @@ -2569,12 +2569,12 @@ def test_fit_mixedlm_interaction_use_ml(self): # Comparing the results self.assertAlmostEqual(expected_coef, observed_coef, places=10) - self.assertAlmostEqual(expected_se, observed_se, places=9) - self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=9) - self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=9) - self.assertAlmostEqual(expected_z, observed_z, places=8) + self.assertAlmostEqual(expected_se, observed_se, places=6) + self.assertAlmostEqual(expected_min_ci, observed_min_ci, places=6) + self.assertAlmostEqual(expected_max_ci, observed_max_ci, places=6) + self.assertAlmostEqual(expected_z, observed_z, places=5) self.assertAlmostEqual(np.log10(expected_p), np.log10(observed_p), - places=9) + places=5) self.assertEqual(expected_type, observed_type) def test_full_fit_mixedlm(self): @@ -2642,26 +2642,26 @@ def test_full_fit_mixedlm(self): # The standard error expected = np.array([0.026125844399462177, np.nan, 0.028692891145471563]) - np.testing.assert_array_almost_equal(expected, observed.se, 8) + np.testing.assert_array_almost_equal(expected, observed.se, 7) # The lower CI expected = np.array([0.07144597572112760, np.nan, -0.21073684365058973]) - np.testing.assert_array_almost_equal(expected, observed.lower, 8) + np.testing.assert_array_almost_equal(expected, observed.lower, 7) # The upper CI expected = np.array([0.1738574038984143, np.nan, -0.09826277713568479]) - np.testing.assert_array_almost_equal(expected, observed.upper, 8) + np.testing.assert_array_almost_equal(expected, observed.upper, 7) # The Z statistics expected = np.array([4.6946497856465763, np.nan, -5.3846023954097459]) - np.testing.assert_array_almost_equal(expected, observed.z, 6) + np.testing.assert_array_almost_equal(expected, observed.z, 5) # The p values expected = np.array([2.670638639346024e-06, 0.6780830649776308, 7.260496248662207e-08]) np.testing.assert_array_almost_equal( - np.log10(expected), np.log10(observed.p), 6, + np.log10(expected), np.log10(observed.p), 4, ) # The analysis type @@ -2734,26 +2734,26 @@ def test_full_fit_mixedlm_use_ml(self): # The standard error expected = np.array([0.026109971717277716, np.nan, 0.02867546407987996]) - np.testing.assert_array_almost_equal(expected, observed.se, 9) + np.testing.assert_array_almost_equal(expected, observed.se, 7) # The lower CI expected = np.array([0.07147708560653579, np.nan, -0.21070268722968036]) - np.testing.assert_array_almost_equal(expected, observed.lower, 8) + np.testing.assert_array_almost_equal(expected, observed.lower, 7) # The upper CI expected = np.array([0.1738262940129832, np.nan, -0.09829693355660693]) - np.testing.assert_array_almost_equal(expected, observed.upper, 8) + np.testing.assert_array_almost_equal(expected, observed.upper, 7) # The Z statistics expected = np.array([4.6975037406339810, np.nan, -5.3878748034473105]) - np.testing.assert_array_almost_equal(expected, observed.z, 6) + np.testing.assert_array_almost_equal(expected, observed.z, 5) # The p values expected = np.array([2.633603631840842e-06, 0.6780830649776319, 7.129568602159964e-08]) np.testing.assert_array_almost_equal( - np.log10(expected), np.log10(observed.p), 6, + np.log10(expected), np.log10(observed.p), 4, ) # The type @@ -2828,26 +2828,26 @@ def test_full_fit_mixedlm_multiprocess(self): # The standard error expected = np.array([0.026125844399462177, np.nan, 0.028692891145471563]) - np.testing.assert_array_almost_equal(expected, observed.se, 8) + np.testing.assert_array_almost_equal(expected, observed.se, 7) # The lower CI expected = np.array([0.07144597572112760, np.nan, -0.21073684365058973]) - np.testing.assert_array_almost_equal(expected, observed.lower, 8) + np.testing.assert_array_almost_equal(expected, observed.lower, 7) # The upper CI expected = np.array([0.1738574038984143, np.nan, -0.09826277713568479]) - np.testing.assert_array_almost_equal(expected, observed.upper, 8) + np.testing.assert_array_almost_equal(expected, observed.upper, 7) # The Z statistics expected = np.array([4.6946497856465763, np.nan, -5.3846023954097459]) - np.testing.assert_array_almost_equal(expected, observed.z, 6) + np.testing.assert_array_almost_equal(expected, observed.z, 5) # The p values expected = np.array([2.670638639346024e-06, 0.6780830649776308, 7.260496248662207e-08]) np.testing.assert_array_almost_equal( - np.log10(expected), np.log10(observed.p), 6, + np.log10(expected), np.log10(observed.p), 4, ) # The type @@ -2923,26 +2923,26 @@ def test_full_fit_mixedlm_multiprocess_use_ml(self): # The standard error expected = np.array([0.026109971717277716, np.nan, 0.02867546407987996]) - np.testing.assert_array_almost_equal(expected, observed.se, 9) + np.testing.assert_array_almost_equal(expected, observed.se, 7) # The lower CI expected = np.array([0.07147708560653579, np.nan, -0.21070268722968036]) - np.testing.assert_array_almost_equal(expected, observed.lower, 8) + np.testing.assert_array_almost_equal(expected, observed.lower, 7) # The upper CI expected = np.array([0.1738262940129832, np.nan, -0.09829693355660693]) - np.testing.assert_array_almost_equal(expected, observed.upper, 8) + np.testing.assert_array_almost_equal(expected, observed.upper, 7) # The Z statistics expected = np.array([4.6975037406339810, np.nan, -5.3878748034473105]) - np.testing.assert_array_almost_equal(expected, observed.z, 6) + np.testing.assert_array_almost_equal(expected, observed.z, 5) # The p values expected = np.array([2.633603631840842e-06, 0.6780830649776319, 7.129568602159964e-08]) np.testing.assert_array_almost_equal( - np.log10(expected), np.log10(observed.p), 6, + np.log10(expected), np.log10(observed.p), 4, ) # The type @@ -3015,28 +3015,28 @@ def test_full_fit_mixedlm_interaction(self): # The standard error expected = np.array([0.06049308027425798, 0.044179901027834402, 0.066498803156555431]) - np.testing.assert_array_almost_equal(expected, observed.se, 9) + np.testing.assert_array_almost_equal(expected, observed.se, 7) # The lower CI expected = np.array([-0.06700243069143892, -0.07256251047675560, -0.23149945885188331]) - np.testing.assert_array_almost_equal(expected, observed.lower, 8) + np.testing.assert_array_almost_equal(expected, observed.lower, 6) # The upper CI expected = np.array([0.1701260866114330, 0.10061951923344345, 0.02917105955185087]) - np.testing.assert_array_almost_equal(expected, observed.upper, 8) + np.testing.assert_array_almost_equal(expected, observed.upper, 6) # The Z statistics expected = np.array([0.8523591082852910, 0.3175313672501355, -1.5212935398528820]) - np.testing.assert_array_almost_equal(expected, observed.z, 7) + np.testing.assert_array_almost_equal(expected, observed.z, 5) # The p values expected = np.array([3.940148087160935e-01, 7.508404423440460e-01, 1.281861906944759e-01]) np.testing.assert_array_almost_equal( - np.log10(expected), np.log10(observed.p), 7, + np.log10(expected), np.log10(observed.p), 5, ) # The analysis type @@ -3110,28 +3110,28 @@ def test_full_fit_mixedlm_interaction_use_ml(self): # The standard error expected = np.array([0.060482583773306557, 0.044172234988136105, 0.06648726468681930]) - np.testing.assert_array_almost_equal(expected, observed.se, 9) + np.testing.assert_array_almost_equal(expected, observed.se, 7) # The lower CI expected = np.array([-0.06698185792762956, -0.07254748531507074, -0.23147684386674616]) - np.testing.assert_array_almost_equal(expected, observed.lower, 8) + np.testing.assert_array_almost_equal(expected, observed.lower, 6) # The upper CI expected = np.array([0.1701055138475855, 0.10060449407170288, 0.02914844456674892]) - np.testing.assert_array_almost_equal(expected, observed.upper, 8) + np.testing.assert_array_almost_equal(expected, observed.upper, 6) # The Z statistics expected = np.array([0.8525070316644490, 0.3175864744467622, -1.5215575513073230]) - np.testing.assert_array_almost_equal(expected, observed.z, 7) + np.testing.assert_array_almost_equal(expected, observed.z, 5) # The p values expected = np.array([3.939327379391016e-01, 7.507986352043505e-01, 1.281199805761517e-01]) np.testing.assert_array_almost_equal( - np.log10(expected), np.log10(observed.p), 8, + np.log10(expected), np.log10(observed.p), 5, ) # The analysis type diff --git a/genipe/tools/imputed_stats.py b/genipe/tools/imputed_stats.py index 9b44aec..2e0db20 100644 --- a/genipe/tools/imputed_stats.py +++ b/genipe/tools/imputed_stats.py @@ -706,6 +706,32 @@ def _skat_write_marker(name, dosage, snp_set, genotype_files): print(name, *dosage, sep=",", file=file_object) +def _extract_mixedlm_random_effect(fitted): + """Extracts the random effects from a MixedLM fit object. + + Args: + fitted (MixedLMResultsWrapper): The fitted object. + + Returns: + pandas.DataFrame: The random effects as a DataFrame (with a column + named "RE"). + + Note + ==== + Depending of the version of StatsModels, the object might be a pandas + DataFrame or a dictionary... + + """ + # Getting the random effects + random_effects = fitted.random_effects + + # If it's a dictionary, we need to create a DataFrame + if isinstance(random_effects, dict): + return pd.DataFrame(random_effects).T.rename(columns={"groups": "RE"}) + + return random_effects.rename(columns={"Intercept": "RE"}) + + def compute_statistics(impute2_filename, samples, markers_to_extract, phenotypes, remove_gender, out_prefix, options): """Parses IMPUTE2 file while computing statistics. @@ -789,13 +815,11 @@ def compute_statistics(impute2_filename, samples, markers_to_extract, # We need to compute the random effects if it's a MixedLM analysis random_effects = None if options.analysis_type == "mixedlm" and options.interaction is None: - random_effects = smf.mixedlm( + random_effects = _extract_mixedlm_random_effect(smf.mixedlm( formula=formula.replace("_GenoD + ", ""), data=phenotypes, groups=phenotypes.index, - ).fit(reml=not options.use_ml).random_effects.rename( - columns={"Intercept": "RE"}, - ) + ).fit(reml=not options.use_ml)) # Reading the file nb_processed = 0 diff --git a/setup.py b/setup.py index 11988e2..02d6eae 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ MAJOR = 1 MINOR = 3 -MICRO = 1 +MICRO = 2 VERSION = "{0}.{1}.{2}".format(MAJOR, MINOR, MICRO)