Merge pull request #1 from Noble-Lab/bug-v0.0.1

Fixed read file bug
Noble-Lab · Dec 9, 2020 · 1491f3a · 1491f3a
2 parents 5525da2 + ae0b71a
commit 1491f3a
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 13 deletions.
diff --git a/crema/__pycache__/parsers.cpython-37.pyc b/crema/__pycache__/parsers.cpython-37.pyc
diff --git a/crema/parsers.py b/crema/parsers.py
@@ -12,7 +12,6 @@ def read_file(
     spectrum_col="scan",
     score_col="combined p-value",
     target_col="target/decoy",
-    delimiter="," or "\t",
 ):
     """
     Read tab-delimited files.
@@ -27,9 +26,6 @@ def read_file(
         name of the column that defines the scores (p-values) of the psms
     target_col : str
         name of the column that indicates if a psm is a target/decoy
-    delimiter : str
-        string character equal to what is used to separate columns
-        within the tab-delimited file
 
     Returns
     -------
@@ -46,13 +42,15 @@ def read_file(
             fields.append(col)
         fields.append(score_col)
         fields.append(target_col)
-    # fields = [spectrum_col, score_col, target_col]
+
     # Create empty Pandas dataframe
     data = pd.DataFrame()
+
     # Loop through all given files
     for file in input_files:
         data = data.append(
-            pd.read_csv(file, sep=delimiter, usecols=fields), ignore_index=True
+            pd.read_csv(file, sep=None, usecols=fields, engine="python"),
+            ignore_index=True,
         )
     data = convert_target_col(data, target_col)
     return PsmDataset(data, spectrum_col, score_col, target_col)

diff --git a/data/single_basic_tab.txt b/data/single_basic_tab.txt
@@ -0,0 +1,11 @@
+scan	combined p-value	target/decoy
+1	0.7	TRUE
+2	0.4	FALSE
+3	0.1	TRUE
+4	0.55	TRUE
+5	0.25	TRUE
+1	0.6	TRUE
+2	0.2	FALSE
+3	0.7	FALSE
+4	0.56	TRUE
+5	0.3	FALSE
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -28,6 +28,28 @@ def test_cli_basic(tmp_path):
     assert os.path.isfile(os.path.join(tmp_path, "crema.logfile.log"))
 
 
+def test_cli_basic_tab(tmp_path):
+    """
+    Test that the cli works with a tab delimited file. Reads in a file
+    with crux default column names.
+
+    Parameters
+    ----------
+    tmp_path : pytest fixture of a temporary directory
+        A pytest temporary directory unique to the test invocation
+
+    Returns
+    -------
+    Pandas Assert Frame
+        Asserts whether or not the the results file and log file
+        are created properly with the correct file path.
+    """
+    cmd = ["crema", "data/single_basic_tab.txt", "--output_dir", tmp_path]
+    subprocess.run(cmd, check=True)
+    assert os.path.isfile(os.path.join(tmp_path, "crema.psm_results.txt"))
+    assert os.path.isfile(os.path.join(tmp_path, "crema.logfile.log"))
+
+
 def test_cli_custom_root(tmp_path):
     """
     Test that the cli works with custom file root.

diff --git a/tests/test_crema_methods.py b/tests/test_crema_methods.py
@@ -341,12 +341,30 @@ def test_single_basic_dataset_class():
     -------
     PsmDataset
         A :py:class:`~crema.dataset.PsmDataset` object
-        containing the PSM data from the given tab-delimited file.
+        containing the PSM data from the given comma separated value file.
     """
     psm = read_file(["data/single_basic.csv"])
     return psm
 
 
+@pytest.fixture
+def test_single_basic_tab_dataset_class():
+    """
+    Creates a pytest fixture of a PsmDataset object
+    by reading in "single_basic_tab.csv" to use in
+    subsequent test cases. This file has crux
+    default column names.
+
+    Returns
+    -------
+    PsmDataset
+        A :py:class:`~crema.dataset.PsmDataset` object
+        containing the PSM data from the given tab delimited file.
+    """
+    psm = read_file(["data/single_basic_tab.txt"])
+    return psm
+
+
 @pytest.fixture
 def test_single_int_targets_dataset_class():
     """
@@ -361,7 +379,7 @@ def test_single_int_targets_dataset_class():
     -------
     PsmDataset
         A :py:class:`~crema.dataset.PsmDataset` object
-        containing the PSM data from the given tab-delimited file.
+        containing the PSM data from the given comma separated value file.
     """
     psm = read_file(["data/single_int_targets.csv"])
     return psm
@@ -380,7 +398,7 @@ def test_single_text_scan_dataset_class():
     -------
     PsmDataset
         A :py:class:`~crema.dataset.PsmDataset` object
-        containing the PSM data from the given tab-delimited file.
+        containing the PSM data from the given comma separated value file.
     """
     psm = read_file(["data/single_text_scan.csv"])
     return psm
@@ -399,7 +417,7 @@ def test_single_add_spectrum_dataset_class():
     -------
     PsmDataset
         A :py:class:`~crema.dataset.PsmDataset` object
-        containing the PSM data from the given tab-delimited file.
+        containing the PSM data from the given comma separated value file.
     """
     psm = read_file(
         ["data/single_add_spectrum.csv"], spectrum_col=["scan", "extras"]
@@ -420,7 +438,7 @@ def test_single_arbitrary_dataset_class():
     -------
     PsmDataset
         A :py:class:`~crema.dataset.PsmDataset` object
-        containing the PSM data from the given tab-delimited file.
+        containing the PSM data from the given comma separated value file.
     """
     psm = read_file(["data/single_arbitrary.csv"])
     return psm
@@ -439,7 +457,7 @@ def test_single_noncrux_dataset_class():
     -------
     PsmDataset
         A :py:class:`~crema.dataset.PsmDataset` object
-        containing the PSM data from the given tab-delimited file.
+        containing the PSM data from the given comma separated value file.
     """
     psm = read_file(["data/single_noncrux.csv"], "scan", "p-value", "target")
     return psm
@@ -458,7 +476,7 @@ def test_multi_dataset_class():
     -------
     PsmDataset
         A :py:class:`~crema.dataset.PsmDataset` object
-        containing the PSM data from the given tab-delimited files.
+        containing the PSM data from the given comma separated value files.
     """
     files = ["data/multi_target.csv", "data/multi_decoy.csv"]
     psm = read_file(files)
@@ -511,6 +529,29 @@ def test_single_basic_tdc(test_single_basic_dataset_class):
     pd.testing.assert_frame_equal(actual, compare)
 
 
+def test_single_basic_tab_data(test_single_basic_tab_dataset_class):
+    """
+    Checks whether or not a PsmDataset object is created properly
+    after reading in a single tab delimited file with crux default column names,
+    but tab delimited instead of comma separated. The point of this test is
+    to ensure that crema can read data separated by tabs as well as commas.
+
+    Parameters
+    ----------
+    test_single_basic_tab_dataset_class : pytest fixture of a PsmDataset object
+        A psm object created by reading in the "single_basic_tab.txt" file
+
+    Returns
+    -------
+    Pandas Assert Frame
+        Asserts whether or not the data in PsmDataset object
+        is equal to the dataframe named "testframe_single_basic_data"
+    """
+    actual = testframe_single_basic_data.copy()
+    compare = test_single_basic_tab_dataset_class.data
+    pd.testing.assert_frame_equal(actual, compare)
+
+
 def test_single_int_targets_data(test_single_int_targets_dataset_class):
     """
     Checks whether or not a PsmDataset object is created properly