From 62ecf54be36340ad4cb4b95fc831857e51adb1f9 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 20 Nov 2024 08:06:48 -0700 Subject: [PATCH 1/4] isolate focus on specific columns --- src/cosmicqc/analyze.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py index 5770cc7..b6df089 100644 --- a/src/cosmicqc/analyze.py +++ b/src/cosmicqc/analyze.py @@ -175,26 +175,29 @@ def find_outliers( Outlier data frame for the given conditions. """ - # interpret the df as CytoDataFrame - df = CytoDataFrame(data=df) - + # Resolve feature_thresholds if provided as a string if isinstance(feature_thresholds, str): feature_thresholds = read_thresholds_set_from_file( feature_thresholds=feature_thresholds, feature_thresholds_file=feature_thresholds_file, ) - # Filter DataFrame for outliers using all conditions - outliers_df = df[ - # use identify outliers as a mask on the full dataframe - identify_outliers( - df=df, - feature_thresholds=feature_thresholds, - feature_thresholds_file=feature_thresholds_file, - ) - ] + # Determine the columns required for processing + required_columns = list(feature_thresholds.keys()) + metadata_columns + + # Interpret the df as CytoDataFrame + df = CytoDataFrame(data=df)[required_columns] + + # Filter DataFrame for outliers using identify_outliers + outliers_mask = identify_outliers( + # Select only the required columns from the DataFrame + df=df, + feature_thresholds=feature_thresholds, + feature_thresholds_file=feature_thresholds_file, + ) + outliers_df = df[outliers_mask] - # Print outliers count and range for each feature + # Print outlier count and range for each feature print( "Number of outliers:", outliers_df.shape[0], @@ -206,15 +209,13 @@ def find_outliers( print(f"{feature} Max:", outliers_df[feature].max()) # Include metadata columns in the output DataFrame - columns_to_include = list(feature_thresholds.keys()) + metadata_columns - - result = outliers_df[columns_to_include] + result = outliers_df[required_columns] - # export the file if specified + # Export the file if specified if export_path is not None: result.export(file_path=export_path) - # Return outliers DataFrame with specified columns + # Return the resulting DataFrame return result From 52d57ffb528d64a0dac28fcb59a85455c662c4e9 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 20 Nov 2024 08:24:52 -0700 Subject: [PATCH 2/4] update for efficiencies in identify outliers --- src/cosmicqc/analyze.py | 98 +++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py index b6df089..bd0649c 100644 --- a/src/cosmicqc/analyze.py +++ b/src/cosmicqc/analyze.py @@ -60,80 +60,84 @@ def identify_outliers( or not for use within other functions. """ - # interpret the df as CytoDataFrame + # Ensure the input is a CytoDataFrame, converting if necessary df = CytoDataFrame(data=df) - # create a copy of the dataframe to ensure - # we don't modify the supplied dataframe inplace. - outlier_df = df.copy() + # Create a copy of the DataFrame only if we intend to export results + # Otherwise, operate directly on the input to save memory. + outlier_df = df.copy() if export_path else df + # Define the naming scheme for z-score columns based on thresholds thresholds_name = ( f"cqc.{feature_thresholds}" if isinstance(feature_thresholds, str) else "cqc.custom" ) + # If feature_thresholds is a string, load the thresholds from the specified file if isinstance(feature_thresholds, str): feature_thresholds = read_thresholds_set_from_file( feature_thresholds=feature_thresholds, feature_thresholds_file=feature_thresholds_file, ) - # Create z-score columns for each feature to reference during outlier detection + # Dictionary to store mappings of features to their z-score column names zscore_columns = {} for feature in feature_thresholds: + # Ensure the feature exists in the DataFrame if feature not in df.columns: raise ValueError(f"Feature '{feature}' does not exist in the DataFrame.") - outlier_df[(colname := f"{thresholds_name}.Z_Score.{feature}")] = scipy_zscore( - df[feature] - ) - zscore_columns[feature] = colname - # Create outlier detection conditions for each feature - conditions = [] - for feature, threshold in feature_thresholds.items(): - # For positive thresholds, look for outliers that are - # that number of std "above" the mean + # Construct the z-score column name + zscore_col = f"{thresholds_name}.Z_Score.{feature}" + + # Calculate and store z-scores only if not already present + if zscore_col not in outlier_df: + outlier_df[zscore_col] = scipy_zscore(df[feature]) + + # Add the column name to the zscore_columns dictionary + zscore_columns[feature] = zscore_col + + # Helper function to create outlier detection conditions + def create_condition(feature: str, threshold: float) -> pd.Series: + # Positive threshold checks for outliers above the mean if threshold > 0: - condition = outlier_df[zscore_columns[feature]] > threshold - # For negative thresholds, look for outliers that are - # that number of std "below" the mean - else: - condition = outlier_df[zscore_columns[feature]] < threshold - conditions.append(condition) - - result = ( - # create a boolean pd.series identifier for dataframe - # based on all conditions for use within other functions. - reduce(operator.and_, conditions) - if not include_threshold_scores - # otherwise, provide the threshold zscore col and the above column - else CytoDataFrame( - data=pd.concat( - [ - # grab only the outlier zscore columns from the outlier_df - outlier_df[zscore_columns.values()], - CytoDataFrame( - { - f"{thresholds_name}.is_outlier": reduce( - operator.and_, conditions - ) - } - ), - ], - axis=1, - ), + return outlier_df[zscore_columns[feature]] > threshold + # Negative threshold checks for outliers below the mean + return outlier_df[zscore_columns[feature]] < threshold + + # Generate outlier detection conditions for all features + conditions = [ + create_condition(feature, threshold) + for feature, threshold in feature_thresholds.items() + ] + + # Construct the result based on whether threshold scores should be included + if include_threshold_scores: + # Extract z-score columns for each feature + zscore_df = outlier_df[list(zscore_columns.values())] + + # Combine conditions into a single Series indicating outlier status + is_outlier_series = reduce(operator.and_, conditions).rename( + f"{thresholds_name}.is_outlier" + ) + + # Combine z-scores and outlier status into a single DataFrame + result = CytoDataFrame( + data=pd.concat([zscore_df, is_outlier_series], axis=1), data_context_dir=df._custom_attrs["data_context_dir"], data_mask_context_dir=df._custom_attrs["data_mask_context_dir"], ) - ) + else: + # Combine conditions into a single Series of boolean values + result = reduce(operator.and_, conditions) + # Export the result if an export path is specified if export_path is not None: - if isinstance(result, pd.Series): - CytoDataFrame(result).export(file_path=export_path) - else: - result.export(file_path=export_path) + export_df = CytoDataFrame(result) if isinstance(result, pd.Series) else result + export_df.export(file_path=export_path) + # Return the resulting Series or DataFrame return result From e2e274eba70905b23f97cbaabdb9f301ffba2076 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 20 Nov 2024 08:25:01 -0700 Subject: [PATCH 3/4] clean up tests and add new coverage --- media/coverage-badge.svg | 2 +- tests/conftest.py | 132 --------------------------------------- 2 files changed, 1 insertion(+), 133 deletions(-) diff --git a/media/coverage-badge.svg b/media/coverage-badge.svg index 21d9b08..be6b515 100644 --- a/media/coverage-badge.svg +++ b/media/coverage-badge.svg @@ -1 +1 @@ -coverage: 87.44%coverage87.44% \ No newline at end of file +coverage: 96.27%coverage96.27% \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 5b74e25..1b4251d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,14 +6,8 @@ import pathlib -import numpy as np import pandas as pd -import plotly.colors as pc import pytest -import skimage -from PIL import Image - -import cosmicqc @pytest.fixture(name="cytotable_CFReT_data_df") @@ -26,25 +20,6 @@ def fixture_cytotable_CFReT_df(): ) -@pytest.fixture(name="cytotable_NF1_data_parquet_shrunken") -def fixture_cytotable_NF1_data_parquet_shrunken(): - """ - Return df to test CytoTable NF1 data through shrunken parquet file - """ - return ( - "tests/data/cytotable/NF1_cellpainting_data_shrunken/" - "Plate_2_with_image_data_shrunken.parquet" - ) - - -@pytest.fixture(name="cytotable_nuclear_speckles_data_parquet") -def fixture_cytotable_nuclear_speckle_data_parquet(): - """ - Return df to test CytoTable nuclear speckles data through shrunken parquet file - """ - return "tests/data/cytotable/nuclear_speckles/test_slide1_converted.parquet" - - @pytest.fixture(name="basic_outlier_dataframe") def fixture_basic_outlier_dataframe(): """ @@ -66,110 +41,3 @@ def fixture_basic_outlier_csv( ) return csv_path - - -@pytest.fixture(name="basic_outlier_csv_gz") -def fixture_basic_outlier_csv_gz( - tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame -): - """ - Creates basic example data csv for use in tests - """ - - basic_outlier_dataframe.to_csv( - csv_gz_path := tmp_path / "example.csv.gz", index=False, compression="gzip" - ) - - return csv_gz_path - - -@pytest.fixture(name="basic_outlier_tsv") -def fixture_basic_outlier_tsv( - tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame -): - """ - Creates basic example data tsv for use in tests - """ - - basic_outlier_dataframe.to_csv( - tsv_path := tmp_path / "example.tsv", sep="\t", index=False - ) - - return tsv_path - - -@pytest.fixture(name="basic_outlier_parquet") -def fixture_basic_outlier_parquet( - tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame -): - """ - Creates basic example data parquet for use in tests - """ - - basic_outlier_dataframe.to_parquet( - parquet_path := tmp_path / "example.parquet", index=False - ) - - return parquet_path - - -@pytest.fixture(name="generate_show_report_html_output") -def fixture_generate_show_report_html_output(cytotable_CFReT_data_df: pd.DataFrame): - """ - Used for generating report output for use with other tests. - """ - - # create outliers dataframe - df = cosmicqc.analyze.label_outliers( - df=cytotable_CFReT_data_df, - include_threshold_scores=True, - ) - - # show a report - df.show_report( - report_path=( - report_path := pathlib.Path(__file__).parent - / "data" - / "coSMicQC" - / "show_report" - / "cosmicqc_example_report.html" - ), - color_palette=pc.qualitative.Dark24[0:2], - auto_open=False, - ) - - return report_path - - -@pytest.fixture -def fixture_dark_image(): - # Create a dark image (50x50 pixels, almost black) - dark_img_array = np.zeros((50, 50, 3), dtype=np.uint8) - return Image.fromarray(dark_img_array) - - -@pytest.fixture -def fixture_mid_brightness_image(): - # Create an image with medium brightness (50x50 pixels, mid gray) - mid_brightness_img_array = np.full((50, 50, 3), 128, dtype=np.uint8) - return Image.fromarray(mid_brightness_img_array) - - -@pytest.fixture -def fixture_bright_image(): - # Create a bright image (50x50 pixels, almost white) - bright_img_array = np.full((50, 50, 3), 255, dtype=np.uint8) - return Image.fromarray(bright_img_array) - - -@pytest.fixture -def fixture_nuclear_speckle_example_image(): - # create an image array from example nuclear speckle data - return Image.fromarray( - ( - skimage.io.imread( - "tests/data/cytotable/nuclear_speckles/images/plate1/slide1_A1_M10_CH0_Z09_illumcorrect.tiff" - ) - / 256 - ).astype(np.uint8) - ).convert("RGBA") From fadbebc342d015a5273ac3ae9a6e778e1262ba85 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 20 Nov 2024 08:39:37 -0700 Subject: [PATCH 4/4] update code comment --- src/cosmicqc/analyze.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py index bd0649c..3b0c5bc 100644 --- a/src/cosmicqc/analyze.py +++ b/src/cosmicqc/analyze.py @@ -63,9 +63,8 @@ def identify_outliers( # Ensure the input is a CytoDataFrame, converting if necessary df = CytoDataFrame(data=df) - # Create a copy of the DataFrame only if we intend to export results - # Otherwise, operate directly on the input to save memory. - outlier_df = df.copy() if export_path else df + # reference the df for a new outlier_df + outlier_df = df # Define the naming scheme for z-score columns based on thresholds thresholds_name = (