Add from_frame conversion support for int16 and int8 columns

Summary: This is so to avoid errors such as: `TypeError: Cannot interpret 'Int16Dtype()' as a data type` When running something like: `sample.covars().plot()` When the input DataFrame had an int16 or int8 inside it. We turn both into float16 (Using since float8 doesn't exist, see: https://stackoverflow.com/a/40507235/256662) Reviewed By: SarigT Differential Revision: D42923742 fbshipit-source-id: 0c2054116878d66adcbb8cdde1a42aea0a5fded1
facebookresearch · Feb 1, 2023 · 5b5110f · 5b5110f
1 parent c164d40
commit 5b5110f
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
-0.4.0 (the future)
+0.3.1 (2023-02-01)
 ==================
+### Bug Fixes
+- Sample.from_frame now also converts int16 and in8 to float16 and float16. Thus helping to avoid `TypeError: Cannot interpret 'Int16Dtype()' as a data type` style errors.
+
 ### Documentation
 - Added ISSUE_TEMPLATE
 

diff --git a/balance/__init__.py b/balance/__init__.py
@@ -20,7 +20,7 @@
 # TODO: verify this works.
 
 global __version__
-__version__ = "0.3.0"  # open source version
+__version__ = "0.3.1"  # open source version
 
 
 def setup_logging(

diff --git a/balance/sample_class.py b/balance/sample_class.py
@@ -207,8 +207,16 @@ def from_frame(
             #           for x in df.columns:
             #               if (is_numeric_dtype(df[x])) and (not is_bool_dtype(df[x])):
             #                   df[x] = df[x].astype("float64")
-            input_type = ["Int64", "Int32", "int64", "int32", "string"]
-            output_type = ["float64", "float32", "float64", "float32", "object"]
+            input_type = ["Int64", "Int32", "int64", "int32", "int16", "int8", "string"]
+            output_type = [
+                "float64",
+                "float32",  # This changes Int32Dtype() into dtype('int32') (from pandas to numpy)
+                "float64",
+                "float32",
+                "float16",
+                "float16",  # Using float16 since float8 doesn't exist, see: https://stackoverflow.com/a/40507235/256662
+                "object",
+            ]
             for i_input, i_output in zip(input_type, output_type):
                 sample._df = balance_util._pd_convert_all_types(
                     sample._df, i_input, i_output

diff --git a/tests/test_sample.py b/tests/test_sample.py
@@ -169,6 +169,12 @@ def test_Sample_from_frame(self):
         df = pd.DataFrame({"id": (1, 2), "a": (1, 2)}, dtype=np.int32)
         self.assertEqual(df.a.dtype.type, np.int32)
         self.assertEqual(Sample.from_frame(df).df.a.dtype.type, np.float32)
+        df = pd.DataFrame({"id": (1, 2), "a": (1, 2)}, dtype=np.int16)
+        self.assertEqual(df.a.dtype.type, np.int16)
+        self.assertEqual(Sample.from_frame(df).df.a.dtype.type, np.float16)
+        df = pd.DataFrame({"id": (1, 2), "a": (1, 2)}, dtype=np.int8)
+        self.assertEqual(df.a.dtype.type, np.int8)
+        self.assertEqual(Sample.from_frame(df).df.a.dtype.type, np.float16)
         # TODO: add tests for other types of conversions
 
     def test_Sample_adjust(self):