BUG: ArrowNotImplementedError when setting multiindex with dictionary[pyarrow] columns #52657
Closed
2 of 3 tasks
Labels
Arrow
pyarrow functionality
Bug
Needs Tests
Unit test(s) needed to prevent regressions
Upstream issue
Issue related to pandas dependency
Pandas version checks
I have checked that this issue has not already been reported.
I have confirmed this bug exists on the latest version of pandas.
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
Issue Description
ArrowNotImplementedError Traceback (most recent call last)
Cell In[8], line 2
1 # Setting multiple "pyarrow categorical" columns as index DOES NOT WORK
----> 2 display(dff.set_index(['x_1','x_2']).head())
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\frame.py:5923, in DataFrame.set_index(self, keys, drop, append, inplace, verify_integrity)
5915 if len(arrays[-1]) != len(self):
5916 # check newest element against length of calling frame, since
5917 # ensure_index_from_sequences would not raise for append=False.
5918 raise ValueError(
5919 f"Length mismatch: Expected {len(self)} rows, "
5920 f"received array of length {len(arrays[-1])}"
5921 )
-> 5923 index = ensure_index_from_sequences(arrays, names)
5925 if verify_integrity and not index.is_unique:
5926 duplicates = index[index.duplicated()].unique()
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\indexes\base.py:7066, in ensure_index_from_sequences(sequences, names)
7064 return Index(sequences[0], name=names)
7065 else:
-> 7066 return MultiIndex.from_arrays(sequences, names=names)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\indexes\multi.py:505, in MultiIndex.from_arrays(cls, arrays, sortorder, names)
502 if len(arrays[i]) != len(arrays[i - 1]):
503 raise ValueError("all arrays must be same length")
--> 505 codes, levels = factorize_from_iterables(arrays)
506 if names is lib.no_default:
507 names = [getattr(arr, "name", None) for arr in arrays]
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2603, in factorize_from_iterables(iterables)
2599 if len(iterables) == 0:
2600 # For consistency, it should return two empty lists.
2601 return [], []
-> 2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
2604 return list(codes), list(categories)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2603, in (.0)
2599 if len(iterables) == 0:
2600 # For consistency, it should return two empty lists.
2601 return [], []
-> 2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
2604 return list(codes), list(categories)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2576, in factorize_from_iterable(values)
2571 codes = values.codes
2572 else:
2573 # The value of ordered is irrelevant since we don't use cat as such,
2574 # but only the resulting categories, the order of which is independent
2575 # from ordered. Set ordered to False as default. See GH #15457
-> 2576 cat = Categorical(values, ordered=False)
2577 categories = cat.categories
2578 codes = cat.codes
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:425, in Categorical.init(self, values, categories, ordered, dtype, fastpath, copy)
423 if dtype.categories is None:
424 try:
--> 425 codes, categories = factorize(values, sort=True)
426 except TypeError as err:
427 codes, categories = factorize(values, sort=False)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\algorithms.py:747, in factorize(values, sort, use_na_sentinel, size_hint)
738 # Implementation notes: This method is responsible for 3 things
739 # 1.) coercing data to array-like (ndarray, Index, extension array)
740 # 2.) factorizing codes and uniques
(...)
744 # responsible only for factorization. All data coercion, sorting and boxing
745 # should happen here.
746 if isinstance(values, (ABCIndex, ABCSeries)):
--> 747 return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)
749 values = _ensure_arraylike(values)
750 original = values
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\base.py:1164, in IndexOpsMixin.factorize(self, sort, use_na_sentinel)
1146 @doc(
1147 algorithms.factorize,
1148 values="",
(...)
1162 use_na_sentinel: bool = True,
1163 ) -> tuple[npt.NDArray[np.intp], Index]:
-> 1164 codes, uniques = algorithms.factorize(
1165 self._values, sort=sort, use_na_sentinel=use_na_sentinel
1166 )
1167 if uniques.dtype == np.float16:
1168 uniques = uniques.astype(np.float32)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\algorithms.py:763, in factorize(values, sort, use_na_sentinel, size_hint)
759 return codes, uniques
761 elif not isinstance(values, np.ndarray):
762 # i.e. ExtensionArray
--> 763 codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
765 else:
766 values = np.asarray(values) # convert DTA/TDA/MultiIndex
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\arrow\array.py:841, in ArrowExtensionArray.factorize(self, use_na_sentinel)
838 else:
839 data = self._data
--> 841 encoded = data.dictionary_encode(null_encoding=null_encoding)
842 if encoded.length() == 0:
843 indices = np.array([], dtype=np.intp)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\table.pxi:586, in pyarrow.lib.ChunkedArray.dictionary_encode()
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow_compute.pyx:560, in pyarrow._compute.call_function()
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow_compute.pyx:355, in pyarrow._compute.Function.call()
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\error.pxi:121, in pyarrow.lib.check_status()
ArrowNotImplementedError: Function 'dictionary_encode' has no kernel matching input types (dictionary<values=int32, indices=int8, ordered=0>)
Expected Behavior
Installed Versions
INSTALLED VERSIONS
commit : 478d340
python : 3.8.16.final.0
python-bits : 64
OS : Windows
OS-release : 10
Version : 10.0.19045
machine : AMD64
processor : Intel64 Family 6 Model 142 Stepping 12, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : English_United States.1252
pandas : 2.0.0
numpy : 1.23.5
pytz : 2023.3
dateutil : 2.8.2
setuptools : 67.6.1
pip : 23.0.1
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : 3.0.9
lxml.etree : 4.9.2
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 3.1.2
IPython : 8.12.0
pandas_datareader: None
bs4 : 4.12.2
bottleneck : None
brotli :
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : 3.7.1
numba : 0.56.4
numexpr : None
odfpy : None
openpyxl : 3.1.2
pandas_gbq : None
pyarrow : 11.0.0
pyreadstat : 1.2.1
pyxlsb : None
s3fs : None
scipy : 1.10.1
snappy : None
sqlalchemy : 2.0.9
tables : None
tabulate : None
xarray : None
xlrd : 1.2.0
zstandard : None
tzdata : 2023.3
qtpy : None
pyqt5 : None
The text was updated successfully, but these errors were encountered: