BUG: ArrowNotImplementedError when setting multiindex with dictionary[pyarrow] columns #52657

aprandin · 2023-04-13T17:11:51Z

Pandas version checks

I have checked that this issue has not already been reported.
I have confirmed this bug exists on the latest version of pandas.
I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd
import numpy as np

data = {
    'x_1':np.random.randint(0,5,size=20),
    'x_2':np.random.randint(5,10,size=20),
    'x_3':np.random.randint(10,15,size=20),
    'x_4':np.random.randint(15,20,size=20),
    
}
df = pd.DataFrame(data=data)
for col in ['x_1','x_2']:
    df[col] = df[col].astype('category')
df.to_feather('example.fh')

dff = pd.read_feather('example.fh', dtype_backend='pyarrow')
display(dff.dtypes)

# Setting only one "pyarrow categorical" column as index works fine
display(dff.set_index('x_1').head())

# Setting multiple NOT "pyarrow categorical" columns as index works fine
display(dff.set_index(['x_3','x_4']).head())

# Setting multiple "pyarrow categorical" columns as index DOES NOT WORK
display(dff.set_index(['x_1','x_2']).head())

Issue Description

ArrowNotImplementedError Traceback (most recent call last)
Cell In[8], line 2
1 # Setting multiple "pyarrow categorical" columns as index DOES NOT WORK
----> 2 display(dff.set_index(['x_1','x_2']).head())

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\frame.py:5923, in DataFrame.set_index(self, keys, drop, append, inplace, verify_integrity)
5915 if len(arrays[-1]) != len(self):
5916 # check newest element against length of calling frame, since
5917 # ensure_index_from_sequences would not raise for append=False.
5918 raise ValueError(
5919 f"Length mismatch: Expected {len(self)} rows, "
5920 f"received array of length {len(arrays[-1])}"
5921 )
-> 5923 index = ensure_index_from_sequences(arrays, names)
5925 if verify_integrity and not index.is_unique:
5926 duplicates = index[index.duplicated()].unique()

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\indexes\base.py:7066, in ensure_index_from_sequences(sequences, names)
7064 return Index(sequences[0], name=names)
7065 else:
-> 7066 return MultiIndex.from_arrays(sequences, names=names)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\indexes\multi.py:505, in MultiIndex.from_arrays(cls, arrays, sortorder, names)
502 if len(arrays[i]) != len(arrays[i - 1]):
503 raise ValueError("all arrays must be same length")
--> 505 codes, levels = factorize_from_iterables(arrays)
506 if names is lib.no_default:
507 names = [getattr(arr, "name", None) for arr in arrays]

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2603, in factorize_from_iterables(iterables)
2599 if len(iterables) == 0:
2600 # For consistency, it should return two empty lists.
2601 return [], []
-> 2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
2604 return list(codes), list(categories)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2603, in (.0)
2599 if len(iterables) == 0:
2600 # For consistency, it should return two empty lists.
2601 return [], []
-> 2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
2604 return list(codes), list(categories)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2576, in factorize_from_iterable(values)
2571 codes = values.codes
2572 else:
2573 # The value of ordered is irrelevant since we don't use cat as such,
2574 # but only the resulting categories, the order of which is independent
2575 # from ordered. Set ordered to False as default. See GH #15457
-> 2576 cat = Categorical(values, ordered=False)
2577 categories = cat.categories
2578 codes = cat.codes

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:425, in Categorical.init(self, values, categories, ordered, dtype, fastpath, copy)
423 if dtype.categories is None:
424 try:
--> 425 codes, categories = factorize(values, sort=True)
426 except TypeError as err:
427 codes, categories = factorize(values, sort=False)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\algorithms.py:747, in factorize(values, sort, use_na_sentinel, size_hint)
738 # Implementation notes: This method is responsible for 3 things
739 # 1.) coercing data to array-like (ndarray, Index, extension array)
740 # 2.) factorizing codes and uniques
(...)
744 # responsible only for factorization. All data coercion, sorting and boxing
745 # should happen here.
746 if isinstance(values, (ABCIndex, ABCSeries)):
--> 747 return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)
749 values = _ensure_arraylike(values)
750 original = values

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\base.py:1164, in IndexOpsMixin.factorize(self, sort, use_na_sentinel)
1146 @doc(
1147 algorithms.factorize,
1148 values="",
(...)
1162 use_na_sentinel: bool = True,
1163 ) -> tuple[npt.NDArray[np.intp], Index]:
-> 1164 codes, uniques = algorithms.factorize(
1165 self._values, sort=sort, use_na_sentinel=use_na_sentinel
1166 )
1167 if uniques.dtype == np.float16:
1168 uniques = uniques.astype(np.float32)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\algorithms.py:763, in factorize(values, sort, use_na_sentinel, size_hint)
759 return codes, uniques
761 elif not isinstance(values, np.ndarray):
762 # i.e. ExtensionArray
--> 763 codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
765 else:
766 values = np.asarray(values) # convert DTA/TDA/MultiIndex

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\arrow\array.py:841, in ArrowExtensionArray.factorize(self, use_na_sentinel)
838 else:
839 data = self._data
--> 841 encoded = data.dictionary_encode(null_encoding=null_encoding)
842 if encoded.length() == 0:
843 indices = np.array([], dtype=np.intp)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\table.pxi:586, in pyarrow.lib.ChunkedArray.dictionary_encode()

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow_compute.pyx:560, in pyarrow._compute.call_function()

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow_compute.pyx:355, in pyarrow._compute.Function.call()

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\error.pxi:121, in pyarrow.lib.check_status()

ArrowNotImplementedError: Function 'dictionary_encode' has no kernel matching input types (dictionary<values=int32, indices=int8, ordered=0>)

Expected Behavior

Installed Versions

INSTALLED VERSIONS

commit : 478d340
python : 3.8.16.final.0
python-bits : 64
OS : Windows
OS-release : 10
Version : 10.0.19045
machine : AMD64
processor : Intel64 Family 6 Model 142 Stepping 12, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : English_United States.1252

pandas : 2.0.0
numpy : 1.23.5
pytz : 2023.3
dateutil : 2.8.2
setuptools : 67.6.1
pip : 23.0.1
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : 3.0.9
lxml.etree : 4.9.2
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 3.1.2
IPython : 8.12.0
pandas_datareader: None
bs4 : 4.12.2
bottleneck : None
brotli :
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : 3.7.1
numba : 0.56.4
numexpr : None
odfpy : None
openpyxl : 3.1.2
pandas_gbq : None
pyarrow : 11.0.0
pyreadstat : 1.2.1
pyxlsb : None
s3fs : None
scipy : 1.10.1
snappy : None
sqlalchemy : 2.0.9
tables : None
tabulate : None
xarray : None
xlrd : 1.2.0
zstandard : None
tzdata : 2023.3
qtpy : None
pyqt5 : None

The text was updated successfully, but these errors were encountered:

phofl · 2023-04-22T17:09:08Z

An xfailed test is welcome

aprandin · 2023-04-27T13:22:58Z

Hi @phofl many thanks for working on the issue.

Do you mean a test like the one below?

import pandas as pd
import numpy as np
import pyarrow as pa
import pytest


data = {
    'x_1':np.random.randint(0,5,size=20),
    'x_2':np.random.randint(5,10,size=20),
    'x_3':np.random.randint(10,15,size=20),
    'x_4':np.random.randint(15,20,size=20),
}
df = pd.DataFrame(data=data)
for col in ['x_1','x_2']:
    df[col] = df[col].astype('category')
df.to_feather('example.fh')

dff = pd.read_feather('example.fh', dtype_backend='pyarrow')

@pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError, reason="Unable to set a pyarrow-categorical column to index")
def test_index_one_cat():
    dff.set_index('x_1')
    
@pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError, reason="Unable to set multiple non-pyarrow-categorical columns to index")
def test_index_two_non_cat():
    dff.set_index(['x_3','x_4'])

@pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError, reason="Unable to set multiple pyarrow-categorical columns to index")
def test_index_two_cat():
    dff.set_index(['x_1','x_2'])

Please let me know if this is in line with your expectations.

Thank you.

mroeschke · 2023-12-06T20:16:03Z

Looks like this needs to be fixed upstream in pyarrow so closing

aprandin added Bug Needs Triage Issue that has not been reviewed by a pandas team member labels Apr 13, 2023

phofl added Arrow pyarrow functionality Upstream issue Issue related to pandas dependency and removed Needs Triage Issue that has not been reviewed by a pandas team member labels Apr 22, 2023

phofl added the Needs Tests Unit test(s) needed to prevent regressions label Apr 22, 2023

mroeschke closed this as completed Dec 6, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

BUG: ArrowNotImplementedError when setting multiindex with dictionary[pyarrow] columns #52657

BUG: ArrowNotImplementedError when setting multiindex with dictionary[pyarrow] columns #52657

aprandin commented Apr 13, 2023

phofl commented Apr 22, 2023

aprandin commented Apr 27, 2023 •

edited

Loading

mroeschke commented Dec 6, 2023

BUG: ArrowNotImplementedError when setting multiindex with dictionary[pyarrow] columns #52657

BUG: ArrowNotImplementedError when setting multiindex with dictionary[pyarrow] columns #52657

Comments

aprandin commented Apr 13, 2023

Pandas version checks

Reproducible Example

Issue Description

Expected Behavior

Installed Versions

INSTALLED VERSIONS

phofl commented Apr 22, 2023

aprandin commented Apr 27, 2023 • edited Loading

mroeschke commented Dec 6, 2023

aprandin commented Apr 27, 2023 •

edited

Loading