Skip to content

Commit

Permalink
TST/CLN: Remove makeStringIndex (#56155)
Browse files Browse the repository at this point in the history
* TST/CLN: Remove makeStringIndex

* Fix failures

* Fix one more test

* Remove makeBoolIndex too

* Remove name

* Adjust test

* Fix benchmarks

* Fix another benchmark
  • Loading branch information
mroeschke authored Nov 27, 2023
1 parent 27ec887 commit a38ecd5
Show file tree
Hide file tree
Showing 48 changed files with 253 additions and 274 deletions.
15 changes: 9 additions & 6 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

import pandas as pd

from .pandas_vb_common import tm

for imp in ["pandas.util", "pandas.tools.hashing"]:
try:
hashing = import_module(imp)
Expand Down Expand Up @@ -47,9 +45,12 @@ def setup(self, unique, sort, dtype):
elif dtype == "datetime64[ns, tz]":
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
elif dtype == "object_str":
data = tm.makeStringIndex(N)
data = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
elif dtype == "string[pyarrow]":
data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]")
data = pd.array(
pd.Index([f"i-{i}" for i in range(N)], dtype=object),
dtype="string[pyarrow]",
)
else:
raise NotImplementedError

Expand Down Expand Up @@ -88,7 +89,7 @@ def setup(self, unique, keep, dtype):
elif dtype == "float64":
data = pd.Index(np.random.randn(N), dtype="float64")
elif dtype == "string":
data = tm.makeStringIndex(N)
data = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
elif dtype == "datetime64[ns]":
data = pd.date_range("2011-01-01", freq="h", periods=N)
elif dtype == "datetime64[ns, tz]":
Expand Down Expand Up @@ -136,7 +137,9 @@ def setup_cache(self):
df = pd.DataFrame(
{
"strings": pd.Series(
tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N))
pd.Index([f"i-{i}" for i in range(10000)], dtype=object).take(
np.random.randint(0, 10000, size=N)
)
),
"floats": np.random.randn(N),
"ints": np.arange(N),
Expand Down
6 changes: 3 additions & 3 deletions asv_bench/benchmarks/algos/isin.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
date_range,
)

from ..pandas_vb_common import tm


class IsIn:
params = [
Expand Down Expand Up @@ -60,7 +58,9 @@ def setup(self, dtype):

elif dtype in ["str", "string[python]", "string[pyarrow]"]:
try:
self.series = Series(tm.makeStringIndex(N), dtype=dtype)
self.series = Series(
Index([f"i-{i}" for i in range(N)], dtype=object), dtype=dtype
)
except ImportError:
raise NotImplementedError
self.values = list(self.series[:2])
Expand Down
4 changes: 1 addition & 3 deletions asv_bench/benchmarks/ctors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
date_range,
)

from .pandas_vb_common import tm


def no_change(arr):
return arr
Expand Down Expand Up @@ -115,7 +113,7 @@ def time_dtindex_from_index_with_series(self):
class MultiIndexConstructor:
def setup(self):
N = 10**4
self.iterables = [tm.makeStringIndex(N), range(20)]
self.iterables = [Index([f"i-{i}" for i in range(N)], dtype=object), range(20)]

def time_multiindex_from_iterables(self):
MultiIndex.from_product(self.iterables)
Expand Down
9 changes: 6 additions & 3 deletions asv_bench/benchmarks/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import numpy as np

import pandas as pd
from pandas import DataFrame
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
from pandas.api.types import (
is_extension_array_dtype,
Expand Down Expand Up @@ -73,8 +76,8 @@ class SelectDtypes:

def setup(self, dtype):
N, K = 5000, 50
self.index = tm.makeStringIndex(N)
self.columns = tm.makeStringIndex(K)
self.index = Index([f"i-{i}" for i in range(N)], dtype=object)
self.columns = Index([f"i-{i}" for i in range(K)], dtype=object)

def create_df(data):
return DataFrame(data, index=self.index, columns=self.columns)
Expand Down
6 changes: 2 additions & 4 deletions asv_bench/benchmarks/frame_ctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
date_range,
)

from .pandas_vb_common import tm

try:
from pandas.tseries.offsets import (
Hour,
Expand All @@ -30,8 +28,8 @@
class FromDicts:
def setup(self):
N, K = 5000, 50
self.index = tm.makeStringIndex(N)
self.columns = tm.makeStringIndex(K)
self.index = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
self.columns = pd.Index([f"i-{i}" for i in range(K)], dtype=object)
frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns)
self.data = frame.to_dict()
self.dict_list = frame.to_dict(orient="records")
Expand Down
11 changes: 7 additions & 4 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from pandas import (
DataFrame,
Index,
MultiIndex,
NaT,
Series,
Expand All @@ -14,8 +15,6 @@
timedelta_range,
)

from .pandas_vb_common import tm


class AsType:
params = [
Expand Down Expand Up @@ -703,8 +702,12 @@ def setup(self, monotonic):
K = 10
df = DataFrame(
{
"key1": tm.makeStringIndex(N).values.repeat(K),
"key2": tm.makeStringIndex(N).values.repeat(K),
"key1": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(
K
),
"key2": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(
K
),
"value": np.random.randn(N * K),
}
)
Expand Down
6 changes: 2 additions & 4 deletions asv_bench/benchmarks/gil.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@

from pandas import (
DataFrame,
Index,
Series,
date_range,
factorize,
read_csv,
)
from pandas.core.algorithms import take_nd

from .pandas_vb_common import tm

try:
from pandas import (
rolling_kurt,
Expand All @@ -34,7 +33,6 @@
except ImportError:
from pandas import algos


from .pandas_vb_common import BaseIO # isort:skip


Expand Down Expand Up @@ -305,7 +303,7 @@ class ParallelFactorize:
param_names = ["threads"]

def setup(self, threads):
strings = tm.makeStringIndex(100000)
strings = Index([f"i-{i}" for i in range(100000)], dtype=object)

@test_parallel(num_threads=threads)
def parallel():
Expand Down
12 changes: 7 additions & 5 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
to_timedelta,
)

from .pandas_vb_common import tm

method_blocklist = {
"object": {
"diff",
Expand Down Expand Up @@ -167,10 +165,14 @@ def setup_cache(self):
"int64_small": Series(np.random.randint(0, 100, size=size)),
"int64_large": Series(np.random.randint(0, 10000, size=size)),
"object_small": Series(
tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))
Index([f"i-{i}" for i in range(100)], dtype=object).take(
np.random.randint(0, 100, size=size)
)
),
"object_large": Series(
tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))
Index([f"i-{i}" for i in range(10000)], dtype=object).take(
np.random.randint(0, 10000, size=size)
)
),
}
return data
Expand Down Expand Up @@ -912,7 +914,7 @@ def setup(self):
n1 = 400
n2 = 250
index = MultiIndex(
levels=[np.arange(n1), tm.makeStringIndex(n2)],
levels=[np.arange(n1), Index([f"i-{i}" for i in range(n2)], dtype=object)],
codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1],
names=["lev1", "lev2"],
)
Expand Down
11 changes: 7 additions & 4 deletions asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
date_range,
)

from .pandas_vb_common import tm


class SetOperations:
params = (
Expand All @@ -30,7 +28,7 @@ def setup(self, index_structure, dtype, method):
date_str_left = Index(dates_left.strftime(fmt))
int_left = Index(np.arange(N))
ea_int_left = Index(np.arange(N), dtype="Int64")
str_left = tm.makeStringIndex(N)
str_left = Index([f"i-{i}" for i in range(N)], dtype=object)

data = {
"datetime": dates_left,
Expand Down Expand Up @@ -155,7 +153,12 @@ class Indexing:

def setup(self, dtype):
N = 10**6
self.idx = getattr(tm, f"make{dtype}Index")(N)
if dtype == "String":
self.idx = Index([f"i-{i}" for i in range(N)], dtype=object)
elif dtype == "Float":
self.idx = Index(np.arange(N), dtype=np.float64)
elif dtype == "Int":
self.idx = Index(np.arange(N), dtype=np.int64)
self.array_mask = (np.arange(N) % 3) == 0
self.series_mask = Series(self.array_mask)
self.sorted = self.idx.sort_values()
Expand Down
8 changes: 3 additions & 5 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
period_range,
)

from .pandas_vb_common import tm


class NumericSeriesIndexing:
params = [
Expand Down Expand Up @@ -124,7 +122,7 @@ class NonNumericSeriesIndexing:
def setup(self, index, index_structure):
N = 10**6
if index == "string":
index = tm.makeStringIndex(N)
index = Index([f"i-{i}" for i in range(N)], dtype=object)
elif index == "datetime":
index = date_range("1900", periods=N, freq="s")
elif index == "period":
Expand Down Expand Up @@ -156,8 +154,8 @@ def time_getitem_list_like(self, index, index_structure):

class DataFrameStringIndexing:
def setup(self):
index = tm.makeStringIndex(1000)
columns = tm.makeStringIndex(30)
index = Index([f"i-{i}" for i in range(1000)], dtype=object)
columns = Index([f"i-{i}" for i in range(30)], dtype=object)
with warnings.catch_warnings(record=True):
self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns)
self.idx_scalar = index[100]
Expand Down
8 changes: 3 additions & 5 deletions asv_bench/benchmarks/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np

from pandas import (
Index,
NaT,
Series,
date_range,
Expand All @@ -17,10 +18,7 @@
to_timedelta,
)

from .pandas_vb_common import (
lib,
tm,
)
from .pandas_vb_common import lib


class ToNumeric:
Expand All @@ -31,7 +29,7 @@ def setup(self, errors):
N = 10000
self.float = Series(np.random.randn(N))
self.numstr = self.float.astype("str")
self.str = Series(tm.makeStringIndex(N))
self.str = Series(Index([f"i-{i}" for i in range(N)], dtype=object))

def time_from_float(self, errors):
to_numeric(self.float, errors=errors)
Expand Down
8 changes: 3 additions & 5 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,15 @@
from pandas import (
Categorical,
DataFrame,
Index,
concat,
date_range,
period_range,
read_csv,
to_datetime,
)

from ..pandas_vb_common import (
BaseIO,
tm,
)
from ..pandas_vb_common import BaseIO


class ToCSV(BaseIO):
Expand Down Expand Up @@ -288,7 +286,7 @@ class ReadCSVSkipRows(BaseIO):

def setup(self, skiprows, engine):
N = 20000
index = tm.makeStringIndex(N)
index = Index([f"i-{i}" for i in range(N)], dtype=object)
df = DataFrame(
{
"float1": np.random.randn(N),
Expand Down
5 changes: 2 additions & 3 deletions asv_bench/benchmarks/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
from pandas import (
DataFrame,
ExcelWriter,
Index,
date_range,
read_excel,
)

from ..pandas_vb_common import tm


def _generate_dataframe():
N = 2000
Expand All @@ -27,7 +26,7 @@ def _generate_dataframe():
columns=[f"float{i}" for i in range(C)],
index=date_range("20000101", periods=N, freq="h"),
)
df["object"] = tm.makeStringIndex(N)
df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object)
return df


Expand Down
Loading

0 comments on commit a38ecd5

Please sign in to comment.