Skip to content

Commit

Permalink
TST/ CLN: Remove makeCustomIndex/DataFrame (#56331)
Browse files Browse the repository at this point in the history
* Remove makeCustom

* Fix mismatches

* Remove makeCustomIndex in all

* Fix excel
  • Loading branch information
mroeschke authored Dec 5, 2023
1 parent dc4c474 commit 0b9e784
Show file tree
Hide file tree
Showing 19 changed files with 311 additions and 403 deletions.
6 changes: 3 additions & 3 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1490,9 +1490,9 @@ rows will skip the intervening rows.

.. ipython:: python
from pandas._testing import makeCustomDataframe as mkdf
df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
mi_idx = pd.MultiIndex.from_arrays([[1, 2, 3, 4], list("abcd")], names=list("ab"))
mi_col = pd.MultiIndex.from_arrays([[1, 2], list("ab")], names=list("cd"))
df = pd.DataFrame(np.ones((4, 2)), index=mi_idx, columns=mi_col)
df.to_csv("mi.csv")
print(open("mi.csv").read())
pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1])
Expand Down
6 changes: 3 additions & 3 deletions doc/source/whatsnew/v0.12.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,9 @@ IO enhancements

.. ipython:: python
from pandas._testing import makeCustomDataframe as mkdf
df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
mi_idx = pd.MultiIndex.from_arrays([[1, 2, 3, 4], list("abcd")], names=list("ab"))
mi_col = pd.MultiIndex.from_arrays([[1, 2], list("ab")], names=list("cd"))
df = pd.DataFrame(np.ones((4, 2)), index=mi_idx, columns=mi_col)
df.to_csv("mi.csv")
print(open("mi.csv").read())
pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1])
Expand Down
235 changes: 1 addition & 234 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

import collections
from collections import Counter
from decimal import Decimal
import operator
import os
Expand All @@ -24,10 +22,7 @@

from pandas.compat import pa_version_under10p1

from pandas.core.dtypes.common import (
is_sequence,
is_string_dtype,
)
from pandas.core.dtypes.common import is_string_dtype

import pandas as pd
from pandas import (
Expand All @@ -38,9 +33,6 @@
MultiIndex,
RangeIndex,
Series,
date_range,
period_range,
timedelta_range,
)
from pandas._testing._io import (
round_trip_localpath,
Expand Down Expand Up @@ -332,229 +324,6 @@ def to_array(obj):
return extract_array(obj, extract_numpy=True)


# -----------------------------------------------------------------------------
# Others


def makeCustomIndex(
nentries,
nlevels,
prefix: str = "#",
names: bool | str | list[str] | None = False,
ndupe_l=None,
idx_type=None,
) -> Index:
"""
Create an index/multindex with given dimensions, levels, names, etc'
nentries - number of entries in index
nlevels - number of levels (> 1 produces multindex)
prefix - a string prefix for labels
names - (Optional), bool or list of strings. if True will use default
names, if false will use no names, if a list is given, the name of
each level in the index will be taken from the list.
ndupe_l - (Optional), list of ints, the number of rows for which the
label will repeated at the corresponding level, you can specify just
the first few, the rest will use the default ndupe_l of 1.
len(ndupe_l) <= nlevels.
idx_type - "i"/"f"/"s"/"dt"/"p"/"td".
If idx_type is not None, `idx_nlevels` must be 1.
"i"/"f" creates an integer/float index,
"s" creates a string
"dt" create a datetime index.
"td" create a datetime index.
if unspecified, string labels will be generated.
"""
if ndupe_l is None:
ndupe_l = [1] * nlevels
assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels
assert names is None or names is False or names is True or len(names) is nlevels
assert idx_type is None or (
idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1
)

if names is True:
# build default names
names = [prefix + str(i) for i in range(nlevels)]
if names is False:
# pass None to index constructor for no name
names = None

# make singleton case uniform
if isinstance(names, str) and nlevels == 1:
names = [names]

# specific 1D index type requested?
idx_func_dict: dict[str, Callable[..., Index]] = {
"i": lambda n: Index(np.arange(n), dtype=np.int64),
"f": lambda n: Index(np.arange(n), dtype=np.float64),
"s": lambda n: Index([f"{i}_{chr(i)}" for i in range(97, 97 + n)]),
"dt": lambda n: date_range("2020-01-01", periods=n),
"td": lambda n: timedelta_range("1 day", periods=n),
"p": lambda n: period_range("2020-01-01", periods=n, freq="D"),
}
idx_func = idx_func_dict.get(idx_type)
if idx_func:
idx = idx_func(nentries)
# but we need to fill in the name
if names:
idx.name = names[0]
return idx
elif idx_type is not None:
raise ValueError(
f"{repr(idx_type)} is not a legal value for `idx_type`, "
"use 'i'/'f'/'s'/'dt'/'p'/'td'."
)

if len(ndupe_l) < nlevels:
ndupe_l.extend([1] * (nlevels - len(ndupe_l)))
assert len(ndupe_l) == nlevels

assert all(x > 0 for x in ndupe_l)

list_of_lists = []
for i in range(nlevels):

def keyfunc(x):
numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_")
return [int(num) for num in numeric_tuple]

# build a list of lists to create the index from
div_factor = nentries // ndupe_l[i] + 1

# Deprecated since version 3.9: collections.Counter now supports []. See PEP 585
# and Generic Alias Type.
cnt: Counter[str] = collections.Counter()
for j in range(div_factor):
label = f"{prefix}_l{i}_g{j}"
cnt[label] = ndupe_l[i]
# cute Counter trick
result = sorted(cnt.elements(), key=keyfunc)[:nentries]
list_of_lists.append(result)

tuples = list(zip(*list_of_lists))

# convert tuples to index
if nentries == 1:
# we have a single level of tuples, i.e. a regular Index
name = None if names is None else names[0]
index = Index(tuples[0], name=name)
elif nlevels == 1:
name = None if names is None else names[0]
index = Index((x[0] for x in tuples), name=name)
else:
index = MultiIndex.from_tuples(tuples, names=names)
return index


def makeCustomDataframe(
nrows,
ncols,
c_idx_names: bool | list[str] = True,
r_idx_names: bool | list[str] = True,
c_idx_nlevels: int = 1,
r_idx_nlevels: int = 1,
data_gen_f=None,
c_ndupe_l=None,
r_ndupe_l=None,
dtype=None,
c_idx_type=None,
r_idx_type=None,
) -> DataFrame:
"""
Create a DataFrame using supplied parameters.
Parameters
----------
nrows, ncols - number of data rows/cols
c_idx_names, r_idx_names - False/True/list of strings, yields No names ,
default names or uses the provided names for the levels of the
corresponding index. You can provide a single string when
c_idx_nlevels ==1.
c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex
r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex
data_gen_f - a function f(row,col) which return the data value
at that position, the default generator used yields values of the form
"RxCy" based on position.
c_ndupe_l, r_ndupe_l - list of integers, determines the number
of duplicates for each label at a given level of the corresponding
index. The default `None` value produces a multiplicity of 1 across
all levels, i.e. a unique index. Will accept a partial list of length
N < idx_nlevels, for just the first N levels. If ndupe doesn't divide
nrows/ncol, the last label might have lower multiplicity.
dtype - passed to the DataFrame constructor as is, in case you wish to
have more control in conjunction with a custom `data_gen_f`
r_idx_type, c_idx_type - "i"/"f"/"s"/"dt"/"td".
If idx_type is not None, `idx_nlevels` must be 1.
"i"/"f" creates an integer/float index,
"s" creates a string index
"dt" create a datetime index.
"td" create a timedelta index.
if unspecified, string labels will be generated.
Examples
--------
# 5 row, 3 columns, default names on both, single index on both axis
>> makeCustomDataframe(5,3)
# make the data a random int between 1 and 100
>> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100))
# 2-level multiindex on rows with each label duplicated
# twice on first level, default names on both axis, single
# index on both axis
>> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2])
# DatetimeIndex on row, index with unicode labels on columns
# no names on either axis
>> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False,
r_idx_type="dt",c_idx_type="u")
# 4-level multindex on rows with names provided, 2-level multindex
# on columns with default labels and default names.
>> a=makeCustomDataframe(5,3,r_idx_nlevels=4,
r_idx_names=["FEE","FIH","FOH","FUM"],
c_idx_nlevels=2)
>> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
"""
assert c_idx_nlevels > 0
assert r_idx_nlevels > 0
assert r_idx_type is None or (
r_idx_type in ("i", "f", "s", "dt", "p", "td") and r_idx_nlevels == 1
)
assert c_idx_type is None or (
c_idx_type in ("i", "f", "s", "dt", "p", "td") and c_idx_nlevels == 1
)

columns = makeCustomIndex(
ncols,
nlevels=c_idx_nlevels,
prefix="C",
names=c_idx_names,
ndupe_l=c_ndupe_l,
idx_type=c_idx_type,
)
index = makeCustomIndex(
nrows,
nlevels=r_idx_nlevels,
prefix="R",
names=r_idx_names,
ndupe_l=r_ndupe_l,
idx_type=r_idx_type,
)

# by default, generate data based on location
if data_gen_f is None:
data_gen_f = lambda r, c: f"R{r}C{c}"

data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)]

return DataFrame(data, index, columns, dtype=dtype)


class SubclassedSeries(Series):
_metadata = ["testattr", "name"]

Expand Down Expand Up @@ -868,8 +637,6 @@ def shares_memory(left, right) -> bool:
"iat",
"iloc",
"loc",
"makeCustomDataframe",
"makeCustomIndex",
"maybe_produces_warning",
"NARROW_NP_DTYPES",
"NP_NAT_OBJECTS",
Expand Down
Loading

0 comments on commit 0b9e784

Please sign in to comment.