diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e91629744463f..e41f625e583c0 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -14,6 +14,8 @@ # $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free # $ ./ci/code_checks.sh notebooks # check execution of documentation notebooks +set -uo pipefail + [[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \ { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } diff --git a/doc/make.py b/doc/make.py index dfa8ae6c1e34c..c4b7ab124f68f 100755 --- a/doc/make.py +++ b/doc/make.py @@ -236,8 +236,9 @@ def html(self): os.remove(zip_fname) if ret_code == 0: - if self.single_doc_html is not None and not self.no_browser: - self._open_browser(self.single_doc_html) + if self.single_doc_html is not None: + if not self.no_browser: + self._open_browser(self.single_doc_html) else: self._add_redirects() if self.whatsnew and not self.no_browser: diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 6eb1387c63a0a..c98fbb836cc6d 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -270,7 +270,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: Examples -------- >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3) + >>> mat = scipy.sparse.eye(3, dtype=float) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 0 1.0 0.0 0.0 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 90946b8d9b5f4..ef10958ac1153 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3955,7 +3955,7 @@ def to_csv( >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], ... 'weapon': ['sai', 'bo staff']}}) - >>> df.to_csv('out.csv', index=False) # doctest: +SKIP + >>> df.to_csv('out.csv', index=False) # doctest: +SKIP Create 'out.zip' containing 'out.csv' @@ -8972,7 +8972,7 @@ def clip( Clips using specific lower and upper thresholds per column: - >>> df.clip([-2, -1], [4,5]) + >>> df.clip([-2, -1], [4, 5]) col_0 col_1 0 4 -1 1 -2 -1 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 204083ac6c04e..c4cc30f9631ea 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -470,10 +470,9 @@ def _aggregate_named(self, func, *args, **kwargs): __examples_series_doc = dedent( """ - >>> ser = pd.Series( - ... [390.0, 350.0, 30.0, 20.0], - ... index=["Falcon", "Falcon", "Parrot", "Parrot"], - ... name="Max Speed") + >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... name="Max Speed") >>> grouped = ser.groupby([1, 1, 2, 2]) >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) Falcon 0.707107 @@ -1331,14 +1330,10 @@ class DataFrameGroupBy(GroupBy[DataFrame]): """ Examples -------- - >>> df = pd.DataFrame( - ... { - ... "A": [1, 1, 2, 2], + >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], - ... "C": [0.362838, 0.227877, 1.267767, -0.562860], - ... } - ... ) - + ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} + >>> df = pd.DataFrame(data) >>> df A B C 0 1 1 0.362838 @@ -1393,7 +1388,8 @@ class DataFrameGroupBy(GroupBy[DataFrame]): >>> df.groupby("A").agg( ... b_min=pd.NamedAgg(column="B", aggfunc="min"), - ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum") + ... ) b_min c_sum A 1 1 0.590715 @@ -2154,7 +2150,7 @@ def idxmax( >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions @@ -2236,7 +2232,7 @@ def idxmin( >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions @@ -2319,9 +2315,9 @@ def value_counts( Examples -------- >>> df = pd.DataFrame({ - ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], - ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], - ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] ... }) >>> df diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7d284db4eba2c..e51983f0aabb7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -232,8 +232,8 @@ class providing the base-class of operations. """, "dataframe_examples": """ >>> df = pd.DataFrame({'A': 'a a b'.split(), - ... 'B': [1,2,3], - ... 'C': [4,6,5]}) + ... 'B': [1, 2, 3], + ... 'C': [4, 6, 5]}) >>> g1 = df.groupby('A', group_keys=False) >>> g2 = df.groupby('A', group_keys=True) @@ -313,7 +313,7 @@ class providing the base-class of operations. The resulting dtype will reflect the return value of the passed ``func``. - >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2) + >>> g1.apply(lambda x: x * 2 if x.name == 'a' else x / 2) a 0.0 a 2.0 b 1.0 @@ -322,7 +322,7 @@ class providing the base-class of operations. In the above, the groups are not part of the index. We can have them included by using ``g2`` where ``group_keys=True``: - >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2) + >>> g2.apply(lambda x: x * 2 if x.name == 'a' else x / 2) a a 0.0 a 2.0 b b 1.0 @@ -421,14 +421,18 @@ class providing the base-class of operations. functions that expect Series, DataFrames, GroupBy or Resampler objects. Instead of writing ->>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP +>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 +>>> g = lambda x, arg1: x * 5 / arg1 +>>> f = lambda x: x ** 4 +>>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"]) +>>> h(g(f(df.groupby('group')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP You can write >>> (df.groupby('group') ... .pipe(f) -... .pipe(g, arg1=a) -... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP +... .pipe(g, arg1=1) +... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP which is much more readable. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 86693f241ddb1..46343b84afb43 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -862,7 +862,8 @@ def levels(self) -> FrozenList: Examples -------- >>> index = pd.MultiIndex.from_product([['mammal'], - ... ('goat', 'human', 'cat', 'dog')], names=['Category', 'Animals']) + ... ('goat', 'human', 'cat', 'dog')], + ... names=['Category', 'Animals']) >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=['Legs']) >>> leg_num Legs diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 7e9aa2d8a9e3c..d54f6d31f6144 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -859,7 +859,7 @@ def fillna(self, method, limit: int | None = None): Missing values present before the upsampling are not affected. >>> sm = pd.Series([1, None, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) + ... index=pd.date_range('20180101', periods=3, freq='h')) >>> sm 2018-01-01 00:00:00 1.0 2018-01-01 01:00:00 NaN @@ -1028,13 +1028,8 @@ def interpolate( Examples -------- - >>> import datetime as dt - >>> timesteps = [ - ... dt.datetime(2023, 3, 1, 7, 0, 0), - ... dt.datetime(2023, 3, 1, 7, 0, 1), - ... dt.datetime(2023, 3, 1, 7, 0, 2), - ... dt.datetime(2023, 3, 1, 7, 0, 3), - ... dt.datetime(2023, 3, 1, 7, 0, 4)] + >>> start = "2023-03-01T07:00:00" + >>> timesteps = pd.date_range(start, periods=5, freq="s") >>> series = pd.Series(data=[1, -1, 2, 1, 3], index=timesteps) >>> series 2023-03-01 07:00:00 1 @@ -1042,7 +1037,7 @@ def interpolate( 2023-03-01 07:00:02 2 2023-03-01 07:00:03 1 2023-03-01 07:00:04 3 - dtype: int64 + Freq: s, dtype: int64 Upsample the dataframe to 0.5Hz by providing the period time of 2s. diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index fdb8ef1cc5dad..25f7e7e9f832b 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -797,7 +797,7 @@ ... 'B': ['a', 'b', 'c', 'd', 'e'], ... 'C': ['f', 'g', 'h', 'i', 'j']}}) - >>> df.replace(to_replace='^[a-g]', value = 'e', regex=True) + >>> df.replace(to_replace='^[a-g]', value='e', regex=True) A B C 0 0 e e 1 1 e e @@ -808,7 +808,7 @@ If ``value`` is not ``None`` and `to_replace` is a dictionary, the dictionary keys will be the DataFrame columns that the replacement will be applied. - >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value = 'e', regex=True) + >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value='e', regex=True) A B C 0 0 e f 1 1 e g diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f90863a8ea1ef..f268d36d7fdc4 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2439,14 +2439,14 @@ def var( create_section_header("Examples"), dedent( """\ - >>> ser = pd.Series([1, 5, 2, 7, 12, 6]) + >>> ser = pd.Series([1, 5, 2, 7, 15, 6]) >>> ser.rolling(3).skew().round(6) 0 NaN 1 NaN 2 1.293343 3 -0.585583 - 4 0.000000 - 5 1.545393 + 4 0.670284 + 5 1.652317 dtype: float64 """ ), @@ -2794,12 +2794,12 @@ def cov( >>> v1 = [3, 3, 3, 5, 8] >>> v2 = [3, 4, 4, 4, 8] - >>> # numpy returns a 2X2 array, the correlation coefficient - >>> # is the number at entry [0][1] - >>> print(f"{{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}}") - 0.333333 - >>> print(f"{{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}}") - 0.916949 + >>> np.corrcoef(v1[:-1], v2[:-1]) + array([[1. , 0.33333333], + [0.33333333, 1. ]]) + >>> np.corrcoef(v1[1:], v2[1:]) + array([[1. , 0.9169493], + [0.9169493, 1. ]]) >>> s1 = pd.Series(v1) >>> s2 = pd.Series(v2) >>> s1.rolling(4).corr(s2) @@ -2813,15 +2813,18 @@ def cov( The below example shows a similar rolling calculation on a DataFrame using the pairwise option. - >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\ - [46., 31.], [50., 36.]]) - >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7)) - [[1. 0.6263001] - [0.6263001 1. ]] - >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7)) - [[1. 0.5553681] - [0.5553681 1. ]] - >>> df = pd.DataFrame(matrix, columns=['X','Y']) + >>> matrix = np.array([[51., 35.], + ... [49., 30.], + ... [47., 32.], + ... [46., 31.], + ... [50., 36.]]) + >>> np.corrcoef(matrix[:-1, 0], matrix[:-1, 1]) + array([[1. , 0.6263001], + [0.6263001, 1. ]]) + >>> np.corrcoef(matrix[1:, 0], matrix[1:, 1]) + array([[1. , 0.55536811], + [0.55536811, 1. ]]) + >>> df = pd.DataFrame(matrix, columns=['X', 'Y']) >>> df X Y 0 51.0 35.0 diff --git a/pandas/io/sql.py b/pandas/io/sql.py index d4b6602d9f0eb..a83c2bf241450 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -680,7 +680,7 @@ def read_sql( pandas now supports reading via ADBC drivers - >>> from adbc_driver_postgresql import dbapi + >>> from adbc_driver_postgresql import dbapi # doctest:+SKIP >>> with dbapi.connect('postgres:///db_name') as conn: # doctest:+SKIP ... pd.read_sql('SELECT int_column FROM test_data', conn) int_column diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index a017787f2dc2d..bd04e812e0be1 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -241,10 +241,10 @@ def hist_frame( .. plot:: :context: close-figs - >>> df = pd.DataFrame({ - ... 'length': [1.5, 0.5, 1.2, 0.9, 3], - ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1] - ... }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) + >>> data = {'length': [1.5, 0.5, 1.2, 0.9, 3], + ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]} + >>> index = ['pig', 'rabbit', 'duck', 'chicken', 'horse'] + >>> df = pd.DataFrame(data, index=index) >>> hist = df.hist(bins=3) """ plot_backend = _get_plot_backend(backend) @@ -607,10 +607,10 @@ def boxplot_frame_groupby( >>> import itertools >>> tuples = [t for t in itertools.product(range(1000), range(4))] >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) - >>> data = np.random.randn(len(index),4) + >>> data = np.random.randn(len(index), 4) >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index) >>> grouped = df.groupby(level='lvl1') - >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8,10)) # doctest: +SKIP + >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8, 10)) # doctest: +SKIP The ``subplots=False`` option shows the boxplots in a single figure. @@ -1400,9 +1400,7 @@ def hist( .. plot:: :context: close-figs - >>> df = pd.DataFrame( - ... np.random.randint(1, 7, 6000), - ... columns = ['one']) + >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 5e5a55b4d0a98..18db460d388a4 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -439,7 +439,7 @@ def bootstrap_plot( :context: close-figs >>> s = pd.Series(np.random.uniform(size=100)) - >>> pd.plotting.bootstrap_plot(s) + >>> pd.plotting.bootstrap_plot(s) # doctest: +SKIP
""" plot_backend = _get_plot_backend("matplotlib") diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 0a6a852bb0f85..98b55f8d690cf 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -228,11 +228,12 @@ def validate_pep8(self): file.name, ] response = subprocess.run(cmd, capture_output=True, check=False, text=True) - stdout = response.stdout - stdout = stdout.replace(file.name, "") - messages = stdout.strip("\n").splitlines() - if messages: - error_messages.extend(messages) + for output in ("stdout", "stderr"): + out = getattr(response, output) + out = out.replace(file.name, "") + messages = out.strip("\n").splitlines() + if messages: + error_messages.extend(messages) finally: file.close() os.unlink(file.name)