diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index 239c4c8..56c8d38 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -646,6 +646,110 @@ "tab.mode(dropna=False)" ] }, + { + "cell_type": "markdown", + "id": "f5c66579", + "metadata": {}, + "source": [ + "### Table.std()\n", + "\n", + "```\n", + "Table.std(axis=0, skipna=True, numeric_only=False, ddof=0)\n", + "```\n", + "\n", + "Return sample standard deviation over requested axis. Normalized by N-1 by default. This can be changed using the ddof argument.\n", + "\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the sum across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | not yet implemented | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "| ddof | int | Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. | 1 |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Table | The std across each row / column with the key corresponding to the row number or column name. |" + ] + }, + { + "cell_type": "markdown", + "id": "c2767afd", + "metadata": {}, + "source": [ + "**Examples:**\n", + "\n", + "Calculate the std across the columns of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87b94fd0", + "metadata": {}, + "outputs": [], + "source": [ + "tab = kx.Table(data=\n", + " {\n", + " 'a': [1, 2, 2, 4],\n", + " 'b': [1, 2, 6, 7],\n", + " 'c': [7, 8, 9, 10],\n", + " 'd': [7, 11, 14, 14]\n", + " }\n", + ")\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e54d557", + "metadata": {}, + "outputs": [], + "source": [ + "tab.std()" + ] + }, + { + "cell_type": "markdown", + "id": "14950833", + "metadata": {}, + "source": [ + "Calculate the std across the rows of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f19161ed", + "metadata": {}, + "outputs": [], + "source": [ + "tab.std(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "a8ea5a38", + "metadata": {}, + "source": [ + "Calculate std accross columns with ddof=0:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6361dcb7", + "metadata": {}, + "outputs": [], + "source": [ + "tab.std(ddof=0)" + ] + }, { "cell_type": "markdown", "id": "7e2813b4", @@ -1813,6 +1917,136 @@ "df.astype({'c4':kx.SymbolVector, 'c5':kx.SymbolVector})" ] }, + { + "cell_type": "markdown", + "id": "0f8813a0", + "metadata": {}, + "source": [ + "### Table.add_prefix()\n", + "\n", + "```\n", + "Table.add_prefix(columns)\n", + "```\n", + "\n", + "Rename columns adding a prefix in a table and return the resulting Table object.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :-----: | :-------------: | :------------------------------------------------------------------ | :--------: |\n", + "| prefix | str | The string that will be concatenated with the name of the columns | _required_ |\n", + "| axis | int | Axis to add prefix on. | 0 |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :---: | :----------------------------------------------------------------- |\n", + "| Table | A table with the given column(s) renamed adding a prefix. |" + ] + }, + { + "cell_type": "markdown", + "id": "9186ed86", + "metadata": {}, + "source": [ + "**Examples:**\n", + "\n", + "The initial table to which a prefix will be added to its columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f20131b", + "metadata": {}, + "outputs": [], + "source": [ + "tab.head()" + ] + }, + { + "cell_type": "markdown", + "id": "73c2b08f", + "metadata": {}, + "source": [ + "Add \"col_\" to table columns:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "926c8295", + "metadata": {}, + "outputs": [], + "source": [ + "tab.add_prefix(prefix=\"col_\").head()" + ] + }, + { + "cell_type": "markdown", + "id": "0a4abc8c", + "metadata": {}, + "source": [ + "### Table.add_suffix()\n", + "\n", + "```\n", + "Table.add_suffix(columns)\n", + "```\n", + "\n", + "Rename columns adding a suffix in a table and return the resulting Table object.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :-----: | :-------------: | :------------------------------------------------------------------ | :--------: |\n", + "| suffix | str | The string that will be concatenated with the name of the columns | _required_ |\n", + "| axis | int | Axis to add suffix on. | 0 |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :---: | :----------------------------------------------------------------- |\n", + "| Table | A table with the given column(s) renamed adding a suffix. |" + ] + }, + { + "cell_type": "markdown", + "id": "c22262b8", + "metadata": {}, + "source": [ + "**Examples:**\n", + "\n", + "The initial table to which a suffix will be added to its columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55c1f504", + "metadata": {}, + "outputs": [], + "source": [ + "tab.head()" + ] + }, + { + "cell_type": "markdown", + "id": "b4687851", + "metadata": {}, + "source": [ + "Add \"_col\" to table columns:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e00d0f5c", + "metadata": {}, + "outputs": [], + "source": [ + "tab.add_suffix(suffix=\"_col\").head()" + ] + }, { "cell_type": "markdown", "id": "718584f8", @@ -2507,6 +2741,82 @@ "tab.prod(numeric_only=True)" ] }, + { + "cell_type": "markdown", + "id": "c87d4f95", + "metadata": {}, + "source": [ + "### Table.count()\n", + "\n", + "```\n", + "Table.count(axis=0, numeric_only=False)\n", + "```\n", + "\n", + "Returns the count of non null values across the given axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to count elements across 1 is columns, 0 is rows. | 0 |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represent the column name / row number and the values are the result of calling `count` on that column / row. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6520c195", + "metadata": {}, + "outputs": [], + "source": [ + "tab.count()" + ] + }, + { + "cell_type": "markdown", + "id": "ce85797d", + "metadata": {}, + "source": [ + "### Table.skew()\n", + "\n", + "```\n", + "Table.skew(axis=0, skipna=True, numeric_only=False)\n", + "```\n", + "\n", + "Returns the skewness of all values across the given axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the skewness across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | Ignore any null values along the axis. | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | A dictionary where the key represent the column name / row number and the values are the result of calling `skew` on that column / row. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fb5dce1", + "metadata": {}, + "outputs": [], + "source": [ + "tab.skew(numeric_only=True)" + ] + }, { "cell_type": "markdown", "id": "499025cb", diff --git a/src/pykx/pandas_api/pandas_indexing.py b/src/pykx/pandas_api/pandas_indexing.py index 954896d..e3dfeb0 100644 --- a/src/pykx/pandas_api/pandas_indexing.py +++ b/src/pykx/pandas_api/pandas_indexing.py @@ -454,6 +454,32 @@ def rename(self, labels=None, index=None, columns=None, axis=0, return t + def add_suffix(self, suffix, axis=0): + t = self + if axis == 1: + t = q('''{[s;t] + c:$[99h~type t;cols value@;cols] t; + (c!`$string[c],\\:string s) xcol t + }''', suffix, t) + elif axis == 0: + raise ValueError('nyi') + else: + raise ValueError(f'No axis named {axis}') + return t + + def add_prefix(self, prefix, axis=0): + t = self + if axis == 1: + t = q('''{[s;t] + c:$[99h~type t;cols value@;cols] t; + (c!`$string[s],/:string[c]) xcol t + }''', prefix, t) + elif axis == 0: + raise ValueError('nyi') + else: + raise ValueError(f'No axis named {axis}') + return t + def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None, ignore_index=False): if n is None and frac is None: diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 91cc147..4ee270a 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -154,6 +154,29 @@ def mean(self, axis: int = 0, numeric_only: bool = False): tab ) + @api_return + def std(self, axis: int = 0, ddof: int = 1, numeric_only: bool = False): + tab = self + if 'Keyed' in str(type(tab)): + tab = q.value(tab) + if numeric_only: + tab = _get_numeric_only_subtable(tab) + + axis_keys = q('{[axis;tab] $[0~axis;cols;`$string til count @] tab}', axis, tab) + + if ddof == len(tab): + return q('{x!count[x]#0n}', axis_keys) + + return q( + '''{[tab;axis;ddof;axis_keys] + tab:$[0~axis;(::);flip] value flip tab; + d:$[0~ddof;dev; + 1~ddof;sdev; + {[ddof;x] avg sqrt (sum xexp[x-avg x;2]) % count[x]-ddof}ddof]; + axis_keys!d each tab + }''', tab, axis, ddof, axis_keys + ) + @api_return def median(self, axis: int = 0, numeric_only: bool = False): tab = self @@ -245,6 +268,16 @@ def prod(self, axis=0, skipna=True, numeric_only=False, min_count=0): min_count ), cols) + @convert_result + def skew(self, axis=0, skipna=True, numeric_only=False): + res, cols = preparse_computations(self, axis, skipna, numeric_only) + return (q( + '''{[row] + m:{(sum (x - avg x) xexp y) % count x}; + g1:{[m;x]m:m[x]; m[3] % m[2] xexp 3%2}[m]; + (g1 each row) * {sqrt[n * n-1] % neg[2] + n:count x} each row + }''', res), cols) + @convert_result def sum(self, axis=0, skipna=True, numeric_only=False, min_count=0): res, cols = preparse_computations(self, axis, skipna, numeric_only) @@ -311,3 +344,8 @@ def agg(self, func, axis=0, *args, **kwargs): # noqa: C901 return data else: return (q('{(flip enlist[`function]!enlist x)!y}', keyname, data)) + + @convert_result + def count(self, axis=0, numeric_only=False): + res, cols = preparse_computations(self, axis, True, numeric_only) + return (q('count each', res), cols) diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index d1247ea..4ea8076 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2038,3 +2038,160 @@ def test_keyed_loc_fixes(q): mkt[['k1', 'y']] with pytest.raises(KeyError): mkt['k1'] + + +def test_pandas_count(q): + tab = q('([] k1: 0n 2 0n 2 0n ; k2: (`a;`;`b;`;`c))') + df = tab.pd() + + qcount = tab.count(axis=1).py() + pcount = df.count(axis=1) + + assert int(qcount[0]) == int(pcount[0]) + assert int(qcount[1]) == 1 + + qcount = tab.count().py() + pcount = df.count() + + assert int(qcount["k1"]) == int(pcount["k1"]) + assert int(qcount["k2"]) == 3 + + qcount = tab.count(numeric_only=True).py() + pcount = df.count(numeric_only=True) + + assert int(qcount["k1"]) == int(pcount["k1"]) + + +def test_df_add_prefix(kx, q): + t = q('([] til 5; 5?5; 5?1f; (5;5)#100?" ")') + + q_add_prefix = t.add_prefix("col_", axis=1) + + assert(q('~', q_add_prefix, t.pd().add_prefix("col_", axis=1))) + + kt = kx.q('([idx:til 5] til 5; 5?5; 5?1f; (5;5)#100?" ")') + + q_add_prefix = kt.add_prefix("col_", axis=1) + assert(q('~', q_add_prefix, kt.pd().add_prefix("col_", axis=1))) + + with pytest.raises(ValueError) as err: + t.add_prefix("col_", axis=0) + assert 'nyi' in str(err) + + with pytest.raises(ValueError) as err: + t.add_prefix("col_", axis=3) + assert 'No axis named 3' in str(err) + + +def test_df_add_suffix(kx, q): + t = q('([] til 5; 5?5; 5?1f; (5;5)#100?" ")') + + q_add_suffix = t.add_suffix("_col", axis=1) + + assert(q('~', q_add_suffix, t.pd().add_suffix("_col", axis=1))) + + kt = kx.q('([idx:til 5] til 5; 5?5; 5?1f; (5;5)#100?" ")') + + q_add_suffix = kt.add_suffix("_col", axis=1) + assert(q('~', q_add_suffix, kt.pd().add_suffix("_col", axis=1))) + + with pytest.raises(ValueError) as err: + t.add_suffix("_col", axis=0) + assert 'nyi' in str(err) + + with pytest.raises(ValueError) as err: + t.add_suffix("_col", axis=3) + assert 'No axis named 3' in str(err) + + +def test_pandas_skew(q): + tab = q('([] price: 250.0f - 100?500.0f; ints: 100 - 100?200)') + df = tab.pd() + qskew = tab.skew().py() + pskew = df.skew() + assert round(float(qskew['price']), 6) == round(float(pskew['price']), 6) + assert round(float(qskew['ints']), 6) == round(float(pskew['ints']), 6) + + tab = q('^', q('([]sym:100?`foo`bar`baz`qux)'), tab) + df = tab.pd() + qskew = tab.skew(numeric_only=True).py() + pskew = df.skew(numeric_only=True) + assert round(float(qskew['price']), 6) == round(float(pskew['price']), 6) + assert round(float(qskew['ints']), 6) == round(float(pskew['ints']), 6) + + tab = q('^', q('([]foo:(5#0n),95?500.0f)'), tab) + df = tab.pd() + qskew = tab.skew(numeric_only=True, skipna=True).py() + pskew = df.skew(numeric_only=True, skipna=True) + assert round(float(qskew['foo']), 6) == round(float(pskew['foo']), 6) + + tab = q('_', 5, tab) # discard rows with null "foo"s + df = tab.pd() + qskew = tab.skew(numeric_only=True, axis=1).py() + pskew = df.skew(numeric_only=True, axis=1) + print(q('~', qskew, pskew)) + for r in range(len(qskew)): + assert round(float(qskew[r]), 6) == round(float(pskew[r]), 6) + + +def test_std(kx, q): + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': [7, 11, 14, 14] + } + ) + tab = kx.toq(df) + p_m = df.std() + q_m = tab.std() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.std(axis=1) + q_m = tab.std(axis=1) + for c in range(len(q.cols(tab))): + assert p_m[c] == q_m[q('{`$string x}', c)].py() + p_m = df.std(ddof=0) + q_m = tab.std(ddof=0) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + + p_m = df.std(ddof=4) + q_m = tab.std(ddof=4) + for c in q.key(q_m).py(): + assert np.isnan(p_m[c]) == np.isnan(q_m[c].py()) + + q['tab'] = kx.toq(df) + tab = q('1!`idx xcols update idx: til count tab from tab') + p_m = df.std() + q_m = tab.std() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.std(axis=1) + q_m = tab.std(axis=1) + for c in range(len(q.cols(tab)) - 1): + assert p_m[c] == q_m[q('{`$string x}', c)].py() + + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': ['foo', 'bar', 'baz', 'qux'] + } + ) + tab = kx.toq(df) + p_m = df.std(numeric_only=True) + q_m = tab.std(numeric_only=True) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.std(axis=1, numeric_only=True) + q_m = tab.std(axis=1, numeric_only=True) + for c in range(len(q.cols(tab))): + assert p_m[c] == q_m[q('{`$string x}', c)].py() + + with pytest.raises(kx.QError): + q_m = tab.std() + with pytest.raises(kx.QError): + q_m = tab.std(axis=1)