From 34ee237f36102cddc19f32d9f12f22a54f73ad87 Mon Sep 17 00:00:00 2001 From: chraberturas Date: Tue, 12 Dec 2023 12:03:25 +0100 Subject: [PATCH 1/7] Added implementation of nunique function --- docs/user-guide/advanced/Pandas_API.ipynb | 40 +++++++++++++++++++++++ src/pykx/pandas_api/pandas_meta.py | 9 +++++ tests/test_pandas_api.py | 37 +++++++++++++++++++++ 3 files changed, 86 insertions(+) diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index cb98590..cb2385c 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -2507,6 +2507,46 @@ "tab.prod(numeric_only=True)" ] }, + { + "cell_type": "markdown", + "source": [ + "### Table.nunique()\n", + "```\n", + "Table.nunique(axis=0, skipna=True, numeric_only=False, min_count=0)\n", + "```\n", + "\n", + "Returns the number of unique elements across the given axis.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the sum across 0 is columns, 1 is rows. | 0 |\n", + "| dropna | bool | Don’t include NaN in the counts. | True |\n", + "\n", + "**Returns:**\n", + "\n", + " | Type | Description |\n", + " | :----------------: | :------------------------------------------------------------------- |\n", + " | Dictionary | A dictionary where the key represent the column name / row number and the values are the result of calling `nunique` on that column / row. |" + ], + "metadata": { + "collapsed": false + }, + "id": "7b39a07bd7cd0af7" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "tab.nunique()" + ], + "metadata": { + "collapsed": false + }, + "id": "f5592b19b69ad46d" + }, { "cell_type": "markdown", "id": "655c3ad2", diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index d0e44ec..a164230 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -255,3 +255,12 @@ def sum(self, axis=0, skipna=True, numeric_only=False, min_count=0): res, min_count ), cols) + + @convert_result + def nunique(self, axis=0, dropna=True): + res, cols = preparse_computations(self, axis, skipna=False) + filterNan = q('{$[11h = type x;x;' + '0h = type x;(x where not null x except w),(w:x where 10h=type each x);' + 'x where not null x]}each') + res = filterNan(res) if dropna else res + return q('(\'[count;distinct]\')', res), cols diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index acfe55f..ddb0873 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2029,3 +2029,40 @@ def test_keyed_loc_fixes(q): mkt[['k1', 'y']] with pytest.raises(KeyError): mkt['k1'] + + +def test_nunique(kx, q): + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': ['foo', 'baz', 'baz', 'qux'] + } + ) + tab = kx.toq(df) + p_m = df.nunique() + q_m = tab.nunique() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.nunique(axis=1) + q_m = tab.nunique(axis=1) + for c in range(len(tab)): + assert p_m[c] == q_m[c].py() + + tab = kx.q('([]A:4 0n 7 6;B:4 0n 0n 7;C:``foo`foo`)') + df = tab.pd() + p_m = df.nunique() + q_m = tab.nunique() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.nunique(axis=1, dropna=False) + q_m = tab.nunique(axis=1, dropna=False) + for c in range(len(tab)): + assert p_m[c] == q_m[c].py() + p_m = df.nunique(dropna=False) + q_m = tab.nunique(dropna=False) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + + From 217cedf05fb88d00a987758da95e6a0131f31a9e Mon Sep 17 00:00:00 2001 From: chraberturas Date: Tue, 16 Jan 2024 13:09:01 +0100 Subject: [PATCH 2/7] Added test for handling strings nulls (" "), differentiating behavior between Python and kdb+ --- tests/test_pandas_api.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index ddb0873..700077f 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2065,4 +2065,9 @@ def test_nunique(kx, q): for c in q.key(q_m).py(): assert p_m[c] == q_m[c].py() + tab = kx.q('([]A:("";" ";"";"foo"))') + df = tab.pd() + p_m = df.nunique() + q_m = tab.nunique() + assert p_m['A'] == 1 + q_m['A'].py() From 1c5dbdaeda9461884bb65d1e3959624281989089 Mon Sep 17 00:00:00 2001 From: chraberturas Date: Tue, 16 Jan 2024 13:09:01 +0100 Subject: [PATCH 3/7] Added test for handling strings nulls (" "), differentiating behavior between Python and kdb+ --- docs/user-guide/advanced/Pandas_API.ipynb | 10 +++++----- tests/test_pandas_api.py | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index cb2385c..c66259a 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -2519,10 +2519,10 @@ "\n", "**Parameters:**\n", "\n", - "| Name | Type | Description | Default |\n", - "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", - "| axis | int | The axis to calculate the sum across 0 is columns, 1 is rows. | 0 |\n", - "| dropna | bool | Don’t include NaN in the counts. | True |\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: |:------------------------------------------------------------------------------------| :-----: |\n", + "| axis | int | The axis to calculate the number of unique elements across 0 is columns, 1 is rows. | 0 |\n", + "| dropna | bool | Don’t include NaN in the counts. | True |\n", "\n", "**Returns:**\n", "\n", @@ -2533,7 +2533,7 @@ "metadata": { "collapsed": false }, - "id": "7b39a07bd7cd0af7" + "id": "5bc5e813e9673a84" }, { "cell_type": "code", diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index ddb0873..700077f 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2065,4 +2065,9 @@ def test_nunique(kx, q): for c in q.key(q_m).py(): assert p_m[c] == q_m[c].py() + tab = kx.q('([]A:("";" ";"";"foo"))') + df = tab.pd() + p_m = df.nunique() + q_m = tab.nunique() + assert p_m['A'] == 1 + q_m['A'].py() From 9fe428c3057d4d4e8d80a96bfd535271cae0bb46 Mon Sep 17 00:00:00 2001 From: chraberturas Date: Thu, 18 Jan 2024 08:39:42 +0100 Subject: [PATCH 4/7] Suggested changes. Error with mixed lists and tests for this case. --- src/pykx/pandas_api/pandas_meta.py | 9 ++++---- tests/test_pandas_api.py | 37 ++++++++++++------------------ 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 7bbaaec..e4698d1 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -260,11 +260,12 @@ def sum(self, axis=0, skipna=True, numeric_only=False, min_count=0): @convert_result def nunique(self, axis=0, dropna=True): res, cols = preparse_computations(self, axis, skipna=False) - filterNan = q('{$[11h = type x;x;' - '0h = type x;(x where not null x except w),(w:x where 10h=type each x);' - 'x where not null x]}each') + if q("any('[1<>count distinct@;type']')@", res).py(): + raise NotImplementedError("Table contains a column whose type is mixed") + filterNan = q('{$[all[10h=type each x]|11h = type x;x;' + 'x where not null x]}each') res = filterNan(res) if dropna else res - return q('(\'[count;distinct]\')', res), cols + return q("('[count;distinct]')", res), cols def agg(self, func, axis=0, *args, **kwargs): # noqa: C901 if 'KeyedTable' in str(type(self)): diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index 700077f..0c5f893 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2032,12 +2032,22 @@ def test_keyed_loc_fixes(q): def test_nunique(kx, q): + tab = kx.q('([]a:4 0n 7 6;b:4 0n 0n 7;c:``foo`foo`)') + df = tab.pd() + p_m = df.nunique() + q_m = tab.nunique() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.nunique(dropna=False) + q_m = tab.nunique(dropna=False) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + df = pd.DataFrame( { 'a': [1, 2, 2, 4], 'b': [1, 2, 6, 7], 'c': [7, 8, 9, 10], - 'd': ['foo', 'baz', 'baz', 'qux'] } ) tab = kx.toq(df) @@ -2050,24 +2060,7 @@ def test_nunique(kx, q): for c in range(len(tab)): assert p_m[c] == q_m[c].py() - tab = kx.q('([]A:4 0n 7 6;B:4 0n 0n 7;C:``foo`foo`)') - df = tab.pd() - p_m = df.nunique() - q_m = tab.nunique() - for c in q.key(q_m).py(): - assert p_m[c] == q_m[c].py() - p_m = df.nunique(axis=1, dropna=False) - q_m = tab.nunique(axis=1, dropna=False) - for c in range(len(tab)): - assert p_m[c] == q_m[c].py() - p_m = df.nunique(dropna=False) - q_m = tab.nunique(dropna=False) - for c in q.key(q_m).py(): - assert p_m[c] == q_m[c].py() - - tab = kx.q('([]A:("";" ";"";"foo"))') - df = tab.pd() - p_m = df.nunique() - q_m = tab.nunique() - assert p_m['A'] == 1 + q_m['A'].py() - + tab = kx.q('([]a:("";" ";"";"foo"))') + with pytest.raises(NotImplementedError, + match=r"Table contains a column whose type is mixed"): + raise tab.nunique() From 0f04d8e8dee2c3bab97437b41a1e2fcc35df846f Mon Sep 17 00:00:00 2001 From: chraberturas Date: Thu, 18 Jan 2024 08:39:42 +0100 Subject: [PATCH 5/7] Suggested changes. Error with mixed lists and tests for this case. --- src/pykx/pandas_api/pandas_meta.py | 9 ++++---- tests/test_pandas_api.py | 37 ++++++++++++------------------ 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 7bbaaec..1ba34f4 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -260,11 +260,12 @@ def sum(self, axis=0, skipna=True, numeric_only=False, min_count=0): @convert_result def nunique(self, axis=0, dropna=True): res, cols = preparse_computations(self, axis, skipna=False) - filterNan = q('{$[11h = type x;x;' - '0h = type x;(x where not null x except w),(w:x where 10h=type each x);' - 'x where not null x]}each') + if q("any('[1<>count distinct@;type']')@", res).py(): + raise NotImplementedError("Table contains a column whose type is mixed") + filterNan = q('{$[all[10h=type each x]|11h = type x;x;' + 'x where not null x]}each') res = filterNan(res) if dropna else res - return q('(\'[count;distinct]\')', res), cols + return (q("('[count;distinct]')", res), cols) def agg(self, func, axis=0, *args, **kwargs): # noqa: C901 if 'KeyedTable' in str(type(self)): diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index 700077f..0c5f893 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2032,12 +2032,22 @@ def test_keyed_loc_fixes(q): def test_nunique(kx, q): + tab = kx.q('([]a:4 0n 7 6;b:4 0n 0n 7;c:``foo`foo`)') + df = tab.pd() + p_m = df.nunique() + q_m = tab.nunique() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.nunique(dropna=False) + q_m = tab.nunique(dropna=False) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + df = pd.DataFrame( { 'a': [1, 2, 2, 4], 'b': [1, 2, 6, 7], 'c': [7, 8, 9, 10], - 'd': ['foo', 'baz', 'baz', 'qux'] } ) tab = kx.toq(df) @@ -2050,24 +2060,7 @@ def test_nunique(kx, q): for c in range(len(tab)): assert p_m[c] == q_m[c].py() - tab = kx.q('([]A:4 0n 7 6;B:4 0n 0n 7;C:``foo`foo`)') - df = tab.pd() - p_m = df.nunique() - q_m = tab.nunique() - for c in q.key(q_m).py(): - assert p_m[c] == q_m[c].py() - p_m = df.nunique(axis=1, dropna=False) - q_m = tab.nunique(axis=1, dropna=False) - for c in range(len(tab)): - assert p_m[c] == q_m[c].py() - p_m = df.nunique(dropna=False) - q_m = tab.nunique(dropna=False) - for c in q.key(q_m).py(): - assert p_m[c] == q_m[c].py() - - tab = kx.q('([]A:("";" ";"";"foo"))') - df = tab.pd() - p_m = df.nunique() - q_m = tab.nunique() - assert p_m['A'] == 1 + q_m['A'].py() - + tab = kx.q('([]a:("";" ";"";"foo"))') + with pytest.raises(NotImplementedError, + match=r"Table contains a column whose type is mixed"): + raise tab.nunique() From 2aa3a6e60235a670327163f1dad8b88d693f57cb Mon Sep 17 00:00:00 2001 From: chraberturas Date: Thu, 18 Jan 2024 12:05:15 +0100 Subject: [PATCH 6/7] QError for mixed lists (suggested by Kx) --- src/pykx/pandas_api/pandas_meta.py | 2 -- tests/test_pandas_api.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 1ba34f4..59de3d8 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -260,8 +260,6 @@ def sum(self, axis=0, skipna=True, numeric_only=False, min_count=0): @convert_result def nunique(self, axis=0, dropna=True): res, cols = preparse_computations(self, axis, skipna=False) - if q("any('[1<>count distinct@;type']')@", res).py(): - raise NotImplementedError("Table contains a column whose type is mixed") filterNan = q('{$[all[10h=type each x]|11h = type x;x;' 'x where not null x]}each') res = filterNan(res) if dropna else res diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index 0c5f893..e6e9891 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2061,6 +2061,5 @@ def test_nunique(kx, q): assert p_m[c] == q_m[c].py() tab = kx.q('([]a:("";" ";"";"foo"))') - with pytest.raises(NotImplementedError, - match=r"Table contains a column whose type is mixed"): + with pytest.raises(kx.QError): raise tab.nunique() From 4aff510f97b4a5d80471fca423dcff5ecdcfaf26 Mon Sep 17 00:00:00 2001 From: chraberturas Date: Mon, 22 Jan 2024 10:35:03 +0100 Subject: [PATCH 7/7] minor: rename filternan (suggested) --- src/pykx/pandas_api/pandas_meta.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 59de3d8..75805a3 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -260,9 +260,9 @@ def sum(self, axis=0, skipna=True, numeric_only=False, min_count=0): @convert_result def nunique(self, axis=0, dropna=True): res, cols = preparse_computations(self, axis, skipna=False) - filterNan = q('{$[all[10h=type each x]|11h = type x;x;' + filternan = q('{$[all[10h=type each x]|11h = type x;x;' 'x where not null x]}each') - res = filterNan(res) if dropna else res + res = filternan(res) if dropna else res return (q("('[count;distinct]')", res), cols) def agg(self, func, axis=0, *args, **kwargs): # noqa: C901