Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#902: fixed memory related bugs with emit dataframe #414

Merged
merged 26 commits into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b1c239e
#902: fixed memory leak with emit dataframe
tomuben May 23, 2024
67862d5
Test
tomuben May 24, 2024
1c93f0c
Several fixes:
tomuben May 24, 2024
0035320
Reverted formatting
tomuben May 24, 2024
64a3fa8
Reverted release PyList_Append()
tomuben May 24, 2024
04d4bdf
Fixed formatting
tomuben May 24, 2024
f9cd794
Fixed formatting
tomuben May 24, 2024
8fe96dc
Fixed error code
tomuben May 24, 2024
f148427
Avoid copy of int/double
tomuben May 26, 2024
d6b1177
Tests and fix for datetime only columns
tomuben May 28, 2024
7e300f0
Added more comments
tomuben May 28, 2024
e8f3f90
Decreased batch count in emit_dtypes_memory_leak.py
tomuben May 28, 2024
170aa4c
Use a single variable for max memory usage
tomuben May 29, 2024
0daddff
Modified dateframe_memory_leak.py
tomuben May 31, 2024
0b5233d
Fixed findings from review
tomuben May 31, 2024
2a927fb
Uncommented tests
tomuben May 31, 2024
9685734
Adjusted memory limit for test_dataframe_set_emits()
tomuben May 31, 2024
7c931b2
Fixes from review
tomuben Jun 3, 2024
ebee2e2
Merge branch 'master' into bug/902_fix_memory_leak_emit_df
tomuben Jun 3, 2024
93b609d
Merge branch 'master' into bug/902_fix_memory_leak_emit_df
tomuben Jun 4, 2024
53bc403
fixed dataframe_memory_leak.py
tomuben Jun 4, 2024
96b58aa
Merge branch 'master' into bug/902_fix_memory_leak_emit_df
tkilias Jun 6, 2024
5ea9ef2
Merge branch 'master' into bug/902_fix_memory_leak_emit_df
tomuben Jun 6, 2024
f7f8fb7
Merge branch 'master' into bug/902_fix_memory_leak_emit_df
tomuben Jun 7, 2024
0bb3b39
Increased memory limit in test
tomuben Jun 7, 2024
36a15c0
Fixed include file for std::all_of
tomuben Jun 7, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 90 additions & 67 deletions exaudfclient/base/python/python3/python_ext_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,18 @@ struct PyPtr {
PyUniquePtr ptr;
};

inline void checkPyPtrIsNull(const PyPtr& obj) {
inline void checkPyPtrIsNotNull(const PyPtr& obj) {
// Error message set by Python
if (!obj)
throw std::runtime_error("F-UDF-CL-SL-PYTHON-1039");
}

inline void checkPyObjIsNotNull(const PyObject *obj) {
// Error message set by Python
if (!obj)
throw std::runtime_error("F-UDF-CL-SL-PYTHON-1142");
}



struct ColumnInfo
Expand Down Expand Up @@ -620,7 +626,7 @@ inline void handleEmitNpyUint64(
throw std::runtime_error(ss.str().c_str());
}
}
checkPyPtrIsNull(pyValue);
checkPyPtrIsNotNull(pyValue);
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL));
}

Expand Down Expand Up @@ -657,7 +663,7 @@ inline void handleEmitNpyUint32(
throw std::runtime_error(ss.str().c_str());
}
}
checkPyPtrIsNull(pyValue);
checkPyPtrIsNotNull(pyValue);
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL));
}

Expand Down Expand Up @@ -694,7 +700,7 @@ inline void handleEmitNpyUint16(
throw std::runtime_error(ss.str().c_str());
}
}
checkPyPtrIsNull(pyValue);
checkPyPtrIsNotNull(pyValue);
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL));
}

Expand Down Expand Up @@ -731,7 +737,7 @@ inline void handleEmitNpyUint8(
throw std::runtime_error(ss.str().c_str());
}
}
checkPyPtrIsNull(pyValue);
checkPyPtrIsNotNull(pyValue);
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL));
}

Expand Down Expand Up @@ -768,7 +774,7 @@ inline void handleEmitNpyFloat64(
throw std::runtime_error(ss.str().c_str());
}
}
checkPyPtrIsNull(pyValue);
checkPyPtrIsNotNull(pyValue);
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL));
}

Expand Down Expand Up @@ -805,7 +811,7 @@ inline void handleEmitNpyFloat32(
throw std::runtime_error(ss.str().c_str());
}
}
checkPyPtrIsNull(pyValue);
checkPyPtrIsNotNull(pyValue);
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL));
}

Expand Down Expand Up @@ -842,7 +848,7 @@ inline void handleEmitNpyBool(
throw std::runtime_error(ss.str().c_str());
}
}
checkPyPtrIsNull(pyValue);
checkPyPtrIsNotNull(pyValue);
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL));
}

Expand All @@ -856,19 +862,20 @@ inline void handleEmitPyBool(
PyPtr& pyValue,
PyPtr& pyResult,
PyPtr& pySetNullMethodName){
PyPtr pyBool(PyList_GetItem(columnArrays[c].get(), r));
checkPyPtrIsNull(pyBool);
if (isNoneOrNA(pyBool.get())) {
//PyList_GetItem returns a 'borrowed' reference. Must not call XDECREF() on it.
PyObject *pyBool = PyList_GetItem(columnArrays[c].get(), r);
checkPyObjIsNotNull(pyBool);
if (isNoneOrNA(pyBool)) {
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL));
return;
}
switch (colInfo[c].type) {
case SWIGVMContainers::BOOLEAN:
if (pyBool.get() == Py_True) {
if (pyBool == Py_True) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this makes also only slightly sense, but we don't change it int this PR

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is that Py_True (which comes from the Python header file) is a static pointer to the (immortal) "True" object of Python.
See https://docs.python.org/3/c-api/bool.html#c.Py_True

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thing is, what ever pyBool is, it is either Py_True or Py_False, otherwise we would need to use a Python comparison instead of a c++. That means, the comparison is meaningless, because we do both cases the same. We could replace this with INCREF(pyBool)

Py_INCREF(Py_True);
pyValue.reset(Py_True);
}
else if (pyBool.get() == Py_False) {
else if (pyBool == Py_False) {
Py_INCREF(Py_False);
pyValue.reset(Py_False);
}
Expand All @@ -883,7 +890,7 @@ inline void handleEmitPyBool(
throw std::runtime_error(ss.str().c_str());
}
}
checkPyPtrIsNull(pyValue);
checkPyPtrIsNotNull(pyValue);
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyValue.get(), NULL));
}

Expand All @@ -897,24 +904,29 @@ inline void handleEmitPyInt(
PyPtr& pyValue,
PyPtr& pyResult,
PyPtr& pySetNullMethodName){
PyPtr pyInt(PyList_GetItem(columnArrays[c].get(), r));
checkPyPtrIsNull(pyInt);
if (isNoneOrNA(pyInt.get())) {
//PyList_GetItem returns a 'borrowed' reference. Must not call XDECREF() on it.
PyObject *pyInt = PyList_GetItem(columnArrays[c].get(), r);
checkPyObjIsNotNull(pyInt);
if (isNoneOrNA(pyInt)) {
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL));
return;
}

switch (colInfo[c].type) {
case SWIGVMContainers::INT64:
case SWIGVMContainers::INT32:
pyValue.reset(pyInt.release());
{
//pyInt points to a 'borrowed' reference. We need to explicitly increase the ref counter here, as pyValue will decrease it again later.
Py_INCREF(pyInt);
pyValue.reset(pyInt);
break;
}
case SWIGVMContainers::NUMERIC:
pyValue.reset(PyObject_Str(pyInt.get()));
pyValue.reset(PyObject_Str(pyInt));
break;
case SWIGVMContainers::DOUBLE:
{
double value = PyFloat_AsDouble(pyInt.get());
double value = PyFloat_AsDouble(pyInt);
if (value < 0 && PyErr_Occurred())
throw std::runtime_error("F-UDF-CL-SL-PYTHON-1067: emit() PY_INT: PyFloat_AsDouble error");
pyValue.reset(PyFloat_FromDouble(value));
Expand All @@ -940,9 +952,10 @@ inline void handleEmitPyFloat(
PyPtr& pyValue,
PyPtr& pyResult,
PyPtr& pySetNullMethodName){
PyPtr pyFloat(PyList_GetItem(columnArrays[c].get(), r));
checkPyPtrIsNull(pyFloat);
if (isNoneOrNA(pyFloat.get())) {
//PyList_GetItem returns a 'borrowed' reference. Must not call XDECREF() on it.
PyObject *pyFloat = PyList_GetItem(columnArrays[c].get(), r);
checkPyObjIsNotNull(pyFloat);
if (isNoneOrNA(pyFloat)) {
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL));
return;
}
Expand All @@ -951,7 +964,7 @@ inline void handleEmitPyFloat(
case SWIGVMContainers::INT64:
case SWIGVMContainers::INT32:
{
double value = PyFloat_AsDouble(pyFloat.get());
double value = PyFloat_AsDouble(pyFloat);
if (value < 0 && PyErr_Occurred())
throw std::runtime_error("F-UDF-CL-SL-PYTHON-1139: emit() PY_FLOAT: PyFloat_AsDouble error");
if (npy_isnan(value)) {
Expand All @@ -963,11 +976,15 @@ inline void handleEmitPyFloat(
break;
}
case SWIGVMContainers::NUMERIC:
pyValue.reset(PyObject_Str(pyFloat.get()));
pyValue.reset(PyObject_Str(pyFloat));
break;
case SWIGVMContainers::DOUBLE:
pyValue.reset(pyFloat.release());
{
//pyFloat points to a 'borrowed' reference. We need to explicitly increase the ref counter here, as pyValue will decrease it again later.
Py_INCREF(pyFloat);
pyValue.reset(pyFloat);
break;
}
default:
{
std::stringstream ss;
Expand All @@ -991,8 +1008,9 @@ inline void handleEmitPyDecimal(
PyPtr& pyIntMethodName,
PyPtr& pyFloatMethodName
){
PyPtr pyDecimal(PyList_GetItem(columnArrays[c].get(), r));
if (isNoneOrNA(pyDecimal.get())) {
//PyList_GetItem returns a 'borrowed' reference. Must not call XDECREF() on it.
PyObject *pyDecimal = PyList_GetItem(columnArrays[c].get(), r);
if (isNoneOrNA(pyDecimal)) {
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL));
return;
}
Expand All @@ -1001,16 +1019,16 @@ inline void handleEmitPyDecimal(
case SWIGVMContainers::INT64:
case SWIGVMContainers::INT32:
{
PyPtr pyInt(PyObject_CallMethodObjArgs(pyDecimal.get(), pyIntMethodName.get(), NULL));
PyPtr pyInt(PyObject_CallMethodObjArgs(pyDecimal, pyIntMethodName.get(), NULL));
pyValue.reset(pyInt.release());
break;
}
case SWIGVMContainers::NUMERIC:
pyValue.reset(PyObject_Str(pyDecimal.get()));
pyValue.reset(PyObject_Str(pyDecimal));
break;
case SWIGVMContainers::DOUBLE:
{
PyPtr pyFloat(PyObject_CallMethodObjArgs(pyDecimal.get(), pyFloatMethodName.get(), NULL));
PyPtr pyFloat(PyObject_CallMethodObjArgs(pyDecimal, pyFloatMethodName.get(), NULL));
pyValue.reset(pyFloat.release());
break;
}
Expand All @@ -1034,25 +1052,25 @@ inline void handleEmitPyStr(
PyPtr& pyValue,
PyPtr& pyResult,
PyPtr& pySetNullMethodName){

PyPtr pyString(PyList_GetItem(columnArrays[c].get(), r));
if (isNoneOrNA(pyString.get())) {
//PyList_GetItem returns a 'borrowed' reference. Must not call XDECREF() on it.
PyObject *pyString = PyList_GetItem(columnArrays[c].get(), r);
if (isNoneOrNA(pyString)) {
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL));
return;
}

switch (colInfo[c].type) {
case SWIGVMContainers::NUMERIC:
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyString.get(), NULL));
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyString, NULL));
break;
case SWIGVMContainers::STRING:
{
Py_ssize_t size = -1;
const char *str = PyUnicode_AsUTF8AndSize(pyString.get(), &size);
const char *str = PyUnicode_AsUTF8AndSize(pyString, &size);
if (!str && size < 0)
throw std::runtime_error("F-UDF-CL-SL-PYTHON-1137: invalid size of string");
PyPtr pySize(PyLong_FromSsize_t(size));
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyString.get(), pySize.get(), NULL));
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyString, pySize.get(), NULL));
break;
}
default:
Expand All @@ -1075,16 +1093,17 @@ inline void handleEmitPyDate(
PyPtr& pyResult,
PyPtr& pySetNullMethodName,
PyPtr& pyIsoformatMethodName){
PyPtr pyDate(PyList_GetItem(columnArrays[c].get(), r));
if (isNoneOrNA(pyDate.get())) {
//PyList_GetItem returns a 'borrowed' reference. Must not call XDECREF() on it.
PyObject *pyDate = PyList_GetItem(columnArrays[c].get(), r);
if (isNoneOrNA(pyDate)) {
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL));
return;
}

switch (colInfo[c].type) {
case SWIGVMContainers::DATE:
{
PyPtr pyIsoDate(PyObject_CallMethodObjArgs(pyDate.get(), pyIsoformatMethodName.get(), NULL));
PyPtr pyIsoDate(PyObject_CallMethodObjArgs(pyDate, pyIsoformatMethodName.get(), NULL));
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyIsoDate.get(), NULL));
break;
}
Expand All @@ -1106,8 +1125,9 @@ inline void handleEmitPyTimestamp(
PyPtr& pyValue,
PyPtr& pyResult,
PyPtr& pySetNullMethodName){
PyPtr pyTimestamp(PyList_GetItem(columnArrays[c].get(), r));
if (isNoneOrNA(pyTimestamp.get())) {
//PyList_GetItem returns a 'borrowed' reference. Must not call XDECREF() on it.
PyObject *pyTimestamp = PyList_GetItem(columnArrays[c].get(), r);
if (isNoneOrNA(pyTimestamp)) {
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL));
return;
}
Expand All @@ -1118,8 +1138,8 @@ inline void handleEmitPyTimestamp(
// We call here pandas.Timestamp.tz_localize(None), because we need to remove the timezone from the timestamp.
// Exasol doesn't support timezones, and if we don't remove the timezone, pandas.Timestamp.isoformat will add
// it to the generated string.
pyTimestamp.reset(PyObject_CallMethod(pyTimestamp.get(), "tz_localize", "z", NULL));
PyPtr pyIsoDatetime(PyObject_CallMethod(pyTimestamp.get(), "isoformat", "s", " "));
PyPtr pyTzLocalize(PyObject_CallMethod(pyTimestamp, "tz_localize", "z", NULL));
PyPtr pyIsoDatetime(PyObject_CallMethod(pyTzLocalize.get(), "isoformat", "s", " "));
pyResult.reset(PyObject_CallMethodObjArgs(
resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyIsoDatetime.get(), NULL));
break;
Expand All @@ -1145,16 +1165,17 @@ inline void handleEmitNpyDateTime(
PyPtr& pyResult,
PyPtr& pySetNullMethodName,
PyPtr& pdNaT){
PyPtr pyTimestamp(PyList_GetItem(columnArrays[c].get(), r));
if (pyTimestamp.get() == pdNaT.get()) {
//PyList_GetItem returns a 'borrowed' reference. Must not call XDECREF() on it.
PyObject *pyTimestamp = PyList_GetItem(columnArrays[c].get(), r);
if (pyTimestamp == pdNaT.get()) {
pyResult.reset(PyObject_CallMethodObjArgs(resultHandler, pySetNullMethodName.get(), pyColSetMethods[c].first.get(), NULL));
return;
}

switch (colInfo[c].type) {
case SWIGVMContainers::TIMESTAMP:
{
PyPtr pyIsoDatetime(PyObject_CallMethod(pyTimestamp.get(), "isoformat", "s", " "));
PyPtr pyIsoDatetime(PyObject_CallMethod(pyTimestamp, "isoformat", "s", " "));
pyResult.reset(PyObject_CallMethodObjArgs(
resultHandler, pyColSetMethods[c].second.get(), pyColSetMethods[c].first.get(), pyIsoDatetime.get(), NULL));
break;
Expand Down Expand Up @@ -1189,32 +1210,34 @@ void emit(PyObject *resultHandler, std::vector<ColumnInfo>& colInfo, PyObject *d


PyPtr data;
PyPtr arrayPtr;
PyArrayObject *pyArray;
PyPtr colArray;
if(colTypes.size()==1 && colTypes.at(0).second == NPY_DATETIME){
// if we get an dataframe with a single datetime column with type datetime[ns],
// it doesn't get transformed into a 2D Array with the attributes values,
// instead we get a DatetimeIndex like this:
// DatetimeIndex(['2020-07-27 14:22:33.600699', ...], dtype='datetime64[ns]', freq=None)
// As a workaround we add a column to the dataframe via resetIndex, use values to get a 2D Array and
// slice out the datetime column, which then will be an Array of pandas.Timestamp objects
PyPtr resetIndex(PyObject_CallMethod(dataframe, "reset_index", NULL));
data=PyPtr(PyObject_GetAttrString(resetIndex.get(), "values"));
pyArray = reinterpret_cast<PyArrayObject*>(PyArray_FROM_OTF(data.get(), NPY_OBJECT, NPY_ARRAY_IN_ARRAY));
bool allColsAreDateTime =
std::all_of(colTypes.begin(), colTypes.end(),
[](std::pair<std::string, int> colType) {
return colType.second == NPY_DATETIME;
});
if(allColsAreDateTime) {
// if we get an dataframe with only datetime columns with type datetime[ns],
// As we call PyArray_FROM_OTF(data.get(), NPY_OBJECT, NPY_ARRAY_IN_ARRAY) with parameter NPY_OBJECT we need
// to explicitly cast the values to type 'object'. Per default the dtypes of values are datetime[ns].
PyPtr asTypeFunc (PyObject_GetAttrString(dataframe, "astype"));
PyPtr keywordArgs(PyDict_New());
PyDict_SetItemString(keywordArgs.get(), "copy", Py_False);
PyPtr funcArgs(Py_BuildValue("(s)", "object"));
PyPtr castedValues(PyObject_Call(asTypeFunc.get(), funcArgs.get(), keywordArgs.get()));
data.reset(PyObject_GetAttrString(castedValues.get(), "values"));
arrayPtr = PyPtr(PyArray_FROM_OTF(data.get(), NPY_OBJECT, NPY_ARRAY_IN_ARRAY));
pyArray = reinterpret_cast<PyArrayObject*>(arrayPtr.get());
numRows = PyArray_DIM(pyArray, 0);
numCols = PyArray_DIM(pyArray, 1)-1;
numCols = PyArray_DIM(pyArray, 1);
// Transpose to column-major
PyPtr transpose = PyPtr(PyArray_Transpose(pyArray, NULL));

PyPtr pyStart(PyLong_FromLong(1));
PyPtr pyStop(PyLong_FromLong(2));
PyPtr slice(PySlice_New(pyStart.get(), pyStop.get(), Py_None));
colArray=PyPtr(PyObject_GetItem(transpose.get(), slice.get()));


colArray = PyPtr(PyArray_Transpose(pyArray, NULL));
}else{
data=PyPtr(PyObject_GetAttrString(dataframe, "values"));
pyArray = reinterpret_cast<PyArrayObject*>(PyArray_FROM_OTF(data.get(), NPY_OBJECT, NPY_ARRAY_IN_ARRAY));
arrayPtr = PyPtr(PyArray_FROM_OTF(data.get(), NPY_OBJECT, NPY_ARRAY_IN_ARRAY));
pyArray = reinterpret_cast<PyArrayObject*>(arrayPtr.get());
numRows = PyArray_DIM(pyArray, 0);
numCols = PyArray_DIM(pyArray, 1);
// Transpose to column-major
Expand Down
1 change: 1 addition & 0 deletions test_container/build/deps/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ lxml
docker
scipy
https://github.com/exasol/exasol-python-test-framework/releases/download/0.4.0/exasol_python_test_framework-0.4.0-py3-none-any.whl
humanfriendly
Loading
Loading