From c478e58f5458ed9a3d7a8f422d5a79f9f5efab26 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 15 Sep 2023 15:13:24 -0400 Subject: [PATCH 1/6] replace cpplint with clang-format --- .pre-commit-config.yaml | 20 +- .../pandas/datetime/date_conversions.h | 6 +- .../include/pandas/datetime/pd_datetime.h | 6 +- pandas/_libs/include/pandas/inline_helper.h | 24 +- pandas/_libs/include/pandas/parser/io.h | 8 +- .../_libs/include/pandas/parser/pd_parser.h | 11 +- .../_libs/include/pandas/parser/tokenizer.h | 227 +- pandas/_libs/include/pandas/portable.h | 5 +- pandas/_libs/include/pandas/skiplist.h | 388 +- .../include/pandas/vendored/klib/khash.h | 674 ++-- .../pandas/vendored/klib/khash_python.h | 544 ++- .../vendored/numpy/datetime/np_datetime.h | 68 +- .../numpy/datetime/np_datetime_strings.h | 42 +- .../pandas/vendored/ujson/lib/ultrajson.h | 96 +- .../pandas/vendored/ujson/python/version.h | 17 +- pandas/_libs/src/datetime/date_conversions.c | 118 +- pandas/_libs/src/datetime/pd_datetime.c | 235 +- pandas/_libs/src/parser/io.c | 126 +- pandas/_libs/src/parser/pd_parser.c | 3 +- pandas/_libs/src/parser/tokenizer.c | 3346 ++++++++--------- .../src/vendored/numpy/datetime/np_datetime.c | 1624 ++++---- .../numpy/datetime/np_datetime_strings.c | 1737 ++++----- .../src/vendored/ujson/lib/ultrajsondec.c | 1524 ++++---- .../src/vendored/ujson/lib/ultrajsonenc.c | 1429 ++++--- .../src/vendored/ujson/python/JSONtoObj.c | 690 ++-- .../src/vendored/ujson/python/objToJSON.c | 3054 ++++++++------- .../_libs/src/vendored/ujson/python/ujson.c | 636 ++-- 27 files changed, 8305 insertions(+), 8353 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c01bf65818167..dfa4f0db0c03c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -70,19 +70,13 @@ repos: - id: fix-encoding-pragma args: [--remove] - id: trailing-whitespace -- repo: https://github.com/cpplint/cpplint - rev: 1.6.1 - hooks: - - id: cpplint - exclude: ^pandas/_libs/include/pandas/vendored/klib - args: [ - --quiet, - '--extensions=c,h', - '--headers=h', - --recursive, - --linelength=88, - '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' - ] +- repo: https://github.com/pocc/pre-commit-hooks + rev: v1.3.5 + hooks: + - id: clang-format + include: ^pandas/_libs/src|^pandas/_libs/include + args: [-i] + types_or: [c, c++] - repo: https://github.com/pylint-dev/pylint rev: v3.0.0a7 hooks: diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h index a5ad926924dc5..42a16f33cc2ea 100644 --- a/pandas/_libs/include/pandas/datetime/date_conversions.h +++ b/pandas/_libs/include/pandas/datetime/date_conversions.h @@ -18,10 +18,8 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit); // up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z // while base="ns" yields "2020-01-01T00:00:00.000000000Z" // len is mutated to save the length of the returned string -char *int64ToIso(int64_t value, - NPY_DATETIMEUNIT valueUnit, - NPY_DATETIMEUNIT base, - size_t *len); +char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, + NPY_DATETIMEUNIT base, size_t *len); // TODO(username): this function doesn't do a lot; should augment or // replace with scaleNanosecToUnit diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h index 3e362deb87807..714d264924750 100644 --- a/pandas/_libs/include/pandas/datetime/pd_datetime.h +++ b/pandas/_libs/include/pandas/datetime/pd_datetime.h @@ -19,12 +19,12 @@ See NUMPY_LICENSE.txt for the license. #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API +#endif // NPY_NO_DEPRECATED_API -#include +#include "pandas/datetime/date_conversions.h" #include "pandas/vendored/numpy/datetime/np_datetime.h" #include "pandas/vendored/numpy/datetime/np_datetime_strings.h" -#include "pandas/datetime/date_conversions.h" +#include #ifdef __cplusplus extern "C" { diff --git a/pandas/_libs/include/pandas/inline_helper.h b/pandas/_libs/include/pandas/inline_helper.h index c77da0e52b9d3..1e03da1327470 100644 --- a/pandas/_libs/include/pandas/inline_helper.h +++ b/pandas/_libs/include/pandas/inline_helper.h @@ -10,15 +10,15 @@ The full license is in the LICENSE file, distributed with this software. #pragma once #ifndef PANDAS_INLINE - #if defined(__clang__) - #define PANDAS_INLINE static __inline__ __attribute__ ((__unused__)) - #elif defined(__GNUC__) - #define PANDAS_INLINE static __inline__ - #elif defined(_MSC_VER) - #define PANDAS_INLINE static __inline - #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L - #define PANDAS_INLINE static inline - #else - #define PANDAS_INLINE - #endif // __GNUC__ -#endif // PANDAS_INLINE +#if defined(__clang__) +#define PANDAS_INLINE static __inline__ __attribute__((__unused__)) +#elif defined(__GNUC__) +#define PANDAS_INLINE static __inline__ +#elif defined(_MSC_VER) +#define PANDAS_INLINE static __inline +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define PANDAS_INLINE static inline +#else +#define PANDAS_INLINE +#endif // __GNUC__ +#endif // PANDAS_INLINE diff --git a/pandas/_libs/include/pandas/parser/io.h b/pandas/_libs/include/pandas/parser/io.h index 9032eb6759358..cbe6bc04b7663 100644 --- a/pandas/_libs/include/pandas/parser/io.h +++ b/pandas/_libs/include/pandas/parser/io.h @@ -10,15 +10,15 @@ The full license is in the LICENSE file, distributed with this software. #pragma once #define PY_SSIZE_T_CLEAN -#include #include "tokenizer.h" +#include #define FS(source) ((file_source *)source) typedef struct _rd_source { - PyObject *obj; - PyObject *buffer; - size_t position; + PyObject *obj; + PyObject *buffer; + size_t position; } rd_source; #define RDS(source) ((rd_source *)source) diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h index 1ea94fde593ef..61f15dcef8d27 100644 --- a/pandas/_libs/include/pandas/parser/pd_parser.h +++ b/pandas/_libs/include/pandas/parser/pd_parser.h @@ -13,8 +13,8 @@ extern "C" { #endif #define PY_SSIZE_T_CLEAN -#include #include "pandas/parser/tokenizer.h" +#include typedef struct { int (*to_double)(char *, double *, char, char, int *); @@ -81,11 +81,10 @@ static PandasParser_CAPI *PandasParserAPI = NULL; PandasParserAPI->parser_set_default_options((self)) #define parser_consume_rows(self, nrows) \ PandasParserAPI->parser_consume_rows((self), (nrows)) -#define parser_trim_buffers(self) \ - PandasParserAPI->parser_trim_buffers((self)) -#define tokenize_all_rows(self, encoding_errors) \ +#define parser_trim_buffers(self) PandasParserAPI->parser_trim_buffers((self)) +#define tokenize_all_rows(self, encoding_errors) \ PandasParserAPI->tokenize_all_rows((self), (encoding_errors)) -#define tokenize_nrows(self, nrows, encoding_errors) \ +#define tokenize_nrows(self, nrows, encoding_errors) \ PandasParserAPI->tokenize_nrows((self), (nrows), (encoding_errors)) #define str_to_int64(p_item, int_min, int_max, error, t_sep) \ PandasParserAPI->str_to_int64((p_item), (int_min), (int_max), (error), \ @@ -104,7 +103,7 @@ static PandasParser_CAPI *PandasParserAPI = NULL; PandasParserAPI->round_trip((p), (q), (decimal), (sci), (tsep), \ (skip_trailing), (error), (maybe_int)) #define to_boolean(item, val) PandasParserAPI->to_boolean((item), (val)) -#endif /* !defined(_PANDAS_PARSER_IMPL) */ +#endif /* !defined(_PANDAS_PARSER_IMPL) */ #ifdef __cplusplus } diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h index a53d09012116d..6a46ad637a401 100644 --- a/pandas/_libs/include/pandas/parser/tokenizer.h +++ b/pandas/_libs/include/pandas/parser/tokenizer.h @@ -18,9 +18,9 @@ See LICENSE for the license #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 -#include #include "pandas/inline_helper.h" #include "pandas/portable.h" +#include #include "pandas/vendored/klib/khash.h" @@ -29,7 +29,6 @@ See LICENSE for the license #define REACHED_EOF 1 #define CALLING_READ_FAILED 2 - /* C flat file parsing low level code for pandas / NumPy @@ -46,7 +45,7 @@ See LICENSE for the license #define TRACE(X) printf X; #else #define TRACE(X) -#endif // VERBOSE +#endif // VERBOSE #define PARSER_OUT_OF_MEMORY -1 @@ -56,131 +55,127 @@ See LICENSE for the license */ typedef enum { - START_RECORD, - START_FIELD, - ESCAPED_CHAR, - IN_FIELD, - IN_QUOTED_FIELD, - ESCAPE_IN_QUOTED_FIELD, - QUOTE_IN_QUOTED_FIELD, - EAT_CRNL, - EAT_CRNL_NOP, - EAT_WHITESPACE, - EAT_COMMENT, - EAT_LINE_COMMENT, - WHITESPACE_LINE, - START_FIELD_IN_SKIP_LINE, - IN_FIELD_IN_SKIP_LINE, - IN_QUOTED_FIELD_IN_SKIP_LINE, - QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE, - FINISHED + START_RECORD, + START_FIELD, + ESCAPED_CHAR, + IN_FIELD, + IN_QUOTED_FIELD, + ESCAPE_IN_QUOTED_FIELD, + QUOTE_IN_QUOTED_FIELD, + EAT_CRNL, + EAT_CRNL_NOP, + EAT_WHITESPACE, + EAT_COMMENT, + EAT_LINE_COMMENT, + WHITESPACE_LINE, + START_FIELD_IN_SKIP_LINE, + IN_FIELD_IN_SKIP_LINE, + IN_QUOTED_FIELD_IN_SKIP_LINE, + QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE, + FINISHED } ParserState; typedef enum { - QUOTE_MINIMAL, - QUOTE_ALL, - QUOTE_NONNUMERIC, - QUOTE_NONE + QUOTE_MINIMAL, + QUOTE_ALL, + QUOTE_NONNUMERIC, + QUOTE_NONE } QuoteStyle; -typedef enum { - ERROR, - WARN, - SKIP -} BadLineHandleMethod; +typedef enum { ERROR, WARN, SKIP } BadLineHandleMethod; typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors); typedef int (*io_cleanup)(void *src); typedef struct parser_t { - void *source; - io_callback cb_io; - io_cleanup cb_cleanup; - - int64_t chunksize; // Number of bytes to prepare for each chunk - char *data; // pointer to data to be processed - int64_t datalen; // amount of data available - int64_t datapos; - - // where to write out tokenized data - char *stream; - uint64_t stream_len; - uint64_t stream_cap; - - // Store words in (potentially ragged) matrix for now, hmm - char **words; - int64_t *word_starts; // where we are in the stream - uint64_t words_len; - uint64_t words_cap; - uint64_t max_words_cap; // maximum word cap encountered - - char *pword_start; // pointer to stream start of current field - int64_t word_start; // position start of current field - - int64_t *line_start; // position in words for start of line - int64_t *line_fields; // Number of fields in each line - uint64_t lines; // Number of (good) lines observed - uint64_t file_lines; // Number of lines (including bad or skipped) - uint64_t lines_cap; // Vector capacity - - // Tokenizing stuff - ParserState state; - int doublequote; /* is " represented by ""? */ - char delimiter; /* field separator */ - int delim_whitespace; /* delimit by consuming space/tabs instead */ - char quotechar; /* quote character */ - char escapechar; /* escape character */ - char lineterminator; - int skipinitialspace; /* ignore spaces following delimiter? */ - int quoting; /* style of quoting to write */ - - char commentchar; - int allow_embedded_newline; - - int usecols; // Boolean: 1: usecols provided, 0: none provided - - Py_ssize_t expected_fields; - BadLineHandleMethod on_bad_lines; - - // floating point options - char decimal; - char sci; - - // thousands separator (comma, period) - char thousands; - - int header; // Boolean: 1: has header, 0: no header - int64_t header_start; // header row start - uint64_t header_end; // header row end - - void *skipset; - PyObject *skipfunc; - int64_t skip_first_N_rows; - int64_t skip_footer; - double (*double_converter)(const char *, char **, - char, char, char, int, int *, int *); - - // error handling - char *warn_msg; - char *error_msg; - - int skip_empty_lines; + void *source; + io_callback cb_io; + io_cleanup cb_cleanup; + + int64_t chunksize; // Number of bytes to prepare for each chunk + char *data; // pointer to data to be processed + int64_t datalen; // amount of data available + int64_t datapos; + + // where to write out tokenized data + char *stream; + uint64_t stream_len; + uint64_t stream_cap; + + // Store words in (potentially ragged) matrix for now, hmm + char **words; + int64_t *word_starts; // where we are in the stream + uint64_t words_len; + uint64_t words_cap; + uint64_t max_words_cap; // maximum word cap encountered + + char *pword_start; // pointer to stream start of current field + int64_t word_start; // position start of current field + + int64_t *line_start; // position in words for start of line + int64_t *line_fields; // Number of fields in each line + uint64_t lines; // Number of (good) lines observed + uint64_t file_lines; // Number of lines (including bad or skipped) + uint64_t lines_cap; // Vector capacity + + // Tokenizing stuff + ParserState state; + int doublequote; /* is " represented by ""? */ + char delimiter; /* field separator */ + int delim_whitespace; /* delimit by consuming space/tabs instead */ + char quotechar; /* quote character */ + char escapechar; /* escape character */ + char lineterminator; + int skipinitialspace; /* ignore spaces following delimiter? */ + int quoting; /* style of quoting to write */ + + char commentchar; + int allow_embedded_newline; + + int usecols; // Boolean: 1: usecols provided, 0: none provided + + Py_ssize_t expected_fields; + BadLineHandleMethod on_bad_lines; + + // floating point options + char decimal; + char sci; + + // thousands separator (comma, period) + char thousands; + + int header; // Boolean: 1: has header, 0: no header + int64_t header_start; // header row start + uint64_t header_end; // header row end + + void *skipset; + PyObject *skipfunc; + int64_t skip_first_N_rows; + int64_t skip_footer; + double (*double_converter)(const char *, char **, char, char, char, int, + int *, int *); + + // error handling + char *warn_msg; + char *error_msg; + + int skip_empty_lines; } parser_t; typedef struct coliter_t { - char **words; - int64_t *line_start; - int64_t col; + char **words; + int64_t *line_start; + int64_t col; } coliter_t; void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start); -#define COLITER_NEXT(iter, word) \ - do { \ - const int64_t i = *iter.line_start++ + iter.col; \ - word = i >= *iter.line_start ? "" : iter.words[i]; \ - } while (0) +#define COLITER_NEXT(iter, word) \ + do { \ + const int64_t i = *iter.line_start++ + iter.col; \ + word = i >= *iter.line_start ? "" : iter.words[i]; \ + } while (0) parser_t *parser_new(void); @@ -208,9 +203,9 @@ int tokenize_all_rows(parser_t *self, const char *encoding_errors); // and want to free memory from the token stream typedef struct uint_state { - int seen_sint; - int seen_uint; - int seen_null; + int seen_sint; + int seen_uint; + int seen_null; } uint_state; void uint_state_init(uint_state *self); @@ -223,9 +218,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep); double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); -double precise_xstrtod(const char *p, char **q, char decimal, - char sci, char tsep, int skip_trailing, - int *error, int *maybe_int); +double precise_xstrtod(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing, int *error, + int *maybe_int); // GH-15140 - round_trip requires and acquires the GIL on its own double round_trip(const char *p, char **q, char decimal, char sci, char tsep, diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h index 2569aa4fded08..588f070372a8a 100644 --- a/pandas/_libs/include/pandas/portable.h +++ b/pandas/_libs/include/pandas/portable.h @@ -18,7 +18,8 @@ The full license is in the LICENSE file, distributed with this software. // GH-23516 - works around locale perf issues // from MUSL libc, licence at LICENSES/MUSL_LICENSE #define isdigit_ascii(c) (((unsigned)(c) - '0') < 10u) -#define getdigit_ascii(c, default) (isdigit_ascii(c) ? ((int)((c) - '0')) : default) +#define getdigit_ascii(c, default) \ + (isdigit_ascii(c) ? ((int)((c) - '0')) : default) #define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5)) -#define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c)) +#define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c)&0x5f) : (c)) #define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c)) diff --git a/pandas/_libs/include/pandas/skiplist.h b/pandas/_libs/include/pandas/skiplist.h index 3be9e51f42e09..d002dba193279 100644 --- a/pandas/_libs/include/pandas/skiplist.h +++ b/pandas/_libs/include/pandas/skiplist.h @@ -15,18 +15,18 @@ Python recipe (https://rhettinger.wordpress.com/2010/02/06/lost-knowledge/) #pragma once +#include "pandas/inline_helper.h" #include #include #include #include -#include "pandas/inline_helper.h" PANDAS_INLINE float __skiplist_nanf(void) { - const union { - int __i; - float __f; - } __bint = {0x7fc00000UL}; - return __bint.__f; + const union { + int __i; + float __f; + } __bint = {0x7fc00000UL}; + return __bint.__f; } #define PANDAS_NAN ((double)__skiplist_nanf()) @@ -35,46 +35,46 @@ PANDAS_INLINE double Log2(double val) { return log(val) / log(2.); } typedef struct node_t node_t; struct node_t { - node_t **next; - int *width; - double value; - int is_nil; - int levels; - int ref_count; + node_t **next; + int *width; + double value; + int is_nil; + int levels; + int ref_count; }; typedef struct { - node_t *head; - node_t **tmp_chain; - int *tmp_steps; - int size; - int maxlevels; + node_t *head; + node_t **tmp_chain; + int *tmp_steps; + int size; + int maxlevels; } skiplist_t; PANDAS_INLINE double urand(void) { - return ((double)rand() + 1) / ((double)RAND_MAX + 2); + return ((double)rand() + 1) / ((double)RAND_MAX + 2); } PANDAS_INLINE int int_min(int a, int b) { return a < b ? a : b; } PANDAS_INLINE node_t *node_init(double value, int levels) { - node_t *result; - result = (node_t *)malloc(sizeof(node_t)); - if (result) { - result->value = value; - result->levels = levels; - result->is_nil = 0; - result->ref_count = 0; - result->next = (node_t **)malloc(levels * sizeof(node_t *)); - result->width = (int *)malloc(levels * sizeof(int)); - if (!(result->next && result->width) && (levels != 0)) { - free(result->next); - free(result->width); - free(result); - return NULL; - } + node_t *result; + result = (node_t *)malloc(sizeof(node_t)); + if (result) { + result->value = value; + result->levels = levels; + result->is_nil = 0; + result->ref_count = 0; + result->next = (node_t **)malloc(levels * sizeof(node_t *)); + result->width = (int *)malloc(levels * sizeof(int)); + if (!(result->next && result->width) && (levels != 0)) { + free(result->next); + free(result->width); + free(result); + return NULL; } - return result; + } + return result; } // do this ourselves @@ -83,215 +83,215 @@ PANDAS_INLINE void node_incref(node_t *node) { ++(node->ref_count); } PANDAS_INLINE void node_decref(node_t *node) { --(node->ref_count); } static void node_destroy(node_t *node) { - int i; - if (node) { - if (node->ref_count <= 1) { - for (i = 0; i < node->levels; ++i) { - node_destroy(node->next[i]); - } - free(node->next); - free(node->width); - // printf("Reference count was 1, freeing\n"); - free(node); - } else { - node_decref(node); - } - // pretty sure that freeing the struct above will be enough + int i; + if (node) { + if (node->ref_count <= 1) { + for (i = 0; i < node->levels; ++i) { + node_destroy(node->next[i]); + } + free(node->next); + free(node->width); + // printf("Reference count was 1, freeing\n"); + free(node); + } else { + node_decref(node); } + // pretty sure that freeing the struct above will be enough + } } PANDAS_INLINE void skiplist_destroy(skiplist_t *skp) { - if (skp) { - node_destroy(skp->head); - free(skp->tmp_steps); - free(skp->tmp_chain); - free(skp); - } + if (skp) { + node_destroy(skp->head); + free(skp->tmp_steps); + free(skp->tmp_chain); + free(skp); + } } PANDAS_INLINE skiplist_t *skiplist_init(int expected_size) { - skiplist_t *result; - node_t *NIL, *head; - int maxlevels, i; - - maxlevels = 1 + Log2((double)expected_size); - result = (skiplist_t *)malloc(sizeof(skiplist_t)); - if (!result) { - return NULL; - } - result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *)); - result->tmp_steps = (int *)malloc(maxlevels * sizeof(int)); - result->maxlevels = maxlevels; - result->size = 0; - - head = result->head = node_init(PANDAS_NAN, maxlevels); - NIL = node_init(0.0, 0); - - if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) { - skiplist_destroy(result); - node_destroy(NIL); - return NULL; - } - - node_incref(head); - - NIL->is_nil = 1; - - for (i = 0; i < maxlevels; ++i) { - head->next[i] = NIL; - head->width[i] = 1; - node_incref(NIL); - } - - return result; + skiplist_t *result; + node_t *NIL, *head; + int maxlevels, i; + + maxlevels = 1 + Log2((double)expected_size); + result = (skiplist_t *)malloc(sizeof(skiplist_t)); + if (!result) { + return NULL; + } + result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *)); + result->tmp_steps = (int *)malloc(maxlevels * sizeof(int)); + result->maxlevels = maxlevels; + result->size = 0; + + head = result->head = node_init(PANDAS_NAN, maxlevels); + NIL = node_init(0.0, 0); + + if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) { + skiplist_destroy(result); + node_destroy(NIL); + return NULL; + } + + node_incref(head); + + NIL->is_nil = 1; + + for (i = 0; i < maxlevels; ++i) { + head->next[i] = NIL; + head->width[i] = 1; + node_incref(NIL); + } + + return result; } // 1 if left < right, 0 if left == right, -1 if left > right PANDAS_INLINE int _node_cmp(node_t *node, double value) { - if (node->is_nil || node->value > value) { - return -1; - } else if (node->value < value) { - return 1; - } else { - return 0; - } + if (node->is_nil || node->value > value) { + return -1; + } else if (node->value < value) { + return 1; + } else { + return 0; + } } PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) { - node_t *node; - int level; - - if (i < 0 || i >= skp->size) { - *ret = 0; - return 0; - } - - node = skp->head; - ++i; - for (level = skp->maxlevels - 1; level >= 0; --level) { - while (node->width[level] <= i) { - i -= node->width[level]; - node = node->next[level]; - } + node_t *node; + int level; + + if (i < 0 || i >= skp->size) { + *ret = 0; + return 0; + } + + node = skp->head; + ++i; + for (level = skp->maxlevels - 1; level >= 0; --level) { + while (node->width[level] <= i) { + i -= node->width[level]; + node = node->next[level]; } + } - *ret = 1; - return node->value; + *ret = 1; + return node->value; } // Returns the lowest rank of all elements with value `value`, as opposed to the // highest rank returned by `skiplist_insert`. PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) { - node_t *node; - int level, rank = 0; - - node = skp->head; - for (level = skp->maxlevels - 1; level >= 0; --level) { - while (_node_cmp(node->next[level], value) > 0) { - rank += node->width[level]; - node = node->next[level]; - } + node_t *node; + int level, rank = 0; + + node = skp->head; + for (level = skp->maxlevels - 1; level >= 0; --level) { + while (_node_cmp(node->next[level], value) > 0) { + rank += node->width[level]; + node = node->next[level]; } + } - return rank + 1; + return rank + 1; } // Returns the rank of the inserted element. When there are duplicates, // `rank` is the highest of the group, i.e. the 'max' method of // https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { - node_t *node, *prevnode, *newnode, *next_at_level; - int *steps_at_level; - int size, steps, level, rank = 0; - node_t **chain; - - chain = skp->tmp_chain; - - steps_at_level = skp->tmp_steps; - memset(steps_at_level, 0, skp->maxlevels * sizeof(int)); - - node = skp->head; - - for (level = skp->maxlevels - 1; level >= 0; --level) { - next_at_level = node->next[level]; - while (_node_cmp(next_at_level, value) >= 0) { - steps_at_level[level] += node->width[level]; - rank += node->width[level]; - node = next_at_level; - next_at_level = node->next[level]; - } - chain[level] = node; - } + node_t *node, *prevnode, *newnode, *next_at_level; + int *steps_at_level; + int size, steps, level, rank = 0; + node_t **chain; - size = int_min(skp->maxlevels, 1 - ((int)Log2(urand()))); + chain = skp->tmp_chain; - newnode = node_init(value, size); - if (!newnode) { - return -1; + steps_at_level = skp->tmp_steps; + memset(steps_at_level, 0, skp->maxlevels * sizeof(int)); + + node = skp->head; + + for (level = skp->maxlevels - 1; level >= 0; --level) { + next_at_level = node->next[level]; + while (_node_cmp(next_at_level, value) >= 0) { + steps_at_level[level] += node->width[level]; + rank += node->width[level]; + node = next_at_level; + next_at_level = node->next[level]; } - steps = 0; + chain[level] = node; + } - for (level = 0; level < size; ++level) { - prevnode = chain[level]; - newnode->next[level] = prevnode->next[level]; + size = int_min(skp->maxlevels, 1 - ((int)Log2(urand()))); - prevnode->next[level] = newnode; - node_incref(newnode); // increment the reference count + newnode = node_init(value, size); + if (!newnode) { + return -1; + } + steps = 0; - newnode->width[level] = prevnode->width[level] - steps; - prevnode->width[level] = steps + 1; + for (level = 0; level < size; ++level) { + prevnode = chain[level]; + newnode->next[level] = prevnode->next[level]; - steps += steps_at_level[level]; - } + prevnode->next[level] = newnode; + node_incref(newnode); // increment the reference count - for (level = size; level < skp->maxlevels; ++level) { - chain[level]->width[level] += 1; - } + newnode->width[level] = prevnode->width[level] - steps; + prevnode->width[level] = steps + 1; + + steps += steps_at_level[level]; + } - ++(skp->size); + for (level = size; level < skp->maxlevels; ++level) { + chain[level]->width[level] += 1; + } - return rank + 1; + ++(skp->size); + + return rank + 1; } PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) { - int level, size; - node_t *node, *prevnode, *tmpnode, *next_at_level; - node_t **chain; - - chain = skp->tmp_chain; - node = skp->head; - - for (level = skp->maxlevels - 1; level >= 0; --level) { - next_at_level = node->next[level]; - while (_node_cmp(next_at_level, value) > 0) { - node = next_at_level; - next_at_level = node->next[level]; - } - chain[level] = node; + int level, size; + node_t *node, *prevnode, *tmpnode, *next_at_level; + node_t **chain; + + chain = skp->tmp_chain; + node = skp->head; + + for (level = skp->maxlevels - 1; level >= 0; --level) { + next_at_level = node->next[level]; + while (_node_cmp(next_at_level, value) > 0) { + node = next_at_level; + next_at_level = node->next[level]; } + chain[level] = node; + } - if (value != chain[0]->next[0]->value) { - return 0; - } + if (value != chain[0]->next[0]->value) { + return 0; + } - size = chain[0]->next[0]->levels; + size = chain[0]->next[0]->levels; - for (level = 0; level < size; ++level) { - prevnode = chain[level]; + for (level = 0; level < size; ++level) { + prevnode = chain[level]; - tmpnode = prevnode->next[level]; + tmpnode = prevnode->next[level]; - prevnode->width[level] += tmpnode->width[level] - 1; - prevnode->next[level] = tmpnode->next[level]; + prevnode->width[level] += tmpnode->width[level] - 1; + prevnode->next[level] = tmpnode->next[level]; - tmpnode->next[level] = NULL; - node_destroy(tmpnode); // decrement refcount or free - } + tmpnode->next[level] = NULL; + node_destroy(tmpnode); // decrement refcount or free + } - for (level = size; level < skp->maxlevels; ++level) { - --(chain[level]->width[level]); - } + for (level = size; level < skp->maxlevels; ++level) { + --(chain[level]->width[level]); + } - --(skp->size); - return 1; + --(skp->size); + return 1; } diff --git a/pandas/_libs/include/pandas/vendored/klib/khash.h b/pandas/_libs/include/pandas/vendored/klib/khash.h index 758f089cc11a5..31d12a3b30001 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash.h @@ -1,4 +1,4 @@ -//Licence at LICENSES/KLIB_LICENSE +// Licence at LICENSES/KLIB_LICENSE /* An example: @@ -6,38 +6,38 @@ #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { - int ret, is_missing; - khiter_t k; - khash_t(32) *h = kh_init(32); - k = kh_put(32, h, 5, &ret); - if (!ret) kh_del(32, h, k); - kh_value(h, k) = 10; - k = kh_get(32, h, 10); - is_missing = (k == kh_end(h)); - k = kh_get(32, h, 5); - kh_del(32, h, k); - for (k = kh_begin(h); k != kh_end(h); ++k) - if (kh_exist(h, k)) kh_value(h, k) = 1; - kh_destroy(32, h); - return 0; + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; } */ /* 2011-09-16 (0.2.6): - * The capacity is a power of 2. This seems to dramatically improve the - speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - - https://github.com/stefanocasazza/ULib - - https://nothings.org/computer/judy/ + - https://github.com/stefanocasazza/ULib + - https://nothings.org/computer/judy/ - * Allow to optionally use linear probing which usually has better - performance for random input. Double hashing is still the default as it - is more robust to certain non-random input. + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as + it is more robust to certain non-random input. - * Added Wang's integer hash function (not used by default). This hash - function is more robust to certain non-random input. + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. 2011-02-14 (0.2.5): @@ -49,32 +49,31 @@ int main() { 2008-09-19 (0.2.3): - * Corrected the example - * Improved interfaces + * Corrected the example + * Improved interfaces 2008-09-11 (0.2.2): - * Improved speed a little in kh_put() + * Improved speed a little in kh_put() 2008-09-10 (0.2.1): - * Added kh_clear() - * Fixed a compiling error + * Added kh_clear() + * Fixed a compiling error 2008-09-02 (0.2.0): - * Changed to token concatenation which increases flexibility. + * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): - * Fixed a bug in kh_get(), which has not been tested previously. + * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): - * Added destructor + * Added destructor */ - #ifndef __AC_KHASH_H #define __AC_KHASH_H @@ -86,11 +85,10 @@ int main() { #define AC_VERSION_KHASH_H "0.2.6" +#include "pandas/inline_helper.h" +#include #include #include -#include -#include "pandas/inline_helper.h" - // hooks for memory allocator, C-runtime allocator used per default #ifndef KHASH_MALLOC @@ -109,7 +107,6 @@ int main() { #define KHASH_FREE free #endif - #if UINT_MAX == 0xffffffffu typedef unsigned int khuint32_t; typedef signed int khint32_t; @@ -145,262 +142,311 @@ typedef float khfloat32_t; typedef khuint32_t khuint_t; typedef khuint_t khiter_t; -#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1) +#define __ac_isempty(flag, i) ((flag[i >> 5] >> (i & 0x1fU)) & 1) #define __ac_isdel(flag, i) (0) #define __ac_iseither(flag, i) __ac_isempty(flag, i) #define __ac_set_isdel_false(flag, i) (0) -#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU))) -#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU))) +#define __ac_set_isempty_false(flag, i) (flag[i >> 5] &= ~(1ul << (i & 0x1fU))) +#define __ac_set_isempty_true(flag, i) (flag[i >> 5] |= (1ul << (i & 0x1fU))) #define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) #define __ac_set_isdel_true(flag, i) ((void)0) - -// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp -khuint32_t PANDAS_INLINE murmur2_32to32(khuint32_t k){ - const khuint32_t SEED = 0xc70f6907UL; - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - const khuint32_t M_32 = 0x5bd1e995; - const int R_32 = 24; - - // Initialize the hash to a 'random' value - khuint32_t h = SEED ^ 4; - - //handle 4 bytes: - k *= M_32; - k ^= k >> R_32; - k *= M_32; - - h *= M_32; - h ^= k; - - // Do a few final mixes of the hash to ensure the "last few - // bytes" are well-incorporated. (Really needed here?) - h ^= h >> 13; - h *= M_32; - h ^= h >> 15; - return h; +// specializations of +// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp +khuint32_t PANDAS_INLINE murmur2_32to32(khuint32_t k) { + const khuint32_t SEED = 0xc70f6907UL; + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const khuint32_t M_32 = 0x5bd1e995; + const int R_32 = 24; + + // Initialize the hash to a 'random' value + khuint32_t h = SEED ^ 4; + + // handle 4 bytes: + k *= M_32; + k ^= k >> R_32; + k *= M_32; + + h *= M_32; + h ^= k; + + // Do a few final mixes of the hash to ensure the "last few + // bytes" are well-incorporated. (Really needed here?) + h ^= h >> 13; + h *= M_32; + h ^= h >> 15; + return h; } -// it is possible to have a special x64-version, which would need less operations, but -// using 32bit version always has also some benefits: +// it is possible to have a special x64-version, which would need less +// operations, but using 32bit version always has also some benefits: // - one code for 32bit and 64bit builds // - the same case for 32bit and 64bit builds -// - no performance difference could be measured compared to a possible x64-version - -khuint32_t PANDAS_INLINE murmur2_32_32to32(khuint32_t k1, khuint32_t k2){ - const khuint32_t SEED = 0xc70f6907UL; - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - const khuint32_t M_32 = 0x5bd1e995; - const int R_32 = 24; - - // Initialize the hash to a 'random' value - khuint32_t h = SEED ^ 4; - - //handle first 4 bytes: - k1 *= M_32; - k1 ^= k1 >> R_32; - k1 *= M_32; - - h *= M_32; - h ^= k1; - - //handle second 4 bytes: - k2 *= M_32; - k2 ^= k2 >> R_32; - k2 *= M_32; - - h *= M_32; - h ^= k2; - - // Do a few final mixes of the hash to ensure the "last few - // bytes" are well-incorporated. - h ^= h >> 13; - h *= M_32; - h ^= h >> 15; - return h; +// - no performance difference could be measured compared to a possible +// x64-version + +khuint32_t PANDAS_INLINE murmur2_32_32to32(khuint32_t k1, khuint32_t k2) { + const khuint32_t SEED = 0xc70f6907UL; + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const khuint32_t M_32 = 0x5bd1e995; + const int R_32 = 24; + + // Initialize the hash to a 'random' value + khuint32_t h = SEED ^ 4; + + // handle first 4 bytes: + k1 *= M_32; + k1 ^= k1 >> R_32; + k1 *= M_32; + + h *= M_32; + h ^= k1; + + // handle second 4 bytes: + k2 *= M_32; + k2 ^= k2 >> R_32; + k2 *= M_32; + + h *= M_32; + h ^= k2; + + // Do a few final mixes of the hash to ensure the "last few + // bytes" are well-incorporated. + h ^= h >> 13; + h *= M_32; + h ^= h >> 15; + return h; } -khuint32_t PANDAS_INLINE murmur2_64to32(khuint64_t k){ - khuint32_t k1 = (khuint32_t)k; - khuint32_t k2 = (khuint32_t)(k >> 32); +khuint32_t PANDAS_INLINE murmur2_64to32(khuint64_t k) { + khuint32_t k1 = (khuint32_t)k; + khuint32_t k2 = (khuint32_t)(k >> 32); - return murmur2_32_32to32(k1, k2); + return murmur2_32_32to32(k1, k2); } - #ifdef KHASH_LINEAR #define __ac_inc(k, m) 1 #else #define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m) #endif -#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) +#define __ac_fsize(m) ((m) < 32 ? 1 : (m) >> 5) #ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#define kroundup32(x) \ + (--(x), (x) |= (x) >> 1, (x) |= (x) >> 2, (x) |= (x) >> 4, (x) |= (x) >> 8, \ + (x) |= (x) >> 16, ++(x)) #endif static const double __ac_HASH_UPPER = 0.77; -#define KHASH_DECLARE(name, khkey_t, khval_t) \ - typedef struct { \ - khuint_t n_buckets, size, n_occupied, upper_bound; \ - khuint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - extern kh_##name##_t *kh_init_##name(); \ - extern void kh_destroy_##name(kh_##name##_t *h); \ - extern void kh_clear_##name(kh_##name##_t *h); \ - extern khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ - extern void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets); \ - extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ - extern void kh_del_##name(kh_##name##_t *h, khuint_t x); - -#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - typedef struct { \ - khuint_t n_buckets, size, n_occupied, upper_bound; \ - khuint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \ - } \ - SCOPE void kh_destroy_##name(kh_##name##_t *h) \ - { \ - if (h) { \ - KHASH_FREE(h->keys); KHASH_FREE(h->flags); \ - KHASH_FREE(h->vals); \ - KHASH_FREE(h); \ - } \ - } \ - SCOPE void kh_clear_##name(kh_##name##_t *h) \ - { \ - if (h && h->flags) { \ - memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \ - h->size = h->n_occupied = 0; \ - } \ - } \ - SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ - { \ - if (h->n_buckets) { \ - khuint_t inc, k, i, last, mask; \ - mask = h->n_buckets - 1; \ - k = __hash_func(key); i = k & mask; \ - inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - i = (i + inc) & mask; \ - if (i == last) return h->n_buckets; \ - } \ - return __ac_iseither(h->flags, i)? h->n_buckets : i; \ - } else return 0; \ - } \ - SCOPE void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets) \ - { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ - khuint32_t *new_flags = 0; \ - khuint_t j = 1; \ - { \ - kroundup32(new_n_buckets); \ - if (new_n_buckets < 4) new_n_buckets = 4; \ - if (h->size >= (khuint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ - else { /* hash table size to be changed (shrink or expand); rehash */ \ - new_flags = (khuint32_t*)KHASH_MALLOC(__ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \ - memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \ - if (h->n_buckets < new_n_buckets) { /* expand */ \ - h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \ - } /* otherwise shrink */ \ - } \ - } \ - if (j) { /* rehashing is needed */ \ - for (j = 0; j != h->n_buckets; ++j) { \ - if (__ac_iseither(h->flags, j) == 0) { \ - khkey_t key = h->keys[j]; \ - khval_t val; \ - khuint_t new_mask; \ - new_mask = new_n_buckets - 1; \ - if (kh_is_map) val = h->vals[j]; \ - __ac_set_isempty_true(h->flags, j); \ - while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ - khuint_t inc, k, i; \ - k = __hash_func(key); \ - i = k & new_mask; \ - inc = __ac_inc(k, new_mask); \ - while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ - __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ - { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ - if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \ - } else { /* write the element and jump out of the loop */ \ - h->keys[i] = key; \ - if (kh_is_map) h->vals[i] = val; \ - break; \ - } \ - } \ - } \ - } \ - if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ - h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ - KHASH_FREE(h->flags); /* free the working space */ \ - h->flags = new_flags; \ - h->n_buckets = new_n_buckets; \ - h->n_occupied = h->size; \ - h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ - } \ - } \ - SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ - { \ - khuint_t x; \ - if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ - else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ - } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ - { \ - khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ - x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ - if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ - else { \ - inc = __ac_inc(k, mask); last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (__ac_isdel(h->flags, i)) site = i; \ - i = (i + inc) & mask; \ - if (i == last) { x = site; break; } \ - } \ - if (x == h->n_buckets) { \ - if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ - else x = i; \ - } \ - } \ - } \ - if (__ac_isempty(h->flags, x)) { /* not present at all */ \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; ++h->n_occupied; \ - *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; \ - *ret = 2; \ - } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ - return x; \ - } \ - SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) \ - { \ - if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ - __ac_set_isdel_true(h->flags, x); \ - --h->size; \ - } \ - } - -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - KHASH_INIT2(name, PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + typedef struct { \ + khuint_t n_buckets, size, n_occupied, upper_bound; \ + khuint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + extern kh_##name##_t *kh_init_##name(); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets); \ + extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khuint_t x); + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, \ + __hash_equal) \ + typedef struct { \ + khuint_t n_buckets, size, n_occupied, upper_bound; \ + khuint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t *)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) { \ + if (h) { \ + KHASH_FREE(h->keys); \ + KHASH_FREE(h->flags); \ + KHASH_FREE(h->vals); \ + KHASH_FREE(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) { \ + if (h->n_buckets) { \ + khuint_t inc, k, i, last, mask; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); \ + i = k & mask; \ + inc = __ac_inc(k, mask); \ + last = i; /* inc==1 for linear probing */ \ + while (!__ac_isempty(h->flags, i) && \ + (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + inc) & mask; \ + if (i == last) \ + return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i) ? h->n_buckets : i; \ + } else \ + return 0; \ + } \ + SCOPE void kh_resize_##name( \ + kh_##name##_t *h, \ + khuint_t new_n_buckets) { /* This function uses 0.25*n_bucktes bytes of \ + working space instead of \ + [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khuint32_t *new_flags = 0; \ + khuint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) \ + new_n_buckets = 4; \ + if (h->size >= (khuint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) \ + j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khuint32_t *)KHASH_MALLOC(__ac_fsize(new_n_buckets) * \ + sizeof(khuint32_t)); \ + memset(new_flags, 0xff, \ + __ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + h->keys = (khkey_t *)KHASH_REALLOC(h->keys, \ + new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t *)KHASH_REALLOC(h->vals, new_n_buckets * \ + sizeof(khval_t)); \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khuint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) \ + val = h->vals[j]; \ + __ac_set_isempty_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khuint_t inc, k, i; \ + k = __hash_func(key); \ + i = k & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) \ + i = (i + inc) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && \ + __ac_iseither(h->flags, i) == \ + 0) { /* kick out the existing element */ \ + { \ + khkey_t tmp = h->keys[i]; \ + h->keys[i] = key; \ + key = tmp; \ + } \ + if (kh_is_map) { \ + khval_t tmp = h->vals[i]; \ + h->vals[i] = val; \ + val = tmp; \ + } \ + __ac_set_isempty_true( \ + h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) \ + h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t *)KHASH_REALLOC(h->keys, \ + new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t *)KHASH_REALLOC(h->vals, \ + new_n_buckets * sizeof(khval_t)); \ + } \ + KHASH_FREE(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) { \ + khuint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size << 1)) \ + kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ + else \ + kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ + } /* TODO: to implement automatically shrinking; resize() already support \ + shrinking */ \ + { \ + khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + x = site = h->n_buckets; \ + k = __hash_func(key); \ + i = k & mask; \ + if (__ac_isempty(h->flags, i)) \ + x = i; /* for speed up */ \ + else { \ + inc = __ac_inc(k, mask); \ + last = i; \ + while (!__ac_isempty(h->flags, i) && \ + (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) \ + site = i; \ + i = (i + inc) & mask; \ + if (i == last) { \ + x = site; \ + break; \ + } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) \ + x = site; \ + else \ + x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else \ + *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, \ + __hash_equal) \ + KHASH_INIT2(name, PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, \ + __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ @@ -419,9 +465,8 @@ static const double __ac_HASH_UPPER = 0.77; @param key The integer [khuint64_t] @return The hash value [khuint_t] */ -PANDAS_INLINE khuint_t kh_int64_hash_func(khuint64_t key) -{ - return (khuint_t)((key)>>33^(key)^(key)<<11); +PANDAS_INLINE khuint_t kh_int64_hash_func(khuint64_t key) { + return (khuint_t)((key) >> 33 ^ (key) ^ (key) << 11); } /*! @function @abstract 64-bit integer comparison function @@ -433,11 +478,12 @@ PANDAS_INLINE khuint_t kh_int64_hash_func(khuint64_t key) @param s Pointer to a null terminated string @return The hash value */ -PANDAS_INLINE khuint_t __ac_X31_hash_string(const char *s) -{ - khuint_t h = *s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; - return h; +PANDAS_INLINE khuint_t __ac_X31_hash_string(const char *s) { + khuint_t h = *s; + if (h) + for (++s; *s; ++s) + h = (h << 5) - h + *s; + return h; } /*! @function @abstract Another interface to const char* hash function @@ -450,15 +496,14 @@ PANDAS_INLINE khuint_t __ac_X31_hash_string(const char *s) */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) -PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key) -{ - key += ~(key << 15); - key ^= (key >> 10); - key += (key << 3); - key ^= (key >> 6); - key += ~(key << 11); - key ^= (key >> 16); - return key; +PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key) { + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; } #define kh_int_hash_func2(k) __ac_Wang_hash((khuint_t)key) @@ -508,7 +553,7 @@ PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key) @param k Key [type of keys] @param r Extra return code: 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in - the bucket has been deleted [int*] + the bucket has been deleted [int*] @return Iterator to the inserted element [khuint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) @@ -518,7 +563,8 @@ PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key) @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] - @return Iterator to the found element, or kh_end(h) is the element is absent [khuint_t] + @return Iterator to the found element, or kh_end(h) is the element is + absent [khuint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) @@ -594,81 +640,80 @@ PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key) @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_INT(name) \ - KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_INT(name, khval_t) \ - KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) -#define KHASH_MAP_INIT_UINT(name, khval_t) \ - KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_UINT(name, khval_t) \ + KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_UINT64(name) \ - KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_SET_INIT_UINT64(name) \ + KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) -#define KHASH_SET_INIT_INT64(name) \ - KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_UINT64(name, khval_t) \ - KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_MAP_INIT_UINT64(name, khval_t) \ + KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, \ + kh_int64_hash_equal) -#define KHASH_MAP_INIT_INT64(name, khval_t) \ - KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, \ + kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 16bit-integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_INT16(name, khval_t) \ - KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_INT16(name, khval_t) \ + KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) -#define KHASH_MAP_INIT_UINT16(name, khval_t) \ - KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_UINT16(name, khval_t) \ + KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 8bit-integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_INT8(name, khval_t) \ - KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - -#define KHASH_MAP_INIT_UINT8(name, khval_t) \ - KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - +#define KHASH_MAP_INIT_INT8(name, khval_t) \ + KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_UINT8(name, khval_t) \ + KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_STR(name) \ - KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_STR(name, khval_t) \ - KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) - +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #define kh_exist_str(h, k) (kh_exist(h, k)) #define kh_exist_float64(h, k) (kh_exist(h, k)) @@ -692,5 +737,4 @@ KHASH_MAP_INIT_UINT16(uint16, size_t) KHASH_MAP_INIT_INT8(int8, size_t) KHASH_MAP_INIT_UINT8(uint8, size_t) - #endif /* __AC_KHASH_H */ diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 1009671adc56b..dc16a4ada1716 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -1,20 +1,17 @@ -//Licence at LICENSES/KLIB_LICENSE +// Licence at LICENSES/KLIB_LICENSE -#include #include - +#include typedef struct { - float real; - float imag; + float real; + float imag; } khcomplex64_t; typedef struct { - double real; - double imag; + double real; + double imag; } khcomplex128_t; - - // khash should report usage to tracemalloc #if PY_VERSION_HEX >= 0x03060000 #include @@ -27,43 +24,41 @@ typedef struct { #define PyTraceMalloc_Untrack(...) #endif - static const int KHASH_TRACE_DOMAIN = 424242; -void *traced_malloc(size_t size){ - void * ptr = malloc(size); - if(ptr!=NULL){ - PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); - } - return ptr; +void *traced_malloc(size_t size) { + void *ptr = malloc(size); + if (ptr != NULL) { + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); + } + return ptr; } -void *traced_calloc(size_t num, size_t size){ - void * ptr = calloc(num, size); - if(ptr!=NULL){ - PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num*size); - } - return ptr; +void *traced_calloc(size_t num, size_t size) { + void *ptr = calloc(num, size); + if (ptr != NULL) { + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num * size); + } + return ptr; } -void *traced_realloc(void* old_ptr, size_t size){ - void * ptr = realloc(old_ptr, size); - if(ptr!=NULL){ - if(old_ptr != ptr){ - PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); - } - PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); +void *traced_realloc(void *old_ptr, size_t size) { + void *ptr = realloc(old_ptr, size); + if (ptr != NULL) { + if (old_ptr != ptr) { + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); } - return ptr; + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); + } + return ptr; } -void traced_free(void* ptr){ - if(ptr!=NULL){ - PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr); - } - free(ptr); +void traced_free(void *ptr) { + if (ptr != NULL) { + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr); + } + free(ptr); } - #define KHASH_MALLOC traced_malloc #define KHASH_REALLOC traced_realloc #define KHASH_CALLOC traced_calloc @@ -74,327 +69,295 @@ void traced_free(void* ptr){ // python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021 // python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85 -// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x)) -// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3). -// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t -// is 64 bits the truncation causes collision issues. Given all that, we use our own -// simple hash, viewing the double bytes as an int64 and using khash's default -// hash for 64 bit integers. -// GH 13436 showed that _Py_HashDouble doesn't work well with khash -// GH 28303 showed, that the simple xoring-version isn't good enough -// See GH 36729 for evaluation of the currently used murmur2-hash version -// An interesting alternative to expensive murmur2-hash would be to change -// the probing strategy and use e.g. the probing strategy from CPython's +// The python 3 hash function has the invariant hash(x) == hash(int(x)) == +// hash(decimal(x)) and the size of hash may be different by platform / version +// (long in py2, Py_ssize_t in py3). We don't need those invariants because +// types will be cast before hashing, and if Py_ssize_t is 64 bits the +// truncation causes collision issues. Given all that, we use our own simple +// hash, viewing the double bytes as an int64 and using khash's default hash for +// 64 bit integers. GH 13436 showed that _Py_HashDouble doesn't work well with +// khash GH 28303 showed, that the simple xoring-version isn't good enough See +// GH 36729 for evaluation of the currently used murmur2-hash version An +// interesting alternative to expensive murmur2-hash would be to change the +// probing strategy and use e.g. the probing strategy from CPython's // implementation of dicts, which shines for smaller sizes but is more // predisposed to superlinear running times (see GH 36729 for comparison) - khuint64_t PANDAS_INLINE asuint64(double key) { - khuint64_t val; - memcpy(&val, &key, sizeof(double)); - return val; + khuint64_t val; + memcpy(&val, &key, sizeof(double)); + return val; } khuint32_t PANDAS_INLINE asuint32(float key) { - khuint32_t val; - memcpy(&val, &key, sizeof(float)); - return val; + khuint32_t val; + memcpy(&val, &key, sizeof(float)); + return val; } #define ZERO_HASH 0 -#define NAN_HASH 0 - -khuint32_t PANDAS_INLINE kh_float64_hash_func(double val){ - // 0.0 and -0.0 should have the same hash: - if (val == 0.0){ - return ZERO_HASH; - } - // all nans should have the same hash: - if ( val!=val ){ - return NAN_HASH; - } - khuint64_t as_int = asuint64(val); - return murmur2_64to32(as_int); +#define NAN_HASH 0 + +khuint32_t PANDAS_INLINE kh_float64_hash_func(double val) { + // 0.0 and -0.0 should have the same hash: + if (val == 0.0) { + return ZERO_HASH; + } + // all nans should have the same hash: + if (val != val) { + return NAN_HASH; + } + khuint64_t as_int = asuint64(val); + return murmur2_64to32(as_int); } -khuint32_t PANDAS_INLINE kh_float32_hash_func(float val){ - // 0.0 and -0.0 should have the same hash: - if (val == 0.0f){ - return ZERO_HASH; - } - // all nans should have the same hash: - if ( val!=val ){ - return NAN_HASH; - } - khuint32_t as_int = asuint32(val); - return murmur2_32to32(as_int); +khuint32_t PANDAS_INLINE kh_float32_hash_func(float val) { + // 0.0 and -0.0 should have the same hash: + if (val == 0.0f) { + return ZERO_HASH; + } + // all nans should have the same hash: + if (val != val) { + return NAN_HASH; + } + khuint32_t as_int = asuint32(val); + return murmur2_32to32(as_int); } #define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) -#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ - KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal) +#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ + KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, \ + kh_floats_hash_equal) KHASH_MAP_INIT_FLOAT64(float64, size_t) -#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ - KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal) +#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ + KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, \ + kh_floats_hash_equal) KHASH_MAP_INIT_FLOAT32(float32, size_t) -khint32_t PANDAS_INLINE kh_complex128_hash_func(khcomplex128_t val){ - return kh_float64_hash_func(val.real)^kh_float64_hash_func(val.imag); +khint32_t PANDAS_INLINE kh_complex128_hash_func(khcomplex128_t val) { + return kh_float64_hash_func(val.real) ^ kh_float64_hash_func(val.imag); } -khint32_t PANDAS_INLINE kh_complex64_hash_func(khcomplex64_t val){ - return kh_float32_hash_func(val.real)^kh_float32_hash_func(val.imag); +khint32_t PANDAS_INLINE kh_complex64_hash_func(khcomplex64_t val) { + return kh_float32_hash_func(val.real) ^ kh_float32_hash_func(val.imag); } -#define kh_complex_hash_equal(a, b) \ +#define kh_complex_hash_equal(a, b) \ (kh_floats_hash_equal(a.real, b.real) && kh_floats_hash_equal(a.imag, b.imag)) - -#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \ - KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, kh_complex_hash_equal) +#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \ + KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, \ + kh_complex_hash_equal) KHASH_MAP_INIT_COMPLEX64(complex64, size_t) - -#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \ - KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, kh_complex_hash_equal) +#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \ + KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, \ + kh_complex_hash_equal) KHASH_MAP_INIT_COMPLEX128(complex128, size_t) - #define kh_exist_complex64(h, k) (kh_exist(h, k)) #define kh_exist_complex128(h, k) (kh_exist(h, k)) - // NaN-floats should be in the same equivalency class, see GH 22119 -int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){ - return ( - Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && - Py_IS_NAN(PyFloat_AS_DOUBLE(b)) - ) - || - ( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) ); +int PANDAS_INLINE floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { + return (Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && Py_IS_NAN(PyFloat_AS_DOUBLE(b))) || + (PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b)); } - // NaNs should be in the same equivalency class, see GH 41836 // PyObject_RichCompareBool for complexobjects has a different behavior // needs to be replaced -int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){ - return ( - Py_IS_NAN(a->cval.real) && - Py_IS_NAN(b->cval.real) && - Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag) - ) - || - ( - Py_IS_NAN(a->cval.real) && - Py_IS_NAN(b->cval.real) && - a->cval.imag == b->cval.imag - ) - || - ( - a->cval.real == b->cval.real && - Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag) - ) - || - ( - a->cval.real == b->cval.real && - a->cval.imag == b->cval.imag - ); +int PANDAS_INLINE complexobject_cmp(PyComplexObject *a, PyComplexObject *b) { + return (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && + Py_IS_NAN(a->cval.imag) && Py_IS_NAN(b->cval.imag)) || + (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && + a->cval.imag == b->cval.imag) || + (a->cval.real == b->cval.real && Py_IS_NAN(a->cval.imag) && + Py_IS_NAN(b->cval.imag)) || + (a->cval.real == b->cval.real && a->cval.imag == b->cval.imag); } -int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b); - +int PANDAS_INLINE pyobject_cmp(PyObject *a, PyObject *b); // replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN), // which treats NaNs as equivalent // see GH 41836 -int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){ - Py_ssize_t i; +int PANDAS_INLINE tupleobject_cmp(PyTupleObject *a, PyTupleObject *b) { + Py_ssize_t i; - if (Py_SIZE(a) != Py_SIZE(b)) { - return 0; - } + if (Py_SIZE(a) != Py_SIZE(b)) { + return 0; + } - for (i = 0; i < Py_SIZE(a); ++i) { - if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) { - return 0; - } + for (i = 0; i < Py_SIZE(a); ++i) { + if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) { + return 0; } - return 1; + } + return 1; } - -int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { - if (a == b) { - return 1; +int PANDAS_INLINE pyobject_cmp(PyObject *a, PyObject *b) { + if (a == b) { + return 1; + } + if (Py_TYPE(a) == Py_TYPE(b)) { + // special handling for some built-in types which could have NaNs + // as we would like to have them equivalent, but the usual + // PyObject_RichCompareBool would return False + if (PyFloat_CheckExact(a)) { + return floatobject_cmp((PyFloatObject *)a, (PyFloatObject *)b); + } + if (PyComplex_CheckExact(a)) { + return complexobject_cmp((PyComplexObject *)a, (PyComplexObject *)b); } - if (Py_TYPE(a) == Py_TYPE(b)) { - // special handling for some built-in types which could have NaNs - // as we would like to have them equivalent, but the usual - // PyObject_RichCompareBool would return False - if (PyFloat_CheckExact(a)) { - return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b); - } - if (PyComplex_CheckExact(a)) { - return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b); - } - if (PyTuple_CheckExact(a)) { - return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b); - } - // frozenset isn't yet supported + if (PyTuple_CheckExact(a)) { + return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b); } + // frozenset isn't yet supported + } - int result = PyObject_RichCompareBool(a, b, Py_EQ); - if (result < 0) { - PyErr_Clear(); - return 0; - } - return result; + int result = PyObject_RichCompareBool(a, b, Py_EQ); + if (result < 0) { + PyErr_Clear(); + return 0; + } + return result; } - Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) { - //Since Python3.10, nan is no longer has hash 0 - if (Py_IS_NAN(val)) { - return 0; - } + // Since Python3.10, nan is no longer has hash 0 + if (Py_IS_NAN(val)) { + return 0; + } #if PY_VERSION_HEX < 0x030A0000 - return _Py_HashDouble(val); + return _Py_HashDouble(val); #else - return _Py_HashDouble(NULL, val); + return _Py_HashDouble(NULL, val); #endif } - -Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) { - return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key)); +Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject *key) { + return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key)); } - #define _PandasHASH_IMAG 1000003UL // replaces _Py_HashDouble with _Pandas_HashDouble -Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) { - Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real); - Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag); - if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) { - return -1; - } - Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash; - if (combined == (Py_uhash_t)-1) { - return -2; - } - return (Py_hash_t)combined; +Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject *key) { + Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real); + Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag); + if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) { + return -1; + } + Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash; + if (combined == (Py_uhash_t)-1) { + return -2; + } + return (Py_hash_t)combined; } +khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject *key); -khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); - -//we could use any hashing algorithm, this is the original CPython's for tuples +// we could use any hashing algorithm, this is the original CPython's for tuples #if SIZEOF_PY_UHASH_T > 4 #define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL) #define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL) #define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL) -#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ +#define _PandasHASH_XXROTATE(x) \ + ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ #else #define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL) #define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL) #define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL) -#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ +#define _PandasHASH_XXROTATE(x) \ + ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ #endif -Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { - Py_ssize_t i, len = Py_SIZE(key); - PyObject **item = key->ob_item; - - Py_uhash_t acc = _PandasHASH_XXPRIME_5; - for (i = 0; i < len; i++) { - Py_uhash_t lane = kh_python_hash_func(item[i]); - if (lane == (Py_uhash_t)-1) { - return -1; - } - acc += lane * _PandasHASH_XXPRIME_2; - acc = _PandasHASH_XXROTATE(acc); - acc *= _PandasHASH_XXPRIME_1; - } - - /* Add input length, mangled to keep the historical value of hash(()). */ - acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL); +Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject *key) { + Py_ssize_t i, len = Py_SIZE(key); + PyObject **item = key->ob_item; - if (acc == (Py_uhash_t)-1) { - return 1546275796; + Py_uhash_t acc = _PandasHASH_XXPRIME_5; + for (i = 0; i < len; i++) { + Py_uhash_t lane = kh_python_hash_func(item[i]); + if (lane == (Py_uhash_t)-1) { + return -1; } - return acc; + acc += lane * _PandasHASH_XXPRIME_2; + acc = _PandasHASH_XXROTATE(acc); + acc *= _PandasHASH_XXPRIME_1; + } + + /* Add input length, mangled to keep the historical value of hash(()). */ + acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL); + + if (acc == (Py_uhash_t)-1) { + return 1546275796; + } + return acc; } - -khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { - Py_hash_t hash; - // For PyObject_Hash holds: - // hash(0.0) == 0 == hash(-0.0) - // yet for different nan-objects different hash-values - // are possible - if (PyFloat_CheckExact(key)) { - // we cannot use kh_float64_hash_func - // because float(k) == k holds for any int-object k - // and kh_float64_hash_func doesn't respect it - hash = floatobject_hash((PyFloatObject*)key); - } - else if (PyComplex_CheckExact(key)) { - // we cannot use kh_complex128_hash_func - // because complex(k,0) == k holds for any int-object k - // and kh_complex128_hash_func doesn't respect it - hash = complexobject_hash((PyComplexObject*)key); - } - else if (PyTuple_CheckExact(key)) { - hash = tupleobject_hash((PyTupleObject*)key); - } - else { - hash = PyObject_Hash(key); - } - - if (hash == -1) { - PyErr_Clear(); - return 0; - } - #if SIZEOF_PY_HASH_T == 4 - // it is already 32bit value - return hash; - #else - // for 64bit builds, - // we need information of the upper 32bits as well - // see GH 37615 - khuint64_t as_uint = (khuint64_t) hash; - // uints avoid undefined behavior of signed ints - return (as_uint>>32)^as_uint; - #endif +khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject *key) { + Py_hash_t hash; + // For PyObject_Hash holds: + // hash(0.0) == 0 == hash(-0.0) + // yet for different nan-objects different hash-values + // are possible + if (PyFloat_CheckExact(key)) { + // we cannot use kh_float64_hash_func + // because float(k) == k holds for any int-object k + // and kh_float64_hash_func doesn't respect it + hash = floatobject_hash((PyFloatObject *)key); + } else if (PyComplex_CheckExact(key)) { + // we cannot use kh_complex128_hash_func + // because complex(k,0) == k holds for any int-object k + // and kh_complex128_hash_func doesn't respect it + hash = complexobject_hash((PyComplexObject *)key); + } else if (PyTuple_CheckExact(key)) { + hash = tupleobject_hash((PyTupleObject *)key); + } else { + hash = PyObject_Hash(key); + } + + if (hash == -1) { + PyErr_Clear(); + return 0; + } +#if SIZEOF_PY_HASH_T == 4 + // it is already 32bit value + return hash; +#else + // for 64bit builds, + // we need information of the upper 32bits as well + // see GH 37615 + khuint64_t as_uint = (khuint64_t)hash; + // uints avoid undefined behavior of signed ints + return (as_uint >> 32) ^ as_uint; +#endif } - #define kh_python_hash_equal(a, b) (pyobject_cmp(a, b)) - // Python object -typedef PyObject* kh_pyobject_t; +typedef PyObject *kh_pyobject_t; -#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \ - KHASH_INIT(name, kh_pyobject_t, khval_t, 1, \ - kh_python_hash_func, kh_python_hash_equal) +#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \ + KHASH_INIT(name, kh_pyobject_t, khval_t, 1, kh_python_hash_func, \ + kh_python_hash_equal) KHASH_MAP_INIT_PYOBJECT(pymap, Py_ssize_t) -#define KHASH_SET_INIT_PYOBJECT(name) \ - KHASH_INIT(name, kh_pyobject_t, char, 0, \ - kh_python_hash_func, kh_python_hash_equal) +#define KHASH_SET_INIT_PYOBJECT(name) \ + KHASH_INIT(name, kh_pyobject_t, char, 0, kh_python_hash_func, \ + kh_python_hash_equal) KHASH_SET_INIT_PYOBJECT(pyset) @@ -404,49 +367,52 @@ KHASH_SET_INIT_PYOBJECT(pyset) KHASH_MAP_INIT_STR(strbox, kh_pyobject_t) typedef struct { - kh_str_t *table; - int starts[256]; + kh_str_t *table; + int starts[256]; } kh_str_starts_t; -typedef kh_str_starts_t* p_kh_str_starts_t; +typedef kh_str_starts_t *p_kh_str_starts_t; p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) { - kh_str_starts_t *result = (kh_str_starts_t*)KHASH_CALLOC(1, sizeof(kh_str_starts_t)); - result->table = kh_init_str(); - return result; + kh_str_starts_t *result = + (kh_str_starts_t *)KHASH_CALLOC(1, sizeof(kh_str_starts_t)); + result->table = kh_init_str(); + return result; } -khuint_t PANDAS_INLINE kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) { - khuint_t result = kh_put_str(table->table, key, ret); - if (*ret != 0) { - table->starts[(unsigned char)key[0]] = 1; - } - return result; +khuint_t PANDAS_INLINE kh_put_str_starts_item(kh_str_starts_t *table, char *key, + int *ret) { + khuint_t result = kh_put_str(table->table, key, ret); + if (*ret != 0) { + table->starts[(unsigned char)key[0]] = 1; + } + return result; } -khuint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const char* key) { - unsigned char ch = *key; - if (table->starts[ch]) { - if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1; - } - return 0; +khuint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t *table, + const char *key) { + unsigned char ch = *key; + if (table->starts[ch]) { + if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) + return 1; + } + return 0; } -void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) { - kh_destroy_str(table->table); - KHASH_FREE(table); +void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t *table) { + kh_destroy_str(table->table); + KHASH_FREE(table); } -void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) { - kh_resize_str(table->table, val); +void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t *table, khuint_t val) { + kh_resize_str(table->table, val); } // utility function: given the number of elements // returns number of necessary buckets -khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){ - khuint_t candidate = n_elements; - kroundup32(candidate); - khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5); - return (upper_bound < n_elements) ? 2*candidate : candidate; - +khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements) { + khuint_t candidate = n_elements; + kroundup32(candidate); + khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5); + return (upper_bound < n_elements) ? 2 * candidate : candidate; } diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h index 6b5135f559482..e4e90a7ea24cf 100644 --- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h +++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h @@ -18,44 +18,44 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API +#endif // NPY_NO_DEPRECATED_API #include typedef struct { - npy_int64 days; - npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds; + npy_int64 days; + npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds; } pandas_timedeltastruct; -static const npy_datetimestruct _AS_MIN_DTS = { - 1969, 12, 31, 23, 59, 50, 776627, 963145, 224193}; -static const npy_datetimestruct _FS_MIN_DTS = { - 1969, 12, 31, 21, 26, 16, 627963, 145224, 193000}; -static const npy_datetimestruct _PS_MIN_DTS = { - 1969, 9, 16, 5, 57, 7, 963145, 224193, 0}; -static const npy_datetimestruct _NS_MIN_DTS = { - 1677, 9, 21, 0, 12, 43, 145224, 193000, 0}; -static const npy_datetimestruct _US_MIN_DTS = { - -290308, 12, 21, 19, 59, 05, 224193, 0, 0}; -static const npy_datetimestruct _MS_MIN_DTS = { - -292275055, 5, 16, 16, 47, 4, 193000, 0, 0}; +static const npy_datetimestruct _AS_MIN_DTS = {1969, 12, 31, 23, 59, + 50, 776627, 963145, 224193}; +static const npy_datetimestruct _FS_MIN_DTS = {1969, 12, 31, 21, 26, + 16, 627963, 145224, 193000}; +static const npy_datetimestruct _PS_MIN_DTS = {1969, 9, 16, 5, 57, + 7, 963145, 224193, 0}; +static const npy_datetimestruct _NS_MIN_DTS = {1677, 9, 21, 0, 12, + 43, 145224, 193000, 0}; +static const npy_datetimestruct _US_MIN_DTS = {-290308, 12, 21, 19, 59, + 05, 224193, 0, 0}; +static const npy_datetimestruct _MS_MIN_DTS = {-292275055, 5, 16, 16, 47, + 4, 193000, 0, 0}; static const npy_datetimestruct _S_MIN_DTS = { -292277022657, 1, 27, 8, 29, 53, 0, 0, 0}; static const npy_datetimestruct _M_MIN_DTS = { -17536621475646, 5, 4, 5, 53, 0, 0, 0, 0}; -static const npy_datetimestruct _AS_MAX_DTS = { - 1970, 1, 1, 0, 0, 9, 223372, 36854, 775807}; -static const npy_datetimestruct _FS_MAX_DTS = { - 1970, 1, 1, 2, 33, 43, 372036, 854775, 807000}; -static const npy_datetimestruct _PS_MAX_DTS = { - 1970, 4, 17, 18, 2, 52, 36854, 775807, 0}; -static const npy_datetimestruct _NS_MAX_DTS = { - 2262, 4, 11, 23, 47, 16, 854775, 807000, 0}; -static const npy_datetimestruct _US_MAX_DTS = { - 294247, 1, 10, 4, 0, 54, 775807, 0, 0}; -static const npy_datetimestruct _MS_MAX_DTS = { - 292278994, 8, 17, 7, 12, 55, 807000, 0, 0}; +static const npy_datetimestruct _AS_MAX_DTS = {1970, 1, 1, 0, 0, + 9, 223372, 36854, 775807}; +static const npy_datetimestruct _FS_MAX_DTS = {1970, 1, 1, 2, 33, + 43, 372036, 854775, 807000}; +static const npy_datetimestruct _PS_MAX_DTS = {1970, 4, 17, 18, 2, + 52, 36854, 775807, 0}; +static const npy_datetimestruct _NS_MAX_DTS = {2262, 4, 11, 23, 47, + 16, 854775, 807000, 0}; +static const npy_datetimestruct _US_MAX_DTS = {294247, 1, 10, 4, 0, + 54, 775807, 0, 0}; +static const npy_datetimestruct _MS_MAX_DTS = {292278994, 8, 17, 7, 12, + 55, 807000, 0, 0}; static const npy_datetimestruct _S_MAX_DTS = { 292277026596, 12, 4, 15, 30, 7, 0, 0, 0}; static const npy_datetimestruct _M_MAX_DTS = { @@ -72,8 +72,7 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, void pandas_datetime_to_datetimestruct(npy_datetime val, NPY_DATETIMEUNIT fr, npy_datetimestruct *result); -void pandas_timedelta_to_timedeltastruct(npy_timedelta val, - NPY_DATETIMEUNIT fr, +void pandas_timedelta_to_timedeltastruct(npy_timedelta val, NPY_DATETIMEUNIT fr, pandas_timedeltastruct *result); extern const int days_per_month_table[2][12]; @@ -86,9 +85,7 @@ int is_leapyear(npy_int64 year); /* * Calculates the days offset from the 1970 epoch. */ -npy_int64 -get_datetimestruct_days(const npy_datetimestruct *dts); - +npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts); /* * Compares two npy_datetimestruct objects chronologically @@ -96,17 +93,14 @@ get_datetimestruct_days(const npy_datetimestruct *dts); int cmp_npy_datetimestruct(const npy_datetimestruct *a, const npy_datetimestruct *b); - /* * Adjusts a datetimestruct based on a minutes offset. Assumes * the current values are valid. */ -void -add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes); +void add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes); /* * This function returns the DateTimeMetaData * contained within the provided datetime dtype. */ -PyArray_DatetimeMetaData get_datetime_metadata_from_dtype( - PyArray_Descr *dtype); +PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(PyArray_Descr *dtype); diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h index 1098637e798fe..d96ca79d70cb7 100644 --- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h +++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h @@ -23,7 +23,7 @@ This file implements string parsing and creation for NumPy datetime. #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API +#endif // NPY_NO_DEPRECATED_API /* 'format_requirement' can be one of three values: * * PARTIAL_MATCH : Only require a partial match with 'format'. @@ -34,11 +34,7 @@ This file implements string parsing and creation for NumPy datetime. * be able to parse it without error is '%Y-%m-%d'; * * INFER_FORMAT: parse without comparing 'format' (i.e. infer it). */ -typedef enum { - PARTIAL_MATCH, - EXACT_MATCH, - INFER_FORMAT -} FormatRequirement; +typedef enum { PARTIAL_MATCH, EXACT_MATCH, INFER_FORMAT } FormatRequirement; /* * Parses (almost) standard ISO 8601 date strings. The differences are: @@ -58,31 +54,26 @@ typedef enum { * 'str' must be a NULL-terminated string, and 'len' must be its length. * * 'out' gets filled with the parsed date-time. - * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for local time. - * 'out_tzoffset' gets set to timezone offset by minutes - * if the parsed time was in local time, - * to 0 otherwise. The values 'now' and 'today' don't get counted - * as local, and neither do UTC +/-#### timezone offsets, because - * they aren't using the computer's local timezone offset. + * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for + * local time. 'out_tzoffset' gets set to timezone offset by minutes if the + * parsed time was in local time, to 0 otherwise. The values 'now' and 'today' + * don't get counted as local, and neither do UTC +/-#### timezone offsets, + * because they aren't using the computer's local timezone offset. * * Returns 0 on success, -1 on failure. */ -int -parse_iso_8601_datetime(const char *str, int len, int want_exc, - npy_datetimestruct *out, - NPY_DATETIMEUNIT *out_bestunit, - int *out_local, - int *out_tzoffset, - const char* format, - int format_len, - FormatRequirement format_requirement); +int parse_iso_8601_datetime(const char *str, int len, int want_exc, + npy_datetimestruct *out, + NPY_DATETIMEUNIT *out_bestunit, int *out_local, + int *out_tzoffset, const char *format, + int format_len, + FormatRequirement format_requirement); /* * Provides a string length to use for converting datetime * objects with the given local and unit settings. */ -int -get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); +int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); /* * Converts an npy_datetimestruct to an (almost) ISO 8601 @@ -94,9 +85,8 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ -int -make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - int utc, NPY_DATETIMEUNIT base); +int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, + int utc, NPY_DATETIMEUNIT base); /* * Converts an pandas_timedeltastruct to an ISO 8601 string. diff --git a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h index 54bcca9e4136c..d60335fbaee4d 100644 --- a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h +++ b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h @@ -16,18 +16,19 @@ modification, are permitted provided that the following conditions are met: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -51,9 +52,9 @@ tree doesn't have cyclic references. #pragma once +#include "pandas/portable.h" #include #include -#include "pandas/portable.h" // Don't output any extra whitespaces when encoding #define JSON_NO_EXTRA_WHITESPACE @@ -74,7 +75,8 @@ tree doesn't have cyclic references. #endif /* -Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */ +Dictates and limits how much stack space for buffers UltraJSON will use before +resorting to provided heap functions */ #ifndef JSON_MAX_STACK_BUFFER_SIZE #define JSON_MAX_STACK_BUFFER_SIZE 131072 #endif @@ -138,23 +140,23 @@ typedef int64_t JSLONG; #endif enum JSTYPES { - JT_NULL, // NULL - JT_TRUE, // boolean true - JT_FALSE, // boolean false - JT_INT, // (JSINT32 (signed 32-bit)) - JT_LONG, // (JSINT64 (signed 64-bit)) - JT_DOUBLE, // (double) - JT_BIGNUM, // integer larger than sys.maxsize - JT_UTF8, // (char 8-bit) - JT_ARRAY, // Array structure - JT_OBJECT, // Key/Value structure - JT_INVALID, // Internal, do not return nor expect - JT_POS_INF, // Positive infinity - JT_NEG_INF, // Negative infinity + JT_NULL, // NULL + JT_TRUE, // boolean true + JT_FALSE, // boolean false + JT_INT, // (JSINT32 (signed 32-bit)) + JT_LONG, // (JSINT64 (signed 64-bit)) + JT_DOUBLE, // (double) + JT_BIGNUM, // integer larger than sys.maxsize + JT_UTF8, // (char 8-bit) + JT_ARRAY, // Array structure + JT_OBJECT, // Key/Value structure + JT_INVALID, // Internal, do not return nor expect + JT_POS_INF, // Positive infinity + JT_NEG_INF, // Negative infinity }; -typedef void * JSOBJ; -typedef void * JSITER; +typedef void *JSOBJ; +typedef void *JSITER; typedef struct __JSONTypeContext { int type; @@ -183,7 +185,7 @@ typedef struct __JSONObjectEncoder { JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc, - size_t *_outLen); + size_t *_outLen); /* Begin iteration of an iterable object (JS_ARRAY or JS_OBJECT) @@ -192,8 +194,9 @@ typedef struct __JSONObjectEncoder { JSPFN_ITERBEGIN iterBegin; /* - Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items. - Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this + Retrieve next object in an iteration. Should return 0 to indicate iteration + has reached end or 1 if there are more items. Implementor is responsible for + keeping state of the iteration. Use ti->prv fields for this */ JSPFN_ITERNEXT iterNext; @@ -205,19 +208,22 @@ typedef struct __JSONObjectEncoder { /* Returns a reference to the value object of an iterator - The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + The is responsible for the life-cycle of the returned string. Use + iterNext/iterEnd and ti->prv to keep track of current object */ JSPFN_ITERGETVALUE iterGetValue; /* Return name of iterator. - The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + The is responsible for the life-cycle of the returned string. Use + iterNext/iterEnd and ti->prv to keep track of current object */ JSPFN_ITERGETNAME iterGetName; /* - Release a value as indicated by setting ti->release = 1 in the previous getValue call. - The ti->prv array should contain the necessary context to release the value + Release a value as indicated by setting ti->release = 1 in the previous + getValue call. The ti->prv array should contain the necessary context to + release the value */ void (*releaseObject)(JSOBJ obj); @@ -228,19 +234,23 @@ typedef struct __JSONObjectEncoder { JSPFN_FREE free; /* - Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/ + Configuration for max recursion, set to 0 to use default (see + JSON_MAX_RECURSION_DEPTH)*/ int recursionMax; /* - Configuration for max decimals of double floating point numbers to encode (0-9) */ + Configuration for max decimals of double floating point numbers to encode + (0-9) */ int doublePrecision; /* - If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */ + If true output will be ASCII with all characters above 127 encoded as \uXXXX. + If false output will be UTF-8 or what ever charset strings are brought as */ int forceASCII; /* - If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and \u0026, respectively. If false, no special encoding will be used. */ + If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and + \u0026, respectively. If false, no special encoding will be used. */ int encodeHTMLChars; /* @@ -266,18 +276,20 @@ Encode an object structure into JSON. Arguments: obj - An anonymous type representing the object enc - Function definitions for querying JSOBJ type -buffer - Preallocated buffer to store result in. If NULL function allocates own buffer -cbBuffer - Length of buffer (ignored if buffer is NULL) +buffer - Preallocated buffer to store result in. If NULL function allocates own +buffer cbBuffer - Length of buffer (ignored if buffer is NULL) Returns: Encoded JSON object as a null terminated char string. NOTE: -If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer. -Life cycle of the provided buffer must still be handled by caller. +If the supplied buffer wasn't enough to hold the result the function will +allocate a new buffer. Life cycle of the provided buffer must still be handled +by caller. -If the return value doesn't equal the specified buffer caller must release the memory using -JSONObjectEncoder.free or free() as specified when calling this function. +If the return value doesn't equal the specified buffer caller must release the +memory using JSONObjectEncoder.free or free() as specified when calling this +function. */ EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer); diff --git a/pandas/_libs/include/pandas/vendored/ujson/python/version.h b/pandas/_libs/include/pandas/vendored/ujson/python/version.h index 97232dd821387..4b00670c946af 100644 --- a/pandas/_libs/include/pandas/vendored/ujson/python/version.h +++ b/pandas/_libs/include/pandas/vendored/ujson/python/version.h @@ -16,18 +16,19 @@ modification, are permitted provided that the following conditions are met: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms diff --git a/pandas/_libs/src/datetime/date_conversions.c b/pandas/_libs/src/datetime/date_conversions.c index 3bc3275be1cfe..4b172349de8d3 100644 --- a/pandas/_libs/src/datetime/date_conversions.c +++ b/pandas/_libs/src/datetime/date_conversions.c @@ -21,83 +21,81 @@ The full license is in the LICENSE file, distributed with this software. * Mutates the provided value directly. Returns 0 on success, non-zero on error. */ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { - switch (unit) { - case NPY_FR_ns: - break; - case NPY_FR_us: - *value /= 1000LL; - break; - case NPY_FR_ms: - *value /= 1000000LL; - break; - case NPY_FR_s: - *value /= 1000000000LL; - break; - default: - return -1; - } + switch (unit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + *value /= 1000LL; + break; + case NPY_FR_ms: + *value /= 1000000LL; + break; + case NPY_FR_s: + *value /= 1000000000LL; + break; + default: + return -1; + } - return 0; + return 0; } /* Converts the int64_t representation of a datetime to ISO; mutates len */ -char *int64ToIso(int64_t value, - NPY_DATETIMEUNIT valueUnit, - NPY_DATETIMEUNIT base, - size_t *len) { - npy_datetimestruct dts; - int ret_code; +char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, + NPY_DATETIMEUNIT base, size_t *len) { + npy_datetimestruct dts; + int ret_code; - pandas_datetime_to_datetimestruct(value, valueUnit, &dts); + pandas_datetime_to_datetimestruct(value, valueUnit, &dts); - *len = (size_t)get_datetime_iso_8601_strlen(0, base); - char *result = PyObject_Malloc(*len); + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); - if (result == NULL) { - PyErr_NoMemory(); - return NULL; - } - // datetime64 is always naive - ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base); - if (ret_code != 0) { - PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - PyObject_Free(result); - } + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } + // datetime64 is always naive + ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base); + if (ret_code != 0) { + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); + } - // Note that get_datetime_iso_8601_strlen just gives a generic size - // for ISO string conversion, not the actual size used - *len = strlen(result); - return result; + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; } npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { - scaleNanosecToUnit(&dt, base); - return dt; + scaleNanosecToUnit(&dt, base); + return dt; } /* Converts the int64_t representation of a duration to ISO; mutates len */ char *int64ToIsoDuration(int64_t value, size_t *len) { - pandas_timedeltastruct tds; - int ret_code; + pandas_timedeltastruct tds; + int ret_code; - pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); + pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); - // Max theoretical length of ISO Duration with 64 bit day - // as the largest unit is 70 characters + 1 for a null terminator - char *result = PyObject_Malloc(71); - if (result == NULL) { - PyErr_NoMemory(); - return NULL; - } + // Max theoretical length of ISO Duration with 64 bit day + // as the largest unit is 70 characters + 1 for a null terminator + char *result = PyObject_Malloc(71); + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } - ret_code = make_iso_8601_timedelta(&tds, result, len); - if (ret_code == -1) { - PyErr_SetString(PyExc_ValueError, - "Could not convert timedelta value to string"); - PyObject_Free(result); - return NULL; - } + ret_code = make_iso_8601_timedelta(&tds, result, len); + if (ret_code == -1) { + PyErr_SetString(PyExc_ValueError, + "Could not convert timedelta value to string"); + PyObject_Free(result); + return NULL; + } - return result; + return result; } diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c index fc2cbcab90174..b201023114f8a 100644 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ b/pandas/_libs/src/datetime/pd_datetime.c @@ -22,7 +22,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include "datetime.h" #include "pandas/datetime/pd_datetime.h" - static void pandas_datetime_destructor(PyObject *op) { void *ptr = PyCapsule_GetPointer(op, PandasDateTime_CAPSULE_NAME); PyMem_Free(ptr); @@ -42,77 +41,77 @@ static void pandas_datetime_destructor(PyObject *op) { * if obj doesn't have the needed date or datetime attributes. */ static int convert_pydatetime_to_datetimestruct(PyObject *dtobj, - npy_datetimestruct *out) { - // Assumes that obj is a valid datetime object - PyObject *tmp; - PyObject *obj = (PyObject*)dtobj; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); - out->month = 1; - out->day = 1; - - out->year = PyLong_AsLong(PyObject_GetAttrString(obj, "year")); - out->month = PyLong_AsLong(PyObject_GetAttrString(obj, "month")); - out->day = PyLong_AsLong(PyObject_GetAttrString(obj, "day")); - - // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use - // PyDateTime_Check here, and less verbose attribute lookups. - - /* Check for time attributes (if not there, return success as a date) */ - if (!PyObject_HasAttrString(obj, "hour") || - !PyObject_HasAttrString(obj, "minute") || - !PyObject_HasAttrString(obj, "second") || - !PyObject_HasAttrString(obj, "microsecond")) { - return 0; - } + npy_datetimestruct *out) { + // Assumes that obj is a valid datetime object + PyObject *tmp; + PyObject *obj = (PyObject *)dtobj; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(npy_datetimestruct)); + out->month = 1; + out->day = 1; + + out->year = PyLong_AsLong(PyObject_GetAttrString(obj, "year")); + out->month = PyLong_AsLong(PyObject_GetAttrString(obj, "month")); + out->day = PyLong_AsLong(PyObject_GetAttrString(obj, "day")); + + // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use + // PyDateTime_Check here, and less verbose attribute lookups. + + /* Check for time attributes (if not there, return success as a date) */ + if (!PyObject_HasAttrString(obj, "hour") || + !PyObject_HasAttrString(obj, "minute") || + !PyObject_HasAttrString(obj, "second") || + !PyObject_HasAttrString(obj, "microsecond")) { + return 0; + } - out->hour = PyLong_AsLong(PyObject_GetAttrString(obj, "hour")); - out->min = PyLong_AsLong(PyObject_GetAttrString(obj, "minute")); - out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second")); - out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond")); - - if (PyObject_HasAttrString(obj, "tzinfo")) { - PyObject *offset = extract_utc_offset(obj); - /* Apply the time zone offset if datetime obj is tz-aware */ - if (offset != NULL) { - if (offset == Py_None) { - Py_DECREF(offset); - return 0; - } - PyObject *tmp_int; - int seconds_offset, minutes_offset; - /* - * The timedelta should have a function "total_seconds" - * which contains the value we want. - */ - tmp = PyObject_CallMethod(offset, "total_seconds", ""); - Py_DECREF(offset); - if (tmp == NULL) { - return -1; - } - tmp_int = PyNumber_Long(tmp); - if (tmp_int == NULL) { - Py_DECREF(tmp); - return -1; - } - seconds_offset = PyLong_AsLong(tmp_int); - if (seconds_offset == -1 && PyErr_Occurred()) { - Py_DECREF(tmp_int); - Py_DECREF(tmp); - return -1; - } - Py_DECREF(tmp_int); - Py_DECREF(tmp); - - /* Convert to a minutes offset and apply it */ - minutes_offset = seconds_offset / 60; - - add_minutes_to_datetimestruct(out, -minutes_offset); - } + out->hour = PyLong_AsLong(PyObject_GetAttrString(obj, "hour")); + out->min = PyLong_AsLong(PyObject_GetAttrString(obj, "minute")); + out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second")); + out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond")); + + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *offset = extract_utc_offset(obj); + /* Apply the time zone offset if datetime obj is tz-aware */ + if (offset != NULL) { + if (offset == Py_None) { + Py_DECREF(offset); + return 0; + } + PyObject *tmp_int; + int seconds_offset, minutes_offset; + /* + * The timedelta should have a function "total_seconds" + * which contains the value we want. + */ + tmp = PyObject_CallMethod(offset, "total_seconds", ""); + Py_DECREF(offset); + if (tmp == NULL) { + return -1; + } + tmp_int = PyNumber_Long(tmp); + if (tmp_int == NULL) { + Py_DECREF(tmp); + return -1; + } + seconds_offset = PyLong_AsLong(tmp_int); + if (seconds_offset == -1 && PyErr_Occurred()) { + Py_DECREF(tmp_int); + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp_int); + Py_DECREF(tmp); + + /* Convert to a minutes offset and apply it */ + minutes_offset = seconds_offset / 60; + + add_minutes_to_datetimestruct(out, -minutes_offset); } + } - return 0; + return 0; } // Converts a Python object representing a Date / Datetime to ISO format @@ -120,66 +119,66 @@ static int convert_pydatetime_to_datetimestruct(PyObject *dtobj, // while base="ns" yields "2020-01-01T00:00:00.000000000Z" // len is mutated to save the length of the returned string static char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, - size_t *len) { - npy_datetimestruct dts; - int ret; - - ret = convert_pydatetime_to_datetimestruct(obj, &dts); - if (ret != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "Could not convert PyDateTime to numpy datetime"); - } - return NULL; + size_t *len) { + npy_datetimestruct dts; + int ret; + + ret = convert_pydatetime_to_datetimestruct(obj, &dts); + if (ret != 0) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "Could not convert PyDateTime to numpy datetime"); } + return NULL; + } - *len = (size_t)get_datetime_iso_8601_strlen(0, base); - char *result = PyObject_Malloc(*len); - // Check to see if PyDateTime has a timezone. - // Don't convert to UTC if it doesn't. - int is_tz_aware = 0; - if (PyObject_HasAttrString(obj, "tzinfo")) { - PyObject *offset = extract_utc_offset(obj); - if (offset == NULL) { - PyObject_Free(result); - return NULL; - } - is_tz_aware = offset != Py_None; - Py_DECREF(offset); + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); + // Check to see if PyDateTime has a timezone. + // Don't convert to UTC if it doesn't. + int is_tz_aware = 0; + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *offset = extract_utc_offset(obj); + if (offset == NULL) { + PyObject_Free(result); + return NULL; } - ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); + is_tz_aware = offset != Py_None; + Py_DECREF(offset); + } + ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); - if (ret != 0) { - PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - PyObject_Free(result); - return NULL; - } + if (ret != 0) { + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); + return NULL; + } - // Note that get_datetime_iso_8601_strlen just gives a generic size - // for ISO string conversion, not the actual size used - *len = strlen(result); - return result; + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; } // Convert a Python Date/Datetime to Unix epoch with resolution base static npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) { - npy_datetimestruct dts; - int ret; - - ret = convert_pydatetime_to_datetimestruct(dt, &dts); - if (ret != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "Could not convert PyDateTime to numpy datetime"); - } - // TODO(username): is setting errMsg required? - // ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - // return NULL; + npy_datetimestruct dts; + int ret; + + ret = convert_pydatetime_to_datetimestruct(dt, &dts); + if (ret != 0) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "Could not convert PyDateTime to numpy datetime"); } + // TODO(username): is setting errMsg required? + // ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + // return NULL; + } - npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); - return NpyDateTimeToEpoch(npy_dt, base); + npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); + return NpyDateTimeToEpoch(npy_dt, base); } static int pandas_datetime_exec(PyObject *module) { diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index e00c5c1e807a7..29c2c8d095907 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -14,19 +14,19 @@ The full license is in the LICENSE file, distributed with this software. */ void *new_rd_source(PyObject *obj) { - rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); - - if (rds == NULL) { - PyErr_NoMemory(); - return NULL; - } - /* hold on to this object */ - Py_INCREF(obj); - rds->obj = obj; - rds->buffer = NULL; - rds->position = 0; - - return (void *)rds; + rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); + + if (rds == NULL) { + PyErr_NoMemory(); + return NULL; + } + /* hold on to this object */ + Py_INCREF(obj); + rds->obj = obj; + rds->buffer = NULL; + rds->position = 0; + + return (void *)rds; } /* @@ -36,11 +36,11 @@ void *new_rd_source(PyObject *obj) { */ int del_rd_source(void *rds) { - Py_XDECREF(RDS(rds)->obj); - Py_XDECREF(RDS(rds)->buffer); - free(rds); + Py_XDECREF(RDS(rds)->obj); + Py_XDECREF(RDS(rds)->buffer); + free(rds); - return 0; + return 0; } /* @@ -51,57 +51,57 @@ int del_rd_source(void *rds) { void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) { - PyGILState_STATE state; - PyObject *result, *func, *args, *tmp; - - void *retval; - - size_t length; - rd_source *src = RDS(source); - state = PyGILState_Ensure(); - - /* delete old object */ - Py_XDECREF(src->buffer); - src->buffer = NULL; - args = Py_BuildValue("(i)", nbytes); - - func = PyObject_GetAttrString(src->obj, "read"); - - /* Note: PyObject_CallObject requires the GIL */ - result = PyObject_CallObject(func, args); - Py_XDECREF(args); - Py_XDECREF(func); - - if (result == NULL) { - PyGILState_Release(state); - *bytes_read = 0; - *status = CALLING_READ_FAILED; - return NULL; - } else if (!PyBytes_Check(result)) { - tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors); - Py_DECREF(result); - if (tmp == NULL) { - PyGILState_Release(state); - return NULL; - } - result = tmp; - } + PyGILState_STATE state; + PyObject *result, *func, *args, *tmp; + + void *retval; + + size_t length; + rd_source *src = RDS(source); + state = PyGILState_Ensure(); - length = PySequence_Length(result); + /* delete old object */ + Py_XDECREF(src->buffer); + src->buffer = NULL; + args = Py_BuildValue("(i)", nbytes); - if (length == 0) - *status = REACHED_EOF; - else - *status = 0; + func = PyObject_GetAttrString(src->obj, "read"); - /* hang on to the Python object */ - src->buffer = result; - retval = (void *)PyBytes_AsString(result); + /* Note: PyObject_CallObject requires the GIL */ + result = PyObject_CallObject(func, args); + Py_XDECREF(args); + Py_XDECREF(func); + if (result == NULL) { PyGILState_Release(state); + *bytes_read = 0; + *status = CALLING_READ_FAILED; + return NULL; + } else if (!PyBytes_Check(result)) { + tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors); + Py_DECREF(result); + if (tmp == NULL) { + PyGILState_Release(state); + return NULL; + } + result = tmp; + } + + length = PySequence_Length(result); + + if (length == 0) + *status = REACHED_EOF; + else + *status = 0; + + /* hang on to the Python object */ + src->buffer = result; + retval = (void *)PyBytes_AsString(result); + + PyGILState_Release(state); - /* TODO: more error handling */ - *bytes_read = length; + /* TODO: more error handling */ + *bytes_read = length; - return retval; + return retval; } diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c index c429f17c1cb8b..41689704ccffc 100644 --- a/pandas/_libs/src/parser/pd_parser.c +++ b/pandas/_libs/src/parser/pd_parser.c @@ -12,7 +12,7 @@ Distributed under the terms of the BSD Simplified License. #include "pandas/parser/io.h" static int to_double(char *item, double *p_value, char sci, char decimal, - int *maybe_int) { + int *maybe_int) { char *p_end = NULL; int error = 0; @@ -95,7 +95,6 @@ static int floatify(PyObject *str, double *result, int *maybe_int) { return -1; } - static void pandas_parser_destructor(PyObject *op) { void *ptr = PyCapsule_GetPointer(op, PandasParser_CAPSULE_NAME); PyMem_Free(ptr); diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index ce8a38df172ef..c9466c485ae94 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -27,18 +27,18 @@ GitHub. See Python Software Foundation License and BSD licenses for these. void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start) { - // column i, starting at 0 - self->words = parser->words; - self->col = i; - self->line_start = parser->line_start + start; + // column i, starting at 0 + self->words = parser->words; + self->col = i; + self->line_start = parser->line_start + start; } static void free_if_not_null(void **ptr) { - TRACE(("free_if_not_null %p\n", *ptr)) - if (*ptr != NULL) { - free(*ptr); - *ptr = NULL; - } + TRACE(("free_if_not_null %p\n", *ptr)) + if (*ptr != NULL) { + free(*ptr); + *ptr = NULL; + } } /* @@ -49,542 +49,529 @@ static void free_if_not_null(void **ptr) { static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity, int64_t space, int64_t elsize, int *error) { - uint64_t cap = *capacity; - void *newbuffer = buffer; - - // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - while ((length + space >= cap) && (newbuffer != NULL)) { - cap = cap ? cap << 1 : 2; - buffer = newbuffer; - newbuffer = realloc(newbuffer, elsize * cap); - } - - if (newbuffer == NULL) { - // realloc failed so don't change *capacity, set *error to errno - // and return the last good realloc'd buffer so it can be freed - *error = errno; - newbuffer = buffer; - } else { - // realloc worked, update *capacity and set *error to 0 - // sigh, multiple return values - *capacity = cap; - *error = 0; - } - return newbuffer; + uint64_t cap = *capacity; + void *newbuffer = buffer; + + // Can we fit potentially nbytes tokens (+ null terminators) in the stream? + while ((length + space >= cap) && (newbuffer != NULL)) { + cap = cap ? cap << 1 : 2; + buffer = newbuffer; + newbuffer = realloc(newbuffer, elsize * cap); + } + + if (newbuffer == NULL) { + // realloc failed so don't change *capacity, set *error to errno + // and return the last good realloc'd buffer so it can be freed + *error = errno; + newbuffer = buffer; + } else { + // realloc worked, update *capacity and set *error to 0 + // sigh, multiple return values + *capacity = cap; + *error = 0; + } + return newbuffer; } void parser_set_default_options(parser_t *self) { - self->decimal = '.'; - self->sci = 'E'; + self->decimal = '.'; + self->sci = 'E'; - // For tokenization - self->state = START_RECORD; + // For tokenization + self->state = START_RECORD; - self->delimiter = ','; // XXX - self->delim_whitespace = 0; + self->delimiter = ','; // XXX + self->delim_whitespace = 0; - self->doublequote = 0; - self->quotechar = '"'; - self->escapechar = 0; + self->doublequote = 0; + self->quotechar = '"'; + self->escapechar = 0; - self->lineterminator = '\0'; /* NUL->standard logic */ + self->lineterminator = '\0'; /* NUL->standard logic */ - self->skipinitialspace = 0; - self->quoting = QUOTE_MINIMAL; - self->allow_embedded_newline = 1; + self->skipinitialspace = 0; + self->quoting = QUOTE_MINIMAL; + self->allow_embedded_newline = 1; - self->expected_fields = -1; - self->on_bad_lines = ERROR; + self->expected_fields = -1; + self->on_bad_lines = ERROR; - self->commentchar = '#'; - self->thousands = '\0'; + self->commentchar = '#'; + self->thousands = '\0'; - self->skipset = NULL; - self->skipfunc = NULL; - self->skip_first_N_rows = -1; - self->skip_footer = 0; + self->skipset = NULL; + self->skipfunc = NULL; + self->skip_first_N_rows = -1; + self->skip_footer = 0; } parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); } int parser_clear_data_buffers(parser_t *self) { - free_if_not_null((void *)&self->stream); - free_if_not_null((void *)&self->words); - free_if_not_null((void *)&self->word_starts); - free_if_not_null((void *)&self->line_start); - free_if_not_null((void *)&self->line_fields); - return 0; + free_if_not_null((void *)&self->stream); + free_if_not_null((void *)&self->words); + free_if_not_null((void *)&self->word_starts); + free_if_not_null((void *)&self->line_start); + free_if_not_null((void *)&self->line_fields); + return 0; } int parser_cleanup(parser_t *self) { - int status = 0; + int status = 0; - // XXX where to put this - free_if_not_null((void *)&self->error_msg); - free_if_not_null((void *)&self->warn_msg); + // XXX where to put this + free_if_not_null((void *)&self->error_msg); + free_if_not_null((void *)&self->warn_msg); - if (self->skipset != NULL) { - kh_destroy_int64((kh_int64_t *)self->skipset); - self->skipset = NULL; - } + if (self->skipset != NULL) { + kh_destroy_int64((kh_int64_t *)self->skipset); + self->skipset = NULL; + } - if (parser_clear_data_buffers(self) < 0) { - status = -1; - } + if (parser_clear_data_buffers(self) < 0) { + status = -1; + } - if (self->cb_cleanup != NULL) { - if (self->cb_cleanup(self->source) < 0) { - status = -1; - } - self->cb_cleanup = NULL; + if (self->cb_cleanup != NULL) { + if (self->cb_cleanup(self->source) < 0) { + status = -1; } + self->cb_cleanup = NULL; + } - return status; + return status; } int parser_init(parser_t *self) { - int64_t sz; - - /* - Initialize data buffers - */ - - self->stream = NULL; - self->words = NULL; - self->word_starts = NULL; - self->line_start = NULL; - self->line_fields = NULL; - self->error_msg = NULL; - self->warn_msg = NULL; - - // token stream - self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); - if (self->stream == NULL) { - parser_cleanup(self); - return PARSER_OUT_OF_MEMORY; - } - self->stream_cap = STREAM_INIT_SIZE; - self->stream_len = 0; - - // word pointers and metadata - sz = STREAM_INIT_SIZE / 10; - sz = sz ? sz : 1; - self->words = malloc(sz * sizeof(char *)); - self->word_starts = malloc(sz * sizeof(int64_t)); - self->max_words_cap = sz; - self->words_cap = sz; - self->words_len = 0; - - // line pointers and metadata - self->line_start = malloc(sz * sizeof(int64_t)); - - self->line_fields = malloc(sz * sizeof(int64_t)); - - self->lines_cap = sz; - self->lines = 0; - self->file_lines = 0; - - if (self->stream == NULL || self->words == NULL || - self->word_starts == NULL || self->line_start == NULL || - self->line_fields == NULL) { - parser_cleanup(self); + int64_t sz; + + /* + Initialize data buffers + */ + + self->stream = NULL; + self->words = NULL; + self->word_starts = NULL; + self->line_start = NULL; + self->line_fields = NULL; + self->error_msg = NULL; + self->warn_msg = NULL; + + // token stream + self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); + if (self->stream == NULL) { + parser_cleanup(self); + return PARSER_OUT_OF_MEMORY; + } + self->stream_cap = STREAM_INIT_SIZE; + self->stream_len = 0; + + // word pointers and metadata + sz = STREAM_INIT_SIZE / 10; + sz = sz ? sz : 1; + self->words = malloc(sz * sizeof(char *)); + self->word_starts = malloc(sz * sizeof(int64_t)); + self->max_words_cap = sz; + self->words_cap = sz; + self->words_len = 0; + + // line pointers and metadata + self->line_start = malloc(sz * sizeof(int64_t)); + + self->line_fields = malloc(sz * sizeof(int64_t)); + + self->lines_cap = sz; + self->lines = 0; + self->file_lines = 0; + + if (self->stream == NULL || self->words == NULL || + self->word_starts == NULL || self->line_start == NULL || + self->line_fields == NULL) { + parser_cleanup(self); - return PARSER_OUT_OF_MEMORY; - } + return PARSER_OUT_OF_MEMORY; + } - /* amount of bytes buffered */ - self->datalen = 0; - self->datapos = 0; + /* amount of bytes buffered */ + self->datalen = 0; + self->datapos = 0; - self->line_start[0] = 0; - self->line_fields[0] = 0; + self->line_start[0] = 0; + self->line_fields[0] = 0; - self->pword_start = self->stream; - self->word_start = 0; + self->pword_start = self->stream; + self->word_start = 0; - self->state = START_RECORD; + self->state = START_RECORD; - self->error_msg = NULL; - self->warn_msg = NULL; + self->error_msg = NULL; + self->warn_msg = NULL; - self->commentchar = '\0'; + self->commentchar = '\0'; - return 0; + return 0; } void parser_free(parser_t *self) { - // opposite of parser_init - parser_cleanup(self); + // opposite of parser_init + parser_cleanup(self); } -void parser_del(parser_t *self) { - free(self); -} +void parser_del(parser_t *self) { free(self); } static int make_stream_space(parser_t *self, size_t nbytes) { - uint64_t i, cap, length; - int status; - void *orig_ptr, *newptr; + uint64_t i, cap, length; + int status; + void *orig_ptr, *newptr; - // Can we fit potentially nbytes tokens (+ null terminators) in the stream? + // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - /* - TOKEN STREAM - */ + /* + TOKEN STREAM + */ - orig_ptr = (void *)self->stream; - TRACE( - ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", + orig_ptr = (void *)self->stream; + TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) - self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, - &self->stream_cap, nbytes * 2, - sizeof(char), &status); - TRACE( - ("make_stream_space: self->stream=%p, self->stream_len = %zu, " + self->stream = + (char *)grow_buffer((void *)self->stream, self->stream_len, + &self->stream_cap, nbytes * 2, sizeof(char), &status); + TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, " "self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) - if (status != 0) { - return PARSER_OUT_OF_MEMORY; - } - - // realloc sets errno when moving buffer? - if (self->stream != orig_ptr) { - self->pword_start = self->stream + self->word_start; - - for (i = 0; i < self->words_len; ++i) { - self->words[i] = self->stream + self->word_starts[i]; - } - } - - /* - WORD VECTORS - */ - - cap = self->words_cap; - - /** - * If we are reading in chunks, we need to be aware of the maximum number - * of words we have seen in previous chunks (self->max_words_cap), so - * that way, we can properly allocate when reading subsequent ones. - * - * Otherwise, we risk a buffer overflow if we mistakenly under-allocate - * just because a recent chunk did not have as many words. - */ - if (self->words_len + nbytes < self->max_words_cap) { - length = self->max_words_cap - nbytes - 1; - } else { - length = self->words_len; - } - - self->words = - (char **)grow_buffer((void *)self->words, length, - &self->words_cap, nbytes, - sizeof(char *), &status); - TRACE( - ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " + if (status != 0) { + return PARSER_OUT_OF_MEMORY; + } + + // realloc sets errno when moving buffer? + if (self->stream != orig_ptr) { + self->pword_start = self->stream + self->word_start; + + for (i = 0; i < self->words_len; ++i) { + self->words[i] = self->stream + self->word_starts[i]; + } + } + + /* + WORD VECTORS + */ + + cap = self->words_cap; + + /** + * If we are reading in chunks, we need to be aware of the maximum number + * of words we have seen in previous chunks (self->max_words_cap), so + * that way, we can properly allocate when reading subsequent ones. + * + * Otherwise, we risk a buffer overflow if we mistakenly under-allocate + * just because a recent chunk did not have as many words. + */ + if (self->words_len + nbytes < self->max_words_cap) { + length = self->max_words_cap - nbytes - 1; + } else { + length = self->words_len; + } + + self->words = + (char **)grow_buffer((void *)self->words, length, &self->words_cap, + nbytes, sizeof(char *), &status); + TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " "%d)\n", self->words_len, self->words_cap, nbytes, status)) - if (status != 0) { - return PARSER_OUT_OF_MEMORY; - } - - // realloc took place - if (cap != self->words_cap) { - TRACE( - ("make_stream_space: cap != self->words_cap, nbytes = %d, " - "self->words_cap=%d\n", - nbytes, self->words_cap)) - newptr = realloc((void *)self->word_starts, - sizeof(int64_t) * self->words_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->word_starts = (int64_t *)newptr; - } - } - - /* - LINE VECTORS - */ - cap = self->lines_cap; - self->line_start = - (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1, - &self->lines_cap, nbytes, - sizeof(int64_t), &status); - TRACE(( - "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", - self->lines + 1, self->lines_cap, nbytes, status)) - if (status != 0) { - return PARSER_OUT_OF_MEMORY; - } - - // realloc took place - if (cap != self->lines_cap) { - TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", - nbytes)) - newptr = realloc((void *)self->line_fields, - sizeof(int64_t) * self->lines_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->line_fields = (int64_t *)newptr; - } + if (status != 0) { + return PARSER_OUT_OF_MEMORY; + } + + // realloc took place + if (cap != self->words_cap) { + TRACE(("make_stream_space: cap != self->words_cap, nbytes = %d, " + "self->words_cap=%d\n", + nbytes, self->words_cap)) + newptr = + realloc((void *)self->word_starts, sizeof(int64_t) * self->words_cap); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->word_starts = (int64_t *)newptr; + } + } + + /* + LINE VECTORS + */ + cap = self->lines_cap; + self->line_start = (int64_t *)grow_buffer((void *)self->line_start, + self->lines + 1, &self->lines_cap, + nbytes, sizeof(int64_t), &status); + TRACE( + ("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", + self->lines + 1, self->lines_cap, nbytes, status)) + if (status != 0) { + return PARSER_OUT_OF_MEMORY; + } + + // realloc took place + if (cap != self->lines_cap) { + TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) + newptr = + realloc((void *)self->line_fields, sizeof(int64_t) * self->lines_cap); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->line_fields = (int64_t *)newptr; } + } - return 0; + return 0; } static int push_char(parser_t *self, char c) { - TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", - self->stream_len + 1, c, self->stream_cap)) - if (self->stream_len >= self->stream_cap) { - TRACE( - ("push_char: ERROR!!! self->stream_len(%d) >= " - "self->stream_cap(%d)\n", - self->stream_len, self->stream_cap)) - int64_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Buffer overflow caught - possible malformed input file.\n"); - return PARSER_OUT_OF_MEMORY; - } - self->stream[self->stream_len++] = c; - return 0; + TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", + self->stream_len + 1, c, self->stream_cap)) + if (self->stream_len >= self->stream_cap) { + TRACE(("push_char: ERROR!!! self->stream_len(%d) >= " + "self->stream_cap(%d)\n", + self->stream_len, self->stream_cap)) + int64_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - possible malformed input file.\n"); + return PARSER_OUT_OF_MEMORY; + } + self->stream[self->stream_len++] = c; + return 0; } int PANDAS_INLINE end_field(parser_t *self) { - // XXX cruft - if (self->words_len >= self->words_cap) { - TRACE( - ("end_field: ERROR!!! self->words_len(%zu) >= " - "self->words_cap(%zu)\n", - self->words_len, self->words_cap)) - int64_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Buffer overflow caught - possible malformed input file.\n"); - return PARSER_OUT_OF_MEMORY; - } + // XXX cruft + if (self->words_len >= self->words_cap) { + TRACE(("end_field: ERROR!!! self->words_len(%zu) >= " + "self->words_cap(%zu)\n", + self->words_len, self->words_cap)) + int64_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - possible malformed input file.\n"); + return PARSER_OUT_OF_MEMORY; + } - // null terminate token - push_char(self, '\0'); + // null terminate token + push_char(self, '\0'); - // set pointer and metadata - self->words[self->words_len] = self->pword_start; + // set pointer and metadata + self->words[self->words_len] = self->pword_start; - TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0])); + TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0])); - TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start, - self->word_start, self->words_len + 1)) + TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start, + self->word_start, self->words_len + 1)) - self->word_starts[self->words_len] = self->word_start; - self->words_len++; + self->word_starts[self->words_len] = self->word_start; + self->words_len++; - // increment line field count - self->line_fields[self->lines]++; + // increment line field count + self->line_fields[self->lines]++; - // New field begin in stream - self->pword_start = self->stream + self->stream_len; - self->word_start = self->stream_len; + // New field begin in stream + self->pword_start = self->stream + self->stream_len; + self->word_start = self->stream_len; - return 0; + return 0; } static void append_warning(parser_t *self, const char *msg) { - int64_t ex_length; - int64_t length = strlen(msg); - void *newptr; - - if (self->warn_msg == NULL) { - self->warn_msg = malloc(length + 1); - snprintf(self->warn_msg, length + 1, "%s", msg); - } else { - ex_length = strlen(self->warn_msg); - newptr = realloc(self->warn_msg, ex_length + length + 1); - if (newptr != NULL) { - self->warn_msg = (char *)newptr; - snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); - } - } + int64_t ex_length; + int64_t length = strlen(msg); + void *newptr; + + if (self->warn_msg == NULL) { + self->warn_msg = malloc(length + 1); + snprintf(self->warn_msg, length + 1, "%s", msg); + } else { + ex_length = strlen(self->warn_msg); + newptr = realloc(self->warn_msg, ex_length + length + 1); + if (newptr != NULL) { + self->warn_msg = (char *)newptr; + snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); + } + } } static int end_line(parser_t *self) { - char *msg; - int64_t fields; - int64_t ex_fields = self->expected_fields; - int64_t bufsize = 100; // for error or warning messages + char *msg; + int64_t fields; + int64_t ex_fields = self->expected_fields; + int64_t bufsize = 100; // for error or warning messages - fields = self->line_fields[self->lines]; + fields = self->line_fields[self->lines]; - TRACE(("end_line: Line end, nfields: %d\n", fields)); + TRACE(("end_line: Line end, nfields: %d\n", fields)); - TRACE(("end_line: lines: %d\n", self->lines)); - if (self->lines > 0) { - if (self->expected_fields >= 0) { - ex_fields = self->expected_fields; - } else { - ex_fields = self->line_fields[self->lines - 1]; - } + TRACE(("end_line: lines: %d\n", self->lines)); + if (self->lines > 0) { + if (self->expected_fields >= 0) { + ex_fields = self->expected_fields; + } else { + ex_fields = self->line_fields[self->lines - 1]; } - TRACE(("end_line: ex_fields: %d\n", ex_fields)); - - if (self->state == START_FIELD_IN_SKIP_LINE || - self->state == IN_FIELD_IN_SKIP_LINE || - self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || - self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) { - TRACE(("end_line: Skipping row %d\n", self->file_lines)); - // increment file line count - self->file_lines++; + } + TRACE(("end_line: ex_fields: %d\n", ex_fields)); - // skip the tokens from this bad line - self->line_start[self->lines] += fields; + if (self->state == START_FIELD_IN_SKIP_LINE || + self->state == IN_FIELD_IN_SKIP_LINE || + self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || + self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) { + TRACE(("end_line: Skipping row %d\n", self->file_lines)); + // increment file line count + self->file_lines++; - // reset field count - self->line_fields[self->lines] = 0; - return 0; - } + // skip the tokens from this bad line + self->line_start[self->lines] += fields; - if (!(self->lines <= self->header_end + 1) && - (fields > ex_fields) && !(self->usecols)) { - // increment file line count - self->file_lines++; + // reset field count + self->line_fields[self->lines] = 0; + return 0; + } - // skip the tokens from this bad line - self->line_start[self->lines] += fields; + if (!(self->lines <= self->header_end + 1) && (fields > ex_fields) && + !(self->usecols)) { + // increment file line count + self->file_lines++; - // reset field count - self->line_fields[self->lines] = 0; + // skip the tokens from this bad line + self->line_start[self->lines] += fields; - // file_lines is now the actual file line number (starting at 1) - if (self->on_bad_lines == ERROR) { - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Expected %" PRId64 " fields in line %" PRIu64 ", saw %" - PRId64 "\n", ex_fields, self->file_lines, fields); + // reset field count + self->line_fields[self->lines] = 0; - TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); + // file_lines is now the actual file line number (starting at 1) + if (self->on_bad_lines == ERROR) { + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Expected %" PRId64 " fields in line %" PRIu64 ", saw %" PRId64 + "\n", + ex_fields, self->file_lines, fields); - return -1; - } else { - // simply skip bad lines - if (self->on_bad_lines == WARN) { - // pass up error message - msg = malloc(bufsize); - snprintf(msg, bufsize, - "Skipping line %" PRIu64 ": expected %" PRId64 - " fields, saw %" PRId64 "\n", - self->file_lines, ex_fields, fields); - append_warning(self, msg); - free(msg); - } - } + TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); + + return -1; } else { - // missing trailing delimiters - if ((self->lines >= self->header_end + 1) && - fields < ex_fields) { - // might overrun the buffer when closing fields - if (make_stream_space(self, ex_fields - fields) < 0) { - int64_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, "out of memory"); - return -1; - } - - while (fields < ex_fields) { - end_field(self); - fields++; - } - } + // simply skip bad lines + if (self->on_bad_lines == WARN) { + // pass up error message + msg = malloc(bufsize); + snprintf(msg, bufsize, + "Skipping line %" PRIu64 ": expected %" PRId64 + " fields, saw %" PRId64 "\n", + self->file_lines, ex_fields, fields); + append_warning(self, msg); + free(msg); + } + } + } else { + // missing trailing delimiters + if ((self->lines >= self->header_end + 1) && fields < ex_fields) { + // might overrun the buffer when closing fields + if (make_stream_space(self, ex_fields - fields) < 0) { + int64_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, "out of memory"); + return -1; + } - // increment both line counts - self->file_lines++; - self->lines++; - - // good line, set new start point - if (self->lines >= self->lines_cap) { - TRACE(( - "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", - self->lines, self->lines_cap)) - int64_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Buffer overflow caught - " - "possible malformed input file.\n"); - return PARSER_OUT_OF_MEMORY; - } - self->line_start[self->lines] = - (self->line_start[self->lines - 1] + fields); + while (fields < ex_fields) { + end_field(self); + fields++; + } + } - TRACE( - ("end_line: new line start: %d\n", self->line_start[self->lines])); + // increment both line counts + self->file_lines++; + self->lines++; - // new line start with 0 fields - self->line_fields[self->lines] = 0; + // good line, set new start point + if (self->lines >= self->lines_cap) { + TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", + self->lines, self->lines_cap)) + int64_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - " + "possible malformed input file.\n"); + return PARSER_OUT_OF_MEMORY; } + self->line_start[self->lines] = + (self->line_start[self->lines - 1] + fields); - TRACE(("end_line: Finished line, at %d\n", self->lines)); + TRACE(("end_line: new line start: %d\n", self->line_start[self->lines])); - return 0; + // new line start with 0 fields + self->line_fields[self->lines] = 0; + } + + TRACE(("end_line: Finished line, at %d\n", self->lines)); + + return 0; } int parser_add_skiprow(parser_t *self, int64_t row) { - khiter_t k; - kh_int64_t *set; - int ret = 0; + khiter_t k; + kh_int64_t *set; + int ret = 0; - if (self->skipset == NULL) { - self->skipset = (void *)kh_init_int64(); - } + if (self->skipset == NULL) { + self->skipset = (void *)kh_init_int64(); + } - set = (kh_int64_t *)self->skipset; + set = (kh_int64_t *)self->skipset; - k = kh_put_int64(set, row, &ret); - set->keys[k] = row; + k = kh_put_int64(set, row, &ret); + set->keys[k] = row; - return 0; + return 0; } int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { - // self->file_lines is zero based so subtract 1 from nrows - if (nrows > 0) { - self->skip_first_N_rows = nrows - 1; - } + // self->file_lines is zero based so subtract 1 from nrows + if (nrows > 0) { + self->skip_first_N_rows = nrows - 1; + } - return 0; + return 0; } static int parser_buffer_bytes(parser_t *self, size_t nbytes, const char *encoding_errors) { - int status; - size_t bytes_read; - - status = 0; - self->datapos = 0; - self->data = self->cb_io(self->source, nbytes, &bytes_read, &status, - encoding_errors); - TRACE(( - "parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", - nbytes, bytes_read, status)); - self->datalen = bytes_read; - - if (status != REACHED_EOF && self->data == NULL) { - int64_t bufsize = 200; - self->error_msg = malloc(bufsize); - - if (status == CALLING_READ_FAILED) { - snprintf(self->error_msg, bufsize, - "Calling read(nbytes) on source failed. " - "Try engine='python'."); - } else { - snprintf(self->error_msg, bufsize, "Unknown error in IO callback"); - } - return -1; + int status; + size_t bytes_read; + + status = 0; + self->datapos = 0; + self->data = + self->cb_io(self->source, nbytes, &bytes_read, &status, encoding_errors); + TRACE( + ("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", + nbytes, bytes_read, status)); + self->datalen = bytes_read; + + if (status != REACHED_EOF && self->data == NULL) { + int64_t bufsize = 200; + self->error_msg = malloc(bufsize); + + if (status == CALLING_READ_FAILED) { + snprintf(self->error_msg, bufsize, + "Calling read(nbytes) on source failed. " + "Try engine='python'."); + } else { + snprintf(self->error_msg, bufsize, "Unknown error in IO callback"); } + return -1; + } - TRACE(("datalen: %d\n", self->datalen)); + TRACE(("datalen: %d\n", self->datalen)); - return status; + return status; } /* @@ -593,63 +580,61 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, */ -#define PUSH_CHAR(c) \ - TRACE( \ - ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ - c, slen, self->stream_cap, self->stream_len)) \ - if (slen >= self->stream_cap) { \ - TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ - self->stream_cap)) \ - int64_t bufsize = 100; \ - self->error_msg = malloc(bufsize); \ - snprintf(self->error_msg, bufsize, \ - "Buffer overflow caught - possible malformed input file.\n");\ - return PARSER_OUT_OF_MEMORY; \ - } \ - *stream++ = c; \ - slen++; +#define PUSH_CHAR(c) \ + TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ + c, slen, self->stream_cap, self->stream_len)) \ + if (slen >= self->stream_cap) { \ + TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ + self->stream_cap)) \ + int64_t bufsize = 100; \ + self->error_msg = malloc(bufsize); \ + snprintf(self->error_msg, bufsize, \ + "Buffer overflow caught - possible malformed input file.\n"); \ + return PARSER_OUT_OF_MEMORY; \ + } \ + *stream++ = c; \ + slen++; // This is a little bit of a hack but works for now -#define END_FIELD() \ - self->stream_len = slen; \ - if (end_field(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; - -#define END_LINE_STATE(STATE) \ - self->stream_len = slen; \ - if (end_line(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; \ - self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ - goto linelimit; \ - } - -#define END_LINE_AND_FIELD_STATE(STATE) \ - self->stream_len = slen; \ - if (end_line(self) < 0) { \ - goto parsingerror; \ - } \ - if (end_field(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; \ - self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ - goto linelimit; \ - } +#define END_FIELD() \ + self->stream_len = slen; \ + if (end_field(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; + +#define END_LINE_STATE(STATE) \ + self->stream_len = slen; \ + if (end_line(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; \ + self->state = STATE; \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + goto linelimit; \ + } + +#define END_LINE_AND_FIELD_STATE(STATE) \ + self->stream_len = slen; \ + if (end_line(self) < 0) { \ + goto parsingerror; \ + } \ + if (end_field(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; \ + self->state = STATE; \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + goto linelimit; \ + } #define END_LINE() END_LINE_STATE(START_RECORD) -#define IS_TERMINATOR(c) \ - (c == lineterminator) +#define IS_TERMINATOR(c) (c == lineterminator) #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) @@ -660,678 +645,671 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, #define IS_ESCAPE_CHAR(c) (c == escape_symbol) -#define IS_SKIPPABLE_SPACE(c) \ - ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) +#define IS_SKIPPABLE_SPACE(c) \ + ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) // applied when in a field -#define IS_DELIMITER(c) \ - ((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c))) - -#define _TOKEN_CLEANUP() \ - self->stream_len = slen; \ - self->datapos = i; \ - TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \ - self->datalen)); - -#define CHECK_FOR_BOM() \ - if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ - buf += 3; \ - self->datapos += 3; \ - } +#define IS_DELIMITER(c) \ + ((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c))) -int skip_this_line(parser_t *self, int64_t rownum) { - int should_skip; - PyObject *result; - PyGILState_STATE state; - - if (self->skipfunc != NULL) { - state = PyGILState_Ensure(); - result = PyObject_CallFunction(self->skipfunc, "i", rownum); - - // Error occurred. It will be processed - // and caught at the Cython level. - if (result == NULL) { - should_skip = -1; - } else { - should_skip = PyObject_IsTrue(result); - } +#define _TOKEN_CLEANUP() \ + self->stream_len = slen; \ + self->datapos = i; \ + TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \ + self->datalen)); - Py_XDECREF(result); - PyGILState_Release(state); +#define CHECK_FOR_BOM() \ + if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ + buf += 3; \ + self->datapos += 3; \ + } - return should_skip; - } else if (self->skipset != NULL) { - return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) != - ((kh_int64_t *)self->skipset)->n_buckets); +int skip_this_line(parser_t *self, int64_t rownum) { + int should_skip; + PyObject *result; + PyGILState_STATE state; + + if (self->skipfunc != NULL) { + state = PyGILState_Ensure(); + result = PyObject_CallFunction(self->skipfunc, "i", rownum); + + // Error occurred. It will be processed + // and caught at the Cython level. + if (result == NULL) { + should_skip = -1; } else { - return (rownum <= self->skip_first_N_rows); + should_skip = PyObject_IsTrue(result); } + + Py_XDECREF(result); + PyGILState_Release(state); + + return should_skip; + } else if (self->skipset != NULL) { + return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) != + ((kh_int64_t *)self->skipset)->n_buckets); + } else { + return (rownum <= self->skip_first_N_rows); + } } -int tokenize_bytes(parser_t *self, - size_t line_limit, uint64_t start_lines) { - int64_t i; - uint64_t slen; - int should_skip; - char c; - char *stream; - char *buf = self->data + self->datapos; - - const char lineterminator = (self->lineterminator == '\0') ? - '\n' : self->lineterminator; - - const int delim_whitespace = self->delim_whitespace; - const char delimiter = self->delimiter; - - // 1000 is something that couldn't fit in "char" - // thus comparing a char to it would always be "false" - const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000; - const int comment_symbol = (self->commentchar != '\0') ? - self->commentchar : 1000; - const int escape_symbol = (self->escapechar != '\0') ? - self->escapechar : 1000; - - if (make_stream_space(self, self->datalen - self->datapos) < 0) { - int64_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, "out of memory"); - return -1; - } +int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { + int64_t i; + uint64_t slen; + int should_skip; + char c; + char *stream; + char *buf = self->data + self->datapos; + + const char lineterminator = + (self->lineterminator == '\0') ? '\n' : self->lineterminator; + + const int delim_whitespace = self->delim_whitespace; + const char delimiter = self->delimiter; + + // 1000 is something that couldn't fit in "char" + // thus comparing a char to it would always be "false" + const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000; + const int comment_symbol = + (self->commentchar != '\0') ? self->commentchar : 1000; + const int escape_symbol = + (self->escapechar != '\0') ? self->escapechar : 1000; + + if (make_stream_space(self, self->datalen - self->datapos) < 0) { + int64_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, "out of memory"); + return -1; + } - stream = self->stream + self->stream_len; - slen = self->stream_len; + stream = self->stream + self->stream_len; + slen = self->stream_len; - TRACE(("%s\n", buf)); + TRACE(("%s\n", buf)); - if (self->file_lines == 0) { - CHECK_FOR_BOM(); - } + if (self->file_lines == 0) { + CHECK_FOR_BOM(); + } + + for (i = self->datapos; i < self->datalen; ++i) { + // next character in file + c = *buf++; + + TRACE(("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, " + "state %d\n", + i, c, self->file_lines + 1, self->line_fields[self->lines], + self->state)); + + switch (self->state) { + case START_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_QUOTE(c)) { + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_DELIMITER(c)) { + // Do nothing, we're starting a new field again. + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + break; + + case IN_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; + } + break; + + case IN_QUOTED_FIELD_IN_SKIP_LINE: + if (IS_QUOTE(c)) { + if (self->doublequote) { + self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + } + break; + + case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: + if (IS_QUOTE(c)) { + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + break; + + case WHITESPACE_LINE: + if (IS_TERMINATOR(c)) { + self->file_lines++; + self->state = START_RECORD; + break; + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + break; + } else if (!self->delim_whitespace) { + if (isblank(c) && c != self->delimiter) { + } else { // backtrack + // use i + 1 because buf has been incremented but not i + do { + --buf; + --i; + } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); + + // reached a newline rather than the beginning + if (IS_TERMINATOR(*buf)) { + ++buf; // move pointer to first char after newline + ++i; + } + self->state = START_FIELD; + } + break; + } + // fall through + + case EAT_WHITESPACE: + if (IS_TERMINATOR(c)) { + END_LINE(); + self->state = START_RECORD; + break; + } else if (IS_CARRIAGE(c)) { + self->state = EAT_CRNL; + break; + } else if (IS_COMMENT_CHAR(c)) { + self->state = EAT_COMMENT; + break; + } else if (!isblank(c)) { + self->state = START_FIELD; + // fall through to subsequent state + } else { + // if whitespace char, keep slurping + break; + } + + case START_RECORD: + // start of record + should_skip = skip_this_line(self, self->file_lines); + + if (should_skip == -1) { + goto parsingerror; + } else if (should_skip) { + if (IS_QUOTE(c)) { + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else { + self->state = IN_FIELD_IN_SKIP_LINE; - for (i = self->datapos; i < self->datalen; ++i) { - // next character in file - c = *buf++; - - TRACE( - ("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, " - "state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); - - switch (self->state) { - case START_FIELD_IN_SKIP_LINE: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else if (IS_DELIMITER(c)) { - // Do nothing, we're starting a new field again. - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - break; - - case IN_FIELD_IN_SKIP_LINE: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_DELIMITER(c)) { - self->state = START_FIELD_IN_SKIP_LINE; - } - break; - - case IN_QUOTED_FIELD_IN_SKIP_LINE: - if (IS_QUOTE(c)) { - if (self->doublequote) { - self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - } - break; - - case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: - if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_DELIMITER(c)) { - self->state = START_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - break; - - case WHITESPACE_LINE: - if (IS_TERMINATOR(c)) { - self->file_lines++; - self->state = START_RECORD; - break; - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - break; - } else if (!self->delim_whitespace) { - if (isblank(c) && c != self->delimiter) { - } else { // backtrack - // use i + 1 because buf has been incremented but not i - do { - --buf; - --i; - } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); - - // reached a newline rather than the beginning - if (IS_TERMINATOR(*buf)) { - ++buf; // move pointer to first char after newline - ++i; - } - self->state = START_FIELD; - } - break; - } - // fall through - - case EAT_WHITESPACE: - if (IS_TERMINATOR(c)) { - END_LINE(); - self->state = START_RECORD; - break; - } else if (IS_CARRIAGE(c)) { - self->state = EAT_CRNL; - break; - } else if (IS_COMMENT_CHAR(c)) { - self->state = EAT_COMMENT; - break; - } else if (!isblank(c)) { - self->state = START_FIELD; - // fall through to subsequent state - } else { - // if whitespace char, keep slurping - break; - } - - case START_RECORD: - // start of record - should_skip = skip_this_line(self, self->file_lines); - - if (should_skip == -1) { - goto parsingerror; - } else if (should_skip) { - if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - - if (IS_TERMINATOR(c)) { - END_LINE(); - } - } - break; - } else if (IS_TERMINATOR(c)) { - // \n\r possible? - if (self->skip_empty_lines) { - self->file_lines++; - } else { - END_LINE(); - } - break; - } else if (IS_CARRIAGE(c)) { - if (self->skip_empty_lines) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else { - self->state = EAT_CRNL; - } - break; - } else if (IS_COMMENT_CHAR(c)) { - self->state = EAT_LINE_COMMENT; - break; - } else if (isblank(c)) { - if (self->delim_whitespace) { - if (self->skip_empty_lines) { - self->state = WHITESPACE_LINE; - } else { - self->state = EAT_WHITESPACE; - } - break; - } else if (c != self->delimiter && self->skip_empty_lines) { - self->state = WHITESPACE_LINE; - break; - } - // fall through - } - - // normal character - fall through - // to handle as START_FIELD - self->state = START_FIELD; - - case START_FIELD: - // expecting field - if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (IS_QUOTE(c)) { - // start quoted field - self->state = IN_QUOTED_FIELD; - } else if (IS_ESCAPE_CHAR(c)) { - // possible escaped character - self->state = ESCAPED_CHAR; - } else if (IS_SKIPPABLE_SPACE(c)) { - // ignore space at start of field - } else if (IS_DELIMITER(c)) { - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - // save empty field - END_FIELD(); - } - } else if (IS_COMMENT_CHAR(c)) { - END_FIELD(); - self->state = EAT_COMMENT; - } else { - // begin new unquoted field - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case ESCAPED_CHAR: - PUSH_CHAR(c); - self->state = IN_FIELD; - break; - - case EAT_LINE_COMMENT: - if (IS_TERMINATOR(c)) { - self->file_lines++; - self->state = START_RECORD; - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; - - case IN_FIELD: - // in unquoted field - if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (IS_ESCAPE_CHAR(c)) { - // possible escaped character - self->state = ESCAPED_CHAR; - } else if (IS_DELIMITER(c)) { - // end of field - end of line not reached yet - END_FIELD(); - - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - self->state = START_FIELD; - } - } else if (IS_COMMENT_CHAR(c)) { - END_FIELD(); - self->state = EAT_COMMENT; - } else { - // normal character - save in field - PUSH_CHAR(c); - } - break; - - case IN_QUOTED_FIELD: - // in quoted field - if (IS_ESCAPE_CHAR(c)) { - // possible escape character - self->state = ESCAPE_IN_QUOTED_FIELD; - } else if (IS_QUOTE(c)) { - if (self->doublequote) { - // double quote - " represented by "" - self->state = QUOTE_IN_QUOTED_FIELD; - } else { - // end of quote part of field - self->state = IN_FIELD; - } - } else { - // normal character - save in field - PUSH_CHAR(c); - } - break; - - case ESCAPE_IN_QUOTED_FIELD: - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - break; - - case QUOTE_IN_QUOTED_FIELD: - // double quote - seen a quote in an quoted field - if (IS_QUOTE(c)) { - // save "" as " - - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - } else if (IS_DELIMITER(c)) { - // end of field - end of line not reached yet - END_FIELD(); - - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - self->state = START_FIELD; - } - } else if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else { - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case EAT_COMMENT: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->state = EAT_CRNL; - } - break; - - // only occurs with non-custom line terminator, - // which is why we directly check for '\n' - case EAT_CRNL: - if (c == '\n') { - END_LINE(); - } else if (IS_DELIMITER(c)) { - if (self->delim_whitespace) { - END_LINE_STATE(EAT_WHITESPACE); - } else { - // Handle \r-delimited files - END_LINE_AND_FIELD_STATE(START_FIELD); - } - } else { - if (self->delim_whitespace) { - /* XXX - * first character of a new record--need to back up and - * reread - * to handle properly... - */ - i--; - buf--; // back up one character (HACK!) - END_LINE_STATE(START_RECORD); - } else { - // \r line terminator - // UGH. we don't actually want - // to consume the token. fix this later - self->stream_len = slen; - if (end_line(self) < 0) { - goto parsingerror; - } - - stream = self->stream + self->stream_len; - slen = self->stream_len; - self->state = START_RECORD; - - --i; - buf--; // let's try this character again (HACK!) - if (line_limit > 0 && - self->lines == start_lines + line_limit) { - goto linelimit; - } - } - } - break; - - // only occurs with non-custom line terminator, - // which is why we directly check for '\n' - case EAT_CRNL_NOP: // inside an ignored comment line - self->state = START_RECORD; - // \r line terminator -- parse this character again - if (c != '\n' && !IS_DELIMITER(c)) { - --i; - --buf; - } - break; - default: - break; + if (IS_TERMINATOR(c)) { + END_LINE(); + } + } + break; + } else if (IS_TERMINATOR(c)) { + // \n\r possible? + if (self->skip_empty_lines) { + self->file_lines++; + } else { + END_LINE(); + } + break; + } else if (IS_CARRIAGE(c)) { + if (self->skip_empty_lines) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else { + self->state = EAT_CRNL; + } + break; + } else if (IS_COMMENT_CHAR(c)) { + self->state = EAT_LINE_COMMENT; + break; + } else if (isblank(c)) { + if (self->delim_whitespace) { + if (self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + } else { + self->state = EAT_WHITESPACE; + } + break; + } else if (c != self->delimiter && self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + break; } + // fall through + } + + // normal character - fall through + // to handle as START_FIELD + self->state = START_FIELD; + + case START_FIELD: + // expecting field + if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (IS_QUOTE(c)) { + // start quoted field + self->state = IN_QUOTED_FIELD; + } else if (IS_ESCAPE_CHAR(c)) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_SKIPPABLE_SPACE(c)) { + // ignore space at start of field + } else if (IS_DELIMITER(c)) { + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + // save empty field + END_FIELD(); + } + } else if (IS_COMMENT_CHAR(c)) { + END_FIELD(); + self->state = EAT_COMMENT; + } else { + // begin new unquoted field + PUSH_CHAR(c); + self->state = IN_FIELD; + } + break; + + case ESCAPED_CHAR: + PUSH_CHAR(c); + self->state = IN_FIELD; + break; + + case EAT_LINE_COMMENT: + if (IS_TERMINATOR(c)) { + self->file_lines++; + self->state = START_RECORD; + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + break; + + case IN_FIELD: + // in unquoted field + if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (IS_ESCAPE_CHAR(c)) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet + END_FIELD(); + + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } + } else if (IS_COMMENT_CHAR(c)) { + END_FIELD(); + self->state = EAT_COMMENT; + } else { + // normal character - save in field + PUSH_CHAR(c); + } + break; + + case IN_QUOTED_FIELD: + // in quoted field + if (IS_ESCAPE_CHAR(c)) { + // possible escape character + self->state = ESCAPE_IN_QUOTED_FIELD; + } else if (IS_QUOTE(c)) { + if (self->doublequote) { + // double quote - " represented by "" + self->state = QUOTE_IN_QUOTED_FIELD; + } else { + // end of quote part of field + self->state = IN_FIELD; + } + } else { + // normal character - save in field + PUSH_CHAR(c); + } + break; + + case ESCAPE_IN_QUOTED_FIELD: + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + break; + + case QUOTE_IN_QUOTED_FIELD: + // double quote - seen a quote in an quoted field + if (IS_QUOTE(c)) { + // save "" as " + + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet + END_FIELD(); + + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } + } else if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else { + PUSH_CHAR(c); + self->state = IN_FIELD; + } + break; + + case EAT_COMMENT: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->state = EAT_CRNL; + } + break; + + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' + case EAT_CRNL: + if (c == '\n') { + END_LINE(); + } else if (IS_DELIMITER(c)) { + if (self->delim_whitespace) { + END_LINE_STATE(EAT_WHITESPACE); + } else { + // Handle \r-delimited files + END_LINE_AND_FIELD_STATE(START_FIELD); + } + } else { + if (self->delim_whitespace) { + /* XXX + * first character of a new record--need to back up and + * reread + * to handle properly... + */ + i--; + buf--; // back up one character (HACK!) + END_LINE_STATE(START_RECORD); + } else { + // \r line terminator + // UGH. we don't actually want + // to consume the token. fix this later + self->stream_len = slen; + if (end_line(self) < 0) { + goto parsingerror; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + self->state = START_RECORD; + + --i; + buf--; // let's try this character again (HACK!) + if (line_limit > 0 && self->lines == start_lines + line_limit) { + goto linelimit; + } + } + } + break; + + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' + case EAT_CRNL_NOP: // inside an ignored comment line + self->state = START_RECORD; + // \r line terminator -- parse this character again + if (c != '\n' && !IS_DELIMITER(c)) { + --i; + --buf; + } + break; + default: + break; } + } - _TOKEN_CLEANUP(); + _TOKEN_CLEANUP(); - TRACE(("Finished tokenizing input\n")) + TRACE(("Finished tokenizing input\n")) - return 0; + return 0; parsingerror: - i++; - _TOKEN_CLEANUP(); + i++; + _TOKEN_CLEANUP(); - return -1; + return -1; linelimit: - i++; - _TOKEN_CLEANUP(); + i++; + _TOKEN_CLEANUP(); - return 0; + return 0; } static int parser_handle_eof(parser_t *self) { - int64_t bufsize = 100; - - TRACE( - ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) + int64_t bufsize = 100; - if (self->datalen != 0) return -1; + TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) - switch (self->state) { - case START_RECORD: - case WHITESPACE_LINE: - case EAT_CRNL_NOP: - case EAT_LINE_COMMENT: - return 0; - - case ESCAPE_IN_QUOTED_FIELD: - case IN_QUOTED_FIELD: - self->error_msg = (char *)malloc(bufsize); - snprintf(self->error_msg, bufsize, - "EOF inside string starting at row %" PRIu64, - self->file_lines); - return -1; - - case ESCAPED_CHAR: - self->error_msg = (char *)malloc(bufsize); - snprintf(self->error_msg, bufsize, - "EOF following escape character"); - return -1; - - case IN_FIELD: - case START_FIELD: - case QUOTE_IN_QUOTED_FIELD: - if (end_field(self) < 0) return -1; - break; - - default: - break; - } - - if (end_line(self) < 0) - return -1; - else - return 0; -} + if (self->datalen != 0) + return -1; -int parser_consume_rows(parser_t *self, size_t nrows) { - int64_t offset, word_deletions; - uint64_t char_count, i; + switch (self->state) { + case START_RECORD: + case WHITESPACE_LINE: + case EAT_CRNL_NOP: + case EAT_LINE_COMMENT: + return 0; - if (nrows > self->lines) { - nrows = self->lines; - } + case ESCAPE_IN_QUOTED_FIELD: + case IN_QUOTED_FIELD: + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "EOF inside string starting at row %" PRIu64, self->file_lines); + return -1; - /* do nothing */ - if (nrows == 0) return 0; + case ESCAPED_CHAR: + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, "EOF following escape character"); + return -1; - /* cannot guarantee that nrows + 1 has been observed */ - word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; - if (word_deletions >= 1) { - char_count = (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1); - } else { - /* if word_deletions == 0 (i.e. this case) then char_count must - * be 0 too, as no data needs to be skipped */ - char_count = 0; - } + case IN_FIELD: + case START_FIELD: + case QUOTE_IN_QUOTED_FIELD: + if (end_field(self) < 0) + return -1; + break; - TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, - char_count)); + default: + break; + } - /* move stream, only if something to move */ - if (char_count < self->stream_len) { - memmove(self->stream, (self->stream + char_count), - self->stream_len - char_count); - } - /* buffer counts */ - self->stream_len -= char_count; + if (end_line(self) < 0) + return -1; + else + return 0; +} - /* move token metadata */ - // Note: We should always have words_len < word_deletions, so this - // subtraction will remain appropriately-typed. - for (i = 0; i < self->words_len - word_deletions; ++i) { - offset = i + word_deletions; +int parser_consume_rows(parser_t *self, size_t nrows) { + int64_t offset, word_deletions; + uint64_t char_count, i; - self->words[i] = self->words[offset] - char_count; - self->word_starts[i] = self->word_starts[offset] - char_count; - } - self->words_len -= word_deletions; - - /* move current word pointer to stream */ - self->pword_start -= char_count; - self->word_start -= char_count; - - /* move line metadata */ - // Note: We should always have self->lines - nrows + 1 >= 0, so this - // subtraction will remain appropriately-typed. - for (i = 0; i < self->lines - nrows + 1; ++i) { - offset = i + nrows; - self->line_start[i] = self->line_start[offset] - word_deletions; - self->line_fields[i] = self->line_fields[offset]; - } - self->lines -= nrows; + if (nrows > self->lines) { + nrows = self->lines; + } + /* do nothing */ + if (nrows == 0) return 0; + + /* cannot guarantee that nrows + 1 has been observed */ + word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; + if (word_deletions >= 1) { + char_count = (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1); + } else { + /* if word_deletions == 0 (i.e. this case) then char_count must + * be 0 too, as no data needs to be skipped */ + char_count = 0; + } + + TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, + char_count)); + + /* move stream, only if something to move */ + if (char_count < self->stream_len) { + memmove(self->stream, (self->stream + char_count), + self->stream_len - char_count); + } + /* buffer counts */ + self->stream_len -= char_count; + + /* move token metadata */ + // Note: We should always have words_len < word_deletions, so this + // subtraction will remain appropriately-typed. + for (i = 0; i < self->words_len - word_deletions; ++i) { + offset = i + word_deletions; + + self->words[i] = self->words[offset] - char_count; + self->word_starts[i] = self->word_starts[offset] - char_count; + } + self->words_len -= word_deletions; + + /* move current word pointer to stream */ + self->pword_start -= char_count; + self->word_start -= char_count; + + /* move line metadata */ + // Note: We should always have self->lines - nrows + 1 >= 0, so this + // subtraction will remain appropriately-typed. + for (i = 0; i < self->lines - nrows + 1; ++i) { + offset = i + nrows; + self->line_start[i] = self->line_start[offset] - word_deletions; + self->line_fields[i] = self->line_fields[offset]; + } + self->lines -= nrows; + + return 0; } static size_t _next_pow2(size_t sz) { - size_t result = 1; - while (result < sz) result *= 2; - return result; + size_t result = 1; + while (result < sz) + result *= 2; + return result; } int parser_trim_buffers(parser_t *self) { - /* - Free memory - */ - size_t new_cap; - void *newptr; - - uint64_t i; - - /** - * Before we free up space and trim, we should - * save how many words we saw when parsing, if - * it exceeds the maximum number we saw before. - * - * This is important for when we read in chunks, - * so that we can inform subsequent chunk parsing - * as to how many words we could possibly see. - */ - if (self->words_cap > self->max_words_cap) { - self->max_words_cap = self->words_cap; - } - - /* trim words, word_starts */ - new_cap = _next_pow2(self->words_len) + 1; - if (new_cap < self->words_cap) { - TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); - self->words = realloc(self->words, new_cap * sizeof(char *)); - if (self->words == NULL) { - return PARSER_OUT_OF_MEMORY; - } - self->word_starts = realloc(self->word_starts, - new_cap * sizeof(int64_t)); - if (self->word_starts == NULL) { - return PARSER_OUT_OF_MEMORY; - } - self->words_cap = new_cap; - } - - /* trim stream */ - new_cap = _next_pow2(self->stream_len) + 1; - TRACE( - ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " + /* + Free memory + */ + size_t new_cap; + void *newptr; + + uint64_t i; + + /** + * Before we free up space and trim, we should + * save how many words we saw when parsing, if + * it exceeds the maximum number we saw before. + * + * This is important for when we read in chunks, + * so that we can inform subsequent chunk parsing + * as to how many words we could possibly see. + */ + if (self->words_cap > self->max_words_cap) { + self->max_words_cap = self->words_cap; + } + + /* trim words, word_starts */ + new_cap = _next_pow2(self->words_len) + 1; + if (new_cap < self->words_cap) { + TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); + self->words = realloc(self->words, new_cap * sizeof(char *)); + if (self->words == NULL) { + return PARSER_OUT_OF_MEMORY; + } + self->word_starts = realloc(self->word_starts, new_cap * sizeof(int64_t)); + if (self->word_starts == NULL) { + return PARSER_OUT_OF_MEMORY; + } + self->words_cap = new_cap; + } + + /* trim stream */ + new_cap = _next_pow2(self->stream_len) + 1; + TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " "%zu\n", new_cap, self->stream_cap, self->lines_cap)); - if (new_cap < self->stream_cap) { - TRACE( - ("parser_trim_buffers: new_cap < self->stream_cap, calling " - "realloc\n")); - newptr = realloc(self->stream, new_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - // Update the pointers in the self->words array (char **) if - // `realloc` - // moved the `self->stream` buffer. This block mirrors a similar - // block in - // `make_stream_space`. - if (self->stream != newptr) { - self->pword_start = (char *)newptr + self->word_start; - - for (i = 0; i < self->words_len; ++i) { - self->words[i] = (char *)newptr + self->word_starts[i]; - } - } - - self->stream = newptr; - self->stream_cap = new_cap; + if (new_cap < self->stream_cap) { + TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling " + "realloc\n")); + newptr = realloc(self->stream, new_cap); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + // Update the pointers in the self->words array (char **) if + // `realloc` + // moved the `self->stream` buffer. This block mirrors a similar + // block in + // `make_stream_space`. + if (self->stream != newptr) { + self->pword_start = (char *)newptr + self->word_start; + + for (i = 0; i < self->words_len; ++i) { + self->words[i] = (char *)newptr + self->word_starts[i]; } + } + + self->stream = newptr; + self->stream_cap = new_cap; } + } - /* trim line_start, line_fields */ - new_cap = _next_pow2(self->lines) + 1; - if (new_cap < self->lines_cap) { - TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = realloc(self->line_start, - new_cap * sizeof(int64_t)); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->line_start = newptr; - } - newptr = realloc(self->line_fields, - new_cap * sizeof(int64_t)); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->line_fields = newptr; - self->lines_cap = new_cap; - } + /* trim line_start, line_fields */ + new_cap = _next_pow2(self->lines) + 1; + if (new_cap < self->lines_cap) { + TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); + newptr = realloc(self->line_start, new_cap * sizeof(int64_t)); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->line_start = newptr; + } + newptr = realloc(self->line_fields, new_cap * sizeof(int64_t)); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->line_fields = newptr; + self->lines_cap = new_cap; } + } - return 0; + return 0; } /* @@ -1341,63 +1319,61 @@ int parser_trim_buffers(parser_t *self) { int _tokenize_helper(parser_t *self, size_t nrows, int all, const char *encoding_errors) { - int status = 0; - uint64_t start_lines = self->lines; - - if (self->state == FINISHED) { - return 0; - } - - TRACE(( - "_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", - nrows, self->datapos, self->datalen)); + int status = 0; + uint64_t start_lines = self->lines; - while (1) { - if (!all && self->lines - start_lines >= nrows) break; - - if (self->datapos == self->datalen) { - status = parser_buffer_bytes(self, self->chunksize, - encoding_errors); - - if (status == REACHED_EOF) { - // close out last line - status = parser_handle_eof(self); - self->state = FINISHED; - break; - } else if (status != 0) { - return status; - } - } - - TRACE( - ("_tokenize_helper: Trying to process %d bytes, datalen=%d, " - "datapos= %d\n", - self->datalen - self->datapos, self->datalen, self->datapos)); - - status = tokenize_bytes(self, nrows, start_lines); - - if (status < 0) { - // XXX - TRACE( - ("_tokenize_helper: Status %d returned from tokenize_bytes, " - "breaking\n", - status)); - status = -1; - break; - } - } - TRACE(("leaving tokenize_helper\n")); - return status; + if (self->state == FINISHED) { + return 0; + } + + TRACE( + ("_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", + nrows, self->datapos, self->datalen)); + + while (1) { + if (!all && self->lines - start_lines >= nrows) + break; + + if (self->datapos == self->datalen) { + status = parser_buffer_bytes(self, self->chunksize, encoding_errors); + + if (status == REACHED_EOF) { + // close out last line + status = parser_handle_eof(self); + self->state = FINISHED; + break; + } else if (status != 0) { + return status; + } + } + + TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, " + "datapos= %d\n", + self->datalen - self->datapos, self->datalen, self->datapos)); + + status = tokenize_bytes(self, nrows, start_lines); + + if (status < 0) { + // XXX + TRACE(("_tokenize_helper: Status %d returned from tokenize_bytes, " + "breaking\n", + status)); + status = -1; + break; + } + } + TRACE(("leaving tokenize_helper\n")); + return status; } int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) { - int status = _tokenize_helper(self, nrows, 0, encoding_errors); - return status; + int status = _tokenize_helper(self, nrows, 0, encoding_errors); + return status; } int tokenize_all_rows(parser_t *self, const char *encoding_errors) { - int status = _tokenize_helper(self, -1, 1, encoding_errors); - return status; + int status = _tokenize_helper(self, -1, 1, encoding_errors); + return status; } /* @@ -1415,15 +1391,15 @@ int tokenize_all_rows(parser_t *self, const char *encoding_errors) { * leaves the value of *val unmodified. */ int to_boolean(const char *item, uint8_t *val) { - if (strcasecmp(item, "TRUE") == 0) { - *val = 1; - return 0; - } else if (strcasecmp(item, "FALSE") == 0) { - *val = 0; - return 0; - } + if (strcasecmp(item, "TRUE") == 0) { + *val = 1; + return 0; + } else if (strcasecmp(item, "FALSE") == 0) { + *val = 0; + return 0; + } - return -1; + return -1; } // --------------------------------------------------------------------------- @@ -1479,301 +1455,320 @@ const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4; double xstrtod(const char *str, char **endptr, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { - double number; - unsigned int i_number = 0; - int exponent; - int negative; - char *p = (char *)str; - double p10; - int n; - int num_digits; - int num_decimals; - - if (maybe_int != NULL) *maybe_int = 1; - // Skip leading whitespace. - while (isspace_ascii(*p)) p++; - - // Handle optional sign. - negative = 0; - switch (*p) { - case '-': - negative = 1; // Fall through to increment position. - case '+': - p++; - } - - exponent = 0; - num_digits = 0; - num_decimals = 0; - - // Process string of digits. - while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) { - i_number = i_number * 10 + (*p - '0'); - p++; - num_digits++; - - p += (tsep != '\0' && *p == tsep); - } - number = i_number; - - if (num_digits > max_int_decimal_digits) { - // process what's left as double - while (isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - - p += (tsep != '\0' && *p == tsep); - } - } - - // Process decimal part. - if (*p == decimal) { - if (maybe_int != NULL) *maybe_int = 0; - p++; - - while (isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - num_decimals++; - } - - exponent -= num_decimals; - } - - if (num_digits == 0) { - *error = ERANGE; - return 0.0; - } - - // Correct for sign. - if (negative) number = -number; - - // Process an exponent string. - if (toupper_ascii(*p) == toupper_ascii(sci)) { - if (maybe_int != NULL) *maybe_int = 0; - - // Handle optional sign. - negative = 0; - switch (*++p) { - case '-': - negative = 1; // Fall through to increment pos. - case '+': - p++; - } - - // Process string of digits. - num_digits = 0; - n = 0; - while (isdigit_ascii(*p)) { - n = n * 10 + (*p - '0'); - num_digits++; - p++; - } - - if (negative) - exponent -= n; - else - exponent += n; + double number; + unsigned int i_number = 0; + int exponent; + int negative; + char *p = (char *)str; + double p10; + int n; + int num_digits; + int num_decimals; + + if (maybe_int != NULL) + *maybe_int = 1; + // Skip leading whitespace. + while (isspace_ascii(*p)) + p++; + + // Handle optional sign. + negative = 0; + switch (*p) { + case '-': + negative = 1; // Fall through to increment position. + case '+': + p++; + } + + exponent = 0; + num_digits = 0; + num_decimals = 0; + + // Process string of digits. + while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) { + i_number = i_number * 10 + (*p - '0'); + p++; + num_digits++; + + p += (tsep != '\0' && *p == tsep); + } + number = i_number; + + if (num_digits > max_int_decimal_digits) { + // process what's left as double + while (isdigit_ascii(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; - // If no digits, after the 'e'/'E', un-consume it - if (num_digits == 0) p--; + p += (tsep != '\0' && *p == tsep); } + } - if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { - *error = ERANGE; - return HUGE_VAL; - } + // Process decimal part. + if (*p == decimal) { + if (maybe_int != NULL) + *maybe_int = 0; + p++; - // Scale the result. - p10 = 10.; - n = exponent; - if (n < 0) n = -n; - while (n) { - if (n & 1) { - if (exponent < 0) - number /= p10; - else - number *= p10; - } - n >>= 1; - p10 *= p10; + while (isdigit_ascii(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; } - if (number == HUGE_VAL) { - *error = ERANGE; - } + exponent -= num_decimals; + } - if (skip_trailing) { - // Skip trailing whitespace. - while (isspace_ascii(*p)) p++; - } + if (num_digits == 0) { + *error = ERANGE; + return 0.0; + } - if (endptr) *endptr = p; - return number; -} + // Correct for sign. + if (negative) + number = -number; -double precise_xstrtod(const char *str, char **endptr, char decimal, - char sci, char tsep, int skip_trailing, - int *error, int *maybe_int) { - double number; - int exponent; - int negative; - char *p = (char *)str; - int num_digits; - int num_decimals; - int max_digits = 17; - int n; - - if (maybe_int != NULL) *maybe_int = 1; - // Cache powers of 10 in memory. - static double e[] = { - 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, - 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, - 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, - 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, - 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, - 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, - 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, - 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, - 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, - 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, - 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, - 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, - 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, - 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, - 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, - 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, - 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, - 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, - 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, - 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, - 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, - 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, - 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, - 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, - 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, - 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, - 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, - 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, - 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, - 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, - 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; - - // Skip leading whitespace. - while (isspace_ascii(*p)) p++; + // Process an exponent string. + if (toupper_ascii(*p) == toupper_ascii(sci)) { + if (maybe_int != NULL) + *maybe_int = 0; // Handle optional sign. negative = 0; - switch (*p) { - case '-': - negative = 1; // Fall through to increment position. - case '+': - p++; + switch (*++p) { + case '-': + negative = 1; // Fall through to increment pos. + case '+': + p++; } - number = 0.; - exponent = 0; - num_digits = 0; - num_decimals = 0; - // Process string of digits. + num_digits = 0; + n = 0; while (isdigit_ascii(*p)) { - if (num_digits < max_digits) { - number = number * 10. + (*p - '0'); - num_digits++; - } else { - ++exponent; - } - - p++; - p += (tsep != '\0' && *p == tsep); + n = n * 10 + (*p - '0'); + num_digits++; + p++; } - // Process decimal part - if (*p == decimal) { - if (maybe_int != NULL) *maybe_int = 0; - p++; + if (negative) + exponent -= n; + else + exponent += n; + + // If no digits, after the 'e'/'E', un-consume it + if (num_digits == 0) + p--; + } + + if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { + *error = ERANGE; + return HUGE_VAL; + } + + // Scale the result. + p10 = 10.; + n = exponent; + if (n < 0) + n = -n; + while (n) { + if (n & 1) { + if (exponent < 0) + number /= p10; + else + number *= p10; + } + n >>= 1; + p10 *= p10; + } + + if (number == HUGE_VAL) { + *error = ERANGE; + } + + if (skip_trailing) { + // Skip trailing whitespace. + while (isspace_ascii(*p)) + p++; + } + + if (endptr) + *endptr = p; + return number; +} - while (num_digits < max_digits && isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - num_decimals++; - } +double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, + char tsep, int skip_trailing, int *error, + int *maybe_int) { + double number; + int exponent; + int negative; + char *p = (char *)str; + int num_digits; + int num_decimals; + int max_digits = 17; + int n; + + if (maybe_int != NULL) + *maybe_int = 1; + // Cache powers of 10 in memory. + static double e[] = { + 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, + 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, + 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, + 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, + 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, + 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, + 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, + 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, + 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, + 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, + 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, + 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, + 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, + 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, + 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, + 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, + 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, + 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, + 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, + 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, + 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, + 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, + 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, + 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, + 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, + 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, + 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, + 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, + 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, + 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, + 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; + + // Skip leading whitespace. + while (isspace_ascii(*p)) + p++; + + // Handle optional sign. + negative = 0; + switch (*p) { + case '-': + negative = 1; // Fall through to increment position. + case '+': + p++; + } + + number = 0.; + exponent = 0; + num_digits = 0; + num_decimals = 0; + + // Process string of digits. + while (isdigit_ascii(*p)) { + if (num_digits < max_digits) { + number = number * 10. + (*p - '0'); + num_digits++; + } else { + ++exponent; + } - if (num_digits >= max_digits) // Consume extra decimal digits. - while (isdigit_ascii(*p)) ++p; + p++; + p += (tsep != '\0' && *p == tsep); + } - exponent -= num_decimals; - } + // Process decimal part + if (*p == decimal) { + if (maybe_int != NULL) + *maybe_int = 0; + p++; - if (num_digits == 0) { - *error = ERANGE; - return 0.0; + while (num_digits < max_digits && isdigit_ascii(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; } - // Correct for sign. - if (negative) number = -number; + if (num_digits >= max_digits) // Consume extra decimal digits. + while (isdigit_ascii(*p)) + ++p; - // Process an exponent string. - if (toupper_ascii(*p) == toupper_ascii(sci)) { - if (maybe_int != NULL) *maybe_int = 0; + exponent -= num_decimals; + } - // Handle optional sign - negative = 0; - switch (*++p) { - case '-': - negative = 1; // Fall through to increment pos. - case '+': - p++; - } + if (num_digits == 0) { + *error = ERANGE; + return 0.0; + } - // Process string of digits. - num_digits = 0; - n = 0; - while (num_digits < max_digits && isdigit_ascii(*p)) { - n = n * 10 + (*p - '0'); - num_digits++; - p++; - } + // Correct for sign. + if (negative) + number = -number; - if (negative) - exponent -= n; - else - exponent += n; + // Process an exponent string. + if (toupper_ascii(*p) == toupper_ascii(sci)) { + if (maybe_int != NULL) + *maybe_int = 0; - // If no digits after the 'e'/'E', un-consume it. - if (num_digits == 0) p--; + // Handle optional sign + negative = 0; + switch (*++p) { + case '-': + negative = 1; // Fall through to increment pos. + case '+': + p++; } - if (exponent > 308) { - *error = ERANGE; - return HUGE_VAL; - } else if (exponent > 0) { - number *= e[exponent]; - } else if (exponent < -308) { // Subnormal - if (exponent < -616) { // Prevent invalid array access. - number = 0.; - } else { - number /= e[-308 - exponent]; - number /= e[308]; - } + // Process string of digits. + num_digits = 0; + n = 0; + while (num_digits < max_digits && isdigit_ascii(*p)) { + n = n * 10 + (*p - '0'); + num_digits++; + p++; + } + if (negative) + exponent -= n; + else + exponent += n; + + // If no digits after the 'e'/'E', un-consume it. + if (num_digits == 0) + p--; + } + + if (exponent > 308) { + *error = ERANGE; + return HUGE_VAL; + } else if (exponent > 0) { + number *= e[exponent]; + } else if (exponent < -308) { // Subnormal + if (exponent < -616) { // Prevent invalid array access. + number = 0.; } else { - number /= e[-exponent]; + number /= e[-308 - exponent]; + number /= e[308]; } - if (number == HUGE_VAL || number == -HUGE_VAL) *error = ERANGE; + } else { + number /= e[-exponent]; + } - if (skip_trailing) { - // Skip trailing whitespace. - while (isspace_ascii(*p)) p++; - } + if (number == HUGE_VAL || number == -HUGE_VAL) + *error = ERANGE; + + if (skip_trailing) { + // Skip trailing whitespace. + while (isspace_ascii(*p)) + p++; + } - if (endptr) *endptr = p; - return number; + if (endptr) + *endptr = p; + return number; } /* copy a decimal number string with `decimal`, `tsep` as decimal point @@ -1782,306 +1777,309 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, with a call to `free`. */ -char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, +char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, char tsep) { - const char *p = s; - size_t length = strlen(s); - char *s_copy = malloc(length + 1); - char *dst = s_copy; - // Skip leading whitespace. - while (isspace_ascii(*p)) p++; - // Copy Leading sign + const char *p = s; + size_t length = strlen(s); + char *s_copy = malloc(length + 1); + char *dst = s_copy; + // Skip leading whitespace. + while (isspace_ascii(*p)) + p++; + // Copy Leading sign + if (*p == '+' || *p == '-') { + *dst++ = *p++; + } + // Copy integer part dropping `tsep` + while (isdigit_ascii(*p)) { + *dst++ = *p++; + p += (tsep != '\0' && *p == tsep); + } + // Replace `decimal` with '.' + if (*p == decimal) { + *dst++ = '.'; + p++; + } + // Copy fractional part after decimal (if any) + while (isdigit_ascii(*p)) { + *dst++ = *p++; + } + // Copy exponent if any + if (toupper_ascii(*p) == toupper_ascii('E')) { + *dst++ = *p++; + // Copy leading exponent sign (if any) if (*p == '+' || *p == '-') { - *dst++ = *p++; - } - // Copy integer part dropping `tsep` - while (isdigit_ascii(*p)) { - *dst++ = *p++; - p += (tsep != '\0' && *p == tsep); - } - // Replace `decimal` with '.' - if (*p == decimal) { - *dst++ = '.'; - p++; + *dst++ = *p++; } - // Copy fractional part after decimal (if any) + // Copy exponent digits while (isdigit_ascii(*p)) { - *dst++ = *p++; + *dst++ = *p++; } - // Copy exponent if any - if (toupper_ascii(*p) == toupper_ascii('E')) { - *dst++ = *p++; - // Copy leading exponent sign (if any) - if (*p == '+' || *p == '-') { - *dst++ = *p++; - } - // Copy exponent digits - while (isdigit_ascii(*p)) { - *dst++ = *p++; - } - } - *dst++ = '\0'; // terminate - if (endpos != NULL) - *endpos = (char *)p; - return s_copy; + } + *dst++ = '\0'; // terminate + if (endpos != NULL) + *endpos = (char *)p; + return s_copy; } - double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { - // 'normalize' representation to C-locale; replace decimal with '.' and - // remove thousands separator. - char *endptr; - char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); - // This is called from a nogil block in parsers.pyx - // so need to explicitly get GIL before Python calls - PyGILState_STATE gstate; - gstate = PyGILState_Ensure(); - char *endpc; - double r = PyOS_string_to_double(pc, &endpc, 0); - // PyOS_string_to_double needs to consume the whole string - if (endpc == pc + strlen(pc)) { - if (q != NULL) { - // report endptr from source string (p) - *q = endptr; - } - } else { - *error = -1; - if (q != NULL) { - // p and pc are different len due to tsep removal. Can't report - // how much it has consumed of p. Just rewind to beginning. - *q = (char *)p; // TODO(willayd): this could be undefined behavior - } - } - if (maybe_int != NULL) *maybe_int = 0; - if (PyErr_Occurred() != NULL) *error = -1; - else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; - PyErr_Clear(); - - PyGILState_Release(gstate); - free(pc); - if (skip_trailing && q != NULL && *q != p) { - while (isspace_ascii(**q)) { - (*q)++; - } - } - return r; + // 'normalize' representation to C-locale; replace decimal with '.' and + // remove thousands separator. + char *endptr; + char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); + // This is called from a nogil block in parsers.pyx + // so need to explicitly get GIL before Python calls + PyGILState_STATE gstate; + gstate = PyGILState_Ensure(); + char *endpc; + double r = PyOS_string_to_double(pc, &endpc, 0); + // PyOS_string_to_double needs to consume the whole string + if (endpc == pc + strlen(pc)) { + if (q != NULL) { + // report endptr from source string (p) + *q = endptr; + } + } else { + *error = -1; + if (q != NULL) { + // p and pc are different len due to tsep removal. Can't report + // how much it has consumed of p. Just rewind to beginning. + *q = (char *)p; // TODO(willayd): this could be undefined behavior + } + } + if (maybe_int != NULL) + *maybe_int = 0; + if (PyErr_Occurred() != NULL) + *error = -1; + else if (r == Py_HUGE_VAL) + *error = (int)Py_HUGE_VAL; + PyErr_Clear(); + + PyGILState_Release(gstate); + free(pc); + if (skip_trailing && q != NULL && *q != p) { + while (isspace_ascii(**q)) { + (*q)++; + } + } + return r; } // End of xstrtod code // --------------------------------------------------------------------------- void uint_state_init(uint_state *self) { - self->seen_sint = 0; - self->seen_uint = 0; - self->seen_null = 0; + self->seen_sint = 0; + self->seen_uint = 0; + self->seen_null = 0; } int uint64_conflict(uint_state *self) { - return self->seen_uint && (self->seen_sint || self->seen_null); + return self->seen_uint && (self->seen_sint || self->seen_null); } int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { - const char *p = p_item; - int isneg = 0; - int64_t number = 0; - int d; + const char *p = p_item; + int isneg = 0; + int64_t number = 0; + int d; + + // Skip leading spaces. + while (isspace_ascii(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + isneg = 1; + ++p; + } else if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit_ascii(*p)) { + // Error... + *error = ERROR_NO_DIGITS; + return 0; + } - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; - } + if (isneg) { + // If number is greater than pre_min, at least one more digit + // can be processed without overflowing. + int dig_pre_min = -(int_min % 10); + int64_t pre_min = int_min / 10; - // Handle sign. - if (*p == '-') { - isneg = 1; - ++p; - } else if (*p == '+') { - p++; + // Process the digits. + d = *p; + if (tsep != '\0') { + while (1) { + if (d == tsep) { + d = *++p; + continue; + } else if (!isdigit_ascii(d)) { + break; + } + if ((number > pre_min) || + ((number == pre_min) && (d - '0' <= dig_pre_min))) { + number = number * 10 - (d - '0'); + d = *++p; + } else { + *error = ERROR_OVERFLOW; + return 0; + } + } + } else { + while (isdigit_ascii(d)) { + if ((number > pre_min) || + ((number == pre_min) && (d - '0' <= dig_pre_min))) { + number = number * 10 - (d - '0'); + d = *++p; + } else { + *error = ERROR_OVERFLOW; + return 0; + } + } } + } else { + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + int64_t pre_max = int_max / 10; + int dig_pre_max = int_max % 10; - // Check that there is a first digit. - if (!isdigit_ascii(*p)) { - // Error... - *error = ERROR_NO_DIGITS; - return 0; - } + // Process the digits. + d = *p; + if (tsep != '\0') { + while (1) { + if (d == tsep) { + d = *++p; + continue; + } else if (!isdigit_ascii(d)) { + break; + } + if ((number < pre_max) || + ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; - if (isneg) { - // If number is greater than pre_min, at least one more digit - // can be processed without overflowing. - int dig_pre_min = -(int_min % 10); - int64_t pre_min = int_min / 10; - - // Process the digits. - d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number > pre_min) || - ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); - d = *++p; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } } else { - while (isdigit_ascii(d)) { - if ((number > pre_min) || - ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); - d = *++p; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } + *error = ERROR_OVERFLOW; + return 0; } + } } else { - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - int64_t pre_max = int_max / 10; - int dig_pre_max = int_max % 10; - - // Process the digits. - d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } + while (isdigit_ascii(d)) { + if ((number < pre_max) || + ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } else { - while (isdigit_ascii(d)) { - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } + *error = ERROR_OVERFLOW; + return 0; } + } } + } - // Skip trailing spaces. - while (isspace_ascii(*p)) { - ++p; - } + // Skip trailing spaces. + while (isspace_ascii(*p)) { + ++p; + } - // Did we use up all the characters? - if (*p) { - *error = ERROR_INVALID_CHARS; - return 0; - } + // Did we use up all the characters? + if (*p) { + *error = ERROR_INVALID_CHARS; + return 0; + } - *error = 0; - return number; + *error = 0; + return number; } uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { - const char *p = p_item; - uint64_t pre_max = uint_max / 10; - int dig_pre_max = uint_max % 10; - uint64_t number = 0; - int d; - - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; - } - - // Handle sign. - if (*p == '-') { - state->seen_sint = 1; - *error = 0; + const char *p = p_item; + uint64_t pre_max = uint_max / 10; + int dig_pre_max = uint_max % 10; + uint64_t number = 0; + int d; + + // Skip leading spaces. + while (isspace_ascii(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + state->seen_sint = 1; + *error = 0; + return 0; + } else if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit_ascii(*p)) { + // Error... + *error = ERROR_NO_DIGITS; + return 0; + } + + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + // + // Process the digits. + d = *p; + if (tsep != '\0') { + while (1) { + if (d == tsep) { + d = *++p; + continue; + } else if (!isdigit_ascii(d)) { + break; + } + if ((number < pre_max) || + ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + + } else { + *error = ERROR_OVERFLOW; return 0; - } else if (*p == '+') { - p++; + } } + } else { + while (isdigit_ascii(d)) { + if ((number < pre_max) || + ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; - // Check that there is a first digit. - if (!isdigit_ascii(*p)) { - // Error... - *error = ERROR_NO_DIGITS; + } else { + *error = ERROR_OVERFLOW; return 0; + } } + } - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - // - // Process the digits. - d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } else { - while (isdigit_ascii(d)) { - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } - - // Skip trailing spaces. - while (isspace_ascii(*p)) { - ++p; - } + // Skip trailing spaces. + while (isspace_ascii(*p)) { + ++p; + } - // Did we use up all the characters? - if (*p) { - *error = ERROR_INVALID_CHARS; - return 0; - } + // Did we use up all the characters? + if (*p) { + *error = ERROR_INVALID_CHARS; + return 0; + } - if (number > (uint64_t)int_max) { - state->seen_uint = 1; - } + if (number > (uint64_t)int_max) { + state->seen_uint = 1; + } - *error = 0; - return number; + *error = 0; + return number; } diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 49016f79de5b9..db1c735bd6094 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -20,15 +20,14 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API +#endif // NPY_NO_DEPRECATED_API #include +#include "pandas/vendored/numpy/datetime/np_datetime.h" #include #include #include -#include "pandas/vendored/numpy/datetime/np_datetime.h" - const int days_per_month_table[2][12] = { {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, @@ -38,8 +37,8 @@ const int days_per_month_table[2][12] = { * Returns 1 if the given year is a leap year, 0 otherwise. */ int is_leapyear(npy_int64 year) { - return (year & 0x3) == 0 && /* year % 4 == 0 */ - ((year % 100) != 0 || (year % 400) == 0); + return (year & 0x3) == 0 && /* year % 4 == 0 */ + ((year % 100) != 0 || (year % 400) == 0); } /* @@ -47,108 +46,108 @@ int is_leapyear(npy_int64 year) { * the current values are valid.g */ void add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes) { - int isleap; - - /* MINUTES */ - dts->min += minutes; - while (dts->min < 0) { - dts->min += 60; - dts->hour--; - } - while (dts->min >= 60) { - dts->min -= 60; - dts->hour++; - } - - /* HOURS */ - while (dts->hour < 0) { - dts->hour += 24; - dts->day--; - } - while (dts->hour >= 24) { - dts->hour -= 24; - dts->day++; - } - - /* DAYS */ - if (dts->day < 1) { - dts->month--; - if (dts->month < 1) { - dts->year--; - dts->month = 12; - } - isleap = is_leapyear(dts->year); - dts->day += days_per_month_table[isleap][dts->month - 1]; - } else if (dts->day > 28) { - isleap = is_leapyear(dts->year); - if (dts->day > days_per_month_table[isleap][dts->month - 1]) { - dts->day -= days_per_month_table[isleap][dts->month - 1]; - dts->month++; - if (dts->month > 12) { - dts->year++; - dts->month = 1; - } - } + int isleap; + + /* MINUTES */ + dts->min += minutes; + while (dts->min < 0) { + dts->min += 60; + dts->hour--; + } + while (dts->min >= 60) { + dts->min -= 60; + dts->hour++; + } + + /* HOURS */ + while (dts->hour < 0) { + dts->hour += 24; + dts->day--; + } + while (dts->hour >= 24) { + dts->hour -= 24; + dts->day++; + } + + /* DAYS */ + if (dts->day < 1) { + dts->month--; + if (dts->month < 1) { + dts->year--; + dts->month = 12; } + isleap = is_leapyear(dts->year); + dts->day += days_per_month_table[isleap][dts->month - 1]; + } else if (dts->day > 28) { + isleap = is_leapyear(dts->year); + if (dts->day > days_per_month_table[isleap][dts->month - 1]) { + dts->day -= days_per_month_table[isleap][dts->month - 1]; + dts->month++; + if (dts->month > 12) { + dts->year++; + dts->month = 1; + } + } + } } /* * Calculates the days offset from the 1970 epoch. */ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { - int i, month; - npy_int64 year, days = 0; - const int *month_lengths; - - year = dts->year - 1970; - days = year * 365; - - /* Adjust for leap years */ - if (days >= 0) { - /* - * 1968 is the closest leap year before 1970. - * Exclude the current year, so add 1. - */ - year += 1; - /* Add one day for each 4 years */ - days += year / 4; - /* 1900 is the closest previous year divisible by 100 */ - year += 68; - /* Subtract one day for each 100 years */ - days -= year / 100; - /* 1600 is the closest previous year divisible by 400 */ - year += 300; - /* Add one day for each 400 years */ - days += year / 400; - } else { - /* - * 1972 is the closest later year after 1970. - * Include the current year, so subtract 2. - */ - year -= 2; - /* Subtract one day for each 4 years */ - days += year / 4; - /* 2000 is the closest later year divisible by 100 */ - year -= 28; - /* Add one day for each 100 years */ - days -= year / 100; - /* 2000 is also the closest later year divisible by 400 */ - /* Subtract one day for each 400 years */ - days += year / 400; - } - - month_lengths = days_per_month_table[is_leapyear(dts->year)]; - month = dts->month - 1; - - /* Add the months */ - for (i = 0; i < month; ++i) { - days += month_lengths[i]; - } - - /* Add the days */ - days += dts->day - 1; - - return days; + int i, month; + npy_int64 year, days = 0; + const int *month_lengths; + + year = dts->year - 1970; + days = year * 365; + + /* Adjust for leap years */ + if (days >= 0) { + /* + * 1968 is the closest leap year before 1970. + * Exclude the current year, so add 1. + */ + year += 1; + /* Add one day for each 4 years */ + days += year / 4; + /* 1900 is the closest previous year divisible by 100 */ + year += 68; + /* Subtract one day for each 100 years */ + days -= year / 100; + /* 1600 is the closest previous year divisible by 400 */ + year += 300; + /* Add one day for each 400 years */ + days += year / 400; + } else { + /* + * 1972 is the closest later year after 1970. + * Include the current year, so subtract 2. + */ + year -= 2; + /* Subtract one day for each 4 years */ + days += year / 4; + /* 2000 is the closest later year divisible by 100 */ + year -= 28; + /* Add one day for each 100 years */ + days -= year / 100; + /* 2000 is also the closest later year divisible by 400 */ + /* Subtract one day for each 400 years */ + days += year / 400; + } + + month_lengths = days_per_month_table[is_leapyear(dts->year)]; + month = dts->month - 1; + + /* Add the months */ + for (i = 0; i < month; ++i) { + days += month_lengths[i]; + } + + /* Add the days */ + days += dts->day - 1; + + return days; } /* @@ -156,62 +155,61 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { * and returns the year. */ static npy_int64 days_to_yearsdays(npy_int64 *days_) { - const npy_int64 days_per_400years = (400 * 365 + 100 - 4 + 1); - /* Adjust so it's relative to the year 2000 (divisible by 400) */ - npy_int64 days = (*days_) - (365 * 30 + 7); - npy_int64 year; - - /* Break down the 400 year cycle to get the year and day within the year */ - if (days >= 0) { - year = 400 * (days / days_per_400years); - days = days % days_per_400years; - } else { - year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); - days = days % days_per_400years; - if (days < 0) { - days += days_per_400years; - } - } - - /* Work out the year/day within the 400 year cycle */ - if (days >= 366) { - year += 100 * ((days - 1) / (100 * 365 + 25 - 1)); - days = (days - 1) % (100 * 365 + 25 - 1); - if (days >= 365) { - year += 4 * ((days + 1) / (4 * 365 + 1)); - days = (days + 1) % (4 * 365 + 1); - if (days >= 366) { - year += (days - 1) / 365; - days = (days - 1) % 365; - } - } - } - - *days_ = days; - return year + 2000; -} + const npy_int64 days_per_400years = (400 * 365 + 100 - 4 + 1); + /* Adjust so it's relative to the year 2000 (divisible by 400) */ + npy_int64 days = (*days_) - (365 * 30 + 7); + npy_int64 year; + + /* Break down the 400 year cycle to get the year and day within the year */ + if (days >= 0) { + year = 400 * (days / days_per_400years); + days = days % days_per_400years; + } else { + year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); + days = days % days_per_400years; + if (days < 0) { + days += days_per_400years; + } + } + + /* Work out the year/day within the 400 year cycle */ + if (days >= 366) { + year += 100 * ((days - 1) / (100 * 365 + 25 - 1)); + days = (days - 1) % (100 * 365 + 25 - 1); + if (days >= 365) { + year += 4 * ((days + 1) / (4 * 365 + 1)); + days = (days + 1) % (4 * 365 + 1); + if (days >= 366) { + year += (days - 1) / 365; + days = (days - 1) % 365; + } + } + } + *days_ = days; + return year + 2000; +} /* * Fills in the year, month, day in 'dts' based on the days * offset from 1970. */ static void set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) { - const int *month_lengths; - int i; + const int *month_lengths; + int i; - dts->year = days_to_yearsdays(&days); - month_lengths = days_per_month_table[is_leapyear(dts->year)]; + dts->year = days_to_yearsdays(&days); + month_lengths = days_per_month_table[is_leapyear(dts->year)]; - for (i = 0; i < 12; ++i) { - if (days < month_lengths[i]) { - dts->month = i + 1; - dts->day = days + 1; - return; - } else { - days -= month_lengths[i]; - } + for (i = 0; i < 12; ++i) { + if (days < month_lengths[i]) { + dts->month = i + 1; + dts->day = days + 1; + return; + } else { + days -= month_lengths[i]; } + } } /* @@ -219,86 +217,86 @@ static void set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) { */ int cmp_npy_datetimestruct(const npy_datetimestruct *a, const npy_datetimestruct *b) { - if (a->year > b->year) { - return 1; - } else if (a->year < b->year) { - return -1; - } - - if (a->month > b->month) { - return 1; - } else if (a->month < b->month) { - return -1; - } - - if (a->day > b->day) { - return 1; - } else if (a->day < b->day) { - return -1; - } - - if (a->hour > b->hour) { - return 1; - } else if (a->hour < b->hour) { - return -1; - } - - if (a->min > b->min) { - return 1; - } else if (a->min < b->min) { - return -1; - } - - if (a->sec > b->sec) { - return 1; - } else if (a->sec < b->sec) { - return -1; - } - - if (a->us > b->us) { - return 1; - } else if (a->us < b->us) { - return -1; - } - - if (a->ps > b->ps) { - return 1; - } else if (a->ps < b->ps) { - return -1; - } - - if (a->as > b->as) { - return 1; - } else if (a->as < b->as) { - return -1; - } - - return 0; + if (a->year > b->year) { + return 1; + } else if (a->year < b->year) { + return -1; + } + + if (a->month > b->month) { + return 1; + } else if (a->month < b->month) { + return -1; + } + + if (a->day > b->day) { + return 1; + } else if (a->day < b->day) { + return -1; + } + + if (a->hour > b->hour) { + return 1; + } else if (a->hour < b->hour) { + return -1; + } + + if (a->min > b->min) { + return 1; + } else if (a->min < b->min) { + return -1; + } + + if (a->sec > b->sec) { + return 1; + } else if (a->sec < b->sec) { + return -1; + } + + if (a->us > b->us) { + return 1; + } else if (a->us < b->us) { + return -1; + } + + if (a->ps > b->ps) { + return 1; + } else if (a->ps < b->ps) { + return -1; + } + + if (a->as > b->as) { + return 1; + } else if (a->as < b->as) { + return -1; + } + + return 0; } /* -* Returns the offset from utc of the timezone as a timedelta. -* The caller is responsible for ensuring that the tzinfo -* attribute exists on the datetime object. -* -* If the passed object is timezone naive, Py_None is returned. -* If extraction of the offset fails, NULL is returned. -* -* NOTE: This function is not vendored from numpy. -*/ + * Returns the offset from utc of the timezone as a timedelta. + * The caller is responsible for ensuring that the tzinfo + * attribute exists on the datetime object. + * + * If the passed object is timezone naive, Py_None is returned. + * If extraction of the offset fails, NULL is returned. + * + * NOTE: This function is not vendored from numpy. + */ PyObject *extract_utc_offset(PyObject *obj) { - PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo"); - if (tmp == NULL) { - return NULL; - } - if (tmp != Py_None) { - PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); - if (offset == NULL) { - Py_DECREF(tmp); - return NULL; - } - return offset; - } - return tmp; + PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo"); + if (tmp == NULL) { + return NULL; + } + if (tmp != Py_None) { + PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); + if (offset == NULL) { + Py_DECREF(tmp); + return NULL; + } + return offset; + } + return tmp; } /* @@ -307,98 +305,90 @@ PyObject *extract_utc_offset(PyObject *obj) { */ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, const npy_datetimestruct *dts) { - npy_datetime ret; - - if (base == NPY_FR_Y) { - /* Truncate to the year */ - ret = dts->year - 1970; - } else if (base == NPY_FR_M) { - /* Truncate to the month */ - ret = 12 * (dts->year - 1970) + (dts->month - 1); - } else { - /* Otherwise calculate the number of days to start */ - npy_int64 days = get_datetimestruct_days(dts); - - switch (base) { - case NPY_FR_W: - /* Truncate to weeks */ - if (days >= 0) { - ret = days / 7; - } else { - ret = (days - 6) / 7; - } - break; - case NPY_FR_D: - ret = days; - break; - case NPY_FR_h: - ret = days * 24 + dts->hour; - break; - case NPY_FR_m: - ret = (days * 24 + dts->hour) * 60 + dts->min; - break; - case NPY_FR_s: - ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec; - break; - case NPY_FR_ms: - ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000 + - dts->us / 1000; - break; - case NPY_FR_us: - ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000000 + - dts->us; - break; - case NPY_FR_ns: - ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000000 + - dts->us) * - 1000 + - dts->ps / 1000; - break; - case NPY_FR_ps: - ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000000 + - dts->us) * - 1000000 + - dts->ps; - break; - case NPY_FR_fs: - /* only 2.6 hours */ - ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000000 + - dts->us) * - 1000000 + - dts->ps) * - 1000 + - dts->as / 1000; - break; - case NPY_FR_as: - /* only 9.2 secs */ - ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000000 + - dts->us) * - 1000000 + - dts->ps) * - 1000000 + - dts->as; - break; - default: - /* Something got corrupted */ - PyErr_SetString( - PyExc_ValueError, - "NumPy datetime metadata with corrupt unit value"); - return -1; - } - } - return ret; + npy_datetime ret; + + if (base == NPY_FR_Y) { + /* Truncate to the year */ + ret = dts->year - 1970; + } else if (base == NPY_FR_M) { + /* Truncate to the month */ + ret = 12 * (dts->year - 1970) + (dts->month - 1); + } else { + /* Otherwise calculate the number of days to start */ + npy_int64 days = get_datetimestruct_days(dts); + + switch (base) { + case NPY_FR_W: + /* Truncate to weeks */ + if (days >= 0) { + ret = days / 7; + } else { + ret = (days - 6) / 7; + } + break; + case NPY_FR_D: + ret = days; + break; + case NPY_FR_h: + ret = days * 24 + dts->hour; + break; + case NPY_FR_m: + ret = (days * 24 + dts->hour) * 60 + dts->min; + break; + case NPY_FR_s: + ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec; + break; + case NPY_FR_ms: + ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * 1000 + + dts->us / 1000; + break; + case NPY_FR_us: + ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * + 1000000 + + dts->us; + break; + case NPY_FR_ns: + ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * + 1000000 + + dts->us) * + 1000 + + dts->ps / 1000; + break; + case NPY_FR_ps: + ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * + 1000000 + + dts->us) * + 1000000 + + dts->ps; + break; + case NPY_FR_fs: + /* only 2.6 hours */ + ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * + 1000000 + + dts->us) * + 1000000 + + dts->ps) * + 1000 + + dts->as / 1000; + break; + case NPY_FR_as: + /* only 9.2 secs */ + ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * + 1000000 + + dts->us) * + 1000000 + + dts->ps) * + 1000000 + + dts->as; + break; + default: + /* Something got corrupted */ + PyErr_SetString(PyExc_ValueError, + "NumPy datetime metadata with corrupt unit value"); + return -1; + } + } + return ret; } /* @@ -410,164 +400,161 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, * for subsequent calls to this command - it is able to deduce that `*d >= 0`. */ npy_int64 extract_unit(npy_datetime *d, npy_datetime unit) { - assert(unit > 0); - npy_int64 div = *d / unit; - npy_int64 mod = *d % unit; - if (mod < 0) { - mod += unit; - div -= 1; - } - assert(mod >= 0); - *d = mod; - return div; + assert(unit > 0); + npy_int64 div = *d / unit; + npy_int64 mod = *d % unit; + if (mod < 0) { + mod += unit; + div -= 1; + } + assert(mod >= 0); + *d = mod; + return div; } /* * Converts a datetime based on the given metadata into a datetimestruct */ -void pandas_datetime_to_datetimestruct(npy_datetime dt, - NPY_DATETIMEUNIT base, +void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, npy_datetimestruct *out) { - npy_int64 perday; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); - out->year = 1970; - out->month = 1; - out->day = 1; - - /* - * Note that care must be taken with the / and % operators - * for negative values. - */ - switch (base) { - case NPY_FR_Y: - out->year = 1970 + dt; - break; - - case NPY_FR_M: - out->year = 1970 + extract_unit(&dt, 12); - out->month = dt + 1; - break; - - case NPY_FR_W: - /* A week is 7 days */ - set_datetimestruct_days(dt * 7, out); - break; - - case NPY_FR_D: - set_datetimestruct_days(dt, out); - break; - - case NPY_FR_h: - perday = 24LL; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = dt; - break; - - case NPY_FR_m: - perday = 24LL * 60; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 60); - out->min = (int)dt; - break; - - case NPY_FR_s: - perday = 24LL * 60 * 60; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 60 * 60); - out->min = (int)extract_unit(&dt, 60); - out->sec = (int)dt; - break; - - case NPY_FR_ms: - perday = 24LL * 60 * 60 * 1000; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 60); - out->sec = (int)extract_unit(&dt, 1000LL); - out->us = (int)(dt * 1000); - break; - - case NPY_FR_us: - perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000); - out->us = (int)dt; - break; - - case NPY_FR_ns: - perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (int)extract_unit(&dt, 1000LL); - out->ps = (int)(dt * 1000); - break; - - case NPY_FR_ps: - perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (int)extract_unit(&dt, 1000LL); - out->ps = (int)(dt * 1000); - break; - - case NPY_FR_fs: - /* entire range is only +- 2.6 hours */ - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * - 1000 * 60 * 60); - if (out->hour < 0) { - out->year = 1969; - out->month = 12; - out->day = 31; - out->hour += 24; - assert(out->hour >= 0); - } - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * - 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * - 1000); - out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->ps = (int)extract_unit(&dt, 1000LL); - out->as = (int)(dt * 1000); - break; - - case NPY_FR_as: - /* entire range is only +- 9.2 seconds */ - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * - 1000 * 1000); - if (out->sec < 0) { - out->year = 1969; - out->month = 12; - out->day = 31; - out->hour = 23; - out->min = 59; - out->sec += 60; - assert(out->sec >= 0); - } - out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); - out->ps = (int)extract_unit(&dt, 1000LL * 1000); - out->as = (int)dt; - break; - - default: - PyErr_SetString(PyExc_RuntimeError, - "NumPy datetime metadata is corrupted with invalid " - "base unit"); + npy_int64 perday; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(npy_datetimestruct)); + out->year = 1970; + out->month = 1; + out->day = 1; + + /* + * Note that care must be taken with the / and % operators + * for negative values. + */ + switch (base) { + case NPY_FR_Y: + out->year = 1970 + dt; + break; + + case NPY_FR_M: + out->year = 1970 + extract_unit(&dt, 12); + out->month = dt + 1; + break; + + case NPY_FR_W: + /* A week is 7 days */ + set_datetimestruct_days(dt * 7, out); + break; + + case NPY_FR_D: + set_datetimestruct_days(dt, out); + break; + + case NPY_FR_h: + perday = 24LL; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = dt; + break; + + case NPY_FR_m: + perday = 24LL * 60; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 60); + out->min = (int)dt; + break; + + case NPY_FR_s: + perday = 24LL * 60 * 60; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 60 * 60); + out->min = (int)extract_unit(&dt, 60); + out->sec = (int)dt; + break; + + case NPY_FR_ms: + perday = 24LL * 60 * 60 * 1000; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 60); + out->sec = (int)extract_unit(&dt, 1000LL); + out->us = (int)(dt * 1000); + break; + + case NPY_FR_us: + perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000); + out->us = (int)dt; + break; + + case NPY_FR_ns: + perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (int)extract_unit(&dt, 1000LL); + out->ps = (int)(dt * 1000); + break; + + case NPY_FR_ps: + perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (int)extract_unit(&dt, 1000LL); + out->ps = (int)(dt * 1000); + break; + + case NPY_FR_fs: + /* entire range is only +- 2.6 hours */ + out->hour = + (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60 * 60); + if (out->hour < 0) { + out->year = 1969; + out->month = 12; + out->day = 31; + out->hour += 24; + assert(out->hour >= 0); } + out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60); + out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000); + out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000); + out->ps = (int)extract_unit(&dt, 1000LL); + out->as = (int)(dt * 1000); + break; + + case NPY_FR_as: + /* entire range is only +- 9.2 seconds */ + out->sec = + (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 1000); + if (out->sec < 0) { + out->year = 1969; + out->month = 12; + out->day = 31; + out->hour = 23; + out->min = 59; + out->sec += 60; + assert(out->sec >= 0); + } + out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); + out->ps = (int)extract_unit(&dt, 1000LL * 1000); + out->as = (int)dt; + break; + + default: + PyErr_SetString(PyExc_RuntimeError, + "NumPy datetime metadata is corrupted with invalid " + "base unit"); + } } /* @@ -579,363 +566,358 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, void pandas_timedelta_to_timedeltastruct(npy_timedelta td, NPY_DATETIMEUNIT base, pandas_timedeltastruct *out) { - npy_int64 frac; - npy_int64 sfrac; - npy_int64 ifrac; - int sign; - npy_int64 per_day; - npy_int64 per_sec; + npy_int64 frac; + npy_int64 sfrac; + npy_int64 ifrac; + int sign; + npy_int64 per_day; + npy_int64 per_sec; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(pandas_timedeltastruct)); + + switch (base) { + case NPY_FR_ns: + + per_day = 86400000000000LL; + per_sec = 1000LL * 1000LL * 1000LL; + + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; + + if (frac < 0) { + sign = -1; + + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } + } else { + sign = 1; + out->days = 0; + } - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(pandas_timedeltastruct)); + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } - switch (base) { - case NPY_FR_ns: - - per_day = 86400000000000LL; - per_sec = 1000LL * 1000LL * 1000LL; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } - - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = ifrac / (1000LL * 1000LL); - ifrac -= out->ms * 1000LL * 1000LL; - out->us = ifrac / 1000LL; - ifrac -= out->us * 1000LL; - out->ns = ifrac; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; - - case NPY_FR_us: - - per_day = 86400000000LL; - per_sec = 1000LL * 1000LL; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } - - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = ifrac / 1000LL; - ifrac -= out->ms * 1000LL; - out->us = ifrac / 1L; - ifrac -= out->us * 1L; - out->ns = ifrac; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; - - case NPY_FR_ms: - - per_day = 86400000LL; - per_sec = 1000LL; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } - - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = ifrac; - out->us = 0; - out->ns = 0; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; - - case NPY_FR_s: - // special case where we can simplify many expressions bc per_sec=1 - - per_day = 86400LL; - per_sec = 1L; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } - - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = 0; - out->us = 0; - out->ns = 0; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; - - case NPY_FR_m: - - out->days = td / 1440LL; - td -= out->days * 1440LL; - out->hrs = td / 60LL; - td -= out->hrs * 60LL; - out->min = td; - - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - case NPY_FR_h: - out->days = td / 24LL; - td -= out->days * 24LL; - out->hrs = td; - - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - case NPY_FR_D: - out->days = td; - out->hrs = 0; - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - case NPY_FR_W: - out->days = 7 * td; - out->hrs = 0; - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - default: - PyErr_SetString(PyExc_RuntimeError, - "NumPy timedelta metadata is corrupted with " - "invalid base unit"); - } - - out->seconds = out->hrs * 3600 + out->min * 60 + out->sec; - out->microseconds = out->ms * 1000 + out->us; - out->nanoseconds = out->ns; -} + if (frac >= 3600) { + out->hrs = frac / 3600LL; + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } + + if (frac >= 60) { + out->min = frac / 60LL; + frac -= out->min * 60LL; + } else { + out->min = 0; + } + + if (frac >= 0) { + out->sec = frac; + frac -= out->sec; + } else { + out->sec = 0; + } + + sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; + + if (sign < 0) + out->days = -out->days; + + ifrac = td - (out->days * per_day + sfrac); + + if (ifrac != 0) { + out->ms = ifrac / (1000LL * 1000LL); + ifrac -= out->ms * 1000LL * 1000LL; + out->us = ifrac / 1000LL; + ifrac -= out->us * 1000LL; + out->ns = ifrac; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; + + case NPY_FR_us: + + per_day = 86400000000LL; + per_sec = 1000LL * 1000LL; + + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; + + if (frac < 0) { + sign = -1; + + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } + } else { + sign = 1; + out->days = 0; + } + + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } + + if (frac >= 3600) { + out->hrs = frac / 3600LL; + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } + + if (frac >= 60) { + out->min = frac / 60LL; + frac -= out->min * 60LL; + } else { + out->min = 0; + } + + if (frac >= 0) { + out->sec = frac; + frac -= out->sec; + } else { + out->sec = 0; + } + + sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; + + if (sign < 0) + out->days = -out->days; + + ifrac = td - (out->days * per_day + sfrac); + + if (ifrac != 0) { + out->ms = ifrac / 1000LL; + ifrac -= out->ms * 1000LL; + out->us = ifrac / 1L; + ifrac -= out->us * 1L; + out->ns = ifrac; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; + + case NPY_FR_ms: + + per_day = 86400000LL; + per_sec = 1000LL; + + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; + + if (frac < 0) { + sign = -1; + + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } + } else { + sign = 1; + out->days = 0; + } + + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } + if (frac >= 3600) { + out->hrs = frac / 3600LL; + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } + + if (frac >= 60) { + out->min = frac / 60LL; + frac -= out->min * 60LL; + } else { + out->min = 0; + } + + if (frac >= 0) { + out->sec = frac; + frac -= out->sec; + } else { + out->sec = 0; + } + + sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; + + if (sign < 0) + out->days = -out->days; + + ifrac = td - (out->days * per_day + sfrac); + + if (ifrac != 0) { + out->ms = ifrac; + out->us = 0; + out->ns = 0; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; + + case NPY_FR_s: + // special case where we can simplify many expressions bc per_sec=1 + + per_day = 86400LL; + per_sec = 1L; + + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; + + if (frac < 0) { + sign = -1; + + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } + } else { + sign = 1; + out->days = 0; + } + + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } + + if (frac >= 3600) { + out->hrs = frac / 3600LL; + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } + + if (frac >= 60) { + out->min = frac / 60LL; + frac -= out->min * 60LL; + } else { + out->min = 0; + } + + if (frac >= 0) { + out->sec = frac; + frac -= out->sec; + } else { + out->sec = 0; + } + + sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; + + if (sign < 0) + out->days = -out->days; + + ifrac = td - (out->days * per_day + sfrac); + + if (ifrac != 0) { + out->ms = 0; + out->us = 0; + out->ns = 0; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; + + case NPY_FR_m: + + out->days = td / 1440LL; + td -= out->days * 1440LL; + out->hrs = td / 60LL; + td -= out->hrs * 60LL; + out->min = td; + + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; + + case NPY_FR_h: + out->days = td / 24LL; + td -= out->days * 24LL; + out->hrs = td; + + out->min = 0; + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; + + case NPY_FR_D: + out->days = td; + out->hrs = 0; + out->min = 0; + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; + + case NPY_FR_W: + out->days = 7 * td; + out->hrs = 0; + out->min = 0; + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; + + default: + PyErr_SetString(PyExc_RuntimeError, + "NumPy timedelta metadata is corrupted with " + "invalid base unit"); + } + + out->seconds = out->hrs * 3600 + out->min * 60 + out->sec; + out->microseconds = out->ms * 1000 + out->us; + out->nanoseconds = out->ns; +} /* * This function returns a pointer to the DateTimeMetaData @@ -945,5 +927,5 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, */ PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(PyArray_Descr *dtype) { - return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta); + return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta); } diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index 9646183fa1786..a0d56efc14bd9 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -26,7 +26,7 @@ This file implements string parsing and creation for NumPy datetime. #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API +#endif // NPY_NO_DEPRECATED_API #include @@ -39,7 +39,6 @@ This file implements string parsing and creation for NumPy datetime. #include "pandas/vendored/numpy/datetime/np_datetime.h" #include "pandas/vendored/numpy/datetime/np_datetime_strings.h" - /* * Parses (almost) standard ISO 8601 date strings. The differences are: * @@ -70,22 +69,19 @@ This file implements string parsing and creation for NumPy datetime. */ typedef enum { - COMPARISON_SUCCESS, - COMPLETED_PARTIAL_MATCH, - COMPARISON_ERROR + COMPARISON_SUCCESS, + COMPLETED_PARTIAL_MATCH, + COMPARISON_ERROR } DatetimePartParseResult; // This function will advance the pointer on format // and decrement characters_remaining by n on success // On failure will return COMPARISON_ERROR without incrementing // If `format_requirement` is PARTIAL_MATCH, and the `format` string has // been exhausted, then return COMPLETED_PARTIAL_MATCH. -static DatetimePartParseResult compare_format( - const char **format, - int *characters_remaining, - const char *compare_to, - int n, - const FormatRequirement format_requirement -) { +static DatetimePartParseResult +compare_format(const char **format, int *characters_remaining, + const char *compare_to, int n, + const FormatRequirement format_requirement) { if (format_requirement == INFER_FORMAT) { return COMPARISON_SUCCESS; } @@ -113,636 +109,649 @@ static DatetimePartParseResult compare_format( int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, - NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset, - const char* format, int format_len, + NPY_DATETIMEUNIT *out_bestunit, int *out_local, + int *out_tzoffset, const char *format, + int format_len, FormatRequirement format_requirement) { - if (len < 0 || format_len < 0) - goto parse_error; - int year_leap = 0; - int i, numdigits; - const char *substr; - int sublen; - NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC; - DatetimePartParseResult comparison; - - /* If year-month-day are separated by a valid separator, - * months/days without leading zeroes will be parsed - * (though not iso8601). If the components aren't separated, - * 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are - * forbidden here (but parsed as YYMMDD elsewhere). - */ - int has_ymd_sep = 0; - char ymd_sep = '\0'; - char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '}; - int valid_ymd_sep_len = sizeof(valid_ymd_sep); - - /* hour-minute-second may or may not separated by ':'. If not, then - * each component must be 2 digits. */ - int has_hms_sep = 0; - int hour_was_2_digits = 0; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); - out->month = 1; - out->day = 1; - - substr = str; - sublen = len; - - /* Skip leading whitespace */ - while (sublen > 0 && isspace(*substr)) { - ++substr; - --sublen; - comparison = compare_format(&format, &format_len, " ", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } - - /* Leading '-' sign for negative year */ - if (*substr == '-') { - ++substr; - --sublen; - } - - if (sublen == 0) { - goto parse_error; - } - - /* PARSE THE YEAR (4 digits) */ - comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement); + if (len < 0 || format_len < 0) + goto parse_error; + int year_leap = 0; + int i, numdigits; + const char *substr; + int sublen; + NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC; + DatetimePartParseResult comparison; + + /* If year-month-day are separated by a valid separator, + * months/days without leading zeroes will be parsed + * (though not iso8601). If the components aren't separated, + * 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are + * forbidden here (but parsed as YYMMDD elsewhere). + */ + int has_ymd_sep = 0; + char ymd_sep = '\0'; + char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '}; + int valid_ymd_sep_len = sizeof(valid_ymd_sep); + + /* hour-minute-second may or may not separated by ':'. If not, then + * each component must be 2 digits. */ + int has_hms_sep = 0; + int hour_was_2_digits = 0; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(npy_datetimestruct)); + out->month = 1; + out->day = 1; + + substr = str; + sublen = len; + + /* Skip leading whitespace */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + comparison = + compare_format(&format, &format_len, " ", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + goto finish; } + } - out->year = 0; - if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && - isdigit(substr[2]) && isdigit(substr[3])) { - out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') + - 10 * (substr[2] - '0') + (substr[3] - '0'); + /* Leading '-' sign for negative year */ + if (*substr == '-') { + ++substr; + --sublen; + } - substr += 4; - sublen -= 4; - } + if (sublen == 0) { + goto parse_error; + } - /* Negate the year if necessary */ - if (str[0] == '-') { - out->year = -out->year; - } - /* Check whether it's a leap-year */ - year_leap = is_leapyear(out->year); + /* PARSE THE YEAR (4 digits) */ + comparison = + compare_format(&format, &format_len, "%Y", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } - /* Next character must be a separator, start of month, or end of string */ - if (sublen == 0) { - if (out_local != NULL) { - *out_local = 0; - } - if (format_len) { - goto parse_error; - } - bestunit = NPY_FR_Y; - goto finish; - } + out->year = 0; + if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && + isdigit(substr[2]) && isdigit(substr[3])) { + out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') + + 10 * (substr[2] - '0') + (substr[3] - '0'); - if (!isdigit(*substr)) { - for (i = 0; i < valid_ymd_sep_len; ++i) { - if (*substr == valid_ymd_sep[i]) { - break; - } - } - if (i == valid_ymd_sep_len) { - goto parse_error; - } - has_ymd_sep = 1; - ymd_sep = valid_ymd_sep[i]; - ++substr; - --sublen; + substr += 4; + sublen -= 4; + } - comparison = compare_format(&format, &format_len, &ymd_sep, 1, - format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* Cannot have trailing separator */ - if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; - } - } + /* Negate the year if necessary */ + if (str[0] == '-') { + out->year = -out->year; + } + /* Check whether it's a leap-year */ + year_leap = is_leapyear(out->year); - /* PARSE THE MONTH */ - comparison = compare_format(&format, &format_len, "%m", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* First digit required */ - out->month = (*substr - '0'); - ++substr; - --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->month = 10 * out->month + (*substr - '0'); - ++substr; - --sublen; - } else if (!has_ymd_sep) { - goto parse_error; + /* Next character must be a separator, start of month, or end of string */ + if (sublen == 0) { + if (out_local != NULL) { + *out_local = 0; } - if (out->month < 1 || out->month > 12) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Month out of range in datetime string \"%s\"", str); - } - goto error; + if (format_len) { + goto parse_error; } + bestunit = NPY_FR_Y; + goto finish; + } - /* Next character must be the separator, start of day, or end of string */ - if (sublen == 0) { - bestunit = NPY_FR_M; - /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */ - if (!has_ymd_sep) { - goto parse_error; - } - if (format_len) { - goto parse_error; - } - if (out_local != NULL) { - *out_local = 0; - } - goto finish; + if (!isdigit(*substr)) { + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (*substr == valid_ymd_sep[i]) { + break; + } } - - if (has_ymd_sep) { - /* Must have separator, but cannot be trailing */ - if (*substr != ymd_sep || sublen == 1) { - goto parse_error; - } - ++substr; - --sublen; - comparison = compare_format(&format, &format_len, &ymd_sep, 1, - format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } + if (i == valid_ymd_sep_len) { + goto parse_error; } + has_ymd_sep = 1; + ymd_sep = valid_ymd_sep[i]; + ++substr; + --sublen; - /* PARSE THE DAY */ - comparison = compare_format(&format, &format_len, "%d", 2, format_requirement); + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + goto finish; } - /* First digit required */ - if (!isdigit(*substr)) { - goto parse_error; + /* Cannot have trailing separator */ + if (sublen == 0 || !isdigit(*substr)) { + goto parse_error; } - out->day = (*substr - '0'); + } + + /* PARSE THE MONTH */ + comparison = + compare_format(&format, &format_len, "%m", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* First digit required */ + out->month = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->month = 10 * out->month + (*substr - '0'); ++substr; --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->day = 10 * out->day + (*substr - '0'); - ++substr; - --sublen; - } else if (!has_ymd_sep) { - goto parse_error; - } - if (out->day < 1 || - out->day > days_per_month_table[year_leap][out->month - 1]) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Day out of range in datetime string \"%s\"", str); - } - goto error; + } else if (!has_ymd_sep) { + goto parse_error; + } + if (out->month < 1 || out->month > 12) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Month out of range in datetime string \"%s\"", str); } + goto error; + } - /* Next character must be a 'T', ' ', or end of string */ - if (sublen == 0) { - if (out_local != NULL) { - *out_local = 0; - } - if (format_len) { - goto parse_error; - } - bestunit = NPY_FR_D; - goto finish; + /* Next character must be the separator, start of day, or end of string */ + if (sublen == 0) { + bestunit = NPY_FR_M; + /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */ + if (!has_ymd_sep) { + goto parse_error; } + if (format_len) { + goto parse_error; + } + if (out_local != NULL) { + *out_local = 0; + } + goto finish; + } - if ((*substr != 'T' && *substr != ' ') || sublen == 1) { - goto parse_error; + if (has_ymd_sep) { + /* Must have separator, but cannot be trailing */ + if (*substr != ymd_sep || sublen == 1) { + goto parse_error; } - comparison = compare_format(&format, &format_len, substr, 1, format_requirement); + ++substr; + --sublen; + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + goto finish; } + } + + /* PARSE THE DAY */ + comparison = + compare_format(&format, &format_len, "%d", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* First digit required */ + if (!isdigit(*substr)) { + goto parse_error; + } + out->day = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->day = 10 * out->day + (*substr - '0'); ++substr; --sublen; + } else if (!has_ymd_sep) { + goto parse_error; + } + if (out->day < 1 || + out->day > days_per_month_table[year_leap][out->month - 1]) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Day out of range in datetime string \"%s\"", str); + } + goto error; + } - /* PARSE THE HOURS */ - comparison = compare_format(&format, &format_len, "%H", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + /* Next character must be a 'T', ' ', or end of string */ + if (sublen == 0) { + if (out_local != NULL) { + *out_local = 0; } - /* First digit required */ - if (!isdigit(*substr)) { - goto parse_error; + if (format_len) { + goto parse_error; } - out->hour = (*substr - '0'); + bestunit = NPY_FR_D; + goto finish; + } + + if ((*substr != 'T' && *substr != ' ') || sublen == 1) { + goto parse_error; + } + comparison = + compare_format(&format, &format_len, substr, 1, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + ++substr; + --sublen; + + /* PARSE THE HOURS */ + comparison = + compare_format(&format, &format_len, "%H", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* First digit required */ + if (!isdigit(*substr)) { + goto parse_error; + } + out->hour = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional */ + if (isdigit(*substr)) { + hour_was_2_digits = 1; + out->hour = 10 * out->hour + (*substr - '0'); ++substr; --sublen; - /* Second digit optional */ - if (isdigit(*substr)) { - hour_was_2_digits = 1; - out->hour = 10 * out->hour + (*substr - '0'); - ++substr; - --sublen; - if (out->hour >= 24) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Hours out of range in datetime string \"%s\"", - str); - } - goto error; - } + if (out->hour >= 24) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Hours out of range in datetime string \"%s\"", str); + } + goto error; } + } - /* Next character must be a ':' or the end of the string */ - if (sublen == 0) { - if (!hour_was_2_digits) { - goto parse_error; - } - if (format_len) { - goto parse_error; - } - bestunit = NPY_FR_h; - goto finish; + /* Next character must be a ':' or the end of the string */ + if (sublen == 0) { + if (!hour_was_2_digits) { + goto parse_error; } - - if (*substr == ':') { - has_hms_sep = 1; - ++substr; - --sublen; - /* Cannot have a trailing separator */ - if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; - } - comparison = compare_format(&format, &format_len, ":", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } else if (!isdigit(*substr)) { - if (!hour_was_2_digits) { - goto parse_error; - } - goto parse_timezone; + if (format_len) { + goto parse_error; } + bestunit = NPY_FR_h; + goto finish; + } - /* PARSE THE MINUTES */ - comparison = compare_format(&format, &format_len, "%M", 2, format_requirement); + if (*substr == ':') { + has_hms_sep = 1; + ++substr; + --sublen; + /* Cannot have a trailing separator */ + if (sublen == 0 || !isdigit(*substr)) { + goto parse_error; + } + comparison = + compare_format(&format, &format_len, ":", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + goto finish; } - /* First digit required */ - out->min = (*substr - '0'); - ++substr; - --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->min = 10 * out->min + (*substr - '0'); - ++substr; - --sublen; - if (out->min >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Minutes out of range in datetime string \"%s\"", - str); - } - goto error; - } - } else if (!has_hms_sep) { - goto parse_error; + } else if (!isdigit(*substr)) { + if (!hour_was_2_digits) { + goto parse_error; } + goto parse_timezone; + } - if (sublen == 0) { - bestunit = NPY_FR_m; - if (format_len) { - goto parse_error; - } - goto finish; + /* PARSE THE MINUTES */ + comparison = + compare_format(&format, &format_len, "%M", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* First digit required */ + out->min = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->min = 10 * out->min + (*substr - '0'); + ++substr; + --sublen; + if (out->min >= 60) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Minutes out of range in datetime string \"%s\"", str); + } + goto error; } + } else if (!has_hms_sep) { + goto parse_error; + } - /* If we make it through this condition block, then the next - * character is a digit. */ - if (has_hms_sep && *substr == ':') { - comparison = compare_format(&format, &format_len, ":", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - ++substr; - --sublen; - /* Cannot have a trailing ':' */ - if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; - } - } else if (!has_hms_sep && isdigit(*substr)) { - } else { - goto parse_timezone; + if (sublen == 0) { + bestunit = NPY_FR_m; + if (format_len) { + goto parse_error; } + goto finish; + } - /* PARSE THE SECONDS */ - comparison = compare_format(&format, &format_len, "%S", 2, format_requirement); + /* If we make it through this condition block, then the next + * character is a digit. */ + if (has_hms_sep && *substr == ':') { + comparison = + compare_format(&format, &format_len, ":", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + goto finish; } - /* First digit required */ - out->sec = (*substr - '0'); ++substr; --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->sec = 10 * out->sec + (*substr - '0'); - ++substr; - --sublen; - if (out->sec >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Seconds out of range in datetime string \"%s\"", - str); - } - goto error; - } - } else if (!has_hms_sep) { - goto parse_error; + /* Cannot have a trailing ':' */ + if (sublen == 0 || !isdigit(*substr)) { + goto parse_error; } + } else if (!has_hms_sep && isdigit(*substr)) { + } else { + goto parse_timezone; + } - /* Next character may be a '.' indicating fractional seconds */ - if (sublen > 0 && *substr == '.') { - ++substr; - --sublen; - comparison = compare_format(&format, &format_len, ".", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } else { - bestunit = NPY_FR_s; - goto parse_timezone; + /* PARSE THE SECONDS */ + comparison = + compare_format(&format, &format_len, "%S", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* First digit required */ + out->sec = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->sec = 10 * out->sec + (*substr - '0'); + ++substr; + --sublen; + if (out->sec >= 60) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Seconds out of range in datetime string \"%s\"", str); + } + goto error; } + } else if (!has_hms_sep) { + goto parse_error; + } - /* PARSE THE MICROSECONDS (0 to 6 digits) */ - comparison = compare_format(&format, &format_len, "%f", 2, format_requirement); + /* Next character may be a '.' indicating fractional seconds */ + if (sublen > 0 && *substr == '.') { + ++substr; + --sublen; + comparison = + compare_format(&format, &format_len, ".", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - numdigits = 0; - for (i = 0; i < 6; ++i) { - out->us *= 10; - if (sublen > 0 && isdigit(*substr)) { - out->us += (*substr - '0'); - ++substr; - --sublen; - ++numdigits; - } - } - - if (sublen == 0 || !isdigit(*substr)) { - if (numdigits > 3) { - bestunit = NPY_FR_us; - } else { - bestunit = NPY_FR_ms; - } - goto parse_timezone; + goto finish; } + } else { + bestunit = NPY_FR_s; + goto parse_timezone; + } - /* PARSE THE PICOSECONDS (0 to 6 digits) */ - numdigits = 0; - for (i = 0; i < 6; ++i) { - out->ps *= 10; - if (sublen > 0 && isdigit(*substr)) { - out->ps += (*substr - '0'); - ++substr; - --sublen; - ++numdigits; - } + /* PARSE THE MICROSECONDS (0 to 6 digits) */ + comparison = + compare_format(&format, &format_len, "%f", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->us *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->us += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; } + } - if (sublen == 0 || !isdigit(*substr)) { - if (numdigits > 3) { - bestunit = NPY_FR_ps; - } else { - bestunit = NPY_FR_ns; - } - goto parse_timezone; + if (sublen == 0 || !isdigit(*substr)) { + if (numdigits > 3) { + bestunit = NPY_FR_us; + } else { + bestunit = NPY_FR_ms; } + goto parse_timezone; + } - /* PARSE THE ATTOSECONDS (0 to 6 digits) */ - numdigits = 0; - for (i = 0; i < 6; ++i) { - out->as *= 10; - if (sublen > 0 && isdigit(*substr)) { - out->as += (*substr - '0'); - ++substr; - --sublen; - ++numdigits; - } + /* PARSE THE PICOSECONDS (0 to 6 digits) */ + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->ps *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->ps += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; } + } + if (sublen == 0 || !isdigit(*substr)) { if (numdigits > 3) { - bestunit = NPY_FR_as; + bestunit = NPY_FR_ps; } else { - bestunit = NPY_FR_fs; + bestunit = NPY_FR_ns; } + goto parse_timezone; + } -parse_timezone: - /* trim any whitespace between time/timezone */ - while (sublen > 0 && isspace(*substr)) { - ++substr; - --sublen; - comparison = compare_format(&format, &format_len, " ", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } + /* PARSE THE ATTOSECONDS (0 to 6 digits) */ + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->as *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->as += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; } + } - if (sublen == 0) { - // Unlike NumPy, treating no time zone as naive - if (format_len > 0) { - goto parse_error; - } - goto finish; - } + if (numdigits > 3) { + bestunit = NPY_FR_as; + } else { + bestunit = NPY_FR_fs; + } - /* UTC specifier */ - if (*substr == 'Z') { - comparison = compare_format(&format, &format_len, "%z", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* "Z" should be equivalent to tz offset "+00:00" */ - if (out_local != NULL) { - *out_local = 1; - } +parse_timezone: + /* trim any whitespace between time/timezone */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + comparison = + compare_format(&format, &format_len, " ", 1, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + } - if (out_tzoffset != NULL) { - *out_tzoffset = 0; - } + if (sublen == 0) { + // Unlike NumPy, treating no time zone as naive + if (format_len > 0) { + goto parse_error; + } + goto finish; + } - if (sublen == 1) { - if (format_len > 0) { - goto parse_error; - } - goto finish; - } else { - ++substr; - --sublen; - } - } else if (*substr == '-' || *substr == '+') { - comparison = compare_format(&format, &format_len, "%z", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* Time zone offset */ - int offset_neg = 0, offset_hour = 0, offset_minute = 0; + /* UTC specifier */ + if (*substr == 'Z') { + comparison = + compare_format(&format, &format_len, "%z", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* "Z" should be equivalent to tz offset "+00:00" */ + if (out_local != NULL) { + *out_local = 1; + } - /* - * Since "local" means local with respect to the current - * machine, we say this is non-local. - */ + if (out_tzoffset != NULL) { + *out_tzoffset = 0; + } - if (*substr == '-') { - offset_neg = 1; - } - ++substr; - --sublen; + if (sublen == 1) { + if (format_len > 0) { + goto parse_error; + } + goto finish; + } else { + ++substr; + --sublen; + } + } else if (*substr == '-' || *substr == '+') { + comparison = + compare_format(&format, &format_len, "%z", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* Time zone offset */ + int offset_neg = 0, offset_hour = 0, offset_minute = 0; - /* The hours offset */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0'); - substr += 2; - sublen -= 2; - if (offset_hour >= 24) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Timezone hours offset out of range " - "in datetime string \"%s\"", - str); - } - goto error; - } - } else if (sublen >= 1 && isdigit(substr[0])) { - offset_hour = substr[0] - '0'; - ++substr; - --sublen; - } else { - goto parse_error; - } + /* + * Since "local" means local with respect to the current + * machine, we say this is non-local. + */ - /* The minutes offset is optional */ - if (sublen > 0) { - /* Optional ':' */ - if (*substr == ':') { - ++substr; - --sublen; - } - - /* The minutes offset (at the end of the string) */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0'); - substr += 2; - sublen -= 2; - if (offset_minute >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Timezone minutes offset out of range " - "in datetime string \"%s\"", - str); - } - goto error; - } - } else if (sublen >= 1 && isdigit(substr[0])) { - offset_minute = substr[0] - '0'; - ++substr; - --sublen; - } else { - goto parse_error; - } - } + if (*substr == '-') { + offset_neg = 1; + } + ++substr; + --sublen; - /* Apply the time zone offset */ - if (offset_neg) { - offset_hour = -offset_hour; - offset_minute = -offset_minute; - } - if (out_local != NULL) { - *out_local = 1; - // Unlike NumPy, do not change internal value to local time - *out_tzoffset = 60 * offset_hour + offset_minute; + /* The hours offset */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0'); + substr += 2; + sublen -= 2; + if (offset_hour >= 24) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Timezone hours offset out of range " + "in datetime string \"%s\"", + str); } + goto error; + } + } else if (sublen >= 1 && isdigit(substr[0])) { + offset_hour = substr[0] - '0'; + ++substr; + --sublen; + } else { + goto parse_error; } - /* Skip trailing whitespace */ - while (sublen > 0 && isspace(*substr)) { + /* The minutes offset is optional */ + if (sublen > 0) { + /* Optional ':' */ + if (*substr == ':') { ++substr; --sublen; - comparison = compare_format(&format, &format_len, " ", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + } + + /* The minutes offset (at the end of the string) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0'); + substr += 2; + sublen -= 2; + if (offset_minute >= 60) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Timezone minutes offset out of range " + "in datetime string \"%s\"", + str); + } + goto error; } + } else if (sublen >= 1 && isdigit(substr[0])) { + offset_minute = substr[0] - '0'; + ++substr; + --sublen; + } else { + goto parse_error; + } } - if ((sublen != 0) || (format_len != 0)) { - goto parse_error; + /* Apply the time zone offset */ + if (offset_neg) { + offset_hour = -offset_hour; + offset_minute = -offset_minute; + } + if (out_local != NULL) { + *out_local = 1; + // Unlike NumPy, do not change internal value to local time + *out_tzoffset = 60 * offset_hour + offset_minute; } + } -finish: - if (out_bestunit != NULL) { - *out_bestunit = bestunit; + /* Skip trailing whitespace */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + comparison = + compare_format(&format, &format_len, " ", 1, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } - return 0; + } + + if ((sublen != 0) || (format_len != 0)) { + goto parse_error; + } + +finish: + if (out_bestunit != NULL) { + *out_bestunit = bestunit; + } + return 0; parse_error: - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Error parsing datetime string \"%s\" at position %d", str, - (int)(substr - str)); - } - return -1; + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Error parsing datetime string \"%s\" at position %d", str, + (int)(substr - str)); + } + return -1; error: - return -1; + return -1; } /* @@ -750,56 +759,55 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, * objects with the given local and unit settings. */ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { - int len = 0; - - switch (base) { - /* Generic units can only be used to represent NaT */ - /* return 4;*/ - case NPY_FR_as: - len += 3; /* "###" */ - case NPY_FR_fs: - len += 3; /* "###" */ - case NPY_FR_ps: - len += 3; /* "###" */ - case NPY_FR_ns: - len += 3; /* "###" */ - case NPY_FR_us: - len += 3; /* "###" */ - case NPY_FR_ms: - len += 4; /* ".###" */ - case NPY_FR_s: - len += 3; /* ":##" */ - case NPY_FR_m: - len += 3; /* ":##" */ - case NPY_FR_h: - len += 3; /* "T##" */ - case NPY_FR_D: - case NPY_FR_W: - len += 3; /* "-##" */ - case NPY_FR_M: - len += 3; /* "-##" */ - case NPY_FR_Y: - len += 21; /* 64-bit year */ - break; - default: - len += 3; /* handle the now defunct NPY_FR_B */ - break; - } + int len = 0; + + switch (base) { + /* Generic units can only be used to represent NaT */ + /* return 4;*/ + case NPY_FR_as: + len += 3; /* "###" */ + case NPY_FR_fs: + len += 3; /* "###" */ + case NPY_FR_ps: + len += 3; /* "###" */ + case NPY_FR_ns: + len += 3; /* "###" */ + case NPY_FR_us: + len += 3; /* "###" */ + case NPY_FR_ms: + len += 4; /* ".###" */ + case NPY_FR_s: + len += 3; /* ":##" */ + case NPY_FR_m: + len += 3; /* ":##" */ + case NPY_FR_h: + len += 3; /* "T##" */ + case NPY_FR_D: + case NPY_FR_W: + len += 3; /* "-##" */ + case NPY_FR_M: + len += 3; /* "-##" */ + case NPY_FR_Y: + len += 21; /* 64-bit year */ + break; + default: + len += 3; /* handle the now defunct NPY_FR_B */ + break; + } - if (base >= NPY_FR_h) { - if (local) { - len += 5; /* "+####" or "-####" */ - } else { - len += 1; /* "Z" */ - } + if (base >= NPY_FR_h) { + if (local) { + len += 5; /* "+####" or "-####" */ + } else { + len += 1; /* "Z" */ } + } - len += 1; /* NULL terminator */ + len += 1; /* NULL terminator */ - return len; + return len; } - /* * Converts an npy_datetimestruct to an (almost) ISO 8601 * NULL-terminated string using timezone Z (UTC). If the string fits in @@ -816,19 +824,19 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { */ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, int utc, NPY_DATETIMEUNIT base) { - char *substr = outstr; - int sublen = outlen; - int tmplen; - - /* - * Print weeks with the same precision as days. - * - * TODO: Could print weeks with YYYY-Www format if the week - * epoch is a Monday. - */ - if (base == NPY_FR_W) { - base = NPY_FR_D; - } + char *substr = outstr; + int sublen = outlen; + int tmplen; + + /* + * Print weeks with the same precision as days. + * + * TODO: Could print weeks with YYYY-Www format if the week + * epoch is a Monday. + */ + if (base == NPY_FR_W) { + base = NPY_FR_D; + } /* YEAR */ /* @@ -837,314 +845,309 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, * to have data all the way to the end of the buffer. */ #ifdef _WIN32 - tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); + tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); #else - tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); -#endif // _WIN32 - /* If it ran out of space or there isn't space for the NULL terminator */ - if (tmplen < 0 || tmplen > sublen) { - goto string_too_short; - } - substr += tmplen; - sublen -= tmplen; + tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); +#endif // _WIN32 + /* If it ran out of space or there isn't space for the NULL terminator */ + if (tmplen < 0 || tmplen > sublen) { + goto string_too_short; + } + substr += tmplen; + sublen -= tmplen; - /* Stop if the unit is years */ - if (base == NPY_FR_Y) { - if (sublen > 0) { - *substr = '\0'; - } - return 0; + /* Stop if the unit is years */ + if (base == NPY_FR_Y) { + if (sublen > 0) { + *substr = '\0'; } + return 0; + } - /* MONTH */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = '-'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->month / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->month % 10) + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is months */ - if (base == NPY_FR_M) { - if (sublen > 0) { - *substr = '\0'; - } - return 0; - } + /* MONTH */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = '-'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->month / 10) + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->month % 10) + '0'); + substr += 3; + sublen -= 3; - /* DAY */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = '-'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->day / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->day % 10) + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is days */ - if (base == NPY_FR_D) { - if (sublen > 0) { - *substr = '\0'; - } - return 0; + /* Stop if the unit is months */ + if (base == NPY_FR_M) { + if (sublen > 0) { + *substr = '\0'; } + return 0; + } - /* HOUR */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = 'T'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->hour / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->hour % 10) + '0'); - substr += 3; - sublen -= 3; + /* DAY */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = '-'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->day / 10) + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->day % 10) + '0'); + substr += 3; + sublen -= 3; - /* Stop if the unit is hours */ - if (base == NPY_FR_h) { - goto add_time_zone; + /* Stop if the unit is days */ + if (base == NPY_FR_D) { + if (sublen > 0) { + *substr = '\0'; } + return 0; + } - /* MINUTE */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = ':'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->min / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->min % 10) + '0'); - substr += 3; - sublen -= 3; + /* HOUR */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = 'T'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->hour / 10) + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->hour % 10) + '0'); + substr += 3; + sublen -= 3; - /* Stop if the unit is minutes */ - if (base == NPY_FR_m) { - goto add_time_zone; - } + /* Stop if the unit is hours */ + if (base == NPY_FR_h) { + goto add_time_zone; + } - /* SECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = ':'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->sec / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->sec % 10) + '0'); - substr += 3; - sublen -= 3; + /* MINUTE */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = ':'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->min / 10) + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->min % 10) + '0'); + substr += 3; + sublen -= 3; - /* Stop if the unit is seconds */ - if (base == NPY_FR_s) { - goto add_time_zone; - } + /* Stop if the unit is minutes */ + if (base == NPY_FR_m) { + goto add_time_zone; + } - /* MILLISECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = '.'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->us / 100000) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->us / 10000) % 10 + '0'); - if (sublen < 4) { - goto string_too_short; - } - substr[3] = (char)((dts->us / 1000) % 10 + '0'); - substr += 4; - sublen -= 4; + /* SECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = ':'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->sec / 10) + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->sec % 10) + '0'); + substr += 3; + sublen -= 3; - /* Stop if the unit is milliseconds */ - if (base == NPY_FR_ms) { - goto add_time_zone; - } + /* Stop if the unit is seconds */ + if (base == NPY_FR_s) { + goto add_time_zone; + } - /* MICROSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->us / 100) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->us / 10) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)(dts->us % 10 + '0'); - substr += 3; - sublen -= 3; + /* MILLISECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = '.'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->us / 100000) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->us / 10000) % 10 + '0'); + if (sublen < 4) { + goto string_too_short; + } + substr[3] = (char)((dts->us / 1000) % 10 + '0'); + substr += 4; + sublen -= 4; - /* Stop if the unit is microseconds */ - if (base == NPY_FR_us) { - goto add_time_zone; - } + /* Stop if the unit is milliseconds */ + if (base == NPY_FR_ms) { + goto add_time_zone; + } - /* NANOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->ps / 100000) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->ps / 10000) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->ps / 1000) % 10 + '0'); - substr += 3; - sublen -= 3; + /* MICROSECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = (char)((dts->us / 100) % 10 + '0'); + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->us / 10) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)(dts->us % 10 + '0'); + substr += 3; + sublen -= 3; - /* Stop if the unit is nanoseconds */ - if (base == NPY_FR_ns) { - goto add_time_zone; - } + /* Stop if the unit is microseconds */ + if (base == NPY_FR_us) { + goto add_time_zone; + } - /* PICOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->ps / 100) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->ps / 10) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)(dts->ps % 10 + '0'); - substr += 3; - sublen -= 3; + /* NANOSECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = (char)((dts->ps / 100000) % 10 + '0'); + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->ps / 10000) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->ps / 1000) % 10 + '0'); + substr += 3; + sublen -= 3; - /* Stop if the unit is picoseconds */ - if (base == NPY_FR_ps) { - goto add_time_zone; - } + /* Stop if the unit is nanoseconds */ + if (base == NPY_FR_ns) { + goto add_time_zone; + } - /* FEMTOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->as / 100000) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->as / 10000) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->as / 1000) % 10 + '0'); - substr += 3; - sublen -= 3; + /* PICOSECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = (char)((dts->ps / 100) % 10 + '0'); + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->ps / 10) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)(dts->ps % 10 + '0'); + substr += 3; + sublen -= 3; - /* Stop if the unit is femtoseconds */ - if (base == NPY_FR_fs) { - goto add_time_zone; - } + /* Stop if the unit is picoseconds */ + if (base == NPY_FR_ps) { + goto add_time_zone; + } - /* ATTOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->as / 100) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->as / 10) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)(dts->as % 10 + '0'); - substr += 3; - sublen -= 3; + /* FEMTOSECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = (char)((dts->as / 100000) % 10 + '0'); + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->as / 10000) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->as / 1000) % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is femtoseconds */ + if (base == NPY_FR_fs) { + goto add_time_zone; + } + + /* ATTOSECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = (char)((dts->as / 100) % 10 + '0'); + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->as / 10) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)(dts->as % 10 + '0'); + substr += 3; + sublen -= 3; add_time_zone: - /* UTC "Zulu" time */ - if (utc) { - if (sublen < 1) { - goto string_too_short; - } - substr[0] = 'Z'; - substr += 1; - sublen -= 1; - } - /* Add a NULL terminator, and return */ - if (sublen > 0) { - substr[0] = '\0'; + /* UTC "Zulu" time */ + if (utc) { + if (sublen < 1) { + goto string_too_short; } + substr[0] = 'Z'; + substr += 1; + sublen -= 1; + } + /* Add a NULL terminator, and return */ + if (sublen > 0) { + substr[0] = '\0'; + } - return 0; + return 0; string_too_short: - PyErr_Format(PyExc_RuntimeError, - "The string provided for NumPy ISO datetime formatting " - "was too short, with length %d", - outlen); - return -1; + PyErr_Format(PyExc_RuntimeError, + "The string provided for NumPy ISO datetime formatting " + "was too short, with length %d", + outlen); + return -1; } - -int make_iso_8601_timedelta(pandas_timedeltastruct *tds, - char *outstr, size_t *outlen) { +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, + size_t *outlen) { *outlen = 0; - *outlen += snprintf(outstr, 60, // NOLINT - "P%" NPY_INT64_FMT - "DT%" NPY_INT32_FMT - "H%" NPY_INT32_FMT - "M%" NPY_INT32_FMT, - tds->days, tds->hrs, tds->min, tds->sec); + *outlen += snprintf(outstr, 60, // NOLINT + "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT + "M%" NPY_INT32_FMT, + tds->days, tds->hrs, tds->min, tds->sec); outstr += *outlen; if (tds->ns != 0) { - *outlen += snprintf(outstr, 12, // NOLINT - ".%03" NPY_INT32_FMT - "%03" NPY_INT32_FMT - "%03" NPY_INT32_FMT - "S", tds->ms, tds->us, tds->ns); + *outlen += snprintf(outstr, 12, // NOLINT + ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT "S", + tds->ms, tds->us, tds->ns); } else if (tds->us != 0) { - *outlen += snprintf(outstr, 9, // NOLINT - ".%03" NPY_INT32_FMT - "%03" NPY_INT32_FMT - "S", tds->ms, tds->us); + *outlen += snprintf(outstr, 9, // NOLINT + ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, + tds->us); } else if (tds->ms != 0) { - *outlen += snprintf(outstr, 6, // NOLINT + *outlen += snprintf(outstr, 6, // NOLINT ".%03" NPY_INT32_FMT "S", tds->ms); } else { - *outlen += snprintf(outstr, 2, // NOLINT + *outlen += snprintf(outstr, 2, // NOLINT "%s", "S"); } diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c index 70794786016fb..a858abf46a598 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c @@ -40,6 +40,7 @@ Numeric decoder derived from TCL library // Licence at LICENSES/ULTRAJSON_LICENSE +#include "pandas/vendored/ujson/lib/ultrajson.h" #include #include #include @@ -48,7 +49,6 @@ Numeric decoder derived from TCL library #include #include #include -#include "pandas/vendored/ujson/lib/ultrajson.h" #ifndef TRUE #define TRUE 1 @@ -59,15 +59,15 @@ Numeric decoder derived from TCL library #endif struct DecoderState { - char *start; - char *end; - wchar_t *escStart; - wchar_t *escEnd; - int escHeap; - int lastType; - JSUINT32 objDepth; - void *prv; - JSONObjectDecoder *dec; + char *start; + char *end; + wchar_t *escStart; + wchar_t *escEnd; + int escHeap; + int lastType; + JSUINT32 objDepth; + void *prv; + JSONObjectDecoder *dec; }; JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds); @@ -75,349 +75,372 @@ typedef JSOBJ (*PFN_DECODER)(struct DecoderState *ds); static JSOBJ SetError(struct DecoderState *ds, int offset, const char *message) { - ds->dec->errorOffset = ds->start + offset; - ds->dec->errorStr = (char *)message; - return NULL; + ds->dec->errorOffset = ds->start + offset; + ds->dec->errorStr = (char *)message; + return NULL; } double createDouble(double intNeg, double intValue, double frcValue, int frcDecimalCount) { - static const double g_pow10[] = {1.0, - 0.1, - 0.01, - 0.001, - 0.0001, - 0.00001, - 0.000001, - 0.0000001, - 0.00000001, - 0.000000001, - 0.0000000001, - 0.00000000001, - 0.000000000001, - 0.0000000000001, - 0.00000000000001, - 0.000000000000001}; - return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg; + static const double g_pow10[] = {1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + 0.00000000001, + 0.000000000001, + 0.0000000000001, + 0.00000000000001, + 0.000000000000001}; + return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg; } JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { - char *end; - double value; - errno = 0; + char *end; + double value; + errno = 0; - value = strtod(ds->start, &end); + value = strtod(ds->start, &end); - if (errno == ERANGE) { - return SetError(ds, -1, "Range error when decoding numeric as double"); - } + if (errno == ERANGE) { + return SetError(ds, -1, "Range error when decoding numeric as double"); + } - ds->start = end; - return ds->dec->newDouble(ds->prv, value); + ds->start = end; + return ds->dec->newDouble(ds->prv, value); } JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { - int intNeg = 1; - JSUINT64 intValue; - JSUINT64 prevIntValue; - int chr; - int decimalCount = 0; - double frcValue = 0.0; - double expNeg; - double expValue; - char *offset = ds->start; - - JSUINT64 overflowLimit = LLONG_MAX; - + int intNeg = 1; + JSUINT64 intValue; + JSUINT64 prevIntValue; + int chr; + int decimalCount = 0; + double frcValue = 0.0; + double expNeg; + double expValue; + char *offset = ds->start; + + JSUINT64 overflowLimit = LLONG_MAX; + + if (*(offset) == 'I') { + goto DECODE_INF; + } else if (*(offset) == 'N') { + goto DECODE_NAN; + } else if (*(offset) == '-') { + offset++; + intNeg = -1; + overflowLimit = LLONG_MIN; if (*(offset) == 'I') { goto DECODE_INF; - } else if (*(offset) == 'N') { - goto DECODE_NAN; - } else if (*(offset) == '-') { - offset++; - intNeg = -1; - overflowLimit = LLONG_MIN; - if (*(offset) == 'I') { - goto DECODE_INF; - } + } + } + + // Scan integer part + intValue = 0; + + while (1) { + chr = (int)(unsigned char)*(offset); + + switch (chr) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + // PERF: Don't do 64-bit arithmetic here unless we have to + prevIntValue = intValue; + intValue = intValue * 10ULL + (JSLONG)(chr - 48); + + if (intNeg == 1 && prevIntValue > intValue) { + return SetError(ds, -1, "Value is too big!"); + } else if (intNeg == -1 && intValue > overflowLimit) { + return SetError(ds, -1, + overflowLimit == LLONG_MAX ? "Value is too big!" + : "Value is too small"); + } + + offset++; + break; + } + case '.': { + offset++; + goto DECODE_FRACTION; + break; + } + case 'e': + case 'E': { + offset++; + goto DECODE_EXPONENT; + break; } - // Scan integer part - intValue = 0; - - while (1) { - chr = (int)(unsigned char)*(offset); - - switch (chr) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - // PERF: Don't do 64-bit arithmetic here unless we have to - prevIntValue = intValue; - intValue = intValue * 10ULL + (JSLONG) (chr - 48); - - if (intNeg == 1 && prevIntValue > intValue) { - return SetError(ds, -1, "Value is too big!"); - } else if (intNeg == -1 && intValue > overflowLimit) { - return SetError(ds, -1, overflowLimit == LLONG_MAX ? - "Value is too big!" : "Value is too small"); - } - - offset++; - break; - } - case '.': { - offset++; - goto DECODE_FRACTION; - break; - } - case 'e': - case 'E': { - offset++; - goto DECODE_EXPONENT; - break; - } - - default: { - goto BREAK_INT_LOOP; - break; - } - } + default: { + goto BREAK_INT_LOOP; + break; + } } + } BREAK_INT_LOOP: - ds->lastType = JT_INT; - ds->start = offset; + ds->lastType = JT_INT; + ds->start = offset; - if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) - return ds->dec->newUnsignedLong(ds->prv, intValue); - else if ((intValue >> 31)) - return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); - else - return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); + if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) + return ds->dec->newUnsignedLong(ds->prv, intValue); + else if ((intValue >> 31)) + return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); + else + return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); DECODE_FRACTION: - if (ds->dec->preciseFloat) { - return decodePreciseFloat(ds); + if (ds->dec->preciseFloat) { + return decodePreciseFloat(ds); + } + + // Scan fraction part + frcValue = 0.0; + for (;;) { + chr = (int)(unsigned char)*(offset); + + switch (chr) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) { + frcValue = frcValue * 10.0 + (double)(chr - 48); + decimalCount++; + } + offset++; + break; } - - // Scan fraction part - frcValue = 0.0; - for (;;) { - chr = (int)(unsigned char)*(offset); - - switch (chr) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) { - frcValue = frcValue * 10.0 + (double)(chr - 48); - decimalCount++; - } - offset++; - break; - } - case 'e': - case 'E': { - offset++; - goto DECODE_EXPONENT; - break; - } - default: { goto BREAK_FRC_LOOP; } - } + case 'e': + case 'E': { + offset++; + goto DECODE_EXPONENT; + break; + } + default: { + goto BREAK_FRC_LOOP; } + } + } BREAK_FRC_LOOP: - // FIXME: Check for arithmetic overflow here - ds->lastType = JT_DOUBLE; - ds->start = offset; - return ds->dec->newDouble( - ds->prv, - createDouble((double)intNeg, (double)intValue, frcValue, decimalCount)); + // FIXME: Check for arithmetic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + return ds->dec->newDouble( + ds->prv, + createDouble((double)intNeg, (double)intValue, frcValue, decimalCount)); DECODE_EXPONENT: - if (ds->dec->preciseFloat) { - return decodePreciseFloat(ds); - } + if (ds->dec->preciseFloat) { + return decodePreciseFloat(ds); + } - expNeg = 1.0; + expNeg = 1.0; - if (*(offset) == '-') { - expNeg = -1.0; - offset++; - } else if (*(offset) == '+') { - expNeg = +1.0; - offset++; + if (*(offset) == '-') { + expNeg = -1.0; + offset++; + } else if (*(offset) == '+') { + expNeg = +1.0; + offset++; + } + + expValue = 0.0; + + for (;;) { + chr = (int)(unsigned char)*(offset); + + switch (chr) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + expValue = expValue * 10.0 + (double)(chr - 48); + offset++; + break; + } + default: { + goto BREAK_EXP_LOOP; } - - expValue = 0.0; - - for (;;) { - chr = (int)(unsigned char)*(offset); - - switch (chr) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - expValue = expValue * 10.0 + (double)(chr - 48); - offset++; - break; - } - default: { goto BREAK_EXP_LOOP; } - } } + } DECODE_NAN: - offset++; - if (*(offset++) != 'a') goto SET_NAN_ERROR; - if (*(offset++) != 'N') goto SET_NAN_ERROR; + offset++; + if (*(offset++) != 'a') + goto SET_NAN_ERROR; + if (*(offset++) != 'N') + goto SET_NAN_ERROR; - ds->lastType = JT_NULL; - ds->start = offset; - return ds->dec->newNull(ds->prv); + ds->lastType = JT_NULL; + ds->start = offset; + return ds->dec->newNull(ds->prv); SET_NAN_ERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'NaN'"); + return SetError(ds, -1, "Unexpected character found when decoding 'NaN'"); DECODE_INF: - offset++; - if (*(offset++) != 'n') goto SET_INF_ERROR; - if (*(offset++) != 'f') goto SET_INF_ERROR; - if (*(offset++) != 'i') goto SET_INF_ERROR; - if (*(offset++) != 'n') goto SET_INF_ERROR; - if (*(offset++) != 'i') goto SET_INF_ERROR; - if (*(offset++) != 't') goto SET_INF_ERROR; - if (*(offset++) != 'y') goto SET_INF_ERROR; - - ds->start = offset; - - if (intNeg == 1) { - ds->lastType = JT_POS_INF; - return ds->dec->newPosInf(ds->prv); - } else { - ds->lastType = JT_NEG_INF; - return ds->dec->newNegInf(ds->prv); - } + offset++; + if (*(offset++) != 'n') + goto SET_INF_ERROR; + if (*(offset++) != 'f') + goto SET_INF_ERROR; + if (*(offset++) != 'i') + goto SET_INF_ERROR; + if (*(offset++) != 'n') + goto SET_INF_ERROR; + if (*(offset++) != 'i') + goto SET_INF_ERROR; + if (*(offset++) != 't') + goto SET_INF_ERROR; + if (*(offset++) != 'y') + goto SET_INF_ERROR; + + ds->start = offset; + + if (intNeg == 1) { + ds->lastType = JT_POS_INF; + return ds->dec->newPosInf(ds->prv); + } else { + ds->lastType = JT_NEG_INF; + return ds->dec->newNegInf(ds->prv); + } SET_INF_ERROR: - if (intNeg == 1) { - const char *msg = "Unexpected character found when decoding 'Infinity'"; - return SetError(ds, -1, msg); - } else { - const char *msg = "Unexpected character found when decoding '-Infinity'"; - return SetError(ds, -1, msg); - } - + if (intNeg == 1) { + const char *msg = "Unexpected character found when decoding 'Infinity'"; + return SetError(ds, -1, msg); + } else { + const char *msg = "Unexpected character found when decoding '-Infinity'"; + return SetError(ds, -1, msg); + } BREAK_EXP_LOOP: - // FIXME: Check for arithmetic overflow here - ds->lastType = JT_DOUBLE; - ds->start = offset; - return ds->dec->newDouble( - ds->prv, - createDouble((double)intNeg, (double)intValue, frcValue, decimalCount) * - pow(10.0, expValue * expNeg)); + // FIXME: Check for arithmetic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + return ds->dec->newDouble( + ds->prv, + createDouble((double)intNeg, (double)intValue, frcValue, decimalCount) * + pow(10.0, expValue * expNeg)); } JSOBJ FASTCALL_MSVC decode_true(struct DecoderState *ds) { - char *offset = ds->start; - offset++; + char *offset = ds->start; + offset++; - if (*(offset++) != 'r') goto SETERROR; - if (*(offset++) != 'u') goto SETERROR; - if (*(offset++) != 'e') goto SETERROR; + if (*(offset++) != 'r') + goto SETERROR; + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; - ds->lastType = JT_TRUE; - ds->start = offset; - return ds->dec->newTrue(ds->prv); + ds->lastType = JT_TRUE; + ds->start = offset; + return ds->dec->newTrue(ds->prv); SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'true'"); + return SetError(ds, -1, "Unexpected character found when decoding 'true'"); } JSOBJ FASTCALL_MSVC decode_false(struct DecoderState *ds) { - char *offset = ds->start; - offset++; - - if (*(offset++) != 'a') goto SETERROR; - if (*(offset++) != 'l') goto SETERROR; - if (*(offset++) != 's') goto SETERROR; - if (*(offset++) != 'e') goto SETERROR; - - ds->lastType = JT_FALSE; - ds->start = offset; - return ds->dec->newFalse(ds->prv); + char *offset = ds->start; + offset++; + + if (*(offset++) != 'a') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 's') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_FALSE; + ds->start = offset; + return ds->dec->newFalse(ds->prv); SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'false'"); + return SetError(ds, -1, "Unexpected character found when decoding 'false'"); } JSOBJ FASTCALL_MSVC decode_null(struct DecoderState *ds) { - char *offset = ds->start; - offset++; + char *offset = ds->start; + offset++; - if (*(offset++) != 'u') goto SETERROR; - if (*(offset++) != 'l') goto SETERROR; - if (*(offset++) != 'l') goto SETERROR; + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; - ds->lastType = JT_NULL; - ds->start = offset; - return ds->dec->newNull(ds->prv); + ds->lastType = JT_NULL; + ds->start = offset; + return ds->dec->newNull(ds->prv); SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'null'"); + return SetError(ds, -1, "Unexpected character found when decoding 'null'"); } void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) { - char *offset; - - for (offset = ds->start; (ds->end - offset) > 0; offset++) { - switch (*offset) { - case ' ': - case '\t': - case '\r': - case '\n': - break; - - default: - ds->start = offset; - return; - } + char *offset; + + for (offset = ds->start; (ds->end - offset) > 0; offset++) { + switch (*offset) { + case ' ': + case '\t': + case '\r': + case '\n': + break; + + default: + ds->start = offset; + return; } + } - if (offset == ds->end) { - ds->start = ds->end; - } + if (offset == ds->end) { + ds->start = ds->end; + } } enum DECODESTRINGSTATE { - DS_ISNULL = 0x32, - DS_ISQUOTE, - DS_ISESCAPE, - DS_UTFLENERROR, + DS_ISNULL = 0x32, + DS_ISQUOTE, + DS_ISESCAPE, + DS_UTFLENERROR, }; static const JSUINT8 g_decoderLookup[256] = { @@ -680,531 +703,520 @@ static const JSUINT8 g_decoderLookup[256] = { }; JSOBJ FASTCALL_MSVC decode_string(struct DecoderState *ds) { - JSUTF16 sur[2] = {0}; - int iSur = 0; - int index; - wchar_t *escOffset; - wchar_t *escStart; - size_t escLen = (ds->escEnd - ds->escStart); - JSUINT8 *inputOffset; - JSUINT8 oct; - JSUTF32 ucs; - ds->lastType = JT_INVALID; - ds->start++; - - if ((size_t)(ds->end - ds->start) > escLen) { - size_t newSize = (ds->end - ds->start); - - if (ds->escHeap) { - if (newSize > (SIZE_MAX / sizeof(wchar_t))) { - return SetError(ds, -1, "Could not reserve memory block"); - } - escStart = (wchar_t *)ds->dec->realloc(ds->escStart, - newSize * sizeof(wchar_t)); - if (!escStart) { - ds->dec->free(ds->escStart); - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escStart = escStart; - } else { - wchar_t *oldStart = ds->escStart; - if (newSize > (SIZE_MAX / sizeof(wchar_t))) { - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escStart = - (wchar_t *)ds->dec->malloc(newSize * sizeof(wchar_t)); - if (!ds->escStart) { - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escHeap = 1; - memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t)); - } - - ds->escEnd = ds->escStart + newSize; + JSUTF16 sur[2] = {0}; + int iSur = 0; + int index; + wchar_t *escOffset; + wchar_t *escStart; + size_t escLen = (ds->escEnd - ds->escStart); + JSUINT8 *inputOffset; + JSUINT8 oct; + JSUTF32 ucs; + ds->lastType = JT_INVALID; + ds->start++; + + if ((size_t)(ds->end - ds->start) > escLen) { + size_t newSize = (ds->end - ds->start); + + if (ds->escHeap) { + if (newSize > (SIZE_MAX / sizeof(wchar_t))) { + return SetError(ds, -1, "Could not reserve memory block"); + } + escStart = + (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t)); + if (!escStart) { + ds->dec->free(ds->escStart); + return SetError(ds, -1, "Could not reserve memory block"); + } + ds->escStart = escStart; + } else { + wchar_t *oldStart = ds->escStart; + if (newSize > (SIZE_MAX / sizeof(wchar_t))) { + return SetError(ds, -1, "Could not reserve memory block"); + } + ds->escStart = (wchar_t *)ds->dec->malloc(newSize * sizeof(wchar_t)); + if (!ds->escStart) { + return SetError(ds, -1, "Could not reserve memory block"); + } + ds->escHeap = 1; + memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t)); } - escOffset = ds->escStart; - inputOffset = (JSUINT8 *)ds->start; - - for (;;) { - switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) { - case DS_ISNULL: { - return SetError(ds, -1, - "Unmatched ''\"' when when decoding 'string'"); - } - case DS_ISQUOTE: { - ds->lastType = JT_UTF8; - inputOffset++; - ds->start += ((char *)inputOffset - (ds->start)); - return ds->dec->newString(ds->prv, ds->escStart, escOffset); - } - case DS_UTFLENERROR: { - return SetError( - ds, -1, - "Invalid UTF-8 sequence length when decoding 'string'"); - } - case DS_ISESCAPE: - inputOffset++; - switch (*inputOffset) { - case '\\': - *(escOffset++) = L'\\'; - inputOffset++; - continue; - case '\"': - *(escOffset++) = L'\"'; - inputOffset++; - continue; - case '/': - *(escOffset++) = L'/'; - inputOffset++; - continue; - case 'b': - *(escOffset++) = L'\b'; - inputOffset++; - continue; - case 'f': - *(escOffset++) = L'\f'; - inputOffset++; - continue; - case 'n': - *(escOffset++) = L'\n'; - inputOffset++; - continue; - case 'r': - *(escOffset++) = L'\r'; - inputOffset++; - continue; - case 't': - *(escOffset++) = L'\t'; - inputOffset++; - continue; - - case 'u': { - int index; - inputOffset++; - - for (index = 0; index < 4; index++) { - switch (*inputOffset) { - case '\0': - return SetError(ds, -1, - "Unterminated unicode " - "escape sequence when " - "decoding 'string'"); - default: - return SetError(ds, -1, - "Unexpected character in " - "unicode escape sequence " - "when decoding 'string'"); - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - sur[iSur] = (sur[iSur] << 4) + - (JSUTF16)(*inputOffset - '0'); - break; - - case 'a': - case 'b': - case 'c': - case 'd': - case 'e': - case 'f': - sur[iSur] = (sur[iSur] << 4) + 10 + - (JSUTF16)(*inputOffset - 'a'); - break; - - case 'A': - case 'B': - case 'C': - case 'D': - case 'E': - case 'F': - sur[iSur] = (sur[iSur] << 4) + 10 + - (JSUTF16)(*inputOffset - 'A'); - break; - } - - inputOffset++; - } - - if (iSur == 0) { - if ((sur[iSur] & 0xfc00) == 0xd800) { - // First of a surrogate pair, continue parsing - iSur++; - break; - } - (*escOffset++) = (wchar_t)sur[iSur]; - iSur = 0; - } else { - // Decode pair - if ((sur[1] & 0xfc00) != 0xdc00) { - return SetError(ds, -1, - "Unpaired high surrogate when " - "decoding 'string'"); - } -#if WCHAR_MAX == 0xffff - (*escOffset++) = (wchar_t)sur[0]; - (*escOffset++) = (wchar_t)sur[1]; -#else - (*escOffset++) = - (wchar_t)0x10000 + - (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); -#endif - iSur = 0; - } - break; - } - - case '\0': - return SetError(ds, -1, - "Unterminated escape sequence when " - "decoding 'string'"); - default: - return SetError(ds, -1, - "Unrecognized escape sequence when " - "decoding 'string'"); - } - break; - - case 1: { - *(escOffset++) = (wchar_t)(*inputOffset++); - break; - } - - case 2: { - ucs = (*inputOffset++) & 0x1f; - ucs <<= 6; - if (((*inputOffset) & 0x80) != 0x80) { - return SetError(ds, -1, - "Invalid octet in UTF-8 sequence when " - "decoding 'string'"); - } - ucs |= (*inputOffset++) & 0x3f; - if (ucs < 0x80) - return SetError(ds, -1, - "Overlong 2 byte UTF-8 sequence detected " - "when decoding 'string'"); - *(escOffset++) = (wchar_t)ucs; - break; - } - - case 3: { - JSUTF32 ucs = 0; - ucs |= (*inputOffset++) & 0x0f; - - for (index = 0; index < 2; index++) { - ucs <<= 6; - oct = (*inputOffset++); - - if ((oct & 0x80) != 0x80) { - return SetError(ds, -1, - "Invalid octet in UTF-8 sequence when " - "decoding 'string'"); - } - - ucs |= oct & 0x3f; - } - - if (ucs < 0x800) - return SetError(ds, -1, - "Overlong 3 byte UTF-8 sequence detected " - "when encoding string"); - *(escOffset++) = (wchar_t)ucs; - break; - } - - case 4: { - JSUTF32 ucs = 0; - ucs |= (*inputOffset++) & 0x07; - - for (index = 0; index < 3; index++) { - ucs <<= 6; - oct = (*inputOffset++); - - if ((oct & 0x80) != 0x80) { - return SetError(ds, -1, - "Invalid octet in UTF-8 sequence when " - "decoding 'string'"); - } - - ucs |= oct & 0x3f; - } - - if (ucs < 0x10000) - return SetError(ds, -1, - "Overlong 4 byte UTF-8 sequence detected " - "when decoding 'string'"); + ds->escEnd = ds->escStart + newSize; + } + + escOffset = ds->escStart; + inputOffset = (JSUINT8 *)ds->start; + for (;;) { + switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) { + case DS_ISNULL: { + return SetError(ds, -1, "Unmatched ''\"' when when decoding 'string'"); + } + case DS_ISQUOTE: { + ds->lastType = JT_UTF8; + inputOffset++; + ds->start += ((char *)inputOffset - (ds->start)); + return ds->dec->newString(ds->prv, ds->escStart, escOffset); + } + case DS_UTFLENERROR: { + return SetError(ds, -1, + "Invalid UTF-8 sequence length when decoding 'string'"); + } + case DS_ISESCAPE: + inputOffset++; + switch (*inputOffset) { + case '\\': + *(escOffset++) = L'\\'; + inputOffset++; + continue; + case '\"': + *(escOffset++) = L'\"'; + inputOffset++; + continue; + case '/': + *(escOffset++) = L'/'; + inputOffset++; + continue; + case 'b': + *(escOffset++) = L'\b'; + inputOffset++; + continue; + case 'f': + *(escOffset++) = L'\f'; + inputOffset++; + continue; + case 'n': + *(escOffset++) = L'\n'; + inputOffset++; + continue; + case 'r': + *(escOffset++) = L'\r'; + inputOffset++; + continue; + case 't': + *(escOffset++) = L'\t'; + inputOffset++; + continue; + + case 'u': { + int index; + inputOffset++; + + for (index = 0; index < 4; index++) { + switch (*inputOffset) { + case '\0': + return SetError(ds, -1, + "Unterminated unicode " + "escape sequence when " + "decoding 'string'"); + default: + return SetError(ds, -1, + "Unexpected character in " + "unicode escape sequence " + "when decoding 'string'"); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + sur[iSur] = (sur[iSur] << 4) + (JSUTF16)(*inputOffset - '0'); + break; + + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16)(*inputOffset - 'a'); + break; + + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16)(*inputOffset - 'A'); + break; + } + + inputOffset++; + } + + if (iSur == 0) { + if ((sur[iSur] & 0xfc00) == 0xd800) { + // First of a surrogate pair, continue parsing + iSur++; + break; + } + (*escOffset++) = (wchar_t)sur[iSur]; + iSur = 0; + } else { + // Decode pair + if ((sur[1] & 0xfc00) != 0xdc00) { + return SetError(ds, -1, + "Unpaired high surrogate when " + "decoding 'string'"); + } #if WCHAR_MAX == 0xffff - if (ucs >= 0x10000) { - ucs -= 0x10000; - *(escOffset++) = (wchar_t)(ucs >> 10) + 0xd800; - *(escOffset++) = (wchar_t)(ucs & 0x3ff) + 0xdc00; - } else { - *(escOffset++) = (wchar_t)ucs; - } + (*escOffset++) = (wchar_t)sur[0]; + (*escOffset++) = (wchar_t)sur[1]; #else - *(escOffset++) = (wchar_t)ucs; + (*escOffset++) = (wchar_t)0x10000 + + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); #endif - break; - } + iSur = 0; } + break; + } + + case '\0': + return SetError(ds, -1, + "Unterminated escape sequence when " + "decoding 'string'"); + default: + return SetError(ds, -1, + "Unrecognized escape sequence when " + "decoding 'string'"); + } + break; + + case 1: { + *(escOffset++) = (wchar_t)(*inputOffset++); + break; } -} -JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) { - JSOBJ itemValue; - JSOBJ newObj; - int len; - ds->objDepth++; - if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { - return SetError(ds, -1, "Reached object decoding depth limit"); + case 2: { + ucs = (*inputOffset++) & 0x1f; + ucs <<= 6; + if (((*inputOffset) & 0x80) != 0x80) { + return SetError(ds, -1, + "Invalid octet in UTF-8 sequence when " + "decoding 'string'"); + } + ucs |= (*inputOffset++) & 0x3f; + if (ucs < 0x80) + return SetError(ds, -1, + "Overlong 2 byte UTF-8 sequence detected " + "when decoding 'string'"); + *(escOffset++) = (wchar_t)ucs; + break; } - newObj = ds->dec->newArray(ds->prv, ds->dec); - len = 0; + case 3: { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x0f; - ds->lastType = JT_INVALID; - ds->start++; - - for (;;) { - SkipWhitespace(ds); - - if ((*ds->start) == ']') { - ds->objDepth--; - if (len == 0) { - ds->start++; - return ds->dec->endArray(ds->prv, newObj); - } - - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError( - ds, -1, - "Unexpected character found when decoding array value (1)"); + for (index = 0; index < 2; index++) { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) { + return SetError(ds, -1, + "Invalid octet in UTF-8 sequence when " + "decoding 'string'"); } - itemValue = decode_any(ds); + ucs |= oct & 0x3f; + } - if (itemValue == NULL) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } + if (ucs < 0x800) + return SetError(ds, -1, + "Overlong 3 byte UTF-8 sequence detected " + "when encoding string"); + *(escOffset++) = (wchar_t)ucs; + break; + } - if (!ds->dec->arrayAddItem(ds->prv, newObj, itemValue)) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } + case 4: { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x07; - SkipWhitespace(ds); - - switch (*(ds->start++)) { - case ']': { - ds->objDepth--; - return ds->dec->endArray(ds->prv, newObj); - } - case ',': - break; - - default: - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError( - ds, -1, - "Unexpected character found when decoding array value (2)"); + for (index = 0; index < 3; index++) { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) { + return SetError(ds, -1, + "Invalid octet in UTF-8 sequence when " + "decoding 'string'"); } - len++; + ucs |= oct & 0x3f; + } + + if (ucs < 0x10000) + return SetError(ds, -1, + "Overlong 4 byte UTF-8 sequence detected " + "when decoding 'string'"); + +#if WCHAR_MAX == 0xffff + if (ucs >= 0x10000) { + ucs -= 0x10000; + *(escOffset++) = (wchar_t)(ucs >> 10) + 0xd800; + *(escOffset++) = (wchar_t)(ucs & 0x3ff) + 0xdc00; + } else { + *(escOffset++) = (wchar_t)ucs; + } +#else + *(escOffset++) = (wchar_t)ucs; +#endif + break; } + } + } } -JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) { - JSOBJ itemName; - JSOBJ itemValue; - JSOBJ newObj; +JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) { + JSOBJ itemValue; + JSOBJ newObj; + int len; + ds->objDepth++; + if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { + return SetError(ds, -1, "Reached object decoding depth limit"); + } + + newObj = ds->dec->newArray(ds->prv, ds->dec); + len = 0; + + ds->lastType = JT_INVALID; + ds->start++; + + for (;;) { + SkipWhitespace(ds); + + if ((*ds->start) == ']') { + ds->objDepth--; + if (len == 0) { + ds->start++; + return ds->dec->endArray(ds->prv, newObj); + } + + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError( + ds, -1, "Unexpected character found when decoding array value (1)"); + } - ds->objDepth++; - if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { - return SetError(ds, -1, "Reached object decoding depth limit"); + itemValue = decode_any(ds); + + if (itemValue == NULL) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; } - newObj = ds->dec->newObject(ds->prv, ds->dec); + if (!ds->dec->arrayAddItem(ds->prv, newObj, itemValue)) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; + } - ds->start++; + SkipWhitespace(ds); - for (;;) { - SkipWhitespace(ds); + switch (*(ds->start++)) { + case ']': { + ds->objDepth--; + return ds->dec->endArray(ds->prv, newObj); + } + case ',': + break; - if ((*ds->start) == '}') { - ds->objDepth--; - ds->start++; - return ds->dec->endObject(ds->prv, newObj); - } + default: + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError( + ds, -1, "Unexpected character found when decoding array value (2)"); + } - ds->lastType = JT_INVALID; - itemName = decode_any(ds); + len++; + } +} - if (itemName == NULL) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } +JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) { + JSOBJ itemName; + JSOBJ itemValue; + JSOBJ newObj; - if (ds->lastType != JT_UTF8) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return SetError( - ds, -1, - "Key name of object must be 'string' when decoding 'object'"); - } + ds->objDepth++; + if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { + return SetError(ds, -1, "Reached object decoding depth limit"); + } - SkipWhitespace(ds); + newObj = ds->dec->newObject(ds->prv, ds->dec); - if (*(ds->start++) != ':') { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return SetError(ds, -1, "No ':' found when decoding object value"); - } + ds->start++; - SkipWhitespace(ds); + for (;;) { + SkipWhitespace(ds); - itemValue = decode_any(ds); + if ((*ds->start) == '}') { + ds->objDepth--; + ds->start++; + return ds->dec->endObject(ds->prv, newObj); + } - if (itemValue == NULL) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return NULL; - } + ds->lastType = JT_INVALID; + itemName = decode_any(ds); - if (!ds->dec->objectAddKey(ds->prv, newObj, itemName, itemValue)) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - ds->dec->releaseObject(ds->prv, itemValue, ds->dec); - return NULL; - } + if (itemName == NULL) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; + } - SkipWhitespace(ds); - - switch (*(ds->start++)) { - case '}': { - ds->objDepth--; - return ds->dec->endObject(ds->prv, newObj); - } - case ',': - break; - - default: - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError( - ds, -1, - "Unexpected character found when decoding object value"); - } + if (ds->lastType != JT_UTF8) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return SetError( + ds, -1, "Key name of object must be 'string' when decoding 'object'"); } -} -JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { - for (;;) { - switch (*ds->start) { - case '\"': - return decode_string(ds); - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case 'I': - case 'N': - case '-': - return decode_numeric(ds); - - case '[': - return decode_array(ds); - case '{': - return decode_object(ds); - case 't': - return decode_true(ds); - case 'f': - return decode_false(ds); - case 'n': - return decode_null(ds); - - case ' ': - case '\t': - case '\r': - case '\n': - // White space - ds->start++; - break; - - default: - return SetError(ds, -1, "Expected object or value"); - } + SkipWhitespace(ds); + + if (*(ds->start++) != ':') { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return SetError(ds, -1, "No ':' found when decoding object value"); } -} -JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, - size_t cbBuffer) { - /* - FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode - escaping doesn't run into the wall each time */ - char *locale; - struct DecoderState ds; - wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; - JSOBJ ret; - - ds.start = (char *)buffer; - ds.end = ds.start + cbBuffer; - - ds.escStart = escBuffer; - ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); - ds.escHeap = 0; - ds.prv = dec->prv; - ds.dec = dec; - ds.dec->errorStr = NULL; - ds.dec->errorOffset = NULL; - ds.objDepth = 0; - - ds.dec = dec; - - locale = setlocale(LC_NUMERIC, NULL); - if (!locale) { - return SetError(&ds, -1, "setlocale call failed"); + SkipWhitespace(ds); + + itemValue = decode_any(ds); + + if (itemValue == NULL) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return NULL; } - if (strcmp(locale, "C")) { - size_t len = strlen(locale) + 1; - char *saved_locale = malloc(len); - if (saved_locale == NULL) { - return SetError(&ds, -1, "Could not reserve memory block"); - } - memcpy(saved_locale, locale, len); - setlocale(LC_NUMERIC, "C"); - ret = decode_any(&ds); - setlocale(LC_NUMERIC, saved_locale); - free(saved_locale); - } else { - ret = decode_any(&ds); + if (!ds->dec->objectAddKey(ds->prv, newObj, itemName, itemValue)) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + ds->dec->releaseObject(ds->prv, itemValue, ds->dec); + return NULL; } - if (ds.escHeap) { - dec->free(ds.escStart); + SkipWhitespace(ds); + + switch (*(ds->start++)) { + case '}': { + ds->objDepth--; + return ds->dec->endObject(ds->prv, newObj); } + case ',': + break; - SkipWhitespace(&ds); + default: + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError(ds, -1, + "Unexpected character found when decoding object value"); + } + } +} - if (ds.start != ds.end && ret) { - dec->releaseObject(ds.prv, ret, ds.dec); - return SetError(&ds, -1, "Trailing data"); +JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { + for (;;) { + switch (*ds->start) { + case '\"': + return decode_string(ds); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case 'I': + case 'N': + case '-': + return decode_numeric(ds); + + case '[': + return decode_array(ds); + case '{': + return decode_object(ds); + case 't': + return decode_true(ds); + case 'f': + return decode_false(ds); + case 'n': + return decode_null(ds); + + case ' ': + case '\t': + case '\r': + case '\n': + // White space + ds->start++; + break; + + default: + return SetError(ds, -1, "Expected object or value"); } + } +} - return ret; +JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, + size_t cbBuffer) { + /* + FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode + escaping doesn't run into the wall each time */ + char *locale; + struct DecoderState ds; + wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; + JSOBJ ret; + + ds.start = (char *)buffer; + ds.end = ds.start + cbBuffer; + + ds.escStart = escBuffer; + ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); + ds.escHeap = 0; + ds.prv = dec->prv; + ds.dec = dec; + ds.dec->errorStr = NULL; + ds.dec->errorOffset = NULL; + ds.objDepth = 0; + + ds.dec = dec; + + locale = setlocale(LC_NUMERIC, NULL); + if (!locale) { + return SetError(&ds, -1, "setlocale call failed"); + } + + if (strcmp(locale, "C")) { + size_t len = strlen(locale) + 1; + char *saved_locale = malloc(len); + if (saved_locale == NULL) { + return SetError(&ds, -1, "Could not reserve memory block"); + } + memcpy(saved_locale, locale, len); + setlocale(LC_NUMERIC, "C"); + ret = decode_any(&ds); + setlocale(LC_NUMERIC, saved_locale); + free(saved_locale); + } else { + ret = decode_any(&ds); + } + + if (ds.escHeap) { + dec->free(ds.escStart); + } + + SkipWhitespace(&ds); + + if (ds.start != ds.end && ret) { + dec->releaseObject(ds.prv, ret, ds.dec); + return SetError(&ds, -1, "Trailing data"); + } + + return ret; } diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index 942bd0b518144..917af4872ecfe 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -40,6 +40,7 @@ Numeric decoder derived from TCL library // Licence at LICENSES/ULTRAJSON_LICENSE +#include "pandas/vendored/ujson/lib/ultrajson.h" #include #include #include @@ -48,7 +49,6 @@ Numeric decoder derived from TCL library #include #include #include -#include "pandas/vendored/ujson/lib/ultrajson.h" #ifndef TRUE #define TRUE 1 @@ -359,8 +359,8 @@ static const JSUINT8 g_asciiOutputTable[256] = { 1}; static void SetError(JSOBJ obj, JSONObjectEncoder *enc, const char *message) { - enc->errorMsg = message; - enc->errorObj = obj; + enc->errorMsg = message; + enc->errorObj = obj; } /* @@ -368,371 +368,362 @@ FIXME: Keep track of how big these get across several encoder calls and try to make an estimate That way we won't run our head into the wall each call */ void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded) { - size_t curSize = enc->end - enc->start; - size_t newSize = curSize * 2; - size_t offset = enc->offset - enc->start; + size_t curSize = enc->end - enc->start; + size_t newSize = curSize * 2; + size_t offset = enc->offset - enc->start; - while (newSize < curSize + cbNeeded) { - newSize *= 2; - } + while (newSize < curSize + cbNeeded) { + newSize *= 2; + } - if (enc->heap) { - enc->start = (char *)enc->realloc(enc->start, newSize); - if (!enc->start) { - SetError(NULL, enc, "Could not reserve memory block"); - return; - } - } else { - char *oldStart = enc->start; - enc->heap = 1; - enc->start = (char *)enc->malloc(newSize); - if (!enc->start) { - SetError(NULL, enc, "Could not reserve memory block"); - return; - } - memcpy(enc->start, oldStart, offset); + if (enc->heap) { + enc->start = (char *)enc->realloc(enc->start, newSize); + if (!enc->start) { + SetError(NULL, enc, "Could not reserve memory block"); + return; } - enc->offset = enc->start + offset; - enc->end = enc->start + newSize; + } else { + char *oldStart = enc->start; + enc->heap = 1; + enc->start = (char *)enc->malloc(newSize); + if (!enc->start) { + SetError(NULL, enc, "Could not reserve memory block"); + return; + } + memcpy(enc->start, oldStart, offset); + } + enc->offset = enc->start + offset; + enc->end = enc->start + newSize; } INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked(char *outputOffset, unsigned short value) { - *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; - *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; - *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; - *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; + *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; + *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; + *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; + *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; } int Buffer_EscapeStringUnvalidated(JSONObjectEncoder *enc, const char *io, const char *end) { - char *of = (char *)enc->offset; - - for (;;) { - switch (*io) { - case 0x00: { - if (io < end) { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - break; - } else { - enc->offset += (of - enc->offset); - return TRUE; - } - } - case '\"': - (*of++) = '\\'; - (*of++) = '\"'; - break; - case '\\': - (*of++) = '\\'; - (*of++) = '\\'; - break; - case '/': - (*of++) = '\\'; - (*of++) = '/'; - break; - case '\b': - (*of++) = '\\'; - (*of++) = 'b'; - break; - case '\f': - (*of++) = '\\'; - (*of++) = 'f'; - break; - case '\n': - (*of++) = '\\'; - (*of++) = 'n'; - break; - case '\r': - (*of++) = '\\'; - (*of++) = 'r'; - break; - case '\t': - (*of++) = '\\'; - (*of++) = 't'; - break; - - case 0x26: // '/' - case 0x3c: // '<' - case 0x3e: // '>' - { - if (enc->encodeHTMLChars) { - // Fall through to \u00XX case below. - } else { - // Same as default case below. - (*of++) = (*io); - break; - } - } - case 0x01: - case 0x02: - case 0x03: - case 0x04: - case 0x05: - case 0x06: - case 0x07: - case 0x0b: - case 0x0e: - case 0x0f: - case 0x10: - case 0x11: - case 0x12: - case 0x13: - case 0x14: - case 0x15: - case 0x16: - case 0x17: - case 0x18: - case 0x19: - case 0x1a: - case 0x1b: - case 0x1c: - case 0x1d: - case 0x1e: - case 0x1f: { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; - *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; - break; - } - default: - (*of++) = (*io); - break; - } - io++; + char *of = (char *)enc->offset; + + for (;;) { + switch (*io) { + case 0x00: { + if (io < end) { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + break; + } else { + enc->offset += (of - enc->offset); + return TRUE; + } + } + case '\"': + (*of++) = '\\'; + (*of++) = '\"'; + break; + case '\\': + (*of++) = '\\'; + (*of++) = '\\'; + break; + case '/': + (*of++) = '\\'; + (*of++) = '/'; + break; + case '\b': + (*of++) = '\\'; + (*of++) = 'b'; + break; + case '\f': + (*of++) = '\\'; + (*of++) = 'f'; + break; + case '\n': + (*of++) = '\\'; + (*of++) = 'n'; + break; + case '\r': + (*of++) = '\\'; + (*of++) = 'r'; + break; + case '\t': + (*of++) = '\\'; + (*of++) = 't'; + break; + + case 0x26: // '/' + case 0x3c: // '<' + case 0x3e: // '>' + { + if (enc->encodeHTMLChars) { + // Fall through to \u00XX case below. + } else { + // Same as default case below. + (*of++) = (*io); + break; + } } + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x0b: + case 0x0e: + case 0x0f: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1a: + case 0x1b: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; + break; + } + default: + (*of++) = (*io); + break; + } + io++; + } } int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) { - JSUTF32 ucs; - char *of = (char *)enc->offset; - - for (;;) { - JSUINT8 utflen = g_asciiOutputTable[(unsigned char)*io]; - - switch (utflen) { - case 0: { - if (io < end) { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - io++; - continue; - } else { - enc->offset += (of - enc->offset); - return TRUE; - } - } - - case 1: { - *(of++) = (*io++); - continue; - } - - case 2: { - JSUTF32 in; - JSUTF16 in16; - - if (end - io < 1) { - enc->offset += (of - enc->offset); - SetError( - obj, enc, - "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } - - memcpy(&in16, io, sizeof(JSUTF16)); - in = (JSUTF32)in16; + JSUTF32 ucs; + char *of = (char *)enc->offset; + + for (;;) { + JSUINT8 utflen = g_asciiOutputTable[(unsigned char)*io]; + + switch (utflen) { + case 0: { + if (io < end) { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + io++; + continue; + } else { + enc->offset += (of - enc->offset); + return TRUE; + } + } + + case 1: { + *(of++) = (*io++); + continue; + } + + case 2: { + JSUTF32 in; + JSUTF16 in16; + + if (end - io < 1) { + enc->offset += (of - enc->offset); + SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in16, io, sizeof(JSUTF16)); + in = (JSUTF32)in16; #ifdef __LITTLE_ENDIAN__ - ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); + ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); #else - ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); + ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); #endif - if (ucs < 0x80) { - enc->offset += (of - enc->offset); - SetError(obj, enc, - "Overlong 2 byte UTF-8 sequence detected when " - "encoding string"); - return FALSE; - } - - io += 2; - break; - } - - case 3: { - JSUTF32 in; - JSUTF16 in16; - JSUINT8 in8; - - if (end - io < 2) { - enc->offset += (of - enc->offset); - SetError( - obj, enc, - "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } - - memcpy(&in16, io, sizeof(JSUTF16)); - memcpy(&in8, io + 2, sizeof(JSUINT8)); + if (ucs < 0x80) { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Overlong 2 byte UTF-8 sequence detected when " + "encoding string"); + return FALSE; + } + + io += 2; + break; + } + + case 3: { + JSUTF32 in; + JSUTF16 in16; + JSUINT8 in8; + + if (end - io < 2) { + enc->offset += (of - enc->offset); + SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in16, io, sizeof(JSUTF16)); + memcpy(&in8, io + 2, sizeof(JSUINT8)); #ifdef __LITTLE_ENDIAN__ - in = (JSUTF32)in16; - in |= in8 << 16; - ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | - ((in & 0x3f0000) >> 16); + in = (JSUTF32)in16; + in |= in8 << 16; + ucs = + ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | ((in & 0x3f0000) >> 16); #else - in = in16 << 8; - in |= in8; - ucs = - ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); + in = in16 << 8; + in |= in8; + ucs = ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); #endif - if (ucs < 0x800) { - enc->offset += (of - enc->offset); - SetError(obj, enc, - "Overlong 3 byte UTF-8 sequence detected when " - "encoding string"); - return FALSE; - } - - io += 3; - break; - } - case 4: { - JSUTF32 in; - - if (end - io < 3) { - enc->offset += (of - enc->offset); - SetError( - obj, enc, - "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } - - memcpy(&in, io, sizeof(JSUTF32)); + if (ucs < 0x800) { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Overlong 3 byte UTF-8 sequence detected when " + "encoding string"); + return FALSE; + } + + io += 3; + break; + } + case 4: { + JSUTF32 in; + + if (end - io < 3) { + enc->offset += (of - enc->offset); + SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in, io, sizeof(JSUTF32)); #ifdef __LITTLE_ENDIAN__ - ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | - ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); + ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | + ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); #else - ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | - ((in & 0x3f00) >> 2) | (in & 0x3f); + ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | + ((in & 0x3f00) >> 2) | (in & 0x3f); #endif - if (ucs < 0x10000) { - enc->offset += (of - enc->offset); - SetError(obj, enc, - "Overlong 4 byte UTF-8 sequence detected when " - "encoding string"); - return FALSE; - } - - io += 4; - break; - } - - case 5: - case 6: { - enc->offset += (of - enc->offset); - SetError( - obj, enc, - "Unsupported UTF-8 sequence length when encoding string"); - return FALSE; - } - - case 29: { - if (enc->encodeHTMLChars) { - // Fall through to \u00XX case 30 below. - } else { - // Same as case 1 above. - *(of++) = (*io++); - continue; - } - } - - case 30: { - // \uXXXX encode - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; - *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; - io++; - continue; - } - case 10: - case 12: - case 14: - case 16: - case 18: - case 20: - case 22: - case 24: { - *(of++) = *((char *)(g_escapeChars + utflen + 0)); - *(of++) = *((char *)(g_escapeChars + utflen + 1)); - io++; - continue; - } - // This can never happen, it's here to make L4 VC++ happy - default: { - ucs = 0; - break; - } - } - - /* - If the character is a UTF8 sequence of length > 1 we end up here */ - if (ucs >= 0x10000) { - ucs -= 0x10000; - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked( - of, (unsigned short)(ucs >> 10) + 0xd800); - of += 4; - - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked( - of, (unsigned short)(ucs & 0x3ff) + 0xdc00); - of += 4; - } else { - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked(of, (unsigned short)ucs); - of += 4; - } + if (ucs < 0x10000) { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Overlong 4 byte UTF-8 sequence detected when " + "encoding string"); + return FALSE; + } + + io += 4; + break; + } + + case 5: + case 6: { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Unsupported UTF-8 sequence length when encoding string"); + return FALSE; } + + case 29: { + if (enc->encodeHTMLChars) { + // Fall through to \u00XX case 30 below. + } else { + // Same as case 1 above. + *(of++) = (*io++); + continue; + } + } + + case 30: { + // \uXXXX encode + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; + io++; + continue; + } + case 10: + case 12: + case 14: + case 16: + case 18: + case 20: + case 22: + case 24: { + *(of++) = *((char *)(g_escapeChars + utflen + 0)); + *(of++) = *((char *)(g_escapeChars + utflen + 1)); + io++; + continue; + } + // This can never happen, it's here to make L4 VC++ happy + default: { + ucs = 0; + break; + } + } + + /* + If the character is a UTF8 sequence of length > 1 we end up here */ + if (ucs >= 0x10000) { + ucs -= 0x10000; + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (unsigned short)(ucs >> 10) + 0xd800); + of += 4; + + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, + (unsigned short)(ucs & 0x3ff) + 0xdc00); + of += 4; + } else { + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (unsigned short)ucs); + of += 4; + } + } } -#define Buffer_Reserve(__enc, __len) \ - if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \ - { \ - Buffer_Realloc((__enc), (__len));\ - } \ +#define Buffer_Reserve(__enc, __len) \ + if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ + Buffer_Realloc((__enc), (__len)); \ + } #define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; -INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, - char *end) { - char aux; - while (end > begin) aux = *end, *end-- = *begin, *begin++ = aux; +INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, char *end) { + char aux; + while (end > begin) + aux = *end, *end-- = *begin, *begin++ = aux; } void Buffer_AppendIndentNewlineUnchecked(JSONObjectEncoder *enc) { - if (enc->indent > 0) Buffer_AppendCharUnchecked(enc, '\n'); + if (enc->indent > 0) + Buffer_AppendCharUnchecked(enc, '\n'); } // This function could be refactored to only accept enc as an argument, @@ -747,172 +738,174 @@ void Buffer_AppendIndentUnchecked(JSONObjectEncoder *enc, JSINT32 value) { } void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) { - char *wstr; - JSUINT32 uvalue = (value < 0) ? -value : value; - wstr = enc->offset; - - // Conversion. Number is reversed. - do { - *wstr++ = (char)(48 + (uvalue % 10)); - } while (uvalue /= 10); - if (value < 0) *wstr++ = '-'; - - // Reverse string - strreverse(enc->offset, wstr - 1); - enc->offset += (wstr - (enc->offset)); + char *wstr; + JSUINT32 uvalue = (value < 0) ? -value : value; + wstr = enc->offset; + + // Conversion. Number is reversed. + do { + *wstr++ = (char)(48 + (uvalue % 10)); + } while (uvalue /= 10); + if (value < 0) + *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset, wstr - 1); + enc->offset += (wstr - (enc->offset)); } void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) { - char *wstr; - JSUINT64 uvalue; - if (value == INT64_MIN) { - uvalue = INT64_MAX + UINT64_C(1); - } else { - uvalue = (value < 0) ? -value : value; - } + char *wstr; + JSUINT64 uvalue; + if (value == INT64_MIN) { + uvalue = INT64_MAX + UINT64_C(1); + } else { + uvalue = (value < 0) ? -value : value; + } - wstr = enc->offset; - // Conversion. Number is reversed. + wstr = enc->offset; + // Conversion. Number is reversed. - do { - *wstr++ = (char)(48 + (uvalue % 10ULL)); - } while (uvalue /= 10ULL); - if (value < 0) *wstr++ = '-'; + do { + *wstr++ = (char)(48 + (uvalue % 10ULL)); + } while (uvalue /= 10ULL); + if (value < 0) + *wstr++ = '-'; - // Reverse string - strreverse(enc->offset, wstr - 1); - enc->offset += (wstr - (enc->offset)); + // Reverse string + strreverse(enc->offset, wstr - 1); + enc->offset += (wstr - (enc->offset)); } int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc, double value) { - /* if input is beyond the thresholds, revert to exponential */ - const double thres_max = (double)1e16 - 1; - const double thres_min = (double)1e-15; - char precision_str[20]; - int count; - double diff = 0.0; - char *str = enc->offset; - char *wstr = str; - unsigned long long whole; - double tmp; - unsigned long long frac; - int neg; - double pow10; - - if (value == HUGE_VAL || value == -HUGE_VAL) { - SetError(obj, enc, "Invalid Inf value when encoding double"); - return FALSE; - } + /* if input is beyond the thresholds, revert to exponential */ + const double thres_max = (double)1e16 - 1; + const double thres_min = (double)1e-15; + char precision_str[20]; + int count; + double diff = 0.0; + char *str = enc->offset; + char *wstr = str; + unsigned long long whole; + double tmp; + unsigned long long frac; + int neg; + double pow10; + + if (value == HUGE_VAL || value == -HUGE_VAL) { + SetError(obj, enc, "Invalid Inf value when encoding double"); + return FALSE; + } - if (!(value == value)) { - SetError(obj, enc, "Invalid Nan value when encoding double"); - return FALSE; - } + if (!(value == value)) { + SetError(obj, enc, "Invalid Nan value when encoding double"); + return FALSE; + } - /* we'll work in positive values and deal with the - negative sign issue later */ - neg = 0; - if (value < 0) { - neg = 1; - value = -value; - } + /* we'll work in positive values and deal with the + negative sign issue later */ + neg = 0; + if (value < 0) { + neg = 1; + value = -value; + } - /* - for very large or small numbers switch back to native sprintf for - exponentials. anyone want to write code to replace this? */ - if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) { - precision_str[0] = '%'; - precision_str[1] = '.'; + /* + for very large or small numbers switch back to native sprintf for + exponentials. anyone want to write code to replace this? */ + if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) { + precision_str[0] = '%'; + precision_str[1] = '.'; #if defined(_WIN32) && defined(_MSC_VER) - sprintf_s(precision_str + 2, sizeof(precision_str) - 2, "%ug", - enc->doublePrecision); - enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str, - neg ? -value : value); + sprintf_s(precision_str + 2, sizeof(precision_str) - 2, "%ug", + enc->doublePrecision); + enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str, + neg ? -value : value); #else - snprintf(precision_str + 2, sizeof(precision_str) - 2, "%ug", - enc->doublePrecision); - enc->offset += snprintf(str, enc->end - enc->offset, precision_str, - neg ? -value : value); + snprintf(precision_str + 2, sizeof(precision_str) - 2, "%ug", + enc->doublePrecision); + enc->offset += snprintf(str, enc->end - enc->offset, precision_str, + neg ? -value : value); #endif - return TRUE; - } + return TRUE; + } - pow10 = g_pow10[enc->doublePrecision]; + pow10 = g_pow10[enc->doublePrecision]; - whole = (unsigned long long)value; - tmp = (value - whole) * pow10; - frac = (unsigned long long)(tmp); - diff = tmp - frac; + whole = (unsigned long long)value; + tmp = (value - whole) * pow10; + frac = (unsigned long long)(tmp); + diff = tmp - frac; + + if (diff > 0.5) { + ++frac; + } else if (diff == 0.5 && ((frac == 0) || (frac & 1))) { + /* if halfway, round up if odd, OR + if last digit is 0. That last part is strange */ + ++frac; + } + + // handle rollover, e.g. + // case 0.99 with prec 1 is 1.0 and case 0.95 with prec is 1.0 as well + if (frac >= pow10) { + frac = 0; + ++whole; + } + + if (enc->doublePrecision == 0) { + diff = value - whole; if (diff > 0.5) { - ++frac; - } else if (diff == 0.5 && ((frac == 0) || (frac & 1))) { - /* if halfway, round up if odd, OR - if last digit is 0. That last part is strange */ - ++frac; + /* greater than 0.5, round up, e.g. 1.6 -> 2 */ + ++whole; + } else if (diff == 0.5 && (whole & 1)) { + /* exactly 0.5 and ODD, then round up */ + /* 1.5 -> 2, but 2.5 -> 2 */ + ++whole; } - // handle rollover, e.g. - // case 0.99 with prec 1 is 1.0 and case 0.95 with prec is 1.0 as well - if (frac >= pow10) { - frac = 0; - ++whole; + // vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 + } else if (frac) { + count = enc->doublePrecision; + // now do fractional part, as an unsigned number + // we know it is not 0 but we can have leading zeros, these + // should be removed + while (!(frac % 10)) { + --count; + frac /= 10; } + //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 - if (enc->doublePrecision == 0) { - diff = value - whole; - - if (diff > 0.5) { - /* greater than 0.5, round up, e.g. 1.6 -> 2 */ - ++whole; - } else if (diff == 0.5 && (whole & 1)) { - /* exactly 0.5 and ODD, then round up */ - /* 1.5 -> 2, but 2.5 -> 2 */ - ++whole; - } - - // vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 - } else if (frac) { - count = enc->doublePrecision; - // now do fractional part, as an unsigned number - // we know it is not 0 but we can have leading zeros, these - // should be removed - while (!(frac % 10)) { - --count; - frac /= 10; - } - //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 - - // now do fractional part, as an unsigned number - do { - --count; - *wstr++ = (char)(48 + (frac % 10)); - } while (frac /= 10); - // add extra 0s - while (count-- > 0) { - *wstr++ = '0'; - } - // add decimal - *wstr++ = '.'; - } else { - *wstr++ = '0'; - *wstr++ = '.'; + // now do fractional part, as an unsigned number + do { + --count; + *wstr++ = (char)(48 + (frac % 10)); + } while (frac /= 10); + // add extra 0s + while (count-- > 0) { + *wstr++ = '0'; } + // add decimal + *wstr++ = '.'; + } else { + *wstr++ = '0'; + *wstr++ = '.'; + } - // Do whole part. Take care of sign - // conversion. Number is reversed. - do { - *wstr++ = (char)(48 + (whole % 10)); - } while (whole /= 10); + // Do whole part. Take care of sign + // conversion. Number is reversed. + do { + *wstr++ = (char)(48 + (whole % 10)); + } while (whole /= 10); - if (neg) { - *wstr++ = '-'; - } - strreverse(str, wstr - 1); - enc->offset += (wstr - (enc->offset)); + if (neg) { + *wstr++ = '-'; + } + strreverse(str, wstr - 1); + enc->offset += (wstr - (enc->offset)); - return TRUE; + return TRUE; } /* @@ -925,291 +918,287 @@ Perhaps implement recursion detection */ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) { - const char *value; - char *objName; - int count; - JSOBJ iterObj; - size_t szlen; - JSONTypeContext tc; - tc.encoder = enc; - - if (enc->level > enc->recursionMax) { - SetError(obj, enc, "Maximum recursion level reached"); - return; - } + const char *value; + char *objName; + int count; + JSOBJ iterObj; + size_t szlen; + JSONTypeContext tc; + tc.encoder = enc; + + if (enc->level > enc->recursionMax) { + SetError(obj, enc, "Maximum recursion level reached"); + return; + } - /* - This reservation must hold + /* + This reservation must hold - length of _name as encoded worst case + - maxLength of double to string OR maxLength of JSLONG to string - */ + length of _name as encoded worst case + + maxLength of double to string OR maxLength of JSLONG to string + */ - Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName)); - if (enc->errorMsg) { - return; - } + Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName)); + if (enc->errorMsg) { + return; + } - if (name) { - Buffer_AppendCharUnchecked(enc, '\"'); + if (name) { + Buffer_AppendCharUnchecked(enc, '\"'); - if (enc->forceASCII) { - if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) { - return; - } - } else { - if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) { - return; - } - } + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) { + return; + } + } else { + if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) { + return; + } + } - Buffer_AppendCharUnchecked(enc, '\"'); + Buffer_AppendCharUnchecked(enc, '\"'); - Buffer_AppendCharUnchecked(enc, ':'); + Buffer_AppendCharUnchecked(enc, ':'); #ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked(enc, ' '); + Buffer_AppendCharUnchecked(enc, ' '); #endif - } + } - enc->beginTypeContext(obj, &tc); + enc->beginTypeContext(obj, &tc); - switch (tc.type) { - case JT_INVALID: { - return; - } + switch (tc.type) { + case JT_INVALID: { + return; + } - case JT_ARRAY: { - count = 0; - enc->iterBegin(obj, &tc); + case JT_ARRAY: { + count = 0; + enc->iterBegin(obj, &tc); - Buffer_AppendCharUnchecked(enc, '['); - Buffer_AppendIndentNewlineUnchecked(enc); + Buffer_AppendCharUnchecked(enc, '['); + Buffer_AppendIndentNewlineUnchecked(enc); - while (enc->iterNext(obj, &tc)) { - if (count > 0) { - Buffer_AppendCharUnchecked(enc, ','); + while (enc->iterNext(obj, &tc)) { + if (count > 0) { + Buffer_AppendCharUnchecked(enc, ','); #ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked(buffer, ' '); + Buffer_AppendCharUnchecked(buffer, ' '); #endif - Buffer_AppendIndentNewlineUnchecked(enc); - } - - iterObj = enc->iterGetValue(obj, &tc); - - enc->level++; - Buffer_AppendIndentUnchecked(enc, enc->level); - encode(iterObj, enc, NULL, 0); - count++; - } - - enc->iterEnd(obj, &tc); - Buffer_AppendIndentNewlineUnchecked(enc); - Buffer_AppendIndentUnchecked(enc, enc->level); - Buffer_AppendCharUnchecked(enc, ']'); - break; - } - - case JT_OBJECT: { - count = 0; - enc->iterBegin(obj, &tc); - - Buffer_AppendCharUnchecked(enc, '{'); - Buffer_AppendIndentNewlineUnchecked(enc); - - while (enc->iterNext(obj, &tc)) { - if (count > 0) { - Buffer_AppendCharUnchecked(enc, ','); + Buffer_AppendIndentNewlineUnchecked(enc); + } + + iterObj = enc->iterGetValue(obj, &tc); + + enc->level++; + Buffer_AppendIndentUnchecked(enc, enc->level); + encode(iterObj, enc, NULL, 0); + count++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendIndentNewlineUnchecked(enc); + Buffer_AppendIndentUnchecked(enc, enc->level); + Buffer_AppendCharUnchecked(enc, ']'); + break; + } + + case JT_OBJECT: { + count = 0; + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked(enc, '{'); + Buffer_AppendIndentNewlineUnchecked(enc); + + while (enc->iterNext(obj, &tc)) { + if (count > 0) { + Buffer_AppendCharUnchecked(enc, ','); #ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked(enc, ' '); + Buffer_AppendCharUnchecked(enc, ' '); #endif - Buffer_AppendIndentNewlineUnchecked(enc); - } - - iterObj = enc->iterGetValue(obj, &tc); - objName = enc->iterGetName(obj, &tc, &szlen); - - enc->level++; - Buffer_AppendIndentUnchecked(enc, enc->level); - encode(iterObj, enc, objName, szlen); - count++; - } - - enc->iterEnd(obj, &tc); - Buffer_AppendIndentNewlineUnchecked(enc); - Buffer_AppendIndentUnchecked(enc, enc->level); - Buffer_AppendCharUnchecked(enc, '}'); - break; - } - - case JT_LONG: { - Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc)); - break; - } - - case JT_INT: { - Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc)); - break; - } - - case JT_TRUE: { - Buffer_AppendCharUnchecked(enc, 't'); - Buffer_AppendCharUnchecked(enc, 'r'); - Buffer_AppendCharUnchecked(enc, 'u'); - Buffer_AppendCharUnchecked(enc, 'e'); - break; - } - - case JT_FALSE: { - Buffer_AppendCharUnchecked(enc, 'f'); - Buffer_AppendCharUnchecked(enc, 'a'); - Buffer_AppendCharUnchecked(enc, 'l'); - Buffer_AppendCharUnchecked(enc, 's'); - Buffer_AppendCharUnchecked(enc, 'e'); - break; - } - - case JT_NULL: { - Buffer_AppendCharUnchecked(enc, 'n'); - Buffer_AppendCharUnchecked(enc, 'u'); - Buffer_AppendCharUnchecked(enc, 'l'); - Buffer_AppendCharUnchecked(enc, 'l'); - break; - } - - case JT_DOUBLE: { - if (!Buffer_AppendDoubleUnchecked(obj, enc, - enc->getDoubleValue(obj, &tc))) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - break; - } - - case JT_UTF8: { - value = enc->getStringValue(obj, &tc, &szlen); - if (enc->errorMsg) { - enc->endTypeContext(obj, &tc); - return; - } - Buffer_Reserve(enc, RESERVE_STRING(szlen)); - Buffer_AppendCharUnchecked(enc, '\"'); - - if (enc->forceASCII) { - if (!Buffer_EscapeStringValidated(obj, enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } else { - if (!Buffer_EscapeStringUnvalidated(enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } - - Buffer_AppendCharUnchecked(enc, '\"'); - break; - } - - case JT_BIGNUM: { - value = enc->getBigNumStringValue(obj, &tc, &szlen); - - Buffer_Reserve(enc, RESERVE_STRING(szlen)); - if (enc->errorMsg) { - enc->endTypeContext(obj, &tc); - return; - } - - if (enc->forceASCII) { - if (!Buffer_EscapeStringValidated(obj, enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } else { - if (!Buffer_EscapeStringUnvalidated(enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } - - break; - } + Buffer_AppendIndentNewlineUnchecked(enc); + } + + iterObj = enc->iterGetValue(obj, &tc); + objName = enc->iterGetName(obj, &tc, &szlen); + + enc->level++; + Buffer_AppendIndentUnchecked(enc, enc->level); + encode(iterObj, enc, objName, szlen); + count++; } - enc->endTypeContext(obj, &tc); - enc->level--; -} + enc->iterEnd(obj, &tc); + Buffer_AppendIndentNewlineUnchecked(enc); + Buffer_AppendIndentUnchecked(enc, enc->level); + Buffer_AppendCharUnchecked(enc, '}'); + break; + } -char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, - size_t _cbBuffer) { - char *locale; - enc->malloc = enc->malloc ? enc->malloc : malloc; - enc->free = enc->free ? enc->free : free; - enc->realloc = enc->realloc ? enc->realloc : realloc; - enc->errorMsg = NULL; - enc->errorObj = NULL; - enc->level = 0; - - if (enc->recursionMax < 1) { - enc->recursionMax = JSON_MAX_RECURSION_DEPTH; + case JT_LONG: { + Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc)); + break; + } + + case JT_INT: { + Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc)); + break; + } + + case JT_TRUE: { + Buffer_AppendCharUnchecked(enc, 't'); + Buffer_AppendCharUnchecked(enc, 'r'); + Buffer_AppendCharUnchecked(enc, 'u'); + Buffer_AppendCharUnchecked(enc, 'e'); + break; + } + + case JT_FALSE: { + Buffer_AppendCharUnchecked(enc, 'f'); + Buffer_AppendCharUnchecked(enc, 'a'); + Buffer_AppendCharUnchecked(enc, 'l'); + Buffer_AppendCharUnchecked(enc, 's'); + Buffer_AppendCharUnchecked(enc, 'e'); + break; + } + + case JT_NULL: { + Buffer_AppendCharUnchecked(enc, 'n'); + Buffer_AppendCharUnchecked(enc, 'u'); + Buffer_AppendCharUnchecked(enc, 'l'); + Buffer_AppendCharUnchecked(enc, 'l'); + break; + } + + case JT_DOUBLE: { + if (!Buffer_AppendDoubleUnchecked(obj, enc, + enc->getDoubleValue(obj, &tc))) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; } + break; + } - if (enc->doublePrecision < 0 || - enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) { - enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; + case JT_UTF8: { + value = enc->getStringValue(obj, &tc, &szlen); + if (enc->errorMsg) { + enc->endTypeContext(obj, &tc); + return; } + Buffer_Reserve(enc, RESERVE_STRING(szlen)); + Buffer_AppendCharUnchecked(enc, '\"'); - if (_buffer == NULL) { - _cbBuffer = 32768; - enc->start = (char *)enc->malloc(_cbBuffer); - if (!enc->start) { - SetError(obj, enc, "Could not reserve memory block"); - return NULL; - } - enc->heap = 1; + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } } else { - enc->start = _buffer; - enc->heap = 0; + if (!Buffer_EscapeStringUnvalidated(enc, value, value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } } - enc->end = enc->start + _cbBuffer; - enc->offset = enc->start; + Buffer_AppendCharUnchecked(enc, '\"'); + break; + } - locale = setlocale(LC_NUMERIC, NULL); - if (!locale) { - SetError(NULL, enc, "setlocale call failed"); - return NULL; + case JT_BIGNUM: { + value = enc->getBigNumStringValue(obj, &tc, &szlen); + + Buffer_Reserve(enc, RESERVE_STRING(szlen)); + if (enc->errorMsg) { + enc->endTypeContext(obj, &tc); + return; } - if (strcmp(locale, "C")) { - size_t len = strlen(locale) + 1; - char *saved_locale = malloc(len); - if (saved_locale == NULL) { - SetError(NULL, enc, "Could not reserve memory block"); - return NULL; - } - memcpy(saved_locale, locale, len); - setlocale(LC_NUMERIC, "C"); - encode(obj, enc, NULL, 0); - setlocale(LC_NUMERIC, saved_locale); - free(saved_locale); + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } } else { - encode(obj, enc, NULL, 0); + if (!Buffer_EscapeStringUnvalidated(enc, value, value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } } - Buffer_Reserve(enc, 1); - if (enc->errorMsg) { - return NULL; + break; + } + } + + enc->endTypeContext(obj, &tc); + enc->level--; +} + +char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, + size_t _cbBuffer) { + char *locale; + enc->malloc = enc->malloc ? enc->malloc : malloc; + enc->free = enc->free ? enc->free : free; + enc->realloc = enc->realloc ? enc->realloc : realloc; + enc->errorMsg = NULL; + enc->errorObj = NULL; + enc->level = 0; + + if (enc->recursionMax < 1) { + enc->recursionMax = JSON_MAX_RECURSION_DEPTH; + } + + if (enc->doublePrecision < 0 || + enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) { + enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; + } + + if (_buffer == NULL) { + _cbBuffer = 32768; + enc->start = (char *)enc->malloc(_cbBuffer); + if (!enc->start) { + SetError(obj, enc, "Could not reserve memory block"); + return NULL; } - Buffer_AppendCharUnchecked(enc, '\0'); + enc->heap = 1; + } else { + enc->start = _buffer; + enc->heap = 0; + } + + enc->end = enc->start + _cbBuffer; + enc->offset = enc->start; + + locale = setlocale(LC_NUMERIC, NULL); + if (!locale) { + SetError(NULL, enc, "setlocale call failed"); + return NULL; + } + + if (strcmp(locale, "C")) { + size_t len = strlen(locale) + 1; + char *saved_locale = malloc(len); + if (saved_locale == NULL) { + SetError(NULL, enc, "Could not reserve memory block"); + return NULL; + } + memcpy(saved_locale, locale, len); + setlocale(LC_NUMERIC, "C"); + encode(obj, enc, NULL, 0); + setlocale(LC_NUMERIC, saved_locale); + free(saved_locale); + } else { + encode(obj, enc, NULL, 0); + } + + Buffer_Reserve(enc, 1); + if (enc->errorMsg) { + return NULL; + } + Buffer_AppendCharUnchecked(enc, '\0'); - return enc->start; + return enc->start; } diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c index d230642b0632d..147282c476c3b 100644 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c @@ -16,18 +16,19 @@ modification, are permitted provided that the following conditions are met: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -40,32 +41,32 @@ Numeric decoder derived from TCL library #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY #define NO_IMPORT_ARRAY #define PY_SSIZE_T_CLEAN +#include "pandas/vendored/ujson/lib/ultrajson.h" #include #include -#include "pandas/vendored/ujson/lib/ultrajson.h" #define PRINTMARK() typedef struct __PyObjectDecoder { - JSONObjectDecoder dec; + JSONObjectDecoder dec; - void *npyarr; // Numpy context buffer - void *npyarr_addr; // Ref to npyarr ptr to track DECREF calls - npy_intp curdim; // Current array dimension + void *npyarr; // Numpy context buffer + void *npyarr_addr; // Ref to npyarr ptr to track DECREF calls + npy_intp curdim; // Current array dimension - PyArray_Descr *dtype; + PyArray_Descr *dtype; } PyObjectDecoder; typedef struct __NpyArrContext { - PyObject *ret; - PyObject *labels[2]; - PyArray_Dims shape; + PyObject *ret; + PyObject *labels[2]; + PyArray_Dims shape; - PyObjectDecoder *dec; + PyObjectDecoder *dec; - npy_intp i; - npy_intp elsize; - npy_intp elcount; + npy_intp i; + npy_intp elsize; + npy_intp elcount; } NpyArrContext; // Numpy handling based on numpy internal code, specifically the function @@ -87,304 +88,301 @@ int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value); // free the numpy context buffer void Npy_releaseContext(NpyArrContext *npyarr) { - PRINTMARK(); - if (npyarr) { - if (npyarr->shape.ptr) { - PyObject_Free(npyarr->shape.ptr); - } - if (npyarr->dec) { - npyarr->dec->npyarr = NULL; - npyarr->dec->curdim = 0; - } - Py_XDECREF(npyarr->labels[0]); - Py_XDECREF(npyarr->labels[1]); - Py_XDECREF(npyarr->ret); - PyObject_Free(npyarr); + PRINTMARK(); + if (npyarr) { + if (npyarr->shape.ptr) { + PyObject_Free(npyarr->shape.ptr); } + if (npyarr->dec) { + npyarr->dec->npyarr = NULL; + npyarr->dec->curdim = 0; + } + Py_XDECREF(npyarr->labels[0]); + Py_XDECREF(npyarr->labels[1]); + Py_XDECREF(npyarr->ret); + PyObject_Free(npyarr); + } } JSOBJ Object_npyNewArray(void *prv, void *_decoder) { - NpyArrContext *npyarr; - PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; - PRINTMARK(); - if (decoder->curdim <= 0) { - // start of array - initialise the context buffer - npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); - decoder->npyarr_addr = npyarr; - - if (!npyarr) { - PyErr_NoMemory(); - return NULL; - } - - npyarr->dec = decoder; - npyarr->labels[0] = npyarr->labels[1] = NULL; - - npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp) * NPY_MAXDIMS); - npyarr->shape.len = 1; - npyarr->ret = NULL; - - npyarr->elsize = 0; - npyarr->elcount = 4; - npyarr->i = 0; - } else { - // starting a new dimension continue the current array (and reshape - // after) - npyarr = (NpyArrContext *)decoder->npyarr; - if (decoder->curdim >= npyarr->shape.len) { - npyarr->shape.len++; - } + NpyArrContext *npyarr; + PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; + PRINTMARK(); + if (decoder->curdim <= 0) { + // start of array - initialise the context buffer + npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + decoder->npyarr_addr = npyarr; + + if (!npyarr) { + PyErr_NoMemory(); + return NULL; + } + + npyarr->dec = decoder; + npyarr->labels[0] = npyarr->labels[1] = NULL; + + npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp) * NPY_MAXDIMS); + npyarr->shape.len = 1; + npyarr->ret = NULL; + + npyarr->elsize = 0; + npyarr->elcount = 4; + npyarr->i = 0; + } else { + // starting a new dimension continue the current array (and reshape + // after) + npyarr = (NpyArrContext *)decoder->npyarr; + if (decoder->curdim >= npyarr->shape.len) { + npyarr->shape.len++; } + } - npyarr->shape.ptr[decoder->curdim] = 0; - decoder->curdim++; - return npyarr; + npyarr->shape.ptr[decoder->curdim] = 0; + decoder->curdim++; + return npyarr; } PyObject *Npy_returnLabelled(NpyArrContext *npyarr) { - PyObject *ret = npyarr->ret; - npy_intp i; - - if (npyarr->labels[0] || npyarr->labels[1]) { - // finished decoding, build tuple with values and labels - ret = PyTuple_New(npyarr->shape.len + 1); - for (i = 0; i < npyarr->shape.len; i++) { - if (npyarr->labels[i]) { - PyTuple_SET_ITEM(ret, i + 1, npyarr->labels[i]); - npyarr->labels[i] = NULL; - } else { - Py_INCREF(Py_None); - PyTuple_SET_ITEM(ret, i + 1, Py_None); - } - } - PyTuple_SET_ITEM(ret, 0, npyarr->ret); + PyObject *ret = npyarr->ret; + npy_intp i; + + if (npyarr->labels[0] || npyarr->labels[1]) { + // finished decoding, build tuple with values and labels + ret = PyTuple_New(npyarr->shape.len + 1); + for (i = 0; i < npyarr->shape.len; i++) { + if (npyarr->labels[i]) { + PyTuple_SET_ITEM(ret, i + 1, npyarr->labels[i]); + npyarr->labels[i] = NULL; + } else { + Py_INCREF(Py_None); + PyTuple_SET_ITEM(ret, i + 1, Py_None); + } } + PyTuple_SET_ITEM(ret, 0, npyarr->ret); + } - return ret; + return ret; } JSOBJ Object_npyEndArray(void *prv, JSOBJ obj) { - PyObject *ret; - char *new_data; - NpyArrContext *npyarr = (NpyArrContext *)obj; - int emptyType = NPY_DEFAULT_TYPE; - npy_intp i; - PRINTMARK(); - if (!npyarr) { - return NULL; - } - - ret = npyarr->ret; - i = npyarr->i; - - npyarr->dec->curdim--; - - if (i == 0 || !npyarr->ret) { - // empty array would not have been initialised so do it now. - if (npyarr->dec->dtype) { - emptyType = npyarr->dec->dtype->type_num; - } - npyarr->ret = ret = - PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0); - } else if (npyarr->dec->curdim <= 0) { - // realloc to final size - new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize); - if (new_data == NULL) { - PyErr_NoMemory(); - Npy_releaseContext(npyarr); - return NULL; - } - ((PyArrayObject *)ret)->data = (void *)new_data; - // PyArray_BYTES(ret) = new_data; - } + PyObject *ret; + char *new_data; + NpyArrContext *npyarr = (NpyArrContext *)obj; + int emptyType = NPY_DEFAULT_TYPE; + npy_intp i; + PRINTMARK(); + if (!npyarr) { + return NULL; + } - if (npyarr->dec->curdim <= 0) { - // finished decoding array, reshape if necessary - if (npyarr->shape.len > 1) { - npyarr->ret = PyArray_Newshape((PyArrayObject *)ret, &npyarr->shape, - NPY_ANYORDER); - Py_DECREF(ret); - } + ret = npyarr->ret; + i = npyarr->i; - ret = Npy_returnLabelled(npyarr); + npyarr->dec->curdim--; - npyarr->ret = NULL; - Npy_releaseContext(npyarr); + if (i == 0 || !npyarr->ret) { + // empty array would not have been initialised so do it now. + if (npyarr->dec->dtype) { + emptyType = npyarr->dec->dtype->type_num; + } + npyarr->ret = ret = + PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0); + } else if (npyarr->dec->curdim <= 0) { + // realloc to final size + new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize); + if (new_data == NULL) { + PyErr_NoMemory(); + Npy_releaseContext(npyarr); + return NULL; + } + ((PyArrayObject *)ret)->data = (void *)new_data; + // PyArray_BYTES(ret) = new_data; + } + + if (npyarr->dec->curdim <= 0) { + // finished decoding array, reshape if necessary + if (npyarr->shape.len > 1) { + npyarr->ret = + PyArray_Newshape((PyArrayObject *)ret, &npyarr->shape, NPY_ANYORDER); + Py_DECREF(ret); } - return ret; + ret = Npy_returnLabelled(npyarr); + + npyarr->ret = NULL; + Npy_releaseContext(npyarr); + } + + return ret; } int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value) { - PyObject *type; - PyArray_Descr *dtype; - npy_intp i; - char *new_data, *item; - NpyArrContext *npyarr = (NpyArrContext *)obj; - PRINTMARK(); - if (!npyarr) { - return 0; - } + PyObject *type; + PyArray_Descr *dtype; + npy_intp i; + char *new_data, *item; + NpyArrContext *npyarr = (NpyArrContext *)obj; + PRINTMARK(); + if (!npyarr) { + return 0; + } - i = npyarr->i; + i = npyarr->i; - npyarr->shape.ptr[npyarr->dec->curdim - 1]++; + npyarr->shape.ptr[npyarr->dec->curdim - 1]++; - if (PyArray_Check((PyObject *)value)) { - // multidimensional array, keep decoding values. - return 1; + if (PyArray_Check((PyObject *)value)) { + // multidimensional array, keep decoding values. + return 1; + } + + if (!npyarr->ret) { + // Array not initialised yet. + // We do it here so we can 'sniff' the data type if none was provided + if (!npyarr->dec->dtype) { + type = PyObject_Type(value); + if (!PyArray_DescrConverter(type, &dtype)) { + Py_DECREF(type); + goto fail; + } + Py_INCREF(dtype); + Py_DECREF(type); + } else { + dtype = PyArray_DescrNew(npyarr->dec->dtype); } + // If it's an object or string then fill a Python list and subsequently + // convert. Otherwise we would need to somehow mess about with + // reference counts when renewing memory. + npyarr->elsize = dtype->elsize; + if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) { + Py_XDECREF(dtype); + + if (npyarr->dec->curdim > 1) { + PyErr_SetString(PyExc_ValueError, + "Cannot decode multidimensional arrays with " + "variable length elements to numpy"); + goto fail; + } + npyarr->elcount = 0; + npyarr->ret = PyList_New(0); + if (!npyarr->ret) { + goto fail; + } + ((JSONObjectDecoder *)npyarr->dec)->newArray = Object_npyNewArrayList; + ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = + Object_npyArrayListAddItem; + ((JSONObjectDecoder *)npyarr->dec)->endArray = Object_npyEndArrayList; + return Object_npyArrayListAddItem(prv, obj, value); + } + + npyarr->ret = PyArray_NewFromDescr(&PyArray_Type, dtype, 1, + &npyarr->elcount, NULL, NULL, 0, NULL); + if (!npyarr->ret) { - // Array not initialised yet. - // We do it here so we can 'sniff' the data type if none was provided - if (!npyarr->dec->dtype) { - type = PyObject_Type(value); - if (!PyArray_DescrConverter(type, &dtype)) { - Py_DECREF(type); - goto fail; - } - Py_INCREF(dtype); - Py_DECREF(type); - } else { - dtype = PyArray_DescrNew(npyarr->dec->dtype); - } - - // If it's an object or string then fill a Python list and subsequently - // convert. Otherwise we would need to somehow mess about with - // reference counts when renewing memory. - npyarr->elsize = dtype->elsize; - if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) { - Py_XDECREF(dtype); - - if (npyarr->dec->curdim > 1) { - PyErr_SetString(PyExc_ValueError, - "Cannot decode multidimensional arrays with " - "variable length elements to numpy"); - goto fail; - } - npyarr->elcount = 0; - npyarr->ret = PyList_New(0); - if (!npyarr->ret) { - goto fail; - } - ((JSONObjectDecoder *)npyarr->dec)->newArray = - Object_npyNewArrayList; - ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = - Object_npyArrayListAddItem; - ((JSONObjectDecoder *)npyarr->dec)->endArray = - Object_npyEndArrayList; - return Object_npyArrayListAddItem(prv, obj, value); - } - - npyarr->ret = PyArray_NewFromDescr( - &PyArray_Type, dtype, 1, &npyarr->elcount, NULL, NULL, 0, NULL); - - if (!npyarr->ret) { - goto fail; - } + goto fail; + } + } + + if (i >= npyarr->elcount) { + // Grow PyArray_DATA(ret): + // this is similar for the strategy for PyListObject, but we use + // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ... + if (npyarr->elsize == 0) { + PyErr_SetString(PyExc_ValueError, + "Cannot decode multidimensional arrays with " + "variable length elements to numpy"); + goto fail; } - if (i >= npyarr->elcount) { - // Grow PyArray_DATA(ret): - // this is similar for the strategy for PyListObject, but we use - // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ... - if (npyarr->elsize == 0) { - PyErr_SetString(PyExc_ValueError, - "Cannot decode multidimensional arrays with " - "variable length elements to numpy"); - goto fail; - } - - npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i; - if (npyarr->elcount <= NPY_MAX_INTP / npyarr->elsize) { - new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret), - npyarr->elcount * npyarr->elsize); - } else { - PyErr_NoMemory(); - goto fail; - } - ((PyArrayObject *)npyarr->ret)->data = (void *)new_data; - - // PyArray_BYTES(npyarr->ret) = new_data; + npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i; + if (npyarr->elcount <= NPY_MAX_INTP / npyarr->elsize) { + new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret), + npyarr->elcount * npyarr->elsize); + } else { + PyErr_NoMemory(); + goto fail; } + ((PyArrayObject *)npyarr->ret)->data = (void *)new_data; - PyArray_DIMS(npyarr->ret)[0] = i + 1; + // PyArray_BYTES(npyarr->ret) = new_data; + } - if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL || - PyArray_SETITEM(npyarr->ret, item, value) == -1) { - goto fail; - } + PyArray_DIMS(npyarr->ret)[0] = i + 1; - Py_DECREF((PyObject *)value); - npyarr->i++; - return 1; + if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL || + PyArray_SETITEM(npyarr->ret, item, value) == -1) { + goto fail; + } + + Py_DECREF((PyObject *)value); + npyarr->i++; + return 1; fail: - Npy_releaseContext(npyarr); - return 0; + Npy_releaseContext(npyarr); + return 0; } JSOBJ Object_npyNewArrayList(void *prv, void *_decoder) { - PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; - PRINTMARK(); - PyErr_SetString( - PyExc_ValueError, - "nesting not supported for object or variable length dtypes"); - Npy_releaseContext(decoder->npyarr); - return NULL; + PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; + PRINTMARK(); + PyErr_SetString(PyExc_ValueError, + "nesting not supported for object or variable length dtypes"); + Npy_releaseContext(decoder->npyarr); + return NULL; } JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj) { - PyObject *list, *ret; - NpyArrContext *npyarr = (NpyArrContext *)obj; - PRINTMARK(); - if (!npyarr) { - return NULL; - } + PyObject *list, *ret; + NpyArrContext *npyarr = (NpyArrContext *)obj; + PRINTMARK(); + if (!npyarr) { + return NULL; + } - // convert decoded list to numpy array - list = (PyObject *)npyarr->ret; - npyarr->ret = PyArray_FROM_O(list); + // convert decoded list to numpy array + list = (PyObject *)npyarr->ret; + npyarr->ret = PyArray_FROM_O(list); - ret = Npy_returnLabelled(npyarr); - npyarr->ret = list; + ret = Npy_returnLabelled(npyarr); + npyarr->ret = list; - ((JSONObjectDecoder *)npyarr->dec)->newArray = Object_npyNewArray; - ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem; - ((JSONObjectDecoder *)npyarr->dec)->endArray = Object_npyEndArray; - Npy_releaseContext(npyarr); - return ret; + ((JSONObjectDecoder *)npyarr->dec)->newArray = Object_npyNewArray; + ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem; + ((JSONObjectDecoder *)npyarr->dec)->endArray = Object_npyEndArray; + Npy_releaseContext(npyarr); + return ret; } int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value) { - NpyArrContext *npyarr = (NpyArrContext *)obj; - PRINTMARK(); - if (!npyarr) { - return 0; - } - PyList_Append((PyObject *)npyarr->ret, value); - Py_DECREF((PyObject *)value); - npyarr->elcount++; - return 1; + NpyArrContext *npyarr = (NpyArrContext *)obj; + PRINTMARK(); + if (!npyarr) { + return 0; + } + PyList_Append((PyObject *)npyarr->ret, value); + Py_DECREF((PyObject *)value); + npyarr->elcount++; + return 1; } int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { - int ret = PyDict_SetItem(obj, name, value); - Py_DECREF((PyObject *)name); - Py_DECREF((PyObject *)value); - return ret == 0 ? 1 : 0; + int ret = PyDict_SetItem(obj, name, value); + Py_DECREF((PyObject *)name); + Py_DECREF((PyObject *)value); + return ret == 0 ? 1 : 0; } int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) { - int ret = PyList_Append(obj, value); - Py_DECREF((PyObject *)value); - return ret == 0 ? 1 : 0; + int ret = PyList_Append(obj, value); + Py_DECREF((PyObject *)value); + return ret == 0 ? 1 : 0; } JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) { - return PyUnicode_FromWideChar(start, (end - start)); + return PyUnicode_FromWideChar(start, (end - start)); } JSOBJ Object_newTrue(void *prv) { Py_RETURN_TRUE; } @@ -406,117 +404,115 @@ JSOBJ Object_newArray(void *prv, void *decoder) { return PyList_New(0); } JSOBJ Object_endArray(void *prv, JSOBJ obj) { return obj; } JSOBJ Object_newInteger(void *prv, JSINT32 value) { - return PyLong_FromLong((long)value); + return PyLong_FromLong((long)value); } JSOBJ Object_newLong(void *prv, JSINT64 value) { - return PyLong_FromLongLong(value); + return PyLong_FromLongLong(value); } JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) { - return PyLong_FromUnsignedLongLong(value); + return PyLong_FromUnsignedLongLong(value); } JSOBJ Object_newDouble(void *prv, double value) { - return PyFloat_FromDouble(value); + return PyFloat_FromDouble(value); } static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) { - PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; - if (obj != decoder->npyarr_addr) { - Py_XDECREF(((PyObject *)obj)); - } + PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; + if (obj != decoder->npyarr_addr) { + Py_XDECREF(((PyObject *)obj)); + } } -static char *g_kwlist[] = {"obj", "precise_float", - "labelled", "dtype", NULL}; +static char *g_kwlist[] = {"obj", "precise_float", "labelled", "dtype", NULL}; PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { - PyObject *ret; - PyObject *sarg; - PyObject *arg; - PyObject *opreciseFloat = NULL; - JSONObjectDecoder *decoder; - PyObjectDecoder pyDecoder; - PyArray_Descr *dtype = NULL; - int labelled = 0; - - JSONObjectDecoder dec = { - Object_newString, Object_objectAddKey, Object_arrayAddItem, - Object_newTrue, Object_newFalse, Object_newNull, - Object_newPosInf, Object_newNegInf, Object_newObject, - Object_endObject, Object_newArray, Object_endArray, - Object_newInteger, Object_newLong, Object_newUnsignedLong, - Object_newDouble, - Object_releaseObject, PyObject_Malloc, PyObject_Free, - PyObject_Realloc}; - - dec.preciseFloat = 0; - dec.prv = NULL; - - pyDecoder.dec = dec; - pyDecoder.curdim = 0; - pyDecoder.npyarr = NULL; - pyDecoder.npyarr_addr = NULL; - - decoder = (JSONObjectDecoder *)&pyDecoder; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg, - &opreciseFloat, &labelled, - PyArray_DescrConverter2, &dtype)) { - Npy_releaseContext(pyDecoder.npyarr); - return NULL; - } - - if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) { - decoder->preciseFloat = 1; - } - - if (PyBytes_Check(arg)) { - sarg = arg; - } else if (PyUnicode_Check(arg)) { - sarg = PyUnicode_AsUTF8String(arg); - if (sarg == NULL) { - // Exception raised above us by codec according to docs - return NULL; - } - } else { - PyErr_Format(PyExc_TypeError, "Expected 'str' or 'bytes'"); - return NULL; + PyObject *ret; + PyObject *sarg; + PyObject *arg; + PyObject *opreciseFloat = NULL; + JSONObjectDecoder *decoder; + PyObjectDecoder pyDecoder; + PyArray_Descr *dtype = NULL; + int labelled = 0; + + JSONObjectDecoder dec = { + Object_newString, Object_objectAddKey, Object_arrayAddItem, + Object_newTrue, Object_newFalse, Object_newNull, + Object_newPosInf, Object_newNegInf, Object_newObject, + Object_endObject, Object_newArray, Object_endArray, + Object_newInteger, Object_newLong, Object_newUnsignedLong, + Object_newDouble, Object_releaseObject, PyObject_Malloc, + PyObject_Free, PyObject_Realloc}; + + dec.preciseFloat = 0; + dec.prv = NULL; + + pyDecoder.dec = dec; + pyDecoder.curdim = 0; + pyDecoder.npyarr = NULL; + pyDecoder.npyarr_addr = NULL; + + decoder = (JSONObjectDecoder *)&pyDecoder; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg, + &opreciseFloat, &labelled, + PyArray_DescrConverter2, &dtype)) { + Npy_releaseContext(pyDecoder.npyarr); + return NULL; + } + + if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) { + decoder->preciseFloat = 1; + } + + if (PyBytes_Check(arg)) { + sarg = arg; + } else if (PyUnicode_Check(arg)) { + sarg = PyUnicode_AsUTF8String(arg); + if (sarg == NULL) { + // Exception raised above us by codec according to docs + return NULL; } + } else { + PyErr_Format(PyExc_TypeError, "Expected 'str' or 'bytes'"); + return NULL; + } - decoder->errorStr = NULL; - decoder->errorOffset = NULL; + decoder->errorStr = NULL; + decoder->errorOffset = NULL; - ret = JSON_DecodeObject(decoder, PyBytes_AS_STRING(sarg), - PyBytes_GET_SIZE(sarg)); + ret = JSON_DecodeObject(decoder, PyBytes_AS_STRING(sarg), + PyBytes_GET_SIZE(sarg)); - if (sarg != arg) { - Py_DECREF(sarg); - } + if (sarg != arg) { + Py_DECREF(sarg); + } - if (PyErr_Occurred()) { - if (ret) { - Py_DECREF((PyObject *)ret); - } - Npy_releaseContext(pyDecoder.npyarr); - return NULL; + if (PyErr_Occurred()) { + if (ret) { + Py_DECREF((PyObject *)ret); } + Npy_releaseContext(pyDecoder.npyarr); + return NULL; + } - if (decoder->errorStr) { - /* - FIXME: It's possible to give a much nicer error message here with actual - failing element in input etc*/ - - PyErr_Format(PyExc_ValueError, "%s", decoder->errorStr); + if (decoder->errorStr) { + /* + FIXME: It's possible to give a much nicer error message here with actual + failing element in input etc*/ - if (ret) { - Py_DECREF((PyObject *)ret); - } - Npy_releaseContext(pyDecoder.npyarr); + PyErr_Format(PyExc_ValueError, "%s", decoder->errorStr); - return NULL; + if (ret) { + Py_DECREF((PyObject *)ret); } + Npy_releaseContext(pyDecoder.npyarr); + + return NULL; + } - return ret; + return ret; } diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 8c55505f61b51..5438d7b398c3a 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -44,13 +44,13 @@ Numeric decoder derived from TCL library #define NO_IMPORT_ARRAY #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY +#include "datetime.h" +#include "pandas/datetime/pd_datetime.h" +#include "pandas/vendored/ujson/lib/ultrajson.h" #include #include #include #include -#include "pandas/vendored/ujson/lib/ultrajson.h" -#include "datetime.h" -#include "pandas/datetime/pd_datetime.h" npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -65,81 +65,81 @@ int object_is_nat_type(PyObject *obj); int object_is_na_type(PyObject *obj); typedef struct __NpyArrContext { - PyObject *array; - char *dataptr; - int curdim; // current dimension in array's order - int stridedim; // dimension we are striding over - int inc; // stride dimension increment (+/- 1) - npy_intp dim; - npy_intp stride; - npy_intp ndim; - npy_intp index[NPY_MAXDIMS]; - int type_num; - PyArray_GetItemFunc *getitem; - - char **rowLabels; - char **columnLabels; + PyObject *array; + char *dataptr; + int curdim; // current dimension in array's order + int stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) + npy_intp dim; + npy_intp stride; + npy_intp ndim; + npy_intp index[NPY_MAXDIMS]; + int type_num; + PyArray_GetItemFunc *getitem; + + char **rowLabels; + char **columnLabels; } NpyArrContext; typedef struct __PdBlockContext { - int colIdx; - int ncols; - int transpose; + int colIdx; + int ncols; + int transpose; - NpyArrContext **npyCtxts; // NpyArrContext for each column + NpyArrContext **npyCtxts; // NpyArrContext for each column } PdBlockContext; typedef struct __TypeContext { - JSPFN_ITERBEGIN iterBegin; - JSPFN_ITEREND iterEnd; - JSPFN_ITERNEXT iterNext; - JSPFN_ITERGETNAME iterGetName; - JSPFN_ITERGETVALUE iterGetValue; - PFN_PyTypeToUTF8 PyTypeToUTF8; - PyObject *newObj; - PyObject *dictObj; - Py_ssize_t index; - Py_ssize_t size; - PyObject *itemValue; - PyObject *itemName; - PyObject *attrList; - PyObject *iterator; - - double doubleValue; - JSINT64 longValue; - - char *cStr; - NpyArrContext *npyarr; - PdBlockContext *pdblock; - int transpose; - char **rowLabels; - char **columnLabels; - npy_intp rowLabelsLen; - npy_intp columnLabelsLen; + JSPFN_ITERBEGIN iterBegin; + JSPFN_ITEREND iterEnd; + JSPFN_ITERNEXT iterNext; + JSPFN_ITERGETNAME iterGetName; + JSPFN_ITERGETVALUE iterGetValue; + PFN_PyTypeToUTF8 PyTypeToUTF8; + PyObject *newObj; + PyObject *dictObj; + Py_ssize_t index; + Py_ssize_t size; + PyObject *itemValue; + PyObject *itemName; + PyObject *attrList; + PyObject *iterator; + + double doubleValue; + JSINT64 longValue; + + char *cStr; + NpyArrContext *npyarr; + PdBlockContext *pdblock; + int transpose; + char **rowLabels; + char **columnLabels; + npy_intp rowLabelsLen; + npy_intp columnLabelsLen; } TypeContext; typedef struct __PyObjectEncoder { - JSONObjectEncoder enc; + JSONObjectEncoder enc; - // pass through the NpyArrContext when encoding multi-dimensional arrays - NpyArrContext *npyCtxtPassthru; + // pass through the NpyArrContext when encoding multi-dimensional arrays + NpyArrContext *npyCtxtPassthru; - // pass through the PdBlockContext when encoding blocks - PdBlockContext *blkCtxtPassthru; + // pass through the PdBlockContext when encoding blocks + PdBlockContext *blkCtxtPassthru; - // pass-through to encode numpy data directly - int npyType; - void *npyValue; + // pass-through to encode numpy data directly + int npyType; + void *npyValue; - int datetimeIso; - NPY_DATETIMEUNIT datetimeUnit; - NPY_DATETIMEUNIT valueUnit; + int datetimeIso; + NPY_DATETIMEUNIT datetimeUnit; + NPY_DATETIMEUNIT valueUnit; - // output format style for pandas data types - int outputFormat; - int originalOutputFormat; + // output format style for pandas data types + int outputFormat; + int originalOutputFormat; - PyObject *defaultHandler; + PyObject *defaultHandler; } PyObjectEncoder; #define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) @@ -149,245 +149,241 @@ enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; int PdBlock_iterNext(JSOBJ, JSONTypeContext *); static TypeContext *createTypeContext(void) { - TypeContext *pc; + TypeContext *pc; - pc = PyObject_Malloc(sizeof(TypeContext)); - if (!pc) { - PyErr_NoMemory(); - return NULL; - } - pc->newObj = NULL; - pc->dictObj = NULL; - pc->itemValue = NULL; - pc->itemName = NULL; - pc->attrList = NULL; - pc->index = 0; - pc->size = 0; - pc->longValue = 0; - pc->doubleValue = 0.0; - pc->cStr = NULL; - pc->npyarr = NULL; - pc->pdblock = NULL; - pc->rowLabels = NULL; - pc->columnLabels = NULL; - pc->transpose = 0; - pc->rowLabelsLen = 0; - pc->columnLabelsLen = 0; - - return pc; + pc = PyObject_Malloc(sizeof(TypeContext)); + if (!pc) { + PyErr_NoMemory(); + return NULL; + } + pc->newObj = NULL; + pc->dictObj = NULL; + pc->itemValue = NULL; + pc->itemName = NULL; + pc->attrList = NULL; + pc->index = 0; + pc->size = 0; + pc->longValue = 0; + pc->doubleValue = 0.0; + pc->cStr = NULL; + pc->npyarr = NULL; + pc->pdblock = NULL; + pc->rowLabels = NULL; + pc->columnLabels = NULL; + pc->transpose = 0; + pc->rowLabelsLen = 0; + pc->columnLabelsLen = 0; + + return pc; } static PyObject *get_values(PyObject *obj) { - PyObject *values = NULL; - - if (object_is_index_type(obj) || object_is_series_type(obj)) { - // The special cases to worry about are dt64tz and category[dt64tz]. - // In both cases we want the UTC-localized datetime64 ndarray, - // without going through and object array of Timestamps. - if (PyObject_HasAttrString(obj, "tz")) { - PyObject *tz = PyObject_GetAttrString(obj, "tz"); - if (tz != Py_None) { - // Go through object array if we have dt64tz, since tz info will - // be lost if values is used directly. - Py_DECREF(tz); - values = PyObject_CallMethod(obj, "__array__", NULL); - return values; - } - Py_DECREF(tz); - } - values = PyObject_GetAttrString(obj, "values"); - if (values == NULL) { - // Clear so we can subsequently try another method - PyErr_Clear(); - } else if (PyObject_HasAttrString(values, "__array__")) { - // We may have gotten a Categorical or Sparse array so call np.array - PyObject *array_values = PyObject_CallMethod(values, "__array__", - NULL); - Py_DECREF(values); - values = array_values; - } else if (!PyArray_CheckExact(values)) { - // Didn't get a numpy array, so keep trying - Py_DECREF(values); - values = NULL; - } - } - + PyObject *values = NULL; + + if (object_is_index_type(obj) || object_is_series_type(obj)) { + // The special cases to worry about are dt64tz and category[dt64tz]. + // In both cases we want the UTC-localized datetime64 ndarray, + // without going through and object array of Timestamps. + if (PyObject_HasAttrString(obj, "tz")) { + PyObject *tz = PyObject_GetAttrString(obj, "tz"); + if (tz != Py_None) { + // Go through object array if we have dt64tz, since tz info will + // be lost if values is used directly. + Py_DECREF(tz); + values = PyObject_CallMethod(obj, "__array__", NULL); + return values; + } + Py_DECREF(tz); + } + values = PyObject_GetAttrString(obj, "values"); if (values == NULL) { - PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); - PyObject *repr; - if (PyObject_HasAttrString(obj, "dtype")) { - PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); - repr = PyObject_Repr(dtype); - Py_DECREF(dtype); - } else { - repr = PyUnicode_FromString(""); - } + // Clear so we can subsequently try another method + PyErr_Clear(); + } else if (PyObject_HasAttrString(values, "__array__")) { + // We may have gotten a Categorical or Sparse array so call np.array + PyObject *array_values = PyObject_CallMethod(values, "__array__", NULL); + Py_DECREF(values); + values = array_values; + } else if (!PyArray_CheckExact(values)) { + // Didn't get a numpy array, so keep trying + Py_DECREF(values); + values = NULL; + } + } + + if (values == NULL) { + PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); + PyObject *repr; + if (PyObject_HasAttrString(obj, "dtype")) { + PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); + repr = PyObject_Repr(dtype); + Py_DECREF(dtype); + } else { + repr = PyUnicode_FromString(""); + } - PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet", - repr, typeRepr); - Py_DECREF(repr); - Py_DECREF(typeRepr); + PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet", + repr, typeRepr); + Py_DECREF(repr); + Py_DECREF(typeRepr); - return NULL; - } + return NULL; + } - return values; + return values; } static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) { - PyObject *tmp = PyObject_GetAttrString(obj, attr); - PyObject *ret; + PyObject *tmp = PyObject_GetAttrString(obj, attr); + PyObject *ret; - if (tmp == 0) { - return 0; - } - ret = PyObject_GetAttrString(tmp, subAttr); - Py_DECREF(tmp); + if (tmp == 0) { + return 0; + } + ret = PyObject_GetAttrString(tmp, subAttr); + Py_DECREF(tmp); - return ret; + return ret; } static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { - PyObject *tmp = PyObject_GetAttrString(obj, attr); - Py_ssize_t ret; + PyObject *tmp = PyObject_GetAttrString(obj, attr); + Py_ssize_t ret; - if (tmp == 0) { - return 0; - } - ret = PyObject_Length(tmp); - Py_DECREF(tmp); + if (tmp == 0) { + return 0; + } + ret = PyObject_Length(tmp); + Py_DECREF(tmp); - if (ret == -1) { - return 0; - } + if (ret == -1) { + return 0; + } - return ret; + return ret; } - static npy_int64 get_long_attr(PyObject *o, const char *attr) { - // NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT + // NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT - npy_int64 long_val; - PyObject *value = PyObject_GetAttrString(o, attr); - long_val = - (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value)); + npy_int64 long_val; + PyObject *value = PyObject_GetAttrString(o, attr); + long_val = + (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value)); - Py_DECREF(value); - - if (object_is_nat_type(o)) { - // i.e. o is NaT, long_val will be NPY_MIN_INT64 - return long_val; - } + Py_DECREF(value); - // ensure we are in nanoseconds, similar to Timestamp._as_creso or _as_unit - PyObject* reso = PyObject_GetAttrString(o, "_creso"); - if (!PyLong_Check(reso)) { - // https://github.com/pandas-dev/pandas/pull/49034#discussion_r1023165139 - Py_DECREF(reso); - return -1; - } + if (object_is_nat_type(o)) { + // i.e. o is NaT, long_val will be NPY_MIN_INT64 + return long_val; + } - long cReso = PyLong_AsLong(reso); + // ensure we are in nanoseconds, similar to Timestamp._as_creso or _as_unit + PyObject *reso = PyObject_GetAttrString(o, "_creso"); + if (!PyLong_Check(reso)) { + // https://github.com/pandas-dev/pandas/pull/49034#discussion_r1023165139 Py_DECREF(reso); - if (cReso == -1 && PyErr_Occurred()) { - return -1; - } + return -1; + } - if (cReso == NPY_FR_us) { - long_val = long_val * 1000L; - } else if (cReso == NPY_FR_ms) { - long_val = long_val * 1000000L; - } else if (cReso == NPY_FR_s) { - long_val = long_val * 1000000000L; - } + long cReso = PyLong_AsLong(reso); + Py_DECREF(reso); + if (cReso == -1 && PyErr_Occurred()) { + return -1; + } - return long_val; + if (cReso == NPY_FR_us) { + long_val = long_val * 1000L; + } else if (cReso == NPY_FR_ms) { + long_val = long_val * 1000000L; + } else if (cReso == NPY_FR_s) { + long_val = long_val * 1000000000L; + } + + return long_val; } static npy_float64 total_seconds(PyObject *td) { - npy_float64 double_val; - PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); - double_val = PyFloat_AS_DOUBLE(value); - Py_DECREF(value); - return double_val; + npy_float64 double_val; + PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); + double_val = PyFloat_AS_DOUBLE(value); + Py_DECREF(value); + return double_val; } static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *_outLen = PyBytes_GET_SIZE(obj); - return PyBytes_AS_STRING(obj); -} - -static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, - size_t *_outLen) { - char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, - (Py_ssize_t *)_outLen); - if (encoded == NULL) { - /* Something went wrong. - Set errorMsg(to tell encoder to stop), - and let Python exception propagate. */ - JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; - enc->errorMsg = "Encoding failed."; - } - return encoded; + PyObject *obj = (PyObject *)_obj; + *_outLen = PyBytes_GET_SIZE(obj); + return PyBytes_AS_STRING(obj); +} + +static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { + char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); + if (encoded == NULL) { + /* Something went wrong. + Set errorMsg(to tell encoder to stop), + and let Python exception propagate. */ + JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; + enc->errorMsg = "Encoding failed."; + } + return encoded; } /* JSON callback. returns a char* and mutates the pointer to *len */ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; - GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); - return GET_TC(tc)->cStr; + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; + GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); + return GET_TC(tc)->cStr; } /* JSON callback. returns a char* and mutates the pointer to *len */ static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { - GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len); - return GET_TC(tc)->cStr; + GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len); + return GET_TC(tc)->cStr; } /* JSON callback */ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, size_t *len) { - if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { - PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; - } + if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - return PyDateTimeToIso(obj, base, len); + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + return PyDateTimeToIso(obj, base, len); } static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { - PyObject *obj = (PyObject *)_obj; - PyObject *str; - PyObject *tmp; - - str = PyObject_CallMethod(obj, "isoformat", NULL); - if (str == NULL) { - *outLen = 0; - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, "Failed to convert time"); - } - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; - } - if (PyUnicode_Check(str)) { - tmp = str; - str = PyUnicode_AsUTF8String(str); - Py_DECREF(tmp); + PyObject *obj = (PyObject *)_obj; + PyObject *str; + PyObject *tmp; + + str = PyObject_CallMethod(obj, "isoformat", NULL); + if (str == NULL) { + *outLen = 0; + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, "Failed to convert time"); } + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } + if (PyUnicode_Check(str)) { + tmp = str; + str = PyUnicode_AsUTF8String(str); + Py_DECREF(tmp); + } - GET_TC(tc)->newObj = str; + GET_TC(tc)->newObj = str; - *outLen = PyBytes_GET_SIZE(str); - char *outValue = PyBytes_AS_STRING(str); - return outValue; + *outLen = PyBytes_GET_SIZE(str); + char *outValue = PyBytes_AS_STRING(str); + return outValue; } //============================================================================= @@ -395,167 +391,167 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { //============================================================================= static void NpyArr_freeItemValue(JSOBJ Py_UNUSED(_obj), JSONTypeContext *tc) { - if (GET_TC(tc)->npyarr && - GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) { - Py_XDECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } + if (GET_TC(tc)->npyarr && + GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) { + Py_XDECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } } int NpyArr_iterNextNone(JSOBJ Py_UNUSED(_obj), JSONTypeContext *Py_UNUSED(tc)) { - return 0; + return 0; } void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyArrayObject *obj; - NpyArrContext *npyarr; - - if (GET_TC(tc)->newObj) { - obj = (PyArrayObject *)GET_TC(tc)->newObj; - } else { - obj = (PyArrayObject *)_obj; - } + PyArrayObject *obj; + NpyArrContext *npyarr; - npyarr = PyObject_Malloc(sizeof(NpyArrContext)); - GET_TC(tc)->npyarr = npyarr; + if (GET_TC(tc)->newObj) { + obj = (PyArrayObject *)GET_TC(tc)->newObj; + } else { + obj = (PyArrayObject *)_obj; + } - if (!npyarr) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } + npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + GET_TC(tc)->npyarr = npyarr; - npyarr->array = (PyObject *)obj; - npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem; - npyarr->dataptr = PyArray_DATA(obj); - npyarr->ndim = PyArray_NDIM(obj) - 1; - npyarr->curdim = 0; - npyarr->type_num = PyArray_DESCR(obj)->type_num; - - if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); - npyarr->stridedim = npyarr->ndim; - npyarr->index[npyarr->ndim] = 0; - npyarr->inc = -1; - } else { - npyarr->dim = PyArray_DIM(obj, 0); - npyarr->stride = PyArray_STRIDE(obj, 0); - npyarr->stridedim = 0; - npyarr->index[0] = 0; - npyarr->inc = 1; - } - - npyarr->columnLabels = GET_TC(tc)->columnLabels; - npyarr->rowLabels = GET_TC(tc)->rowLabels; + if (!npyarr) { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } + + npyarr->array = (PyObject *)obj; + npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem; + npyarr->dataptr = PyArray_DATA(obj); + npyarr->ndim = PyArray_NDIM(obj) - 1; + npyarr->curdim = 0; + npyarr->type_num = PyArray_DESCR(obj)->type_num; + + if (GET_TC(tc)->transpose) { + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; + } else { + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; + } + + npyarr->columnLabels = GET_TC(tc)->columnLabels; + npyarr->rowLabels = GET_TC(tc)->rowLabels; } void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; + NpyArrContext *npyarr = GET_TC(tc)->npyarr; - if (npyarr) { - NpyArr_freeItemValue(obj, tc); - PyObject_Free(npyarr); - } + if (npyarr) { + NpyArr_freeItemValue(obj, tc); + PyObject_Free(npyarr); + } } void NpyArrPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - // finished this dimension, reset the data pointer - npyarr->curdim--; - npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; - npyarr->stridedim -= npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); - npyarr->dataptr += npyarr->stride; + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + // finished this dimension, reset the data pointer + npyarr->curdim--; + npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; + npyarr->stridedim -= npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->dataptr += npyarr->stride; - NpyArr_freeItemValue(obj, tc); + NpyArr_freeItemValue(obj, tc); } int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; + NpyArrContext *npyarr = GET_TC(tc)->npyarr; - if (PyErr_Occurred()) { - return 0; - } + if (PyErr_Occurred()) { + return 0; + } - if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { - return 0; - } + if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { + return 0; + } - NpyArr_freeItemValue(obj, tc); + NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) { - GET_TC(tc)->itemValue = obj; - Py_INCREF(obj); - ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); - // Also write the resolution (unit) of the ndarray - PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); - ((PyObjectEncoder *)tc->encoder)->valueUnit = - get_datetime_metadata_from_dtype(dtype).base; - ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; - } else { - GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); - } + if (PyArray_ISDATETIME(npyarr->array)) { + GET_TC(tc)->itemValue = obj; + Py_INCREF(obj); + ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); + // Also write the resolution (unit) of the ndarray + PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); + ((PyObjectEncoder *)tc->encoder)->valueUnit = + get_datetime_metadata_from_dtype(dtype).base; + ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; + } else { + GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + } - npyarr->dataptr += npyarr->stride; - npyarr->index[npyarr->stridedim]++; - return 1; + npyarr->dataptr += npyarr->stride; + npyarr->index[npyarr->stridedim]++; + return 1; } int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; + NpyArrContext *npyarr = GET_TC(tc)->npyarr; - if (PyErr_Occurred()) { - return 0; - } + if (PyErr_Occurred()) { + return 0; + } - if (npyarr->curdim >= npyarr->ndim || - npyarr->index[npyarr->stridedim] >= npyarr->dim) { - // innermost dimension, start retrieving item values - GET_TC(tc)->iterNext = NpyArr_iterNextItem; - return NpyArr_iterNextItem(_obj, tc); - } + if (npyarr->curdim >= npyarr->ndim || + npyarr->index[npyarr->stridedim] >= npyarr->dim) { + // innermost dimension, start retrieving item values + GET_TC(tc)->iterNext = NpyArr_iterNextItem; + return NpyArr_iterNextItem(_obj, tc); + } - // dig a dimension deeper - npyarr->index[npyarr->stridedim]++; + // dig a dimension deeper + npyarr->index[npyarr->stridedim]++; - npyarr->curdim++; - npyarr->stridedim += npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); - npyarr->index[npyarr->stridedim] = 0; + npyarr->curdim++; + npyarr->stridedim += npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->index[npyarr->stridedim] = 0; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; - GET_TC(tc)->itemValue = npyarr->array; - return 1; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; + GET_TC(tc)->itemValue = npyarr->array; + return 1; } JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; + return GET_TC(tc)->itemValue; } char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - npy_intp idx; - char *cStr; + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + npy_intp idx; + char *cStr; - if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { - idx = npyarr->index[npyarr->stridedim] - 1; - cStr = npyarr->columnLabels[idx]; - } else { - idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - cStr = npyarr->rowLabels[idx]; - } + if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { + idx = npyarr->index[npyarr->stridedim] - 1; + cStr = npyarr->columnLabels[idx]; + } else { + idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; + cStr = npyarr->rowLabels[idx]; + } - *outLen = strlen(cStr); + *outLen = strlen(cStr); - return cStr; + return cStr; } //============================================================================= @@ -568,217 +564,216 @@ char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, //============================================================================= void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - if (blkCtxt->transpose) { - blkCtxt->colIdx++; - } else { - blkCtxt->colIdx = 0; - } + if (blkCtxt->transpose) { + blkCtxt->colIdx++; + } else { + blkCtxt->colIdx = 0; + } - NpyArr_freeItemValue(obj, tc); + NpyArr_freeItemValue(obj, tc); } int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - if (blkCtxt->colIdx >= blkCtxt->ncols) { - return 0; - } + if (blkCtxt->colIdx >= blkCtxt->ncols) { + return 0; + } - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - blkCtxt->colIdx++; - return NpyArr_iterNextItem(obj, tc); + GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; + blkCtxt->colIdx++; + return NpyArr_iterNextItem(obj, tc); } char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - npy_intp idx; - char *cStr; - - if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { - idx = blkCtxt->colIdx - 1; - cStr = npyarr->columnLabels[idx]; - } else { - idx = GET_TC(tc)->iterNext != PdBlock_iterNext - ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 - : npyarr->index[npyarr->stridedim]; + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; + npy_intp idx; + char *cStr; - cStr = npyarr->rowLabels[idx]; - } + if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { + idx = blkCtxt->colIdx - 1; + cStr = npyarr->columnLabels[idx]; + } else { + idx = GET_TC(tc)->iterNext != PdBlock_iterNext + ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 + : npyarr->index[npyarr->stridedim]; - *outLen = strlen(cStr); - return cStr; + cStr = npyarr->rowLabels[idx]; + } + + *outLen = strlen(cStr); + return cStr; } char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - npy_intp idx; - char *cStr; - - if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { - idx = npyarr->index[npyarr->stridedim] - 1; - cStr = npyarr->columnLabels[idx]; - } else { - idx = blkCtxt->colIdx; - cStr = npyarr->rowLabels[idx]; - } + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; + npy_intp idx; + char *cStr; + + if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { + idx = npyarr->index[npyarr->stridedim] - 1; + cStr = npyarr->columnLabels[idx]; + } else { + idx = blkCtxt->colIdx; + cStr = npyarr->rowLabels[idx]; + } - *outLen = strlen(cStr); - return cStr; + *outLen = strlen(cStr); + return cStr; } int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr; + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + NpyArrContext *npyarr; - if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { - return 0; - } + if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { + return 0; + } - if (blkCtxt->transpose) { - if (blkCtxt->colIdx >= blkCtxt->ncols) { - return 0; - } - } else { - npyarr = blkCtxt->npyCtxts[0]; - if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { - return 0; - } + if (blkCtxt->transpose) { + if (blkCtxt->colIdx >= blkCtxt->ncols) { + return 0; } + } else { + npyarr = blkCtxt->npyCtxts[0]; + if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { + return 0; + } + } - ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt; - GET_TC(tc)->itemValue = obj; + ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt; + GET_TC(tc)->itemValue = obj; - return 1; + return 1; } void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - if (blkCtxt->transpose) { - // if transposed we exhaust each column before moving to the next - GET_TC(tc)->iterNext = NpyArr_iterNextItem; - GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose; - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - } + if (blkCtxt->transpose) { + // if transposed we exhaust each column before moving to the next + GET_TC(tc)->iterNext = NpyArr_iterNextItem; + GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose; + GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; + } } void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *values, *arrays, *array; - PdBlockContext *blkCtxt; - NpyArrContext *npyarr; - Py_ssize_t i; - - obj = (PyObject *)_obj; - - GET_TC(tc)->iterGetName = GET_TC(tc)->transpose - ? PdBlock_iterGetName_Transpose - : PdBlock_iterGetName; - - blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); - if (!blkCtxt) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - GET_TC(tc)->pdblock = blkCtxt; + PyObject *obj, *values, *arrays, *array; + PdBlockContext *blkCtxt; + NpyArrContext *npyarr; + Py_ssize_t i; - blkCtxt->colIdx = 0; - blkCtxt->transpose = GET_TC(tc)->transpose; - blkCtxt->ncols = get_attr_length(obj, "columns"); - - if (blkCtxt->ncols == 0) { - blkCtxt->npyCtxts = NULL; - - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } + obj = (PyObject *)_obj; - blkCtxt->npyCtxts = - PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols); - if (!blkCtxt->npyCtxts) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } + GET_TC(tc)->iterGetName = GET_TC(tc)->transpose + ? PdBlock_iterGetName_Transpose + : PdBlock_iterGetName; - arrays = get_sub_attr(obj, "_mgr", "column_arrays"); - if (!arrays) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } + blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); + if (!blkCtxt) { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } + GET_TC(tc)->pdblock = blkCtxt; - for (i = 0; i < PyObject_Length(arrays); i++) { - array = PyList_GET_ITEM(arrays, i); - if (!array) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto ARR_RET; - } + blkCtxt->colIdx = 0; + blkCtxt->transpose = GET_TC(tc)->transpose; + blkCtxt->ncols = get_attr_length(obj, "columns"); - // ensure we have a numpy array (i.e. np.asarray) - values = PyObject_CallMethod(array, "__array__", NULL); - if ((!values) || (!PyArray_CheckExact(values))) { - // Didn't get a numpy array - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto ARR_RET; - } + if (blkCtxt->ncols == 0) { + blkCtxt->npyCtxts = NULL; - GET_TC(tc)->newObj = values; + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } - // init a dedicated context for this column - NpyArr_iterBegin(obj, tc); - npyarr = GET_TC(tc)->npyarr; + blkCtxt->npyCtxts = PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols); + if (!blkCtxt->npyCtxts) { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } - GET_TC(tc)->itemValue = NULL; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; + arrays = get_sub_attr(obj, "_mgr", "column_arrays"); + if (!arrays) { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } - blkCtxt->npyCtxts[i] = npyarr; - GET_TC(tc)->newObj = NULL; + for (i = 0; i < PyObject_Length(arrays); i++) { + array = PyList_GET_ITEM(arrays, i); + if (!array) { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto ARR_RET; } - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; - goto ARR_RET; -ARR_RET: - Py_DECREF(arrays); -} + // ensure we have a numpy array (i.e. np.asarray) + values = PyObject_CallMethod(array, "__array__", NULL); + if ((!values) || (!PyArray_CheckExact(values))) { + // Didn't get a numpy array + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto ARR_RET; + } -void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt; - NpyArrContext *npyarr; - int i; + GET_TC(tc)->newObj = values; - GET_TC(tc)->itemValue = NULL; + // init a dedicated context for this column + NpyArr_iterBegin(obj, tc); npyarr = GET_TC(tc)->npyarr; - blkCtxt = GET_TC(tc)->pdblock; + GET_TC(tc)->itemValue = NULL; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - if (blkCtxt) { - for (i = 0; i < blkCtxt->ncols; i++) { - npyarr = blkCtxt->npyCtxts[i]; - if (npyarr) { - if (npyarr->array) { - Py_DECREF(npyarr->array); - npyarr->array = NULL; - } + blkCtxt->npyCtxts[i] = npyarr; + GET_TC(tc)->newObj = NULL; + } + GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; + goto ARR_RET; - GET_TC(tc)->npyarr = npyarr; - NpyArr_iterEnd(obj, tc); +ARR_RET: + Py_DECREF(arrays); +} - blkCtxt->npyCtxts[i] = NULL; - } +void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + PdBlockContext *blkCtxt; + NpyArrContext *npyarr; + int i; + + GET_TC(tc)->itemValue = NULL; + npyarr = GET_TC(tc)->npyarr; + + blkCtxt = GET_TC(tc)->pdblock; + + if (blkCtxt) { + for (i = 0; i < blkCtxt->ncols; i++) { + npyarr = blkCtxt->npyCtxts[i]; + if (npyarr) { + if (npyarr->array) { + Py_DECREF(npyarr->array); + npyarr->array = NULL; } - if (blkCtxt->npyCtxts) { - PyObject_Free(blkCtxt->npyCtxts); - } - PyObject_Free(blkCtxt); + GET_TC(tc)->npyarr = npyarr; + NpyArr_iterEnd(obj, tc); + + blkCtxt->npyCtxts[i] = NULL; + } } + + if (blkCtxt->npyCtxts) { + PyObject_Free(blkCtxt->npyCtxts); + } + PyObject_Free(blkCtxt); + } } //============================================================================= @@ -786,34 +781,34 @@ void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { // itemValue is borrowed reference, no ref counting //============================================================================= void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyTuple_GET_SIZE((PyObject *)obj); - GET_TC(tc)->itemValue = NULL; + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyTuple_GET_SIZE((PyObject *)obj); + GET_TC(tc)->itemValue = NULL; } int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) { - PyObject *item; + PyObject *item; - if (GET_TC(tc)->index >= GET_TC(tc)->size) { - return 0; - } + if (GET_TC(tc)->index >= GET_TC(tc)->size) { + return 0; + } - item = PyTuple_GET_ITEM(obj, GET_TC(tc)->index); + item = PyTuple_GET_ITEM(obj, GET_TC(tc)->index); - GET_TC(tc)->itemValue = item; - GET_TC(tc)->index++; - return 1; + GET_TC(tc)->itemValue = item; + GET_TC(tc)->index++; + return 1; } void Tuple_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; + return GET_TC(tc)->itemValue; } char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), size_t *Py_UNUSED(outLen)) { - return NULL; + return NULL; } //============================================================================= @@ -821,47 +816,47 @@ char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), // itemValue is borrowed reference, no ref counting //============================================================================= void Set_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->itemValue = NULL; - GET_TC(tc)->iterator = PyObject_GetIter(obj); + GET_TC(tc)->itemValue = NULL; + GET_TC(tc)->iterator = PyObject_GetIter(obj); } int Set_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObject *item; + PyObject *item; - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } - item = PyIter_Next(GET_TC(tc)->iterator); + item = PyIter_Next(GET_TC(tc)->iterator); - if (item == NULL) { - return 0; - } + if (item == NULL) { + return 0; + } - GET_TC(tc)->itemValue = item; - return 1; + GET_TC(tc)->itemValue = item; + return 1; } void Set_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } - if (GET_TC(tc)->iterator) { - Py_DECREF(GET_TC(tc)->iterator); - GET_TC(tc)->iterator = NULL; - } + if (GET_TC(tc)->iterator) { + Py_DECREF(GET_TC(tc)->iterator); + GET_TC(tc)->iterator = NULL; + } } JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; + return GET_TC(tc)->itemValue; } char *Set_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), size_t *Py_UNUSED(outLen)) { - return NULL; + return NULL; } //============================================================================= @@ -870,98 +865,98 @@ char *Set_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), // itemValue ref is from PyObject_GetAttr. Ref counted //============================================================================= void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->attrList = PyObject_Dir(obj); - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); + GET_TC(tc)->attrList = PyObject_Dir(obj); + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); } void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } - if (GET_TC(tc)->itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } + if (GET_TC(tc)->itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } - Py_DECREF((PyObject *)GET_TC(tc)->attrList); + Py_DECREF((PyObject *)GET_TC(tc)->attrList); } int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj = (PyObject *)_obj; - PyObject *itemValue = GET_TC(tc)->itemValue; - PyObject *itemName = GET_TC(tc)->itemName; - PyObject *attr; - PyObject *attrName; - char *attrStr; - - if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { - return 0; - } - - if (itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = itemValue = NULL; - } - - if (itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = itemName = NULL; - } + PyObject *obj = (PyObject *)_obj; + PyObject *itemValue = GET_TC(tc)->itemValue; + PyObject *itemName = GET_TC(tc)->itemName; + PyObject *attr; + PyObject *attrName; + char *attrStr; + + if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { + return 0; + } - for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index++) { - attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); - attr = PyUnicode_AsUTF8String(attrName); - attrStr = PyBytes_AS_STRING(attr); + if (itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = itemValue = NULL; + } - if (attrStr[0] == '_') { - Py_DECREF(attr); - continue; - } + if (itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = itemName = NULL; + } - itemValue = PyObject_GetAttr(obj, attrName); - if (itemValue == NULL) { - PyErr_Clear(); - Py_DECREF(attr); - continue; - } + for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index++) { + attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); + attr = PyUnicode_AsUTF8String(attrName); + attrStr = PyBytes_AS_STRING(attr); - if (PyCallable_Check(itemValue)) { - Py_DECREF(itemValue); - Py_DECREF(attr); - continue; - } - - GET_TC(tc)->itemName = itemName; - GET_TC(tc)->itemValue = itemValue; + if (attrStr[0] == '_') { + Py_DECREF(attr); + continue; + } - itemName = attr; - break; + itemValue = PyObject_GetAttr(obj, attrName); + if (itemValue == NULL) { + PyErr_Clear(); + Py_DECREF(attr); + continue; } - if (itemName == NULL) { - GET_TC(tc)->index = GET_TC(tc)->size; - GET_TC(tc)->itemValue = NULL; - return 0; + if (PyCallable_Check(itemValue)) { + Py_DECREF(itemValue); + Py_DECREF(attr); + continue; } GET_TC(tc)->itemName = itemName; GET_TC(tc)->itemValue = itemValue; - GET_TC(tc)->index++; - return 1; + itemName = attr; + break; + } + + if (itemName == NULL) { + GET_TC(tc)->index = GET_TC(tc)->size; + GET_TC(tc)->itemValue = NULL; + return 0; + } + + GET_TC(tc)->itemName = itemName; + GET_TC(tc)->itemValue = itemValue; + GET_TC(tc)->index++; + + return 1; } JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; + return GET_TC(tc)->itemValue; } char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { - *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); - return PyBytes_AS_STRING(GET_TC(tc)->itemName); + *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); + return PyBytes_AS_STRING(GET_TC(tc)->itemName); } //============================================================================= @@ -969,187 +964,187 @@ char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, // itemValue is borrowed from object (which is list). No refcounting //============================================================================= void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyList_GET_SIZE((PyObject *)obj); + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE((PyObject *)obj); } int List_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (GET_TC(tc)->index >= GET_TC(tc)->size) { - return 0; - } + if (GET_TC(tc)->index >= GET_TC(tc)->size) { + return 0; + } - GET_TC(tc)->itemValue = PyList_GET_ITEM(obj, GET_TC(tc)->index); - GET_TC(tc)->index++; - return 1; + GET_TC(tc)->itemValue = PyList_GET_ITEM(obj, GET_TC(tc)->index); + GET_TC(tc)->index++; + return 1; } void List_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; + return GET_TC(tc)->itemValue; } char *List_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), size_t *Py_UNUSED(outLen)) { - return NULL; + return NULL; } //============================================================================= // pandas Index iteration functions //============================================================================= void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + } } int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { - Py_ssize_t index; - if (!GET_TC(tc)->cStr) { - return 0; - } - - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); - } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) { - return 0; - } - } else { - return 0; - } + Py_ssize_t index; + if (!GET_TC(tc)->cStr) { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) { + memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } else if (index == 1) { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) { + return 0; + } + } else { + return 0; + } - GET_TC(tc)->index++; - return 1; + GET_TC(tc)->index++; + return 1; } void Index_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; + return GET_TC(tc)->itemValue; } char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= // pandas Series iteration functions //============================================================================= void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + } } int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { - Py_ssize_t index; - if (!GET_TC(tc)->cStr) { - return 0; - } - - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); - } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); - } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) { - return 0; - } - } else { - return 0; - } + Py_ssize_t index; + if (!GET_TC(tc)->cStr) { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) { + memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } else if (index == 1) { + memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } else if (index == 2) { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) { + return 0; + } + } else { + return 0; + } - GET_TC(tc)->index++; - return 1; + GET_TC(tc)->index++; + return 1; } void Series_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - enc->outputFormat = enc->originalOutputFormat; + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + enc->outputFormat = enc->originalOutputFormat; } JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; + return GET_TC(tc)->itemValue; } char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= // pandas DataFrame iteration functions //============================================================================= void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series & index + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + } } int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - Py_ssize_t index; - if (!GET_TC(tc)->cStr) { - return 0; - } - - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { - memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); - } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); - } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); - Py_INCREF(obj); - GET_TC(tc)->itemValue = obj; - } else { - return 0; - } + Py_ssize_t index; + if (!GET_TC(tc)->cStr) { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) { + memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); + } else if (index == 1) { + memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } else if (index == 2) { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + Py_INCREF(obj); + GET_TC(tc)->itemValue = obj; + } else { + return 0; + } - GET_TC(tc)->index++; - return 1; + GET_TC(tc)->index++; + return 1; } void DataFrame_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - enc->outputFormat = enc->originalOutputFormat; + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + enc->outputFormat = enc->originalOutputFormat; } JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; + return GET_TC(tc)->itemValue; } char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= @@ -1158,62 +1153,62 @@ char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, // itemValue is borrowed from object (which is dict). No refCounting //============================================================================= void Dict_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - GET_TC(tc)->index = 0; + GET_TC(tc)->index = 0; } int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObject *itemNameTmp; + PyObject *itemNameTmp; - if (GET_TC(tc)->itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } + if (GET_TC(tc)->itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } - if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, - &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) { - return 0; - } + if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, + &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) { + return 0; + } - if (PyUnicode_Check(GET_TC(tc)->itemName)) { - GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); - } else if (!PyBytes_Check(GET_TC(tc)->itemName)) { - GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); - itemNameTmp = GET_TC(tc)->itemName; - GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); - Py_DECREF(itemNameTmp); - } else { - Py_INCREF(GET_TC(tc)->itemName); - } - return 1; + if (PyUnicode_Check(GET_TC(tc)->itemName)) { + GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); + } else if (!PyBytes_Check(GET_TC(tc)->itemName)) { + GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); + itemNameTmp = GET_TC(tc)->itemName; + GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); + Py_DECREF(itemNameTmp); + } else { + Py_INCREF(GET_TC(tc)->itemName); + } + return 1; } void Dict_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } - Py_DECREF(GET_TC(tc)->dictObj); + if (GET_TC(tc)->itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + Py_DECREF(GET_TC(tc)->dictObj); } JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; + return GET_TC(tc)->itemValue; } char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { - *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); - return PyBytes_AS_STRING(GET_TC(tc)->itemName); + *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); + return PyBytes_AS_STRING(GET_TC(tc)->itemName); } void NpyArr_freeLabels(char **labels, npy_intp len) { - npy_intp i; + npy_intp i; - if (labels) { - for (i = 0; i < len; i++) { - PyObject_Free(labels[i]); - } - PyObject_Free(labels); + if (labels) { + for (i = 0; i < len; i++) { + PyObject_Free(labels[i]); } + PyObject_Free(labels); + } } /* @@ -1235,895 +1230,884 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { */ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp num) { - // NOTE this function steals a reference to labels. - PyObject *item = NULL; - size_t len; - npy_intp i, stride; - char **ret; - char *dataptr, *cLabel; - int type_num; - PyArray_Descr *dtype; - NPY_DATETIMEUNIT base = enc->datetimeUnit; - - if (!labels) { - return 0; - } - - if (PyArray_SIZE(labels) < num) { - PyErr_SetString( - PyExc_ValueError, - "Label array sizes do not match corresponding data shape"); - Py_DECREF(labels); - return 0; - } - - ret = PyObject_Malloc(sizeof(char *) * num); - if (!ret) { - PyErr_NoMemory(); - Py_DECREF(labels); - return 0; - } - - for (i = 0; i < num; i++) { - ret[i] = NULL; - } - - stride = PyArray_STRIDE(labels, 0); - dataptr = PyArray_DATA(labels); - type_num = PyArray_TYPE(labels); - dtype = PyArray_DESCR(labels); + // NOTE this function steals a reference to labels. + PyObject *item = NULL; + size_t len; + npy_intp i, stride; + char **ret; + char *dataptr, *cLabel; + int type_num; + PyArray_Descr *dtype; + NPY_DATETIMEUNIT base = enc->datetimeUnit; + + if (!labels) { + return 0; + } - for (i = 0; i < num; i++) { - item = PyArray_GETITEM(labels, dataptr); - if (!item) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } + if (PyArray_SIZE(labels) < num) { + PyErr_SetString(PyExc_ValueError, + "Label array sizes do not match corresponding data shape"); + Py_DECREF(labels); + return 0; + } - int is_datetimelike = 0; - npy_int64 i8date; - NPY_DATETIMEUNIT dateUnit = NPY_FR_ns; - if (PyTypeNum_ISDATETIME(type_num)) { - is_datetimelike = 1; - PyArray_VectorUnaryFunc *castfunc = - PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, - "Cannot cast numpy dtype %d to long", - enc->npyType); - } - castfunc(dataptr, &i8date, 1, NULL, NULL); - dateUnit = get_datetime_metadata_from_dtype(dtype).base; - } else if (PyDate_Check(item) || PyDelta_Check(item)) { - is_datetimelike = 1; - if (PyObject_HasAttrString(item, "_value")) { - // pd.Timestamp object or pd.NaT - // see test_date_index_and_values for case with non-nano - i8date = get_long_attr(item, "_value"); - } else { - if (PyDelta_Check(item)) { - i8date = total_seconds(item) * - 1000000000LL; // nanoseconds per second - } else { - // datetime.* objects don't follow above rules - i8date = PyDateTimeToEpoch(item, NPY_FR_ns); - } - } + ret = PyObject_Malloc(sizeof(char *) * num); + if (!ret) { + PyErr_NoMemory(); + Py_DECREF(labels); + return 0; + } + + for (i = 0; i < num; i++) { + ret[i] = NULL; + } + + stride = PyArray_STRIDE(labels, 0); + dataptr = PyArray_DATA(labels); + type_num = PyArray_TYPE(labels); + dtype = PyArray_DESCR(labels); + + for (i = 0; i < num; i++) { + item = PyArray_GETITEM(labels, dataptr); + if (!item) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + int is_datetimelike = 0; + npy_int64 i8date; + NPY_DATETIMEUNIT dateUnit = NPY_FR_ns; + if (PyTypeNum_ISDATETIME(type_num)) { + is_datetimelike = 1; + PyArray_VectorUnaryFunc *castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", + enc->npyType); + } + castfunc(dataptr, &i8date, 1, NULL, NULL); + dateUnit = get_datetime_metadata_from_dtype(dtype).base; + } else if (PyDate_Check(item) || PyDelta_Check(item)) { + is_datetimelike = 1; + if (PyObject_HasAttrString(item, "_value")) { + // pd.Timestamp object or pd.NaT + // see test_date_index_and_values for case with non-nano + i8date = get_long_attr(item, "_value"); + } else { + if (PyDelta_Check(item)) { + i8date = total_seconds(item) * 1000000000LL; // nanoseconds per second + } else { + // datetime.* objects don't follow above rules + i8date = PyDateTimeToEpoch(item, NPY_FR_ns); } + } + } - if (is_datetimelike) { - if (i8date == get_nat()) { - len = 4; - cLabel = PyObject_Malloc(len + 1); - strncpy(cLabel, "null", len + 1); + if (is_datetimelike) { + if (i8date == get_nat()) { + len = 4; + cLabel = PyObject_Malloc(len + 1); + strncpy(cLabel, "null", len + 1); + } else { + if (enc->datetimeIso) { + if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { + // TODO(username): non-nano timedelta support? + cLabel = int64ToIsoDuration(i8date, &len); + } else { + if (type_num == NPY_DATETIME) { + cLabel = int64ToIso(i8date, dateUnit, base, &len); } else { - if (enc->datetimeIso) { - if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { - // TODO(username): non-nano timedelta support? - cLabel = int64ToIsoDuration(i8date, &len); - } else { - if (type_num == NPY_DATETIME) { - cLabel = int64ToIso(i8date, dateUnit, base, &len); - } else { - cLabel = PyDateTimeToIso(item, base, &len); - } - } - if (cLabel == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - } else { - int size_of_cLabel = 21; // 21 chars for int 64 - cLabel = PyObject_Malloc(size_of_cLabel); - snprintf(cLabel, size_of_cLabel, "%" NPY_DATETIME_FMT, - NpyDateTimeToEpoch(i8date, base)); - len = strlen(cLabel); - } - } - } else { // Fallback to string representation - // Replace item with the string to keep it alive. - Py_SETREF(item, PyObject_Str(item)); - if (item == NULL) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; + cLabel = PyDateTimeToIso(item, base, &len); } - - cLabel = (char *)PyUnicode_AsUTF8(item); - len = strlen(cLabel); - } - - // Add 1 to include NULL terminator - ret[i] = PyObject_Malloc(len + 1); - memcpy(ret[i], cLabel, len + 1); - Py_DECREF(item); - - if (is_datetimelike) { - PyObject_Free(cLabel); - } - - if (PyErr_Occurred()) { + } + if (cLabel == NULL) { + Py_DECREF(item); NpyArr_freeLabels(ret, num); ret = 0; break; + } + } else { + int size_of_cLabel = 21; // 21 chars for int 64 + cLabel = PyObject_Malloc(size_of_cLabel); + snprintf(cLabel, size_of_cLabel, "%" NPY_DATETIME_FMT, + NpyDateTimeToEpoch(i8date, base)); + len = strlen(cLabel); } + } + } else { // Fallback to string representation + // Replace item with the string to keep it alive. + Py_SETREF(item, PyObject_Str(item)); + if (item == NULL) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } - if (!ret[i]) { - PyErr_NoMemory(); - ret = 0; - break; - } - - dataptr += stride; + cLabel = (char *)PyUnicode_AsUTF8(item); + len = strlen(cLabel); } - Py_DECREF(labels); - return ret; -} + // Add 1 to include NULL terminator + ret[i] = PyObject_Malloc(len + 1); + memcpy(ret[i], cLabel, len + 1); + Py_DECREF(item); -void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) { - PyObject *tmpObj = NULL; - tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL); - if (!PyErr_Occurred()) { - if (tmpObj == NULL) { - PyErr_SetString(PyExc_TypeError, - "Failed to execute default handler"); - } else { - encode(tmpObj, (JSONObjectEncoder *)enc, NULL, 0); - } + if (is_datetimelike) { + PyObject_Free(cLabel); } - Py_XDECREF(tmpObj); - return; -} - -void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *exc, *toDictFunc, *tmpObj, *values; - TypeContext *pc; - PyObjectEncoder *enc; - double val; - npy_int64 value; - int unit; - tc->prv = NULL; - - if (!_obj) { - tc->type = JT_INVALID; - return; + if (PyErr_Occurred()) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; } - obj = (PyObject *)_obj; - enc = (PyObjectEncoder *)tc->encoder; - - if (PyBool_Check(obj)) { - tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; - return; - } else if (obj == Py_None) { - tc->type = JT_NULL; - return; + if (!ret[i]) { + PyErr_NoMemory(); + ret = 0; + break; } - pc = createTypeContext(); - if (!pc) { - tc->type = JT_INVALID; - return; - } - tc->prv = pc; - - if (PyTypeNum_ISDATETIME(enc->npyType)) { - int64_t longVal; - PyArray_VectorUnaryFunc *castfunc = - PyArray_GetCastFunc(PyArray_DescrFromType(enc->npyType), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", - enc->npyType); - } - castfunc(enc->npyValue, &longVal, 1, NULL, NULL); - if (longVal == get_nat()) { - tc->type = JT_NULL; - } else { - if (enc->datetimeIso) { - if (enc->npyType == NPY_TIMEDELTA) { - pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; - } else { - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; - } - // Currently no way to pass longVal to iso function, so use - // state management - pc->longValue = longVal; - tc->type = JT_UTF8; - } else { - NPY_DATETIMEUNIT base = - ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - pc->longValue = NpyDateTimeToEpoch(longVal, base); - tc->type = JT_LONG; - } - } + dataptr += stride; + } - // TODO(username): this prevents infinite loop with - // mixed-type DataFrames; - // refactor - enc->npyCtxtPassthru = NULL; - enc->npyType = -1; - return; - } + Py_DECREF(labels); + return ret; +} - if (PyIter_Check(obj) || - (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) { - goto ISITERABLE; +void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) { + PyObject *tmpObj = NULL; + tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL); + if (!PyErr_Occurred()) { + if (tmpObj == NULL) { + PyErr_SetString(PyExc_TypeError, "Failed to execute default handler"); + } else { + encode(tmpObj, (JSONObjectEncoder *)enc, NULL, 0); } + } + Py_XDECREF(tmpObj); + return; +} - if (PyLong_Check(obj)) { - tc->type = JT_LONG; - int overflow = 0; - pc->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); - int err; - err = (pc->longValue == -1) && PyErr_Occurred(); - - if (overflow) { - tc->type = JT_BIGNUM; - } else if (err) { - goto INVALID; - } +void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { + PyObject *obj, *exc, *toDictFunc, *tmpObj, *values; + TypeContext *pc; + PyObjectEncoder *enc; + double val; + npy_int64 value; + int unit; - return; - } else if (PyFloat_Check(obj)) { - val = PyFloat_AS_DOUBLE(obj); - if (npy_isnan(val) || npy_isinf(val)) { - tc->type = JT_NULL; - } else { - pc->doubleValue = val; - tc->type = JT_DOUBLE; - } - return; - } else if (PyBytes_Check(obj)) { - pc->PyTypeToUTF8 = PyBytesToUTF8; - tc->type = JT_UTF8; - return; - } else if (PyUnicode_Check(obj)) { - pc->PyTypeToUTF8 = PyUnicodeToUTF8; - tc->type = JT_UTF8; - return; - } else if (object_is_decimal_type(obj)) { - pc->doubleValue = PyFloat_AsDouble(obj); - tc->type = JT_DOUBLE; - return; - } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { - if (object_is_nat_type(obj)) { - tc->type = JT_NULL; - return; - } + tc->prv = NULL; - if (enc->datetimeIso) { - pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; - tc->type = JT_UTF8; - } else { - NPY_DATETIMEUNIT base = - ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - pc->longValue = PyDateTimeToEpoch(obj, base); - tc->type = JT_LONG; - } - return; - } else if (PyTime_Check(obj)) { - pc->PyTypeToUTF8 = PyTimeToJSON; - tc->type = JT_UTF8; - return; - } else if (PyArray_IsScalar(obj, Datetime)) { - npy_int64 longVal; - if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) { - tc->type = JT_NULL; - return; - } - PyArray_Descr *dtype = PyArray_DescrFromScalar(obj); - if (!PyTypeNum_ISDATETIME(dtype->type_num)) { - PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime"); - return; - } + if (!_obj) { + tc->type = JT_INVALID; + return; + } - PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64); - PyArray_CastScalarToCtype(obj, &longVal, outcode); - Py_DECREF(outcode); + obj = (PyObject *)_obj; + enc = (PyObjectEncoder *)tc->encoder; - if (enc->datetimeIso) { - GET_TC(tc)->longValue = longVal; - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; - enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base; - tc->type = JT_UTF8; - } else { - NPY_DATETIMEUNIT base = - ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - pc->longValue = PyDateTimeToEpoch(obj, base); - tc->type = JT_LONG; - } - return; - } else if (PyDelta_Check(obj)) { - if (PyObject_HasAttrString(obj, "_value")) { - // pd.Timedelta object or pd.NaT - value = get_long_attr(obj, "_value"); - } else { - value = total_seconds(obj) * 1000000000LL; // nanoseconds per sec - } + if (PyBool_Check(obj)) { + tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; + return; + } else if (obj == Py_None) { + tc->type = JT_NULL; + return; + } - if (value == get_nat()) { - tc->type = JT_NULL; - return; - } else if (enc->datetimeIso) { - pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; - tc->type = JT_UTF8; + pc = createTypeContext(); + if (!pc) { + tc->type = JT_INVALID; + return; + } + tc->prv = pc; + + if (PyTypeNum_ISDATETIME(enc->npyType)) { + int64_t longVal; + PyArray_VectorUnaryFunc *castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(enc->npyType), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", + enc->npyType); + } + castfunc(enc->npyValue, &longVal, 1, NULL, NULL); + if (longVal == get_nat()) { + tc->type = JT_NULL; + } else { + if (enc->datetimeIso) { + if (enc->npyType == NPY_TIMEDELTA) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; } else { - unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (scaleNanosecToUnit(&value, unit) != 0) { - // TODO(username): Add some kind of error handling here - } + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + } + // Currently no way to pass longVal to iso function, so use + // state management + pc->longValue = longVal; + tc->type = JT_UTF8; + } else { + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + pc->longValue = NpyDateTimeToEpoch(longVal, base); + tc->type = JT_LONG; + } + } - exc = PyErr_Occurred(); + // TODO(username): this prevents infinite loop with + // mixed-type DataFrames; + // refactor + enc->npyCtxtPassthru = NULL; + enc->npyType = -1; + return; + } - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { - goto INVALID; - } + if (PyIter_Check(obj) || (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) { + goto ISITERABLE; + } - tc->type = JT_LONG; - } - pc->longValue = value; - return; - } else if (PyArray_IsScalar(obj, Integer)) { - tc->type = JT_LONG; - PyArray_CastScalarToCtype(obj, &(pc->longValue), - PyArray_DescrFromType(NPY_INT64)); + if (PyLong_Check(obj)) { + tc->type = JT_LONG; + int overflow = 0; + pc->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); + int err; + err = (pc->longValue == -1) && PyErr_Occurred(); - exc = PyErr_Occurred(); + if (overflow) { + tc->type = JT_BIGNUM; + } else if (err) { + goto INVALID; + } - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { - goto INVALID; - } + return; + } else if (PyFloat_Check(obj)) { + val = PyFloat_AS_DOUBLE(obj); + if (npy_isnan(val) || npy_isinf(val)) { + tc->type = JT_NULL; + } else { + pc->doubleValue = val; + tc->type = JT_DOUBLE; + } + return; + } else if (PyBytes_Check(obj)) { + pc->PyTypeToUTF8 = PyBytesToUTF8; + tc->type = JT_UTF8; + return; + } else if (PyUnicode_Check(obj)) { + pc->PyTypeToUTF8 = PyUnicodeToUTF8; + tc->type = JT_UTF8; + return; + } else if (object_is_decimal_type(obj)) { + pc->doubleValue = PyFloat_AsDouble(obj); + tc->type = JT_DOUBLE; + return; + } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { + if (object_is_nat_type(obj)) { + tc->type = JT_NULL; + return; + } - return; - } else if (PyArray_IsScalar(obj, Bool)) { - PyArray_CastScalarToCtype(obj, &(pc->longValue), - PyArray_DescrFromType(NPY_BOOL)); - tc->type = (pc->longValue) ? JT_TRUE : JT_FALSE; - return; - } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) { - PyArray_CastScalarToCtype(obj, &(pc->doubleValue), - PyArray_DescrFromType(NPY_DOUBLE)); - tc->type = JT_DOUBLE; - return; - } else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) { - PyErr_Format(PyExc_TypeError, - "%R (0d array) is not JSON serializable at the moment", - obj); - goto INVALID; - } else if (object_is_na_type(obj)) { - tc->type = JT_NULL; - return; + if (enc->datetimeIso) { + pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; + tc->type = JT_UTF8; + } else { + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + pc->longValue = PyDateTimeToEpoch(obj, base); + tc->type = JT_LONG; + } + return; + } else if (PyTime_Check(obj)) { + pc->PyTypeToUTF8 = PyTimeToJSON; + tc->type = JT_UTF8; + return; + } else if (PyArray_IsScalar(obj, Datetime)) { + npy_int64 longVal; + if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) { + tc->type = JT_NULL; + return; + } + PyArray_Descr *dtype = PyArray_DescrFromScalar(obj); + if (!PyTypeNum_ISDATETIME(dtype->type_num)) { + PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime"); + return; + } + + PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64); + PyArray_CastScalarToCtype(obj, &longVal, outcode); + Py_DECREF(outcode); + + if (enc->datetimeIso) { + GET_TC(tc)->longValue = longVal; + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base; + tc->type = JT_UTF8; + } else { + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + pc->longValue = PyDateTimeToEpoch(obj, base); + tc->type = JT_LONG; + } + return; + } else if (PyDelta_Check(obj)) { + if (PyObject_HasAttrString(obj, "_value")) { + // pd.Timedelta object or pd.NaT + value = get_long_attr(obj, "_value"); + } else { + value = total_seconds(obj) * 1000000000LL; // nanoseconds per sec } -ISITERABLE: + if (value == get_nat()) { + tc->type = JT_NULL; + return; + } else if (enc->datetimeIso) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + tc->type = JT_UTF8; + } else { + unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + if (scaleNanosecToUnit(&value, unit) != 0) { + // TODO(username): Add some kind of error handling here + } - if (object_is_index_type(obj)) { - if (enc->outputFormat == SPLIT) { - tc->type = JT_OBJECT; - pc->iterBegin = Index_iterBegin; - pc->iterEnd = Index_iterEnd; - pc->iterNext = Index_iterNext; - pc->iterGetValue = Index_iterGetValue; - pc->iterGetName = Index_iterGetName; - return; - } + exc = PyErr_Occurred(); - pc->newObj = get_values(obj); - if (pc->newObj) { - tc->type = JT_ARRAY; - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - } else { - goto INVALID; - } + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + goto INVALID; + } - return; - } else if (object_is_series_type(obj)) { - if (enc->outputFormat == SPLIT) { - tc->type = JT_OBJECT; - pc->iterBegin = Series_iterBegin; - pc->iterEnd = Series_iterEnd; - pc->iterNext = Series_iterNext; - pc->iterGetValue = Series_iterGetValue; - pc->iterGetName = Series_iterGetName; - return; - } + tc->type = JT_LONG; + } + pc->longValue = value; + return; + } else if (PyArray_IsScalar(obj, Integer)) { + tc->type = JT_LONG; + PyArray_CastScalarToCtype(obj, &(pc->longValue), + PyArray_DescrFromType(NPY_INT64)); - pc->newObj = get_values(obj); - if (!pc->newObj) { - goto INVALID; - } + exc = PyErr_Occurred(); - if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - tc->type = JT_OBJECT; - tmpObj = PyObject_GetAttrString(obj, "index"); - if (!tmpObj) { - goto INVALID; - } - values = get_values(tmpObj); - Py_DECREF(tmpObj); - if (!values) { - goto INVALID; - } - pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); - if (!pc->columnLabels) { - goto INVALID; - } - } else { - tc->type = JT_ARRAY; - } - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - return; - } else if (PyArray_Check(obj)) { - if (enc->npyCtxtPassthru) { - pc->npyarr = enc->npyCtxtPassthru; - tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); - - pc->iterBegin = NpyArrPassThru_iterBegin; - pc->iterNext = NpyArr_iterNext; - pc->iterEnd = NpyArrPassThru_iterEnd; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - - enc->npyCtxtPassthru = NULL; - return; - } + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + goto INVALID; + } - tc->type = JT_ARRAY; - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - return; - } else if (object_is_dataframe_type(obj)) { - if (enc->blkCtxtPassthru) { - pc->pdblock = enc->blkCtxtPassthru; - tc->type = - (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY); - - pc->iterBegin = PdBlockPassThru_iterBegin; - pc->iterEnd = PdBlockPassThru_iterEnd; - pc->iterNext = PdBlock_iterNextItem; - pc->iterGetName = PdBlock_iterGetName; - pc->iterGetValue = NpyArr_iterGetValue; - - enc->blkCtxtPassthru = NULL; - return; - } + return; + } else if (PyArray_IsScalar(obj, Bool)) { + PyArray_CastScalarToCtype(obj, &(pc->longValue), + PyArray_DescrFromType(NPY_BOOL)); + tc->type = (pc->longValue) ? JT_TRUE : JT_FALSE; + return; + } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) { + PyArray_CastScalarToCtype(obj, &(pc->doubleValue), + PyArray_DescrFromType(NPY_DOUBLE)); + tc->type = JT_DOUBLE; + return; + } else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) { + PyErr_Format(PyExc_TypeError, + "%R (0d array) is not JSON serializable at the moment", obj); + goto INVALID; + } else if (object_is_na_type(obj)) { + tc->type = JT_NULL; + return; + } - if (enc->outputFormat == SPLIT) { - tc->type = JT_OBJECT; - pc->iterBegin = DataFrame_iterBegin; - pc->iterEnd = DataFrame_iterEnd; - pc->iterNext = DataFrame_iterNext; - pc->iterGetValue = DataFrame_iterGetValue; - pc->iterGetName = DataFrame_iterGetName; - return; - } +ISITERABLE: - pc->iterBegin = PdBlock_iterBegin; - pc->iterEnd = PdBlock_iterEnd; - pc->iterNext = PdBlock_iterNext; - pc->iterGetName = PdBlock_iterGetName; - pc->iterGetValue = NpyArr_iterGetValue; - - if (enc->outputFormat == VALUES) { - tc->type = JT_ARRAY; - } else if (enc->outputFormat == RECORDS) { - tc->type = JT_ARRAY; - tmpObj = PyObject_GetAttrString(obj, "columns"); - if (!tmpObj) { - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) { - goto INVALID; - } - } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - tc->type = JT_OBJECT; - tmpObj = (enc->outputFormat == INDEX - ? PyObject_GetAttrString(obj, "index") - : PyObject_GetAttrString(obj, "columns")); - if (!tmpObj) { - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->rowLabelsLen); - Py_DECREF(tmpObj); - tmpObj = (enc->outputFormat == INDEX - ? PyObject_GetAttrString(obj, "columns") - : PyObject_GetAttrString(obj, "index")); - if (!tmpObj) { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } + if (object_is_index_type(obj)) { + if (enc->outputFormat == SPLIT) { + tc->type = JT_OBJECT; + pc->iterBegin = Index_iterBegin; + pc->iterEnd = Index_iterEnd; + pc->iterNext = Index_iterNext; + pc->iterGetValue = Index_iterGetValue; + pc->iterGetName = Index_iterGetName; + return; + } + + pc->newObj = get_values(obj); + if (pc->newObj) { + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + } else { + goto INVALID; + } - if (enc->outputFormat == COLUMNS) { - pc->transpose = 1; - } - } else { - goto INVALID; - } - return; - } else if (PyDict_Check(obj)) { - tc->type = JT_OBJECT; - pc->iterBegin = Dict_iterBegin; - pc->iterEnd = Dict_iterEnd; - pc->iterNext = Dict_iterNext; - pc->iterGetValue = Dict_iterGetValue; - pc->iterGetName = Dict_iterGetName; - pc->dictObj = obj; - Py_INCREF(obj); - - return; - } else if (PyList_Check(obj)) { - tc->type = JT_ARRAY; - pc->iterBegin = List_iterBegin; - pc->iterEnd = List_iterEnd; - pc->iterNext = List_iterNext; - pc->iterGetValue = List_iterGetValue; - pc->iterGetName = List_iterGetName; - return; - } else if (PyTuple_Check(obj)) { - tc->type = JT_ARRAY; - pc->iterBegin = Tuple_iterBegin; - pc->iterEnd = Tuple_iterEnd; - pc->iterNext = Tuple_iterNext; - pc->iterGetValue = Tuple_iterGetValue; - pc->iterGetName = Tuple_iterGetName; - return; - } else if (PyAnySet_Check(obj)) { - tc->type = JT_ARRAY; - pc->iterBegin = Set_iterBegin; - pc->iterEnd = Set_iterEnd; - pc->iterNext = Set_iterNext; - pc->iterGetValue = Set_iterGetValue; - pc->iterGetName = Set_iterGetName; - return; + return; + } else if (object_is_series_type(obj)) { + if (enc->outputFormat == SPLIT) { + tc->type = JT_OBJECT; + pc->iterBegin = Series_iterBegin; + pc->iterEnd = Series_iterEnd; + pc->iterNext = Series_iterNext; + pc->iterGetValue = Series_iterGetValue; + pc->iterGetName = Series_iterGetName; + return; + } + + pc->newObj = get_values(obj); + if (!pc->newObj) { + goto INVALID; + } + + if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { + tc->type = JT_OBJECT; + tmpObj = PyObject_GetAttrString(obj, "index"); + if (!tmpObj) { + goto INVALID; + } + values = get_values(tmpObj); + Py_DECREF(tmpObj); + if (!values) { + goto INVALID; + } + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->columnLabelsLen); + if (!pc->columnLabels) { + goto INVALID; + } + } else { + tc->type = JT_ARRAY; } + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } else if (PyArray_Check(obj)) { + if (enc->npyCtxtPassthru) { + pc->npyarr = enc->npyCtxtPassthru; + tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); + + pc->iterBegin = NpyArrPassThru_iterBegin; + pc->iterNext = NpyArr_iterNext; + pc->iterEnd = NpyArrPassThru_iterEnd; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + + enc->npyCtxtPassthru = NULL; + return; + } + + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } else if (object_is_dataframe_type(obj)) { + if (enc->blkCtxtPassthru) { + pc->pdblock = enc->blkCtxtPassthru; + tc->type = + (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY); + + pc->iterBegin = PdBlockPassThru_iterBegin; + pc->iterEnd = PdBlockPassThru_iterEnd; + pc->iterNext = PdBlock_iterNextItem; + pc->iterGetName = PdBlock_iterGetName; + pc->iterGetValue = NpyArr_iterGetValue; + + enc->blkCtxtPassthru = NULL; + return; + } + + if (enc->outputFormat == SPLIT) { + tc->type = JT_OBJECT; + pc->iterBegin = DataFrame_iterBegin; + pc->iterEnd = DataFrame_iterEnd; + pc->iterNext = DataFrame_iterNext; + pc->iterGetValue = DataFrame_iterGetValue; + pc->iterGetName = DataFrame_iterGetName; + return; + } + + pc->iterBegin = PdBlock_iterBegin; + pc->iterEnd = PdBlock_iterEnd; + pc->iterNext = PdBlock_iterNext; + pc->iterGetName = PdBlock_iterGetName; + pc->iterGetValue = NpyArr_iterGetValue; + + if (enc->outputFormat == VALUES) { + tc->type = JT_ARRAY; + } else if (enc->outputFormat == RECORDS) { + tc->type = JT_ARRAY; + tmpObj = PyObject_GetAttrString(obj, "columns"); + if (!tmpObj) { + goto INVALID; + } + values = get_values(tmpObj); + if (!values) { + Py_DECREF(tmpObj); + goto INVALID; + } + pc->columnLabelsLen = PyObject_Size(tmpObj); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->columnLabelsLen); + Py_DECREF(tmpObj); + if (!pc->columnLabels) { + goto INVALID; + } + } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { + tc->type = JT_OBJECT; + tmpObj = + (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "index") + : PyObject_GetAttrString(obj, "columns")); + if (!tmpObj) { + goto INVALID; + } + values = get_values(tmpObj); + if (!values) { + Py_DECREF(tmpObj); + goto INVALID; + } + pc->rowLabelsLen = PyObject_Size(tmpObj); + pc->rowLabels = + NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->rowLabelsLen); + Py_DECREF(tmpObj); + tmpObj = + (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") + : PyObject_GetAttrString(obj, "index")); + if (!tmpObj) { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + values = get_values(tmpObj); + if (!values) { + Py_DECREF(tmpObj); + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + pc->columnLabelsLen = PyObject_Size(tmpObj); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->columnLabelsLen); + Py_DECREF(tmpObj); + if (!pc->columnLabels) { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } - toDictFunc = PyObject_GetAttrString(obj, "toDict"); + if (enc->outputFormat == COLUMNS) { + pc->transpose = 1; + } + } else { + goto INVALID; + } + return; + } else if (PyDict_Check(obj)) { + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = obj; + Py_INCREF(obj); - if (toDictFunc) { - PyObject *tuple = PyTuple_New(0); - PyObject *toDictResult = PyObject_Call(toDictFunc, tuple, NULL); - Py_DECREF(tuple); - Py_DECREF(toDictFunc); + return; + } else if (PyList_Check(obj)) { + tc->type = JT_ARRAY; + pc->iterBegin = List_iterBegin; + pc->iterEnd = List_iterEnd; + pc->iterNext = List_iterNext; + pc->iterGetValue = List_iterGetValue; + pc->iterGetName = List_iterGetName; + return; + } else if (PyTuple_Check(obj)) { + tc->type = JT_ARRAY; + pc->iterBegin = Tuple_iterBegin; + pc->iterEnd = Tuple_iterEnd; + pc->iterNext = Tuple_iterNext; + pc->iterGetValue = Tuple_iterGetValue; + pc->iterGetName = Tuple_iterGetName; + return; + } else if (PyAnySet_Check(obj)) { + tc->type = JT_ARRAY; + pc->iterBegin = Set_iterBegin; + pc->iterEnd = Set_iterEnd; + pc->iterNext = Set_iterNext; + pc->iterGetValue = Set_iterGetValue; + pc->iterGetName = Set_iterGetName; + return; + } - if (toDictResult == NULL) { - PyErr_Clear(); - tc->type = JT_NULL; - return; - } + toDictFunc = PyObject_GetAttrString(obj, "toDict"); - if (!PyDict_Check(toDictResult)) { - Py_DECREF(toDictResult); - tc->type = JT_NULL; - return; - } + if (toDictFunc) { + PyObject *tuple = PyTuple_New(0); + PyObject *toDictResult = PyObject_Call(toDictFunc, tuple, NULL); + Py_DECREF(tuple); + Py_DECREF(toDictFunc); - tc->type = JT_OBJECT; - pc->iterBegin = Dict_iterBegin; - pc->iterEnd = Dict_iterEnd; - pc->iterNext = Dict_iterNext; - pc->iterGetValue = Dict_iterGetValue; - pc->iterGetName = Dict_iterGetName; - pc->dictObj = toDictResult; - return; + if (toDictResult == NULL) { + PyErr_Clear(); + tc->type = JT_NULL; + return; } - PyErr_Clear(); - - if (enc->defaultHandler) { - Object_invokeDefaultHandler(obj, enc); - goto INVALID; + if (!PyDict_Check(toDictResult)) { + Py_DECREF(toDictResult); + tc->type = JT_NULL; + return; } tc->type = JT_OBJECT; - pc->iterBegin = Dir_iterBegin; - pc->iterEnd = Dir_iterEnd; - pc->iterNext = Dir_iterNext; - pc->iterGetValue = Dir_iterGetValue; - pc->iterGetName = Dir_iterGetName; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = toDictResult; return; + } + + PyErr_Clear(); + + if (enc->defaultHandler) { + Object_invokeDefaultHandler(obj, enc); + goto INVALID; + } + + tc->type = JT_OBJECT; + pc->iterBegin = Dir_iterBegin; + pc->iterEnd = Dir_iterEnd; + pc->iterNext = Dir_iterNext; + pc->iterGetValue = Dir_iterGetValue; + pc->iterGetName = Dir_iterGetName; + return; INVALID: - tc->type = JT_INVALID; - PyObject_Free(tc->prv); - tc->prv = NULL; - return; + tc->type = JT_INVALID; + PyObject_Free(tc->prv); + tc->prv = NULL; + return; } void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (tc->prv) { - Py_XDECREF(GET_TC(tc)->newObj); - GET_TC(tc)->newObj = NULL; - NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); - GET_TC(tc)->rowLabels = NULL; - NpyArr_freeLabels(GET_TC(tc)->columnLabels, - GET_TC(tc)->columnLabelsLen); - GET_TC(tc)->columnLabels = NULL; - PyObject_Free(GET_TC(tc)->cStr); - GET_TC(tc)->cStr = NULL; - PyObject_Free(tc->prv); - tc->prv = NULL; - } + if (tc->prv) { + Py_XDECREF(GET_TC(tc)->newObj); + GET_TC(tc)->newObj = NULL; + NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); + GET_TC(tc)->rowLabels = NULL; + NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); + GET_TC(tc)->columnLabels = NULL; + PyObject_Free(GET_TC(tc)->cStr); + GET_TC(tc)->cStr = NULL; + PyObject_Free(tc->prv); + tc->prv = NULL; + } } const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { - return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen); + return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen); } JSINT64 Object_getLongValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->longValue; + return GET_TC(tc)->longValue; } double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->doubleValue; + return GET_TC(tc)->doubleValue; } const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { - PyObject *repr = PyObject_Str(obj); - const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *)_outLen); - char *bytes = PyObject_Malloc(*_outLen + 1); - memcpy(bytes, str, *_outLen + 1); - GET_TC(tc)->cStr = bytes; + PyObject *repr = PyObject_Str(obj); + const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *)_outLen); + char *bytes = PyObject_Malloc(*_outLen + 1); + memcpy(bytes, str, *_outLen + 1); + GET_TC(tc)->cStr = bytes; - Py_DECREF(repr); + Py_DECREF(repr); - return GET_TC(tc)->cStr; + return GET_TC(tc)->cStr; } static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); } void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->iterBegin(obj, tc); + GET_TC(tc)->iterBegin(obj, tc); } int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) { - return GET_TC(tc)->iterNext(obj, tc); + return GET_TC(tc)->iterNext(obj, tc); } void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->iterEnd(obj, tc); + GET_TC(tc)->iterEnd(obj, tc); } JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { - return GET_TC(tc)->iterGetValue(obj, tc); + return GET_TC(tc)->iterGetValue(obj, tc); } char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { - return GET_TC(tc)->iterGetName(obj, tc, outLen); + return GET_TC(tc)->iterGetName(obj, tc, outLen); } PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, PyObject *kwargs) { - PyDateTime_IMPORT; - if (PyDateTimeAPI == NULL) { - return NULL; - } - - PandasDateTime_IMPORT; - if (PandasDateTimeAPI == NULL) { - return NULL; - } - - static char *kwlist[] = {"obj", - "ensure_ascii", - "double_precision", - "encode_html_chars", - "orient", - "date_unit", - "iso_dates", - "default_handler", - "indent", - NULL}; - - char buffer[65536]; - char *ret; - PyObject *newobj; - PyObject *oinput = NULL; - PyObject *oensureAscii = NULL; - int idoublePrecision = 10; // default double precision setting - PyObject *oencodeHTMLChars = NULL; - char *sOrient = NULL; - char *sdateFormat = NULL; - PyObject *oisoDates = 0; - PyObject *odefHandler = 0; - int indent = 0; - - PyObjectEncoder pyEncoder = {{ - Object_beginTypeContext, - Object_endTypeContext, - Object_getStringValue, - Object_getLongValue, - NULL, // getIntValue is unused - Object_getDoubleValue, - Object_getBigNumStringValue, - Object_iterBegin, - Object_iterNext, - Object_iterEnd, - Object_iterGetValue, - Object_iterGetName, - Object_releaseObject, - PyObject_Malloc, - PyObject_Realloc, - PyObject_Free, - -1, // recursionMax - idoublePrecision, - 1, // forceAscii - 0, // encodeHTMLChars - 0, // indent - }}; - JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; - - pyEncoder.npyCtxtPassthru = NULL; - pyEncoder.blkCtxtPassthru = NULL; - pyEncoder.npyType = -1; - pyEncoder.npyValue = NULL; - pyEncoder.datetimeIso = 0; - pyEncoder.datetimeUnit = NPY_FR_ms; - pyEncoder.outputFormat = COLUMNS; - pyEncoder.defaultHandler = 0; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist, - &oinput, &oensureAscii, &idoublePrecision, - &oencodeHTMLChars, &sOrient, &sdateFormat, - &oisoDates, &odefHandler, &indent)) { - return NULL; - } + PyDateTime_IMPORT; + if (PyDateTimeAPI == NULL) { + return NULL; + } - if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) { - encoder->forceASCII = 0; - } + PandasDateTime_IMPORT; + if (PandasDateTimeAPI == NULL) { + return NULL; + } + + static char *kwlist[] = {"obj", + "ensure_ascii", + "double_precision", + "encode_html_chars", + "orient", + "date_unit", + "iso_dates", + "default_handler", + "indent", + NULL}; + + char buffer[65536]; + char *ret; + PyObject *newobj; + PyObject *oinput = NULL; + PyObject *oensureAscii = NULL; + int idoublePrecision = 10; // default double precision setting + PyObject *oencodeHTMLChars = NULL; + char *sOrient = NULL; + char *sdateFormat = NULL; + PyObject *oisoDates = 0; + PyObject *odefHandler = 0; + int indent = 0; + + PyObjectEncoder pyEncoder = {{ + Object_beginTypeContext, + Object_endTypeContext, + Object_getStringValue, + Object_getLongValue, + NULL, // getIntValue is unused + Object_getDoubleValue, + Object_getBigNumStringValue, + Object_iterBegin, + Object_iterNext, + Object_iterEnd, + Object_iterGetValue, + Object_iterGetName, + Object_releaseObject, + PyObject_Malloc, + PyObject_Realloc, + PyObject_Free, + -1, // recursionMax + idoublePrecision, + 1, // forceAscii + 0, // encodeHTMLChars + 0, // indent + }}; + JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; + + pyEncoder.npyCtxtPassthru = NULL; + pyEncoder.blkCtxtPassthru = NULL; + pyEncoder.npyType = -1; + pyEncoder.npyValue = NULL; + pyEncoder.datetimeIso = 0; + pyEncoder.datetimeUnit = NPY_FR_ms; + pyEncoder.outputFormat = COLUMNS; + pyEncoder.defaultHandler = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist, &oinput, + &oensureAscii, &idoublePrecision, + &oencodeHTMLChars, &sOrient, &sdateFormat, + &oisoDates, &odefHandler, &indent)) { + return NULL; + } - if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) { - encoder->encodeHTMLChars = 1; - } + if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) { + encoder->forceASCII = 0; + } - if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) { - PyErr_Format( - PyExc_ValueError, - "Invalid value '%d' for option 'double_precision', max is '%u'", - idoublePrecision, JSON_DOUBLE_MAX_DECIMALS); - return NULL; - } - encoder->doublePrecision = idoublePrecision; - - if (sOrient != NULL) { - if (strcmp(sOrient, "records") == 0) { - pyEncoder.outputFormat = RECORDS; - } else if (strcmp(sOrient, "index") == 0) { - pyEncoder.outputFormat = INDEX; - } else if (strcmp(sOrient, "split") == 0) { - pyEncoder.outputFormat = SPLIT; - } else if (strcmp(sOrient, "values") == 0) { - pyEncoder.outputFormat = VALUES; - } else if (strcmp(sOrient, "columns") != 0) { - PyErr_Format(PyExc_ValueError, - "Invalid value '%s' for option 'orient'", sOrient); - return NULL; - } - } + if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) { + encoder->encodeHTMLChars = 1; + } - if (sdateFormat != NULL) { - if (strcmp(sdateFormat, "s") == 0) { - pyEncoder.datetimeUnit = NPY_FR_s; - } else if (strcmp(sdateFormat, "ms") == 0) { - pyEncoder.datetimeUnit = NPY_FR_ms; - } else if (strcmp(sdateFormat, "us") == 0) { - pyEncoder.datetimeUnit = NPY_FR_us; - } else if (strcmp(sdateFormat, "ns") == 0) { - pyEncoder.datetimeUnit = NPY_FR_ns; - } else { - PyErr_Format(PyExc_ValueError, - "Invalid value '%s' for option 'date_unit'", - sdateFormat); - return NULL; - } + if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) { + PyErr_Format( + PyExc_ValueError, + "Invalid value '%d' for option 'double_precision', max is '%u'", + idoublePrecision, JSON_DOUBLE_MAX_DECIMALS); + return NULL; + } + encoder->doublePrecision = idoublePrecision; + + if (sOrient != NULL) { + if (strcmp(sOrient, "records") == 0) { + pyEncoder.outputFormat = RECORDS; + } else if (strcmp(sOrient, "index") == 0) { + pyEncoder.outputFormat = INDEX; + } else if (strcmp(sOrient, "split") == 0) { + pyEncoder.outputFormat = SPLIT; + } else if (strcmp(sOrient, "values") == 0) { + pyEncoder.outputFormat = VALUES; + } else if (strcmp(sOrient, "columns") != 0) { + PyErr_Format(PyExc_ValueError, "Invalid value '%s' for option 'orient'", + sOrient); + return NULL; + } + } + + if (sdateFormat != NULL) { + if (strcmp(sdateFormat, "s") == 0) { + pyEncoder.datetimeUnit = NPY_FR_s; + } else if (strcmp(sdateFormat, "ms") == 0) { + pyEncoder.datetimeUnit = NPY_FR_ms; + } else if (strcmp(sdateFormat, "us") == 0) { + pyEncoder.datetimeUnit = NPY_FR_us; + } else if (strcmp(sdateFormat, "ns") == 0) { + pyEncoder.datetimeUnit = NPY_FR_ns; + } else { + PyErr_Format(PyExc_ValueError, + "Invalid value '%s' for option 'date_unit'", sdateFormat); + return NULL; } + } - if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) { - pyEncoder.datetimeIso = 1; - } + if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) { + pyEncoder.datetimeIso = 1; + } - if (odefHandler != NULL && odefHandler != Py_None) { - if (!PyCallable_Check(odefHandler)) { - PyErr_SetString(PyExc_TypeError, "Default handler is not callable"); - return NULL; - } - pyEncoder.defaultHandler = odefHandler; + if (odefHandler != NULL && odefHandler != Py_None) { + if (!PyCallable_Check(odefHandler)) { + PyErr_SetString(PyExc_TypeError, "Default handler is not callable"); + return NULL; } + pyEncoder.defaultHandler = odefHandler; + } - encoder->indent = indent; + encoder->indent = indent; - pyEncoder.originalOutputFormat = pyEncoder.outputFormat; - ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); - if (PyErr_Occurred()) { - return NULL; - } + pyEncoder.originalOutputFormat = pyEncoder.outputFormat; + ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); + if (PyErr_Occurred()) { + return NULL; + } - if (encoder->errorMsg) { - if (ret != buffer) { - encoder->free(ret); - } - PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg); - return NULL; + if (encoder->errorMsg) { + if (ret != buffer) { + encoder->free(ret); } + PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg); + return NULL; + } - newobj = PyUnicode_FromString(ret); + newobj = PyUnicode_FromString(ret); - if (ret != buffer) { - encoder->free(ret); - } + if (ret != buffer) { + encoder->free(ret); + } - return newobj; + return newobj; } diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c index 12dbb33f2874f..736089c48e3ee 100644 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ b/pandas/_libs/src/vendored/ujson/python/ujson.c @@ -16,18 +16,19 @@ derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -50,10 +51,10 @@ void *initObjToJSON(void); /* JSONToObj */ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs); -#define ENCODER_HELP_TEXT \ - "Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \ - "alter the maximum digit precision of doubles. Set " \ - "encode_html_chars=True to encode < > & as unicode escape sequences." +#define ENCODER_HELP_TEXT \ + "Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \ + "alter the maximum digit precision of doubles. Set " \ + "encode_html_chars=True to encode < > & as unicode escape sequences." static PyMethodDef ujsonMethods[] = { {"ujson_dumps", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS, @@ -65,12 +66,12 @@ static PyMethodDef ujsonMethods[] = { }; typedef struct { - PyObject *type_decimal; - PyObject *type_dataframe; - PyObject *type_series; - PyObject *type_index; - PyObject *type_nat; - PyObject *type_na; + PyObject *type_decimal; + PyObject *type_dataframe; + PyObject *type_series; + PyObject *type_index; + PyObject *type_nat; + PyObject *type_na; } modulestate; #define modulestate(o) ((modulestate *)PyModule_GetState(o)) @@ -90,359 +91,356 @@ static struct PyModuleDef moduledef = {.m_base = PyModuleDef_HEAD_INIT, #ifndef PYPY_VERSION /* Used in objToJSON.c */ int object_is_decimal_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_decimal = state->type_decimal; - if (type_decimal == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_decimal); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_decimal = state->type_decimal; + if (type_decimal == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_decimal); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } int object_is_dataframe_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_dataframe = state->type_dataframe; - if (type_dataframe == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_dataframe); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_dataframe = state->type_dataframe; + if (type_dataframe == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_dataframe); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } int object_is_series_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_series = state->type_series; - if (type_series == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_series); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_series = state->type_series; + if (type_series == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_series); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } int object_is_index_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_index = state->type_index; - if (type_index == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_index); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_index = state->type_index; + if (type_index == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_index); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } int object_is_nat_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_nat = state->type_nat; - if (type_nat == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_nat); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_nat = state->type_nat; + if (type_nat == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_nat); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } int object_is_na_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_na = state->type_na; - if (type_na == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_na); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_na = state->type_na; + if (type_na == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_na); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } #else - /* Used in objToJSON.c */ +/* Used in objToJSON.c */ int object_is_decimal_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("decimal"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_decimal = PyObject_GetAttrString(module, "Decimal"); - if (type_decimal == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_decimal); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_decimal); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("decimal"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_decimal = PyObject_GetAttrString(module, "Decimal"); + if (type_decimal == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_decimal); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_decimal); + PyErr_Clear(); + return 0; + } + return result; } int object_is_dataframe_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_dataframe = PyObject_GetAttrString(module, "DataFrame"); - if (type_dataframe == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_dataframe); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_dataframe); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("pandas"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_dataframe = PyObject_GetAttrString(module, "DataFrame"); + if (type_dataframe == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_dataframe); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_dataframe); + PyErr_Clear(); + return 0; + } + return result; } int object_is_series_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_series = PyObject_GetAttrString(module, "Series"); - if (type_series == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_series); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_series); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("pandas"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_series = PyObject_GetAttrString(module, "Series"); + if (type_series == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_series); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_series); + PyErr_Clear(); + return 0; + } + return result; } int object_is_index_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_index = PyObject_GetAttrString(module, "Index"); - if (type_index == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_index); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_index); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("pandas"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_index = PyObject_GetAttrString(module, "Index"); + if (type_index == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_index); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_index); + PyErr_Clear(); + return 0; + } + return result; } int object_is_nat_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas._libs.tslibs.nattype"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_nat = PyObject_GetAttrString(module, "NaTType"); - if (type_nat == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_nat); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_nat); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("pandas._libs.tslibs.nattype"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_nat = PyObject_GetAttrString(module, "NaTType"); + if (type_nat == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_nat); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_nat); + PyErr_Clear(); + return 0; + } + return result; } int object_is_na_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas._libs.missing"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_na = PyObject_GetAttrString(module, "NAType"); - if (type_na == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_na); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_na); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("pandas._libs.missing"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_na = PyObject_GetAttrString(module, "NAType"); + if (type_na == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_na); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_na); + PyErr_Clear(); + return 0; + } + return result; } #endif static int module_traverse(PyObject *m, visitproc visit, void *arg) { - Py_VISIT(modulestate(m)->type_decimal); - Py_VISIT(modulestate(m)->type_dataframe); - Py_VISIT(modulestate(m)->type_series); - Py_VISIT(modulestate(m)->type_index); - Py_VISIT(modulestate(m)->type_nat); - Py_VISIT(modulestate(m)->type_na); - return 0; + Py_VISIT(modulestate(m)->type_decimal); + Py_VISIT(modulestate(m)->type_dataframe); + Py_VISIT(modulestate(m)->type_series); + Py_VISIT(modulestate(m)->type_index); + Py_VISIT(modulestate(m)->type_nat); + Py_VISIT(modulestate(m)->type_na); + return 0; } static int module_clear(PyObject *m) { - Py_CLEAR(modulestate(m)->type_decimal); - Py_CLEAR(modulestate(m)->type_dataframe); - Py_CLEAR(modulestate(m)->type_series); - Py_CLEAR(modulestate(m)->type_index); - Py_CLEAR(modulestate(m)->type_nat); - Py_CLEAR(modulestate(m)->type_na); - return 0; + Py_CLEAR(modulestate(m)->type_decimal); + Py_CLEAR(modulestate(m)->type_dataframe); + Py_CLEAR(modulestate(m)->type_series); + Py_CLEAR(modulestate(m)->type_index); + Py_CLEAR(modulestate(m)->type_nat); + Py_CLEAR(modulestate(m)->type_na); + return 0; } static void module_free(void *module) { module_clear((PyObject *)module); } PyMODINIT_FUNC PyInit_json(void) { - import_array() - PyObject *module; + import_array() PyObject *module; #ifndef PYPY_VERSION - // This function is not supported in PyPy. - if ((module = PyState_FindModule(&moduledef)) != NULL) { - Py_INCREF(module); - return module; - } + // This function is not supported in PyPy. + if ((module = PyState_FindModule(&moduledef)) != NULL) { + Py_INCREF(module); + return module; + } #endif - module = PyModule_Create(&moduledef); - if (module == NULL) { - return NULL; - } + module = PyModule_Create(&moduledef); + if (module == NULL) { + return NULL; + } #ifndef PYPY_VERSION - PyObject *mod_decimal = PyImport_ImportModule("decimal"); - if (mod_decimal) { - PyObject *type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); - assert(type_decimal != NULL); - modulestate(module)->type_decimal = type_decimal; - Py_DECREF(mod_decimal); - } - - PyObject *mod_pandas = PyImport_ImportModule("pandas"); - if (mod_pandas) { - PyObject *type_dataframe = - PyObject_GetAttrString(mod_pandas, "DataFrame"); - assert(type_dataframe != NULL); - modulestate(module)->type_dataframe = type_dataframe; - - PyObject *type_series = PyObject_GetAttrString(mod_pandas, "Series"); - assert(type_series != NULL); - modulestate(module)->type_series = type_series; - - PyObject *type_index = PyObject_GetAttrString(mod_pandas, "Index"); - assert(type_index != NULL); - modulestate(module)->type_index = type_index; - - Py_DECREF(mod_pandas); - } - - PyObject *mod_nattype = - PyImport_ImportModule("pandas._libs.tslibs.nattype"); - if (mod_nattype) { - PyObject *type_nat = PyObject_GetAttrString(mod_nattype, "NaTType"); - assert(type_nat != NULL); - modulestate(module)->type_nat = type_nat; - - Py_DECREF(mod_nattype); - } - - PyObject *mod_natype = PyImport_ImportModule("pandas._libs.missing"); - if (mod_natype) { - PyObject *type_na = PyObject_GetAttrString(mod_natype, "NAType"); - assert(type_na != NULL); - modulestate(module)->type_na = type_na; - - Py_DECREF(mod_natype); - } else { - PyErr_Clear(); - } + PyObject *mod_decimal = PyImport_ImportModule("decimal"); + if (mod_decimal) { + PyObject *type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); + assert(type_decimal != NULL); + modulestate(module)->type_decimal = type_decimal; + Py_DECREF(mod_decimal); + } + + PyObject *mod_pandas = PyImport_ImportModule("pandas"); + if (mod_pandas) { + PyObject *type_dataframe = PyObject_GetAttrString(mod_pandas, "DataFrame"); + assert(type_dataframe != NULL); + modulestate(module)->type_dataframe = type_dataframe; + + PyObject *type_series = PyObject_GetAttrString(mod_pandas, "Series"); + assert(type_series != NULL); + modulestate(module)->type_series = type_series; + + PyObject *type_index = PyObject_GetAttrString(mod_pandas, "Index"); + assert(type_index != NULL); + modulestate(module)->type_index = type_index; + + Py_DECREF(mod_pandas); + } + + PyObject *mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype"); + if (mod_nattype) { + PyObject *type_nat = PyObject_GetAttrString(mod_nattype, "NaTType"); + assert(type_nat != NULL); + modulestate(module)->type_nat = type_nat; + + Py_DECREF(mod_nattype); + } + + PyObject *mod_natype = PyImport_ImportModule("pandas._libs.missing"); + if (mod_natype) { + PyObject *type_na = PyObject_GetAttrString(mod_natype, "NAType"); + assert(type_na != NULL); + modulestate(module)->type_na = type_na; + + Py_DECREF(mod_natype); + } else { + PyErr_Clear(); + } #endif - /* Not vendored for now - JSONDecodeError = PyErr_NewException("ujson.JSONDecodeError", - PyExc_ValueError, NULL); Py_XINCREF(JSONDecodeError); if - (PyModule_AddObject(module, "JSONDecodeError", JSONDecodeError) < 0) - { - Py_XDECREF(JSONDecodeError); - Py_CLEAR(JSONDecodeError); - Py_DECREF(module); - return NULL; - } - */ - - return module; + /* Not vendored for now + JSONDecodeError = PyErr_NewException("ujson.JSONDecodeError", + PyExc_ValueError, NULL); Py_XINCREF(JSONDecodeError); if + (PyModule_AddObject(module, "JSONDecodeError", JSONDecodeError) < 0) + { + Py_XDECREF(JSONDecodeError); + Py_CLEAR(JSONDecodeError); + Py_DECREF(module); + return NULL; + } + */ + + return module; } From 64833390b6561bc8197155df341f32b96717652e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 15 Sep 2023 15:23:17 -0400 Subject: [PATCH 2/6] fixups --- .pre-commit-config.yaml | 2 +- doc/source/development/contributing_codebase.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dfa4f0db0c03c..44f3793d6a905 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -74,7 +74,7 @@ repos: rev: v1.3.5 hooks: - id: clang-format - include: ^pandas/_libs/src|^pandas/_libs/include + files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] - repo: https://github.com/pylint-dev/pylint diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 41f4b4d5783ea..39743f0f892e7 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -39,7 +39,7 @@ Pre-commit Additionally, :ref:`Continuous Integration ` will run code formatting checks like ``black``, ``ruff``, -``isort``, and ``cpplint`` and more using `pre-commit hooks `_. +``isort``, and ``clang-format`` and more using `pre-commit hooks `_. Any warnings from these checks will cause the :ref:`Continuous Integration ` to fail; therefore, it is helpful to run the check yourself before submitting code. This can be done by installing ``pre-commit`` (which should already have happened if you followed the instructions From 29b835f677a0120b5ae9c98218c6303d5dd6a4ee Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 15 Sep 2023 18:14:43 -0400 Subject: [PATCH 3/6] use conda for clang-format --- .pre-commit-config.yaml | 14 +++++++------- environment.yml | 1 + requirements-dev.txt | 1 + 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 44f3793d6a905..2697239c0e3f9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -70,13 +70,6 @@ repos: - id: fix-encoding-pragma args: [--remove] - id: trailing-whitespace -- repo: https://github.com/pocc/pre-commit-hooks - rev: v1.3.5 - hooks: - - id: clang-format - files: ^pandas/_libs/src|^pandas/_libs/include - args: [-i] - types_or: [c, c++] - repo: https://github.com/pylint-dev/pylint rev: v3.0.0a7 hooks: @@ -160,6 +153,13 @@ repos: types: [pyi] args: [scripts/run_stubtest.py] stages: [manual] + - id: clang-format + name: clang-format + entry: clang-format + language: system + files: ^pandas/_libs/src|^pandas/_libs/include + args: [-i] + types_or: [c, c++] - id: inconsistent-namespace-usage name: 'Check for inconsistent use of pandas namespace' entry: python scripts/check_for_inconsistent_pandas_namespace.py diff --git a/environment.yml b/environment.yml index 8deae839f5408..5498c61493c49 100644 --- a/environment.yml +++ b/environment.yml @@ -74,6 +74,7 @@ dependencies: - cxx-compiler # code checks + - clang-format=15.0.7 - flake8=6.0.0 # run in subprocess over docstring examples - mypy=1.4.1 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py diff --git a/requirements-dev.txt b/requirements-dev.txt index 01e0701bc39a7..9f6e7997fbe59 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -51,6 +51,7 @@ seaborn moto flask asv>=0.5.1 +clang-format==15.0.7 flake8==6.0.0 mypy==1.4.1 tokenize-rt From 1d3b10c13b208dd0cf8681ffd9c55d38a04d6573 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 15 Sep 2023 18:23:59 -0400 Subject: [PATCH 4/6] language: conda --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2697239c0e3f9..c52b612ba6af7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -156,7 +156,7 @@ repos: - id: clang-format name: clang-format entry: clang-format - language: system + language: conda files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] From 78891f5ac1adcabf18fdfdc877b9dbb929025c80 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 15 Sep 2023 18:29:57 -0400 Subject: [PATCH 5/6] add additional_dependencies --- .pre-commit-config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c52b612ba6af7..4d709d8c962e2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -157,6 +157,8 @@ repos: name: clang-format entry: clang-format language: conda + additional_dependencies: + - clang-format files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] From e90e4f79f69b3a0d498b005df8e07e502aa0056f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 21 Oct 2023 23:38:46 -0400 Subject: [PATCH 6/6] try new source --- .pre-commit-config.yaml | 16 +++++++--------- environment.yml | 1 - requirements-dev.txt | 1 - 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0650a55ec54a2..a9a9baac6069a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -114,6 +114,13 @@ repos: rev: v0.6.8 hooks: - id: sphinx-lint +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: ea59a72 + hooks: + - id: clang-format + files: ^pandas/_libs/src|^pandas/_libs/include + args: [-i] + types_or: [c, c++] - repo: local hooks: - id: pyright @@ -153,15 +160,6 @@ repos: types: [pyi] args: [scripts/run_stubtest.py] stages: [manual] - - id: clang-format - name: clang-format - entry: clang-format - language: conda - additional_dependencies: - - clang-format - files: ^pandas/_libs/src|^pandas/_libs/include - args: [-i] - types_or: [c, c++] - id: inconsistent-namespace-usage name: 'Check for inconsistent use of pandas namespace' entry: python scripts/check_for_inconsistent_pandas_namespace.py diff --git a/environment.yml b/environment.yml index a3afe5669b57b..a9648f3298198 100644 --- a/environment.yml +++ b/environment.yml @@ -74,7 +74,6 @@ dependencies: - cxx-compiler # code checks - - clang-format=15.0.7 - flake8=6.0.0 # run in subprocess over docstring examples - mypy=1.4.1 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py diff --git a/requirements-dev.txt b/requirements-dev.txt index 2b75bc7ef104a..6e1a6058dce0e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -51,7 +51,6 @@ seaborn moto flask asv>=0.5.1 -clang-format==15.0.7 flake8==6.0.0 mypy==1.4.1 tokenize-rt