From dcf908d273a42e73def970f41e3436d76cf9a6c2 Mon Sep 17 00:00:00 2001 From: G-D-Petrov Date: Thu, 31 Oct 2024 15:01:11 +0200 Subject: [PATCH] Refactor to cover some edge cases --- .../arcticdb/version_store/_normalization.py | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index 8eafe9e015..e871d16eef 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -699,30 +699,29 @@ def to_block(a): return a.astype(np.float64) if len(index) == 0 and a.dtype == np.dtype('object') and not IS_PANDAS_TWO else a def gen_blocks(): - start_index = n_ind # Start index of the current block - current_dtype = arrays[start_index].dtype - column_placement = 0 + col_placement_in_block = n_ind + current_dtype = None + + for i in range(n_ind, len(arrays)): + a = to_block(arrays[i]) + + if current_dtype is None: + current_dtype = a.dtype - for i, a in enumerate(arrays[n_ind:], start=n_ind): - a = to_block(a) if a.dtype != current_dtype: - # Yield the current block if dtype changes yield make_block( - values=np.array(arrays[start_index:i], dtype=current_dtype), # Slice from start to current index - placement=slice(column_placement, column_placement + (i - start_index)) + values=np.array(arrays[col_placement_in_block:i], dtype=current_dtype), + placement=slice(col_placement_in_block - n_ind, i - n_ind) ) - # Update for the new block - column_placement += i - start_index - start_index, current_dtype = i, a.dtype + col_placement_in_block, current_dtype = i, a.dtype # Yield the last block if any remain - if start_index < len(arrays): + if col_placement_in_block < len(arrays): yield make_block( - values=np.array(arrays[start_index:], dtype=current_dtype), - placement=slice(column_placement, column_placement + (len(arrays) - start_index)) + values=np.array(arrays[col_placement_in_block:], dtype=current_dtype), + placement=slice(col_placement_in_block - n_ind, len(arrays) - n_ind) ) - if cols is None or len(cols) == 0: return pd.DataFrame(data, index=ind, columns=cols)