diff --git a/python/.asv/results/benchmarks.json b/python/.asv/results/benchmarks.json index 80c4d58b557..e2e8e7ab25e 100644 --- a/python/.asv/results/benchmarks.json +++ b/python/.asv/results/benchmarks.json @@ -746,9 +746,9 @@ "version": "80de9b1982a498c300177d02874a8626152eccb57cd0ba4228a5bb168e7608c8", "warmup_time": -1 }, - "bi_benchmarks.BIBenchmarks.peakmem_query_groupaby_city_count_filter_two_aggregations": { - "code": "class BIBenchmarks:\n def peakmem_query_groupaby_city_count_filter_two_aggregations(self, times_bigger):\n q = QueryBuilder()\n q = self.get_query_groupaby_city_count_filter_two_aggregations(q)\n df = self.lib.read(f\"{self.symbol}{times_bigger}\", query_builder=q)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupaby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_all(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_all(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupaby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_isin_filter(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupaby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_filter_two_aggregations(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", - "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupaby_city_count_filter_two_aggregations", + "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_all": { + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_all", "param_names": [ "param1" ], @@ -758,16 +758,70 @@ "10" ] ], - "setup_cache_key": "bi_benchmarks:41", + "setup_cache_key": "bi_benchmarks:61", "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "ff213cbd496202079b9405a8c788e658d986bfd83958b70262dfef80910d46f7" + "version": "576958b39e1560f56e73fa558989d2e101eecf9f5f36f4cc70604777fa4855b2" }, - "bi_benchmarks.BIBenchmarks.time_query_groupaby_city_count_all": { - "code": "class BIBenchmarks:\n def time_query_groupaby_city_count_all(self, times_bigger) -> pd.DataFrame:\n q = QueryBuilder()\n q = self.get_query_groupaby_city_count_all( q)\n df = self.lib.read(f\"{self.symbol}{times_bigger}\", query_builder=q)\n return df.data\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupaby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_all(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_all(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupaby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_isin_filter(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupaby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_filter_two_aggregations(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_filter_two_aggregations": { + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger):\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_filter_two_aggregations", + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "peakmemory", + "unit": "bytes", + "version": "00ae811ef6427d56921273b8d93c7443a1c71ed305edc73cf2375a167813bd53" + }, + "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_isin_filter": { + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_isin_filter", + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "peakmemory", + "unit": "bytes", + "version": "2ae348f65721858288f1940833c76de99d61d33fd8e21a5e9ef2958b208c8320" + }, + "bi_benchmarks.BIBenchmarks.peakmem_query_readall": { + "code": "class BIBenchmarks:\n def peakmem_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "name": "bi_benchmarks.BIBenchmarks.peakmem_query_readall", + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "peakmemory", + "unit": "bytes", + "version": "45dc0723cbde50cbd213a97e50084ae8457ff69fb12a842d9c48469fcda2caa3" + }, + "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_all": { + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", "min_run_count": 2, - "name": "bi_benchmarks.BIBenchmarks.time_query_groupaby_city_count_all", + "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_all", "number": 2, "param_names": [ "param1" @@ -781,17 +835,17 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:41", + "setup_cache_key": "bi_benchmarks:61", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "4f760b3e1f701aa3c896769a2d05bbfae0cbf5cc8a3e635de4f3e9d32b663506", + "version": "cc034dbad83f8695c4a670878f73e49b8ccb7548eb237cdbaeed0321fe4787ba", "warmup_time": -1 }, - "bi_benchmarks.BIBenchmarks.time_query_groupaby_city_count_filter_two_aggregations": { - "code": "class BIBenchmarks:\n def time_query_groupaby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame:\n q = QueryBuilder()\n q = self.get_query_groupaby_city_count_filter_two_aggregations(q)\n df = self.lib.read(f\"{self.symbol}{times_bigger}\", query_builder=q)\n return df.data\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupaby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_all(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_all(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupaby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_isin_filter(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupaby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_filter_two_aggregations(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_filter_two_aggregations": { + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", "min_run_count": 2, - "name": "bi_benchmarks.BIBenchmarks.time_query_groupaby_city_count_filter_two_aggregations", + "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_filter_two_aggregations", "number": 2, "param_names": [ "param1" @@ -805,17 +859,17 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:41", + "setup_cache_key": "bi_benchmarks:61", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "8b96fabe7782128e3c76f03fff29299bc2d757c1638f027ddca60486be4fadb9", + "version": "9cdc08e3b0b8d92ffa8e4c6922e90417d82cdc653f3596ae38b729eac2cf00bb", "warmup_time": -1 }, - "bi_benchmarks.BIBenchmarks.time_query_groupaby_city_count_isin_filter": { - "code": "class BIBenchmarks:\n def time_query_groupaby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n q = QueryBuilder()\n q = self.get_query_groupaby_city_count_isin_filter(q)\n df = self.lib.read(f\"{self.symbol}{times_bigger}\", query_builder=q)\n return df.data\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupaby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_all(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_all(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupaby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_isin_filter(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupaby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_filter_two_aggregations(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_isin_filter": { + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", "min_run_count": 2, - "name": "bi_benchmarks.BIBenchmarks.time_query_groupaby_city_count_isin_filter", + "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_isin_filter", "number": 2, "param_names": [ "param1" @@ -829,15 +883,15 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:41", + "setup_cache_key": "bi_benchmarks:61", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "6e2b0e5a9d85ffe2246dbb8573bb2209ba5ed7da040ed853950163e370a6ac69", + "version": "79b7c695f5c71eff57c7734047eb6b2d359b077c243444bb3ae2069cdfbc1011", "warmup_time": -1 }, "bi_benchmarks.BIBenchmarks.time_query_readall": { - "code": "class BIBenchmarks:\n def time_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupaby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_all(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_all(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupaby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_isin_filter(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupaby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupaby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = self.get_query_groupaby_city_count_filter_two_aggregations(_df)\n arctic_df.sort_index(inplace=True)\n self.assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "code": "class BIBenchmarks:\n def time_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", "min_run_count": 2, "name": "bi_benchmarks.BIBenchmarks.time_query_readall", "number": 2, @@ -853,11 +907,11 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:41", + "setup_cache_key": "bi_benchmarks:61", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "1745f826de8878859b2f7e8855e678aa164f8817186c00acf15bd2542bad41e8", + "version": "fc198dfac3e8e832aaa7e0d3355d4038a4acf2ada7cbf9bc3ff34bf0f7c433b8", "warmup_time": -1 }, "list_functions.ListFunctions.peakmem_list_symbols": { diff --git a/python/benchmarks/bi_benchmarks.py b/python/benchmarks/bi_benchmarks.py index 7bdd822dff1..733244abef7 100644 --- a/python/benchmarks/bi_benchmarks.py +++ b/python/benchmarks/bi_benchmarks.py @@ -12,6 +12,26 @@ from .common import * + +def get_query_groupby_city_count_all(q): + return q.groupby("City").agg({"Keyword": "count"}) + + +def get_query_groupby_city_count_isin_filter(q): + return q[q["Keyword"].isin(["bimbo", "twat", "faggot"])].groupby("City").agg({"Keyword": "count"}) + + +def get_query_groupby_city_count_filter_two_aggregations(q): + return q[q["Keyword"] == "faggot" ].groupby("City").agg({"Keyword": "count", "Number of Records" : "sum"}) + +def assert_frame_equal(pandas_df:pd.DataFrame, arctic_df:pd.DataFrame): + arctic_df.sort_index(inplace=True) + pd.testing.assert_frame_equal(pandas_df, + arctic_df, + check_column_type=False, + check_dtype=False) + + class BIBenchmarks: ''' Sample test benchmark for using one opensource BI CSV source. @@ -68,43 +88,35 @@ def setup_cache(self): for i in range(1, (BIBenchmarks.params[-1])): dfcum = pd.concat([dfcum, self.df]) _df = dfcum + print("DF for iterration xSize original ready: ", num) + _df.info(verbose=True,memory_usage='deep') self.lib.write(f"{self.symbol}{num}", _df) print("If pandas query produces different dataframe than arctic one stop tests!") print("This will mean query problem is there most likely") - print("Pre-check correctness for query_groupaby_city_count_all") + print("Pre-check correctness for query_groupby_city_count_all") _df = self.df.copy(deep=True) - arctic_df = self.time_query_groupaby_city_count_all(BIBenchmarks.params[0]) - _df = self.get_query_groupaby_city_count_all(_df) - arctic_df.sort_index(inplace=True) - self.assert_frame_equal(_df, arctic_df) + arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0]) + _df = get_query_groupby_city_count_all(_df) + assert_frame_equal(_df, arctic_df) - print("Pre-check correctness for query_groupaby_city_count_isin_filter") + print("Pre-check correctness for query_groupby_city_count_isin_filter") _df = self.df.copy(deep=True) - arctic_df = self.time_query_groupaby_city_count_isin_filter(BIBenchmarks.params[0]) - _df = self.get_query_groupaby_city_count_isin_filter(_df) - arctic_df.sort_index(inplace=True) - self.assert_frame_equal(_df, arctic_df) + arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0]) + _df = get_query_groupby_city_count_isin_filter(_df) + assert_frame_equal(_df, arctic_df) - print("Pre-check correctness for query_groupaby_city_count_filter_two_aggregations") + print("Pre-check correctness for query_groupby_city_count_filter_two_aggregations") _df = self.df.copy(deep=True) - arctic_df = self.time_query_groupaby_city_count_filter_two_aggregations(BIBenchmarks.params[0]) - _df = self.get_query_groupaby_city_count_filter_two_aggregations(_df) - arctic_df.sort_index(inplace=True) - self.assert_frame_equal(_df, arctic_df) + arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0]) + _df = get_query_groupby_city_count_filter_two_aggregations(_df) + assert_frame_equal(_df, arctic_df) print("All pre-checks completed SUCCESSFULLY") del self.ac - def assert_frame_equal(self, pandas_df:pd.DataFrame, arctic_df:pd.DataFrame): - arctic_df.sort_index(inplace=True) - pd.testing.assert_frame_equal(pandas_df, - arctic_df, - check_column_type=False, - check_dtype=False) - def setup(self, num_rows): self.ac = Arctic(f"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB") self.lib = self.ac.get_library(self.lib_name) @@ -114,36 +126,43 @@ def teardown(self, num_rows): def time_query_readall(self, times_bigger): self.lib.read(f"{self.symbol}{times_bigger}") + + def peakmem_query_readall(self, times_bigger): + self.lib.read(f"{self.symbol}{times_bigger}") - def get_query_groupaby_city_count_all(self, q): - return q.groupby("City").agg({"Keyword": "count"}) - - def time_query_groupaby_city_count_all(self, times_bigger) -> pd.DataFrame: + def query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame: q = QueryBuilder() - q = self.get_query_groupaby_city_count_all( q) + q = get_query_groupby_city_count_all( q) df = self.lib.read(f"{self.symbol}{times_bigger}", query_builder=q) return df.data - def get_query_groupaby_city_count_isin_filter(self, q): - return q[q["Keyword"].isin(["bimbo", "twat", "faggot"])].groupby("City").agg({"Keyword": "count"}) + def time_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame: + return self.query_groupby_city_count_all(times_bigger) - def time_query_groupaby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame: + def peakmem_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame: + return self.query_groupby_city_count_all(times_bigger) + + def query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame: q = QueryBuilder() - q = self.get_query_groupaby_city_count_isin_filter(q) + q = get_query_groupby_city_count_isin_filter(q) df = self.lib.read(f"{self.symbol}{times_bigger}", query_builder=q) return df.data - def get_query_groupaby_city_count_filter_two_aggregations(self, q): - return q[q["Keyword"] == "faggot" ].groupby("City").agg({"Keyword": "count", "Number of Records" : "sum"}) - - def time_query_groupaby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame: + def time_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame: + return self.query_groupby_city_count_isin_filter(times_bigger) + + def peakmem_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame: + return self.query_groupby_city_count_isin_filter(times_bigger) + + def query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame: q = QueryBuilder() - q = self.get_query_groupaby_city_count_filter_two_aggregations(q) + q = get_query_groupby_city_count_filter_two_aggregations(q) df = self.lib.read(f"{self.symbol}{times_bigger}", query_builder=q) return df.data - def peakmem_query_groupaby_city_count_filter_two_aggregations(self, times_bigger): - q = QueryBuilder() - q = self.get_query_groupaby_city_count_filter_two_aggregations(q) - df = self.lib.read(f"{self.symbol}{times_bigger}", query_builder=q) + def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame: + return self.query_groupby_city_count_filter_two_aggregations(times_bigger) + + def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger): + return self.query_groupby_city_count_filter_two_aggregations(times_bigger)