From 19f00cb67424b9afb0c18cf7b85aba547a49f603 Mon Sep 17 00:00:00 2001 From: Georgi Rusev Date: Mon, 25 Nov 2024 20:17:40 +0200 Subject: [PATCH 1/4] upload zipped data for cache purposes --- python/data/CityMaxCapita_1.parquet.gzip | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 python/data/CityMaxCapita_1.parquet.gzip diff --git a/python/data/CityMaxCapita_1.parquet.gzip b/python/data/CityMaxCapita_1.parquet.gzip new file mode 100644 index 0000000000..bc43eb04e9 --- /dev/null +++ b/python/data/CityMaxCapita_1.parquet.gzip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94e20f150cf17cee5d9ec839b8af6fd7d1a7772e5a78a4e186d33229d63618cb +size 94110850 From 14c2cb36f9b3ed42ab5448e7c6a5dcf178f0ec04 Mon Sep 17 00:00:00 2001 From: Georgi Rusev Date: Mon, 25 Nov 2024 20:21:06 +0200 Subject: [PATCH 2/4] asv file + lfs rule for gzip and parquet files --- .gitattributes | 2 + python/.asv/results/benchmarks.json | 206 +++++++++++++++++++++++++++- python/benchmarks/bi_benchmarks.py | 168 +++++++++++++++++++++++ python/benchmarks/common.py | 152 ++++++++++++++++++++ 4 files changed, 522 insertions(+), 6 deletions(-) create mode 100644 .gitattributes create mode 100644 python/benchmarks/bi_benchmarks.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..0508cddce6 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.gzip filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text diff --git a/python/.asv/results/benchmarks.json b/python/.asv/results/benchmarks.json index ff9122eb97..e2e8e7ab25 100644 --- a/python/.asv/results/benchmarks.json +++ b/python/.asv/results/benchmarks.json @@ -746,6 +746,174 @@ "version": "80de9b1982a498c300177d02874a8626152eccb57cd0ba4228a5bb168e7608c8", "warmup_time": -1 }, + "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_all": { + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_all", + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "peakmemory", + "unit": "bytes", + "version": "576958b39e1560f56e73fa558989d2e101eecf9f5f36f4cc70604777fa4855b2" + }, + "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_filter_two_aggregations": { + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger):\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_filter_two_aggregations", + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "peakmemory", + "unit": "bytes", + "version": "00ae811ef6427d56921273b8d93c7443a1c71ed305edc73cf2375a167813bd53" + }, + "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_isin_filter": { + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_isin_filter", + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "peakmemory", + "unit": "bytes", + "version": "2ae348f65721858288f1940833c76de99d61d33fd8e21a5e9ef2958b208c8320" + }, + "bi_benchmarks.BIBenchmarks.peakmem_query_readall": { + "code": "class BIBenchmarks:\n def peakmem_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "name": "bi_benchmarks.BIBenchmarks.peakmem_query_readall", + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "peakmemory", + "unit": "bytes", + "version": "45dc0723cbde50cbd213a97e50084ae8457ff69fb12a842d9c48469fcda2caa3" + }, + "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_all": { + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "min_run_count": 2, + "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_all", + "number": 2, + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "time", + "unit": "seconds", + "version": "cc034dbad83f8695c4a670878f73e49b8ccb7548eb237cdbaeed0321fe4787ba", + "warmup_time": -1 + }, + "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_filter_two_aggregations": { + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "min_run_count": 2, + "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_filter_two_aggregations", + "number": 2, + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "time", + "unit": "seconds", + "version": "9cdc08e3b0b8d92ffa8e4c6922e90417d82cdc653f3596ae38b729eac2cf00bb", + "warmup_time": -1 + }, + "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_isin_filter": { + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "min_run_count": 2, + "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_isin_filter", + "number": 2, + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "time", + "unit": "seconds", + "version": "79b7c695f5c71eff57c7734047eb6b2d359b077c243444bb3ae2069cdfbc1011", + "warmup_time": -1 + }, + "bi_benchmarks.BIBenchmarks.time_query_readall": { + "code": "class BIBenchmarks:\n def time_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "min_run_count": 2, + "name": "bi_benchmarks.BIBenchmarks.time_query_readall", + "number": 2, + "param_names": [ + "param1" + ], + "params": [ + [ + "1", + "10" + ] + ], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "setup_cache_key": "bi_benchmarks:61", + "timeout": 6000, + "type": "time", + "unit": "seconds", + "version": "fc198dfac3e8e832aaa7e0d3355d4038a4acf2ada7cbf9bc3ff34bf0f7c433b8", + "warmup_time": -1 + }, "list_functions.ListFunctions.peakmem_list_symbols": { "code": "class ListFunctions:\n def peakmem_list_symbols(self, num_symbols):\n self.lib.list_symbols()\n\n def setup(self, num_symbols):\n self.ac = Arctic(\"lmdb://list_functions\")\n self.lib = self.ac[f\"{num_symbols}_num_symbols\"]\n\n def setup_cache(self):\n self.ac = Arctic(\"lmdb://list_functions\")\n \n num_symbols = ListFunctions.params\n for syms in num_symbols:\n lib_name = f\"{syms}_num_symbols\"\n self.ac.delete_library(lib_name)\n lib = self.ac.create_library(lib_name)\n for sym in range(syms):\n lib.write(f\"{sym}_sym\", generate_benchmark_df(ListFunctions.rows))", "name": "list_functions.ListFunctions.peakmem_list_symbols", @@ -1245,7 +1413,7 @@ "warmup_time": -1 }, "resample.Resample.peakmem_resample": { - "code": "class Resample:\n def peakmem_resample(self, num_rows, downsampling_factor, col_type, aggregation):\n if col_type == \"datetime\" and aggregation == \"sum\" or col_type == \"str\" and aggregation in [\"sum\", \"mean\", \"min\", \"max\"]:\n raise SkipNotImplemented(f\"{aggregation} not supported on columns of type {col_type}\")\n else:\n self.lib.read(col_type, date_range=self.date_range, query_builder=self.query_builder)\n\n def setup(self, num_rows, downsampling_factor, col_type, aggregation):\n self.ac = Arctic(self.CONNECTION_STRING)\n self.lib = self.ac[self.LIB_NAME]\n self.date_range = (pd.Timestamp(0), pd.Timestamp(num_rows, unit=\"us\"))\n self.query_builder = QueryBuilder().resample(f\"{downsampling_factor}us\").agg({\"col\": aggregation})\n\n def setup_cache(self):\n ac = Arctic(self.CONNECTION_STRING)\n ac.delete_library(self.LIB_NAME)\n lib = ac.create_library(self.LIB_NAME)\n rng = np.random.default_rng()\n col_types = self.params[2]\n rows = max(self.params[0])\n for col_type in col_types:\n if col_type == \"str\":\n num_unique_strings = 100\n unique_strings = random_strings_of_length(num_unique_strings, 10, True)\n sym = col_type\n num_segments = rows // self.ROWS_PER_SEGMENT\n for idx in range(num_segments):\n index = pd.date_range(pd.Timestamp(idx * self.ROWS_PER_SEGMENT, unit=\"us\"), freq=\"us\", periods=self.ROWS_PER_SEGMENT)\n if col_type == \"int\":\n col_data = rng.integers(0, 100_000, self.ROWS_PER_SEGMENT)\n elif col_type == \"bool\":\n col_data = rng.integers(0, 2, self.ROWS_PER_SEGMENT)\n col_data = col_data.astype(bool)\n elif col_type == \"float\":\n col_data = 100_000 * rng.random(self.ROWS_PER_SEGMENT)\n elif col_type == \"datetime\":\n col_data = rng.integers(0, 100_000, self.ROWS_PER_SEGMENT)\n col_data = col_data.astype(\"datetime64[s]\")\n elif col_type == \"str\":\n col_data = np.random.choice(unique_strings, self.ROWS_PER_SEGMENT)\n df = pd.DataFrame({\"col\": col_data}, index=index)\n lib.append(sym, df)", + "code": "class Resample:\n def peakmem_resample(self, num_rows, downsampling_factor, col_type, aggregation):\n if col_type == \"datetime\" and aggregation == \"sum\" or col_type == \"str\" and aggregation in [\"sum\", \"mean\", \"min\", \"max\"]:\n pass\n # Use this when upgrading to ASV 0.6.0 or later\n # raise SkipNotImplemented(f\"{aggregation} not supported on columns of type {col_type}\")\n else:\n self.lib.read(col_type, date_range=self.date_range, query_builder=self.query_builder)\n\n def setup(self, num_rows, downsampling_factor, col_type, aggregation):\n self.ac = Arctic(self.CONNECTION_STRING)\n self.lib = self.ac[self.LIB_NAME]\n self.date_range = (pd.Timestamp(0), pd.Timestamp(num_rows, unit=\"us\"))\n self.query_builder = QueryBuilder().resample(f\"{downsampling_factor}us\").agg({\"col\": aggregation})\n\n def setup_cache(self):\n ac = Arctic(self.CONNECTION_STRING)\n ac.delete_library(self.LIB_NAME)\n lib = ac.create_library(self.LIB_NAME)\n rng = np.random.default_rng()\n col_types = self.params[2]\n rows = max(self.params[0])\n for col_type in col_types:\n if col_type == \"str\":\n num_unique_strings = 100\n unique_strings = random_strings_of_length(num_unique_strings, 10, True)\n sym = col_type\n num_segments = rows // self.ROWS_PER_SEGMENT\n for idx in range(num_segments):\n index = pd.date_range(pd.Timestamp(idx * self.ROWS_PER_SEGMENT, unit=\"us\"), freq=\"us\", periods=self.ROWS_PER_SEGMENT)\n if col_type == \"int\":\n col_data = rng.integers(0, 100_000, self.ROWS_PER_SEGMENT)\n elif col_type == \"bool\":\n col_data = rng.integers(0, 2, self.ROWS_PER_SEGMENT)\n col_data = col_data.astype(bool)\n elif col_type == \"float\":\n col_data = 100_000 * rng.random(self.ROWS_PER_SEGMENT)\n elif col_type == \"datetime\":\n col_data = rng.integers(0, 100_000, self.ROWS_PER_SEGMENT)\n col_data = col_data.astype(\"datetime64[s]\")\n elif col_type == \"str\":\n col_data = np.random.choice(unique_strings, self.ROWS_PER_SEGMENT)\n df = pd.DataFrame({\"col\": col_data}, index=index)\n lib.append(sym, df)", "name": "resample.Resample.peakmem_resample", "param_names": [ "num_rows", @@ -1280,13 +1448,13 @@ "'count'" ] ], - "setup_cache_key": "resample:38", + "setup_cache_key": "resample:37", "type": "peakmemory", "unit": "bytes", - "version": "e64300ebb5bd625e1a0f3774aadd035e5738b41295ec2a8ce082d2e9add9b580" + "version": "760c9d62e17a5467f1e93abb258d89057e8fdf9ee67d98ceb376e731157a4d2e" }, "resample.Resample.time_resample": { - "code": "class Resample:\n def time_resample(self, num_rows, downsampling_factor, col_type, aggregation):\n if col_type == \"datetime\" and aggregation == \"sum\" or col_type == \"str\" and aggregation in [\"sum\", \"mean\", \"min\", \"max\"]:\n raise SkipNotImplemented(f\"{aggregation} not supported on columns of type {col_type}\")\n else:\n self.lib.read(col_type, date_range=self.date_range, query_builder=self.query_builder)\n\n def setup(self, num_rows, downsampling_factor, col_type, aggregation):\n self.ac = Arctic(self.CONNECTION_STRING)\n self.lib = self.ac[self.LIB_NAME]\n self.date_range = (pd.Timestamp(0), pd.Timestamp(num_rows, unit=\"us\"))\n self.query_builder = QueryBuilder().resample(f\"{downsampling_factor}us\").agg({\"col\": aggregation})\n\n def setup_cache(self):\n ac = Arctic(self.CONNECTION_STRING)\n ac.delete_library(self.LIB_NAME)\n lib = ac.create_library(self.LIB_NAME)\n rng = np.random.default_rng()\n col_types = self.params[2]\n rows = max(self.params[0])\n for col_type in col_types:\n if col_type == \"str\":\n num_unique_strings = 100\n unique_strings = random_strings_of_length(num_unique_strings, 10, True)\n sym = col_type\n num_segments = rows // self.ROWS_PER_SEGMENT\n for idx in range(num_segments):\n index = pd.date_range(pd.Timestamp(idx * self.ROWS_PER_SEGMENT, unit=\"us\"), freq=\"us\", periods=self.ROWS_PER_SEGMENT)\n if col_type == \"int\":\n col_data = rng.integers(0, 100_000, self.ROWS_PER_SEGMENT)\n elif col_type == \"bool\":\n col_data = rng.integers(0, 2, self.ROWS_PER_SEGMENT)\n col_data = col_data.astype(bool)\n elif col_type == \"float\":\n col_data = 100_000 * rng.random(self.ROWS_PER_SEGMENT)\n elif col_type == \"datetime\":\n col_data = rng.integers(0, 100_000, self.ROWS_PER_SEGMENT)\n col_data = col_data.astype(\"datetime64[s]\")\n elif col_type == \"str\":\n col_data = np.random.choice(unique_strings, self.ROWS_PER_SEGMENT)\n df = pd.DataFrame({\"col\": col_data}, index=index)\n lib.append(sym, df)", + "code": "class Resample:\n def time_resample(self, num_rows, downsampling_factor, col_type, aggregation):\n if col_type == \"datetime\" and aggregation == \"sum\" or col_type == \"str\" and aggregation in [\"sum\", \"mean\", \"min\", \"max\"]:\n pass\n # Use this when upgrading to ASV 0.6.0 or later\n # raise SkipNotImplemented(f\"{aggregation} not supported on columns of type {col_type}\")\n else:\n self.lib.read(col_type, date_range=self.date_range, query_builder=self.query_builder)\n\n def setup(self, num_rows, downsampling_factor, col_type, aggregation):\n self.ac = Arctic(self.CONNECTION_STRING)\n self.lib = self.ac[self.LIB_NAME]\n self.date_range = (pd.Timestamp(0), pd.Timestamp(num_rows, unit=\"us\"))\n self.query_builder = QueryBuilder().resample(f\"{downsampling_factor}us\").agg({\"col\": aggregation})\n\n def setup_cache(self):\n ac = Arctic(self.CONNECTION_STRING)\n ac.delete_library(self.LIB_NAME)\n lib = ac.create_library(self.LIB_NAME)\n rng = np.random.default_rng()\n col_types = self.params[2]\n rows = max(self.params[0])\n for col_type in col_types:\n if col_type == \"str\":\n num_unique_strings = 100\n unique_strings = random_strings_of_length(num_unique_strings, 10, True)\n sym = col_type\n num_segments = rows // self.ROWS_PER_SEGMENT\n for idx in range(num_segments):\n index = pd.date_range(pd.Timestamp(idx * self.ROWS_PER_SEGMENT, unit=\"us\"), freq=\"us\", periods=self.ROWS_PER_SEGMENT)\n if col_type == \"int\":\n col_data = rng.integers(0, 100_000, self.ROWS_PER_SEGMENT)\n elif col_type == \"bool\":\n col_data = rng.integers(0, 2, self.ROWS_PER_SEGMENT)\n col_data = col_data.astype(bool)\n elif col_type == \"float\":\n col_data = 100_000 * rng.random(self.ROWS_PER_SEGMENT)\n elif col_type == \"datetime\":\n col_data = rng.integers(0, 100_000, self.ROWS_PER_SEGMENT)\n col_data = col_data.astype(\"datetime64[s]\")\n elif col_type == \"str\":\n col_data = np.random.choice(unique_strings, self.ROWS_PER_SEGMENT)\n df = pd.DataFrame({\"col\": col_data}, index=index)\n lib.append(sym, df)", "min_run_count": 2, "name": "resample.Resample.time_resample", "number": 5, @@ -1326,10 +1494,36 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "resample:38", + "setup_cache_key": "resample:37", + "type": "time", + "unit": "seconds", + "version": "1381d2db90e66cb5cd04febf62398827a3ac9928795eaced908daec35d5c0c31", + "warmup_time": -1 + }, + "resample.ResampleWide.peakmem_resample_wide": { + "code": "class ResampleWide:\n def peakmem_resample_wide(self):\n self.lib.read(self.SYM, query_builder=self.query_builder)\n\n def setup(self):\n self.ac = Arctic(self.CONNECTION_STRING)\n self.lib = self.ac[self.LIB_NAME]\n aggs = dict()\n for col in self.COLS:\n aggs[col] = \"last\"\n self.query_builder = QueryBuilder().resample(\"30us\").agg(aggs)\n\n def setup_cache(self):\n ac = Arctic(self.CONNECTION_STRING)\n ac.delete_library(self.LIB_NAME)\n lib = ac.create_library(self.LIB_NAME)\n rng = np.random.default_rng()\n num_rows = 3000\n index = pd.date_range(pd.Timestamp(0, unit=\"us\"), freq=\"us\", periods=num_rows)\n data = dict()\n for col in self.COLS:\n data[col] = 100 * rng.random(num_rows, dtype=np.float64)\n df = pd.DataFrame(data, index=index)\n lib.write(self.SYM, df)", + "name": "resample.ResampleWide.peakmem_resample_wide", + "param_names": [], + "params": [], + "setup_cache_key": "resample:103", + "type": "peakmemory", + "unit": "bytes", + "version": "53f042192048c92d282637c1bbcee9e52dacec9086c534782de30d7ff67e77eb" + }, + "resample.ResampleWide.time_resample_wide": { + "code": "class ResampleWide:\n def time_resample_wide(self):\n self.lib.read(self.SYM, query_builder=self.query_builder)\n\n def setup(self):\n self.ac = Arctic(self.CONNECTION_STRING)\n self.lib = self.ac[self.LIB_NAME]\n aggs = dict()\n for col in self.COLS:\n aggs[col] = \"last\"\n self.query_builder = QueryBuilder().resample(\"30us\").agg(aggs)\n\n def setup_cache(self):\n ac = Arctic(self.CONNECTION_STRING)\n ac.delete_library(self.LIB_NAME)\n lib = ac.create_library(self.LIB_NAME)\n rng = np.random.default_rng()\n num_rows = 3000\n index = pd.date_range(pd.Timestamp(0, unit=\"us\"), freq=\"us\", periods=num_rows)\n data = dict()\n for col in self.COLS:\n data[col] = 100 * rng.random(num_rows, dtype=np.float64)\n df = pd.DataFrame(data, index=index)\n lib.write(self.SYM, df)", + "min_run_count": 2, + "name": "resample.ResampleWide.time_resample_wide", + "number": 5, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "setup_cache_key": "resample:103", "type": "time", "unit": "seconds", - "version": "2d10a27f3668632f382e90783829b4bb08cabb656c02754c00d5953ee42f3794", + "version": "ece714f981e8de31ee8296644624bf8f5fb895e6bf48d64a6ae2a9c50c5db7a2", "warmup_time": -1 }, "version": 2, diff --git a/python/benchmarks/bi_benchmarks.py b/python/benchmarks/bi_benchmarks.py new file mode 100644 index 0000000000..733244abef --- /dev/null +++ b/python/benchmarks/bi_benchmarks.py @@ -0,0 +1,168 @@ +""" +Copyright 2023 Man Group Operations Limited + +Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + +As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. +""" +import os +from pathlib import Path +from arcticdb import Arctic +from arcticdb.version_store.processing import QueryBuilder + +from .common import * + + +def get_query_groupby_city_count_all(q): + return q.groupby("City").agg({"Keyword": "count"}) + + +def get_query_groupby_city_count_isin_filter(q): + return q[q["Keyword"].isin(["bimbo", "twat", "faggot"])].groupby("City").agg({"Keyword": "count"}) + + +def get_query_groupby_city_count_filter_two_aggregations(q): + return q[q["Keyword"] == "faggot" ].groupby("City").agg({"Keyword": "count", "Number of Records" : "sum"}) + +def assert_frame_equal(pandas_df:pd.DataFrame, arctic_df:pd.DataFrame): + arctic_df.sort_index(inplace=True) + pd.testing.assert_frame_equal(pandas_df, + arctic_df, + check_column_type=False, + check_dtype=False) + + +class BIBenchmarks: + ''' + Sample test benchmark for using one opensource BI CSV source. + The logic of a test is + - download if parquet file does not exists source in .bz2 format + - convert it to parquet format + - prepare library with it containing several symbols that are constructed based on this DF + - for each query we want to benchmark do a pre-check that this query produces SAME result on Pandas and arcticDB + - run the benchmark tests + ''' + + + number = 2 + timeout = 6000 + LIB_NAME = "BI_benchmark_lib" + # We use dataframe in this file + CITY_BI_FILE = "data/CityMaxCapita_1.csv.bz2" + CITY_BI_FILE2 = "data/CityMaxCapita_1.parquet.gzip" + + #Defines how many times bigger the database is + params = [1, 10] + + def __init__(self): + self.lib_name = BIBenchmarks.LIB_NAME + self.symbol = self.lib_name + + def setup_cache(self): + + file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2) + if (not os.path.exists(file)) : + dfo = download_and_process_city_to_parquet(file) + dff = pd.read_parquet(file) + pd.testing.assert_frame_equal(dfo,dff) + else: + print("Parquet file exists!") + + # read data from bz.2 file + # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE) + # self.df : pd.DataFrame = process_city(abs_path) + + self.df : pd.DataFrame = pd.read_parquet(file) + + self.ac = Arctic(f"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB") + self.ac.delete_library(self.lib_name) + self.lib = self.ac.create_library(self.lib_name) + + print("The procedure is creating N times larger dataframes") + print("by concatenating original DF N times") + for num in BIBenchmarks.params: + _df = self.df.copy(deep=True) + if (num > 1): + # lets create N times bigger DF + dfcum = self.df.copy(deep=True) + for i in range(1, (BIBenchmarks.params[-1])): + dfcum = pd.concat([dfcum, self.df]) + _df = dfcum + print("DF for iterration xSize original ready: ", num) + _df.info(verbose=True,memory_usage='deep') + self.lib.write(f"{self.symbol}{num}", _df) + + print("If pandas query produces different dataframe than arctic one stop tests!") + print("This will mean query problem is there most likely") + + print("Pre-check correctness for query_groupby_city_count_all") + _df = self.df.copy(deep=True) + arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0]) + _df = get_query_groupby_city_count_all(_df) + assert_frame_equal(_df, arctic_df) + + print("Pre-check correctness for query_groupby_city_count_isin_filter") + _df = self.df.copy(deep=True) + arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0]) + _df = get_query_groupby_city_count_isin_filter(_df) + assert_frame_equal(_df, arctic_df) + + print("Pre-check correctness for query_groupby_city_count_filter_two_aggregations") + _df = self.df.copy(deep=True) + arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0]) + _df = get_query_groupby_city_count_filter_two_aggregations(_df) + assert_frame_equal(_df, arctic_df) + + print("All pre-checks completed SUCCESSFULLY") + + del self.ac + + def setup(self, num_rows): + self.ac = Arctic(f"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB") + self.lib = self.ac.get_library(self.lib_name) + + def teardown(self, num_rows): + del self.ac + + def time_query_readall(self, times_bigger): + self.lib.read(f"{self.symbol}{times_bigger}") + + def peakmem_query_readall(self, times_bigger): + self.lib.read(f"{self.symbol}{times_bigger}") + + def query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame: + q = QueryBuilder() + q = get_query_groupby_city_count_all( q) + df = self.lib.read(f"{self.symbol}{times_bigger}", query_builder=q) + return df.data + + def time_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame: + return self.query_groupby_city_count_all(times_bigger) + + def peakmem_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame: + return self.query_groupby_city_count_all(times_bigger) + + def query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame: + q = QueryBuilder() + q = get_query_groupby_city_count_isin_filter(q) + df = self.lib.read(f"{self.symbol}{times_bigger}", query_builder=q) + return df.data + + def time_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame: + return self.query_groupby_city_count_isin_filter(times_bigger) + + def peakmem_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame: + return self.query_groupby_city_count_isin_filter(times_bigger) + + def query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame: + q = QueryBuilder() + q = get_query_groupby_city_count_filter_two_aggregations(q) + df = self.lib.read(f"{self.symbol}{times_bigger}", query_builder=q) + return df.data + + def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame: + return self.query_groupby_city_count_filter_two_aggregations(times_bigger) + + def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger): + return self.query_groupby_city_count_filter_two_aggregations(times_bigger) + diff --git a/python/benchmarks/common.py b/python/benchmarks/common.py index e538309b27..d5839f2bf0 100644 --- a/python/benchmarks/common.py +++ b/python/benchmarks/common.py @@ -5,9 +5,14 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ +import urllib.parse import pandas as pd import numpy as np +import os +import bz2 +import urllib.request + def generate_pseudo_random_dataframe(n, freq="s", end_timestamp="1/1/2023"): """ @@ -68,3 +73,150 @@ def generate_benchmark_df(n, freq="min", end_timestamp="1/1/2023"): def get_prewritten_lib_name(rows): return f"prewritten_{rows}" + + +def get_filename_from_url(url): + parsed_url = urllib.parse.urlparse(url) + return os.path.basename(parsed_url.path) + + +def download_file(url: str) -> str: + """ + Downloads file from specific location and then saves + it under same name at current directory. + Returns the name of file just saved + """ + print("Downloading file from: ", url) + name = get_filename_from_url(url) + urllib.request.urlretrieve(url, name) + print("File downloaded: ", name) + return name + +def download_and_process_city_to_parquet(save_to_file:str) -> pd.DataFrame : + ''' + Downloads CSV from a location then saves it in gziped parqet + ''' + name = download_file("http://www.cwi.nl/~boncz/PublicBIbenchmark/CityMaxCapita/CityMaxCapita_1.csv.bz2") + name = decompress_bz2_file(name) + df : pd.DataFrame = read_city(name) + location = os.path.join(save_to_file) + directory = os.path.dirname(location) + if not os.path.exists(directory): + os.makedirs(directory) + print("Saving dataframe to gzip/parquet file: " ,location) + df.to_parquet(location, + compression='gzip', + index=True) + return df + +def decompress_bz2_file(name: str) -> str: + """ + Decompresses a bz2 file and saves content in + a text file having same name (without bz.2 extensions) + in current directory. + Returns the name of the saved file + """ + print("Decompressing file: ", name) + nn = name.replace(".bz2", "") + new_name = os.path.basename(nn) + + with bz2.open(name, 'rb') as input_file: + decompressed_data = input_file.read() + + with open(new_name, 'wb') as output_file: + output_file.write(decompressed_data) + + print("Decompressed file: ", new_name) + + return new_name + +def read_city(file1:str): + """ + Data source: + https://github.com/cwida/public_bi_benchmark/blob/master/benchmark/CityMaxCapita/queries/11.sql + + As CSV file contains nulls in int and float we fix those programatically + """ + columns =[ + "City/Admin", + "City/State", + "City", + "Created Date/Time", + "Date Joined", + "FF Ratio", + "Favorites", + "First Link in Tweet", + "Followers", + "Following", + "Gender", + "Influencer?", + "Keyword", + "LPF", + "Language", + "Lat", + "Listed Number", + "Long Domain", + "Long", + "Number of Records", + "Region", + "Short Domain", + "State/Country", + "State", + "Tweet Text", + "Tweets", + "Twitter Client", + "User Bio", + "User Loc", + "Username 1", + "Username" + ] + types = { + "City/Admin" : str, + "City/State" : str, + "City" : str, + "Created Date/Time" : np.float64, + "Date Joined" : np.float64, + "FF Ratio" : np.float64, + "Favorites" : np.int32, + "First Link in Tweet" : str, + "Followers" : np.int32, + "Following" : np.int32, + "Gender" : str, + "Influencer?" : pd.Int32Dtype(), + "Keyword" : str, + "LPF" : np.float64, + "Language" : str, + "Lat" : np.float64, + "Listed Number" : pd.Int32Dtype(), + "Long Domain" : str, + "Long" : np.float64, + "Number of Records" : np.int32, + "Region" : str, + "Short Domain" : str, + "State/Country" : str, + "State" : str, + "Tweet Text" : str, + "Tweets" : np.int32, + "Twitter Client" : str, + "User Bio" : str, + "User Loc" : str, + "Username 1" : str, + "Username" : str + } + + df = pd.read_csv(file1, sep="|", + header=None, + dtype=types, + names=columns, + ) + + df["Influencer?"]=df["Influencer?"].fillna(0).astype(np.int32) + df["Listed Number"]=df["Listed Number"].fillna(0).astype(np.int32) + + return df + +def process_city(fileloc:str) -> pd.DataFrame : + # read data from bz.2 file + name = decompress_bz2_file(fileloc) + df : pd.DataFrame = read_city(name) + return df From 61e4f027dfd694fe62d426bed15f76ec8ac78b93 Mon Sep 17 00:00:00 2001 From: Georgi Rusev Date: Mon, 9 Dec 2024 11:32:23 +0200 Subject: [PATCH 3/4] code on comments --- python/.asv/results/benchmarks.json | 48 ++++++++++++++--------------- python/benchmarks/bi_benchmarks.py | 45 +++++++++++++++++++++------ 2 files changed, 60 insertions(+), 33 deletions(-) diff --git a/python/.asv/results/benchmarks.json b/python/.asv/results/benchmarks.json index e2e8e7ab25..1927fb9f10 100644 --- a/python/.asv/results/benchmarks.json +++ b/python/.asv/results/benchmarks.json @@ -747,7 +747,7 @@ "warmup_time": -1 }, "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_all": { - "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_all", "param_names": [ "param1" @@ -758,14 +758,14 @@ "10" ] ], - "setup_cache_key": "bi_benchmarks:61", + "setup_cache_key": "bi_benchmarks:67", "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "576958b39e1560f56e73fa558989d2e101eecf9f5f36f4cc70604777fa4855b2" + "version": "b4bc970aa24d4a74e3062d17e17e617ac36b67dfb351d8762325d4496ebb677d" }, "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_filter_two_aggregations": { - "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger):\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger):\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_filter_two_aggregations", "param_names": [ "param1" @@ -776,14 +776,14 @@ "10" ] ], - "setup_cache_key": "bi_benchmarks:61", + "setup_cache_key": "bi_benchmarks:67", "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "00ae811ef6427d56921273b8d93c7443a1c71ed305edc73cf2375a167813bd53" + "version": "8e74c83adabb49c330f8d269d1172f18829de567c7ab02bd00c521ba028ca56e" }, "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_isin_filter": { - "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_isin_filter", "param_names": [ "param1" @@ -794,14 +794,14 @@ "10" ] ], - "setup_cache_key": "bi_benchmarks:61", + "setup_cache_key": "bi_benchmarks:67", "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "2ae348f65721858288f1940833c76de99d61d33fd8e21a5e9ef2958b208c8320" + "version": "813128d0d0581f76a432fdaf555b0865bfe9a5fe7d7c31c21e0a2b44758a279a" }, "bi_benchmarks.BIBenchmarks.peakmem_query_readall": { - "code": "class BIBenchmarks:\n def peakmem_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "code": "class BIBenchmarks:\n def peakmem_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "name": "bi_benchmarks.BIBenchmarks.peakmem_query_readall", "param_names": [ "param1" @@ -812,14 +812,14 @@ "10" ] ], - "setup_cache_key": "bi_benchmarks:61", + "setup_cache_key": "bi_benchmarks:67", "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "45dc0723cbde50cbd213a97e50084ae8457ff69fb12a842d9c48469fcda2caa3" + "version": "9e5376528cf402528bdba44b37a3fd5566f8bd4ccf1267a837ce7478f9e7a496" }, "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_all": { - "code": "class BIBenchmarks:\n def time_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "min_run_count": 2, "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_all", "number": 2, @@ -835,15 +835,15 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:61", + "setup_cache_key": "bi_benchmarks:67", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "cc034dbad83f8695c4a670878f73e49b8ccb7548eb237cdbaeed0321fe4787ba", + "version": "8b4efc5635d35ff9b8bc984409f150d1389d8718d4d34b51b628cc81c58f0b44", "warmup_time": -1 }, "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_filter_two_aggregations": { - "code": "class BIBenchmarks:\n def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "min_run_count": 2, "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_filter_two_aggregations", "number": 2, @@ -859,15 +859,15 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:61", + "setup_cache_key": "bi_benchmarks:67", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "9cdc08e3b0b8d92ffa8e4c6922e90417d82cdc653f3596ae38b729eac2cf00bb", + "version": "71e40e8b73a50ce93d6a44c0496b392aaf7adcdee3d861e1e1e3684e1c54b190", "warmup_time": -1 }, "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_isin_filter": { - "code": "class BIBenchmarks:\n def time_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "min_run_count": 2, "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_isin_filter", "number": 2, @@ -883,15 +883,15 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:61", + "setup_cache_key": "bi_benchmarks:67", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "79b7c695f5c71eff57c7734047eb6b2d359b077c243444bb3ae2069cdfbc1011", + "version": "ee200caa54a54a312f56a942deb07ec2d8cc646ddac74fa507cb4a9aae1b6fb6", "warmup_time": -1 }, "bi_benchmarks.BIBenchmarks.time_query_readall": { - "code": "class BIBenchmarks:\n def time_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n for i in range(1, (BIBenchmarks.params[-1])):\n dfcum = pd.concat([dfcum, self.df])\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY\")\n \n del self.ac", + "code": "class BIBenchmarks:\n def time_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "min_run_count": 2, "name": "bi_benchmarks.BIBenchmarks.time_query_readall", "number": 2, @@ -907,11 +907,11 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:61", + "setup_cache_key": "bi_benchmarks:67", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "fc198dfac3e8e832aaa7e0d3355d4038a4acf2ada7cbf9bc3ff34bf0f7c433b8", + "version": "6a74cfd049227f232c1f6c380677782eeaed4423a258cad8932bf6b5cfd556b7", "warmup_time": -1 }, "list_functions.ListFunctions.peakmem_list_symbols": { diff --git a/python/benchmarks/bi_benchmarks.py b/python/benchmarks/bi_benchmarks.py index 733244abef..d52b86d7de 100644 --- a/python/benchmarks/bi_benchmarks.py +++ b/python/benchmarks/bi_benchmarks.py @@ -5,25 +5,28 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ + + import os +import pandas as pd from pathlib import Path +import time from arcticdb import Arctic from arcticdb.version_store.processing import QueryBuilder +from benchmarks.common import download_and_process_city_to_parquet -from .common import * - - -def get_query_groupby_city_count_all(q): +def get_query_groupby_city_count_all(q:QueryBuilder | pd.DataFrame) -> QueryBuilder | pd.DataFrame: return q.groupby("City").agg({"Keyword": "count"}) -def get_query_groupby_city_count_isin_filter(q): +def get_query_groupby_city_count_isin_filter(q:QueryBuilder | pd.DataFrame) -> QueryBuilder | pd.DataFrame: return q[q["Keyword"].isin(["bimbo", "twat", "faggot"])].groupby("City").agg({"Keyword": "count"}) -def get_query_groupby_city_count_filter_two_aggregations(q): +def get_query_groupby_city_count_filter_two_aggregations(q:QueryBuilder | pd.DataFrame) -> QueryBuilder | pd.DataFrame: return q[q["Keyword"] == "faggot" ].groupby("City").agg({"Keyword": "count", "Number of Records" : "sum"}) + def assert_frame_equal(pandas_df:pd.DataFrame, arctic_df:pd.DataFrame): arctic_df.sort_index(inplace=True) pd.testing.assert_frame_equal(pandas_df, @@ -39,7 +42,8 @@ class BIBenchmarks: - download if parquet file does not exists source in .bz2 format - convert it to parquet format - prepare library with it containing several symbols that are constructed based on this DF - - for each query we want to benchmark do a pre-check that this query produces SAME result on Pandas and arcticDB + - for each query we want to benchmark do a pre-check that this query produces + SAME result on Pandas and arcticDB - run the benchmark tests ''' @@ -54,12 +58,16 @@ class BIBenchmarks: #Defines how many times bigger the database is params = [1, 10] + def __init__(self): self.lib_name = BIBenchmarks.LIB_NAME self.symbol = self.lib_name + def setup_cache(self): + start_time = time.time() + file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2) if (not os.path.exists(file)) : dfo = download_and_process_city_to_parquet(file) @@ -80,13 +88,16 @@ def setup_cache(self): print("The procedure is creating N times larger dataframes") print("by concatenating original DF N times") + print("Size of original Dataframe: ", self.df.shape[0]) for num in BIBenchmarks.params: _df = self.df.copy(deep=True) if (num > 1): # lets create N times bigger DF dfcum = self.df.copy(deep=True) + arr = [dfcum] for i in range(1, (BIBenchmarks.params[-1])): - dfcum = pd.concat([dfcum, self.df]) + arr.append(dfcum) + dfcum = pd.concat(arr) _df = dfcum print("DF for iterration xSize original ready: ", num) _df.info(verbose=True,memory_usage='deep') @@ -113,56 +124,72 @@ def setup_cache(self): _df = get_query_groupby_city_count_filter_two_aggregations(_df) assert_frame_equal(_df, arctic_df) - print("All pre-checks completed SUCCESSFULLY") + print("All pre-checks completed SUCCESSFULLY. Time: ", time.time() - start_time) del self.ac + def setup(self, num_rows): self.ac = Arctic(f"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB") self.lib = self.ac.get_library(self.lib_name) + def teardown(self, num_rows): del self.ac + def time_query_readall(self, times_bigger): self.lib.read(f"{self.symbol}{times_bigger}") + def peakmem_query_readall(self, times_bigger): self.lib.read(f"{self.symbol}{times_bigger}") + def query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame: q = QueryBuilder() q = get_query_groupby_city_count_all( q) df = self.lib.read(f"{self.symbol}{times_bigger}", query_builder=q) return df.data + def time_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame: return self.query_groupby_city_count_all(times_bigger) + def peakmem_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame: return self.query_groupby_city_count_all(times_bigger) + def query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame: q = QueryBuilder() q = get_query_groupby_city_count_isin_filter(q) df = self.lib.read(f"{self.symbol}{times_bigger}", query_builder=q) return df.data + def time_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame: return self.query_groupby_city_count_isin_filter(times_bigger) + def peakmem_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame: return self.query_groupby_city_count_isin_filter(times_bigger) + def query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame: q = QueryBuilder() q = get_query_groupby_city_count_filter_two_aggregations(q) df = self.lib.read(f"{self.symbol}{times_bigger}", query_builder=q) return df.data + def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame: return self.query_groupby_city_count_filter_two_aggregations(times_bigger) + def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger): return self.query_groupby_city_count_filter_two_aggregations(times_bigger) + + + From 04ee45f58564a48df689989c7ea421e2645423a0 Mon Sep 17 00:00:00 2001 From: Georgi Rusev Date: Mon, 9 Dec 2024 14:09:06 +0200 Subject: [PATCH 4/4] comments addressed --- python/.asv/results/benchmarks.json | 48 ++++++++++++++--------------- python/benchmarks/bi_benchmarks.py | 22 +++++-------- 2 files changed, 31 insertions(+), 39 deletions(-) diff --git a/python/.asv/results/benchmarks.json b/python/.asv/results/benchmarks.json index 1927fb9f10..55e00baa1a 100644 --- a/python/.asv/results/benchmarks.json +++ b/python/.asv/results/benchmarks.json @@ -747,7 +747,7 @@ "warmup_time": -1 }, "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_all": { - "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = pd.concat([self.df] * num)\n print(\"DF for iterration xSize original ready: \", num)\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_all", "param_names": [ "param1" @@ -758,14 +758,14 @@ "10" ] ], - "setup_cache_key": "bi_benchmarks:67", + "setup_cache_key": "bi_benchmarks:68", "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "b4bc970aa24d4a74e3062d17e17e617ac36b67dfb351d8762325d4496ebb677d" + "version": "a6be28bf68c237bc424c84b3af930d32da53053e5ddb11b97910376560ab0918" }, "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_filter_two_aggregations": { - "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger):\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger):\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = pd.concat([self.df] * num)\n print(\"DF for iterration xSize original ready: \", num)\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_filter_two_aggregations", "param_names": [ "param1" @@ -776,14 +776,14 @@ "10" ] ], - "setup_cache_key": "bi_benchmarks:67", + "setup_cache_key": "bi_benchmarks:68", "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "8e74c83adabb49c330f8d269d1172f18829de567c7ab02bd00c521ba028ca56e" + "version": "9a608927df33b903bb6dd7ec33fea2c8172dd638e0169d8fffddb5069e188e47" }, "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_isin_filter": { - "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", + "code": "class BIBenchmarks:\n def peakmem_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = pd.concat([self.df] * num)\n print(\"DF for iterration xSize original ready: \", num)\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "name": "bi_benchmarks.BIBenchmarks.peakmem_query_groupby_city_count_isin_filter", "param_names": [ "param1" @@ -794,14 +794,14 @@ "10" ] ], - "setup_cache_key": "bi_benchmarks:67", + "setup_cache_key": "bi_benchmarks:68", "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "813128d0d0581f76a432fdaf555b0865bfe9a5fe7d7c31c21e0a2b44758a279a" + "version": "0ae4a1c3ebcac6600a0636c80e757fb34ef285156f9f01a10285fb6c803e2bf7" }, "bi_benchmarks.BIBenchmarks.peakmem_query_readall": { - "code": "class BIBenchmarks:\n def peakmem_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", + "code": "class BIBenchmarks:\n def peakmem_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = pd.concat([self.df] * num)\n print(\"DF for iterration xSize original ready: \", num)\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "name": "bi_benchmarks.BIBenchmarks.peakmem_query_readall", "param_names": [ "param1" @@ -812,14 +812,14 @@ "10" ] ], - "setup_cache_key": "bi_benchmarks:67", + "setup_cache_key": "bi_benchmarks:68", "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "9e5376528cf402528bdba44b37a3fd5566f8bd4ccf1267a837ce7478f9e7a496" + "version": "2957ec25dedc5ee645e69a28ed4a38ebd27415c53828b3bf6fb4b57f146bfa13" }, "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_all": { - "code": "class BIBenchmarks:\n def time_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_all(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_all(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = pd.concat([self.df] * num)\n print(\"DF for iterration xSize original ready: \", num)\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "min_run_count": 2, "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_all", "number": 2, @@ -835,15 +835,15 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:67", + "setup_cache_key": "bi_benchmarks:68", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "8b4efc5635d35ff9b8bc984409f150d1389d8718d4d34b51b628cc81c58f0b44", + "version": "a7307ac55b614273b8a71fff12b16beeb9a256d49c760415422b54ec023a6126", "warmup_time": -1 }, "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_filter_two_aggregations": { - "code": "class BIBenchmarks:\n def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_filter_two_aggregations(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = pd.concat([self.df] * num)\n print(\"DF for iterration xSize original ready: \", num)\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "min_run_count": 2, "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_filter_two_aggregations", "number": 2, @@ -859,15 +859,15 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:67", + "setup_cache_key": "bi_benchmarks:68", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "71e40e8b73a50ce93d6a44c0496b392aaf7adcdee3d861e1e1e3684e1c54b190", + "version": "5e4efb31734abc4f731146ae302ddef56e66ada452c319e8a0dea8ac9e3e82a4", "warmup_time": -1 }, "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_isin_filter": { - "code": "class BIBenchmarks:\n def time_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", + "code": "class BIBenchmarks:\n def time_query_groupby_city_count_isin_filter(self, times_bigger) -> pd.DataFrame:\n return self.query_groupby_city_count_isin_filter(times_bigger)\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = pd.concat([self.df] * num)\n print(\"DF for iterration xSize original ready: \", num)\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "min_run_count": 2, "name": "bi_benchmarks.BIBenchmarks.time_query_groupby_city_count_isin_filter", "number": 2, @@ -883,15 +883,15 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:67", + "setup_cache_key": "bi_benchmarks:68", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "ee200caa54a54a312f56a942deb07ec2d8cc646ddac74fa507cb4a9aae1b6fb6", + "version": "89cad46eb5100e61cfa5aecda0a0a34d755bb7e6e60434bb8979176681926006", "warmup_time": -1 }, "bi_benchmarks.BIBenchmarks.time_query_readall": { - "code": "class BIBenchmarks:\n def time_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = self.df.copy(deep=True)\n if (num > 1):\n # lets create N times bigger DF\n dfcum = self.df.copy(deep=True)\n arr = [dfcum]\n for i in range(1, (BIBenchmarks.params[-1])):\n arr.append(dfcum)\n dfcum = pd.concat(arr)\n _df = dfcum\n print(\"DF for iterration xSize original ready: \", num)\n _df.info(verbose=True,memory_usage='deep')\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", + "code": "class BIBenchmarks:\n def time_query_readall(self, times_bigger):\n self.lib.read(f\"{self.symbol}{times_bigger}\")\n\n def setup(self, num_rows):\n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.lib = self.ac.get_library(self.lib_name)\n\n def setup_cache(self):\n \n start_time = time.time()\n \n file = os.path.join(Path(__file__).resolve().parent.parent, BIBenchmarks.CITY_BI_FILE2)\n if (not os.path.exists(file)) :\n dfo = download_and_process_city_to_parquet(file)\n dff = pd.read_parquet(file)\n pd.testing.assert_frame_equal(dfo,dff)\n else:\n print(\"Parquet file exists!\")\n \n # read data from bz.2 file\n # abs_path = os.path.join(Path(__file__).resolve().parent.parent,BIBenchmarks.CITY_BI_FILE)\n # self.df : pd.DataFrame = process_city(abs_path)\n \n self.df : pd.DataFrame = pd.read_parquet(file)\n \n self.ac = Arctic(f\"lmdb://opensource_datasets_{self.lib_name}?map_size=20GB\")\n self.ac.delete_library(self.lib_name)\n self.lib = self.ac.create_library(self.lib_name)\n \n print(\"The procedure is creating N times larger dataframes\")\n print(\"by concatenating original DF N times\")\n print(\"Size of original Dataframe: \", self.df.shape[0])\n for num in BIBenchmarks.params:\n _df = pd.concat([self.df] * num)\n print(\"DF for iterration xSize original ready: \", num)\n self.lib.write(f\"{self.symbol}{num}\", _df)\n \n print(\"If pandas query produces different dataframe than arctic one stop tests!\")\n print(\"This will mean query problem is there most likely\")\n \n print(\"Pre-check correctness for query_groupby_city_count_all\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_all(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_all(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_isin_filter\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_isin_filter(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_isin_filter(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"Pre-check correctness for query_groupby_city_count_filter_two_aggregations\")\n _df = self.df.copy(deep=True)\n arctic_df = self.time_query_groupby_city_count_filter_two_aggregations(BIBenchmarks.params[0])\n _df = get_query_groupby_city_count_filter_two_aggregations(_df)\n assert_frame_equal(_df, arctic_df)\n \n print(\"All pre-checks completed SUCCESSFULLY. Time: \", time.time() - start_time)\n \n del self.ac", "min_run_count": 2, "name": "bi_benchmarks.BIBenchmarks.time_query_readall", "number": 2, @@ -907,11 +907,11 @@ "repeat": 0, "rounds": 2, "sample_time": 0.01, - "setup_cache_key": "bi_benchmarks:67", + "setup_cache_key": "bi_benchmarks:68", "timeout": 6000, "type": "time", "unit": "seconds", - "version": "6a74cfd049227f232c1f6c380677782eeaed4423a258cad8932bf6b5cfd556b7", + "version": "c746faf05e4dbb872efa770cbe5ae057dafe3ecc1fb8969d1026db2dee7bfd99", "warmup_time": -1 }, "list_functions.ListFunctions.peakmem_list_symbols": { diff --git a/python/benchmarks/bi_benchmarks.py b/python/benchmarks/bi_benchmarks.py index d52b86d7de..d5fe7a3ffe 100644 --- a/python/benchmarks/bi_benchmarks.py +++ b/python/benchmarks/bi_benchmarks.py @@ -8,6 +8,7 @@ import os +import arcticdb.util.test as test import pandas as pd from pathlib import Path import time @@ -29,10 +30,10 @@ def get_query_groupby_city_count_filter_two_aggregations(q:QueryBuilder | pd.Dat def assert_frame_equal(pandas_df:pd.DataFrame, arctic_df:pd.DataFrame): arctic_df.sort_index(inplace=True) - pd.testing.assert_frame_equal(pandas_df, - arctic_df, - check_column_type=False, - check_dtype=False) + test.assert_frame_equal(pandas_df, + arctic_df, + check_column_type=False, + check_dtype=False) class BIBenchmarks: @@ -90,17 +91,8 @@ def setup_cache(self): print("by concatenating original DF N times") print("Size of original Dataframe: ", self.df.shape[0]) for num in BIBenchmarks.params: - _df = self.df.copy(deep=True) - if (num > 1): - # lets create N times bigger DF - dfcum = self.df.copy(deep=True) - arr = [dfcum] - for i in range(1, (BIBenchmarks.params[-1])): - arr.append(dfcum) - dfcum = pd.concat(arr) - _df = dfcum - print("DF for iterration xSize original ready: ", num) - _df.info(verbose=True,memory_usage='deep') + _df = pd.concat([self.df] * num) + print("DF for iterration xSize original ready: ", num) self.lib.write(f"{self.symbol}{num}", _df) print("If pandas query produces different dataframe than arctic one stop tests!")