Merge pull request awslabs#186 from Bit-Quill/integ-record-verification

Improve bucket query documentation & refactor pagination constant
Bit-Quill · May 13, 2024 · e9abacb · e9abacb
2 parents fd5edf5 + 35b1672
commit e9abacb
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 9 deletions.
diff --git a/tools/python/influx-migration/README.md b/tools/python/influx-migration/README.md
@@ -146,15 +146,35 @@ After meeting the prerequisites:
 
 2. **Provide Credentials**: Provide host addresses and ports as CLI options.
 
-3. **Verify Data**: Ensure the data is correctly transferred by:
+3. **Verify Data**: To manually verify data migration run the following commands before, on the source instance, and after migrating, on the destination instance:
 
-    a. Using the InfluxDB UI and inspecting buckets.
+    - **i**. List all buckets using the Influx CLI, where `<token>` is an operator token and `<host>` is the host of the source or destination instance, e.g., `https://<hostname>:8086` or `http://localhost:8086`.
 
-    b. Listing buckets with `influx bucket list -t <destination token> --host <destination host address> --skip-verify`.
+      ```shell
+      influx bucket list -t <token> --host <host>
+      ```
+    
+      This will help get an idea of the buckets in the source instance, whether you are doing a full migration or migrating just one bucket.
+
+    - **ii**. Start the Influx CLI v1 shell.
+
+      ```shell
+      influx v1 shell -t <token> --host <host> --org <org name>
+      ```
 
-    c. Using `influx v1 shell -t <destination token> --host <destination host address> --skip-verify` and running `SELECT * FROM <migrated bucket>.<retention period>.<measurement name> LIMIT 100` to view contents of a bucket or `SELECT COUNT(*) FROM <migrated bucket>.<retention period>.<measurement name>` to verify the correct number of records have been migrated.
+    - **iii**. Query the number of records in a bucket with the Influx CLI v1 shell.
 
-    d. By running a query using `influx query -t <destination token> --host <destination host address> --skip-verify 'from(bucket: "<migrated bucket>") |> range(start: <desired start>, stop: <desired stop>)'`. Adding `|> count()` to the query is also a way to verify the correct number of records have been migrated.
+      ```sql
+      SELECT COUNT(*) FROM "<bucket name>"."<retention period>"."<measurement name>"
+      ```
+
+      For reference, if a bucket has an infinite retention period then the value of `<retention period>` will be `autogen`.
+    
+    - **iv**. Exit the Influx v1 shell and use the Influx CLI to query the number of records in each table within a bucket using [`count()`](https://docs.influxdata.com/flux/v0/stdlib/universe/count/) and the largest possible range. Note that this will show **different** results compared to the previous step, as the total number of records are not being queried.
+
+      ```shell
+      influx query 'from(bucket: "<bucket name>") |> range(start: 1680-01-01T00:00:00Z, stop: 2800-01-01T00:00:00Z) |> count()' -t <token> --org <org name> --host <host>
+      ```
 
 ## Example Run
 

diff --git a/tools/python/influx-migration/influx_migration.py b/tools/python/influx-migration/influx_migration.py
@@ -53,7 +53,7 @@
 # The number of seconds to wait before scraping from the /metrics endpoint
 METRICS_SCRAPE_INTERVAL_SECONDS=10
 
-BUCKET_PAGINATION_LIMIT=100
+MAX_BUCKET_PAGINATION_LIMIT=100
 
 script_duration = 0
 
@@ -115,15 +115,15 @@ def report_all_bucket_series_count(host, token, org_name=None, skip_verify=False
         # CSV migration may use an all access token, meaning buckets will be scoped to an organization
         if org_name is not None:
             client.org = org_name
-        buckets = client.buckets_api().find_buckets(limit=BUCKET_PAGINATION_LIMIT)
+        buckets = client.buckets_api().find_buckets(limit=MAX_BUCKET_PAGINATION_LIMIT)
         offset = 0
         while len(buckets.buckets) > 0:
             for bucket in buckets.buckets:
                 if not bucket.name.startswith("_"):
                     report_bucket_series_count(bucket_name=bucket.name, host=host, token=token,
                         org_name=org_name, skip_verify=skip_verify)
-            offset += BUCKET_PAGINATION_LIMIT
-            buckets = client.buckets_api().find_buckets(limit=BUCKET_PAGINATION_LIMIT,
+            offset += MAX_BUCKET_PAGINATION_LIMIT
+            buckets = client.buckets_api().find_buckets(limit=MAX_BUCKET_PAGINATION_LIMIT,
                 offset=offset)
 
 def report_bucket_series_count(bucket_name, host, token, org_name=None, skip_verify=False):