From e56bf26343f22b83b69212232e5c341d8328f86e Mon Sep 17 00:00:00 2001
From: Jonathan <jonathan.hilgart@snowflake.com>
Date: Fri, 3 May 2024 10:39:49 -0700
Subject: [PATCH] physical to base and sample values

---
 README.md                                     | 45 +++++++++++++++++--
 semantic_model_generator/generate_model.py    | 18 ++++----
 .../tests/generate_model_test.py              | 16 +++----
 3 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 41d387d7..7adde515 100644
--- a/README.md
+++ b/README.md
@@ -73,12 +73,12 @@ python
 ```python
 from semantic_model_generator.generate_model import generate_base_semantic_model_from_snowflake
 
-PHYSICAL_TABLES = ['<your-database-name-1>.<your-schema-name-1>.<your-physical-table-or-view-name-1>','<your-database-name-2>.<your-schema-name-2>.<your-physical-table-or-view-name-2>']
+BASE_TABLES = ['<your-database-name-1>.<your-schema-name-1>.<your-base-table-or-view-name-1>','<your-database-name-2>.<your-schema-name-2>.<your-base-table-or-view-name-2>']
 SNOWFLAKE_ACCOUNT = "<your-snowflake-account>"
 SEMANTIC_MODEL_NAME = "<a-meaningful-semantic-model-name>"
 
 generate_base_semantic_model_from_snowflake(
-    physical_tables=PHYSICAL_TABLES,
+    base_tables=BASE_TABLES,
     snowflake_account=SNOWFLAKE_ACCOUNT,
     semantic_model_name=SEMANTIC_MODEL_NAME
 )
@@ -99,7 +99,7 @@ This is the script version run on the command line.
 2. Run on your command line.
 ```bash
 python -m semantic_model_generator.generate_model \
-    --physical_tables  "['<your-database-name-1>.<your-schema-name-1>.<your-physical-table-or-view-name-1>','<your-database-name-2>.<your-schema-name-2>.<your-physical-table-or-view-name-2>']" \
+    --base_tables  "['<your-database-name-1>.<your-schema-name-1>.<your-base-table-or-view-name-1>','<your-database-name-2>.<your-schema-name-2>.<your-base-table-or-view-name-2>']" \
     --semantic_model_name "<a-meaningful-semantic-model-name>" \
     --snowflake_account="<your-snowflake-account>"
 ```
@@ -181,7 +181,7 @@ tables:
     description: A logical table capturing daily sales information across different store locations and product categories.
 
     # The fully qualified name of the underlying physical table.
-    physical_table:
+    base_table:
       database: sales
       schema: public
       table: sd_data
@@ -195,11 +195,19 @@ tables:
         description: The category of the product sold.
         expr: cat
         unique: false
+        data_type: NUMBER
+        sample_values:
+          - '501'
+          - '544'
 
       - name: store_country
         description: The country where the sale took place.
         expr: cntry
         unique: false
+        data_type: TEXT
+        sample_values:
+          - 'USA'
+          - 'GBR'
 
       - name: sales_channel
         synonyms: 
@@ -208,6 +216,10 @@ tables:
         description: The channel through which the sale was made.
         expr: chn
         unique: false
+        data_type: TEXT
+        sample_values:
+          - 'FB'
+          - 'GOOGLE'
 
     time_dimensions:
       - name: sale_timestamp
@@ -217,6 +229,11 @@ tables:
         description: The time when the sale occurred. In UTC.
         expr: dt
         unique: false
+        data_type: TIMESTAMP
+        sample_values:
+          - '2016-09-01 07:30:00'
+          - '2016-09-01 14:16:00'
+          - '2016-09-04 09:20:00'
 
     measures:
       - name: sales_amount
@@ -226,11 +243,19 @@ tables:
         description: The total amount of money generated from the sale.
         expr: amt
         default_aggregation: sum
+        data_type: NUMBER
+        sample_values:
+          - '11.650000'
+          - '50.880000'
 
       - name: sales_tax
         description: The sales tax paid for this sale.
         expr: amt * 0.0975
         default_aggregation: sum
+        data_type: NUMBER
+        sample_values:
+          - '51.650000'
+          - '57.800'
 
       - name: units_sold
         synonyms: 
@@ -239,11 +264,19 @@ tables:
         description: The number of units sold in the transaction.
         expr: unts
         default_aggregation: sum
+        data_type: NUMBER
+        sample_values:
+          - '1'
+          - '3'
 
       - name: cost
         description: The cost of the product sold.
         expr: cst
         default_aggregation: sum
+        data_type: NUMBER
+        sample_values:
+          - '10'
+          - '33'
 
       - name: profit
         synonyms: 
@@ -252,6 +285,10 @@ tables:
         description: The profit generated from a sale.
         expr: amt - cst
         default_aggregation: sum
+        data_type: NUMBER
+        sample_values:
+          - '15'
+          - '37'
 
 
     # A table can define commonly used filters over it. These filters can then be referenced in user questions directly.
diff --git a/semantic_model_generator/generate_model.py b/semantic_model_generator/generate_model.py
index ea590885..ec159080 100644
--- a/semantic_model_generator/generate_model.py
+++ b/semantic_model_generator/generate_model.py
@@ -127,13 +127,13 @@ def _raw_table_to_semantic_context_table(
 
 
 def raw_schema_to_semantic_context(
-    physical_tables: List[str], snowflake_account: str, semantic_model_name: str
+    base_tables: List[str], snowflake_account: str, semantic_model_name: str
 ) -> semantic_model_pb2.SemanticModel:
     """
     Converts a list of fully qualified Snowflake table names into a semantic model.
 
     Parameters:
-        physical_tables  (list[str]): Fully qualified table names to include in the semantic model.
+        base_tables  (list[str]): Fully qualified table names to include in the semantic model.
     - snowflake_account (str): Snowflake account identifier.
     - semantic_model_name (str): A meaningful semantic model name.
 
@@ -155,7 +155,7 @@ def raw_schema_to_semantic_context(
     # For FQN tables, create a new snowflake connection per table in case the db/schema is different.
     table_objects = []
     unique_database_schema: List[str] = []
-    for table in physical_tables:
+    for table in base_tables:
         # Verify this is a valid FQN table. For now, we check that the table follows the following format.
         # {database}.{schema}.{table}
         fqn_table = create_fqn_table(table)
@@ -253,7 +253,7 @@ def _to_snake_case(s: str) -> str:
 
 
 def generate_base_semantic_model_from_snowflake(
-    physical_tables: List[str],
+    base_tables: List[str],
     snowflake_account: str,
     semantic_model_name: str,
     output_yaml_path: Optional[str] = None,
@@ -262,7 +262,7 @@ def generate_base_semantic_model_from_snowflake(
     Generates a base semantic context from specified Snowflake tables and exports it to a YAML file.
 
     Parameters:
-    physical_tables : Fully qualified names of Snowflake tables to include in the semantic context.
+        base_tables : Fully qualified names of Snowflake tables to include in the semantic context.
         snowflake_account: Identifier of the Snowflake account.
         semantic_model_name: The human readable model name. This should be semantically meaningful to an organization.
         output_yaml_path: Path for the output YAML file. If None, defaults to 'semantic_model_generator/output_models/YYYYMMDDHHMMSS_<semantic_model_name>.yaml'.
@@ -280,7 +280,7 @@ def generate_base_semantic_model_from_snowflake(
     else:  # Assume user gives correct path.
         write_path = output_yaml_path
     context = raw_schema_to_semantic_context(
-        physical_tables,
+        base_tables,
         snowflake_account=snowflake_account,
         semantic_model_name=semantic_model_name,
     )
@@ -301,10 +301,10 @@ def generate_base_semantic_model_from_snowflake(
     )
 
     parser.add_argument(
-        "--physical_tables",
+        "--base_tables",
         type=list,
         required=True,
-        help="The list of fully qualified table names all following the format {database_name}.{schema_name}{table_name}",
+        help="The list of fully qualified table names all following the format {database_name}.{schema_name}.{table_name}",
     )
     parser.add_argument(
         "--snowflake_account",
@@ -328,7 +328,7 @@ def generate_base_semantic_model_from_snowflake(
     args = parser.parse_args()
 
     generate_base_semantic_model_from_snowflake(
-        physical_tables=args.physical_tables,
+        base_tables=args.base_tables,
         snowflake_account=args.snowflake_account,
         semantic_model_name=args.semantic_model_name,
         output_yaml_path=args.output_yaml_path,
diff --git a/semantic_model_generator/tests/generate_model_test.py b/semantic_model_generator/tests/generate_model_test.py
index e81b72d8..42b13a0d 100644
--- a/semantic_model_generator/tests/generate_model_test.py
+++ b/semantic_model_generator/tests/generate_model_test.py
@@ -217,11 +217,11 @@ def test_raw_schema_to_semantic_context(
     want_yaml = "name: this is the best semantic model ever\ntables:\n  - name: ALIAS\n    description: '  '\n    base_table:\n      database: test_db\n      schema: schema_test\n      table: ALIAS\n    filters:\n      - name: '  '\n        synonyms:\n          - '  '\n        description: '  '\n        expr: '  '\n    dimensions:\n      - name: ZIP_CODE\n        synonyms:\n          - '  '\n        description: '  '\n        expr: ZIP_CODE\n        data_type: TEXT\n    time_dimensions:\n      - name: BAD_ALIAS\n        synonyms:\n          - '  '\n        description: '  '\n        expr: BAD_ALIAS\n        data_type: TIMESTAMP\n    measures:\n      - name: AREA_CODE\n        synonyms:\n          - '  '\n        description: '  '\n        expr: AREA_CODE\n        data_type: NUMBER\n      - name: CBSA\n        synonyms:\n          - '  '\n        description: '  '\n        expr: CBSA\n        data_type: NUMBER\n"
 
     snowflake_account = "test_account"
-    physical_tables = ["test_db.schema_test.ALIAS"]
+    base_tables = ["test_db.schema_test.ALIAS"]
     semantic_model_name = "this is the best semantic model ever"
 
     semantic_model = raw_schema_to_semantic_context(
-        physical_tables=physical_tables,
+        base_tables=base_tables,
         snowflake_account=snowflake_account,
         semantic_model_name=semantic_model_name,
     )
@@ -251,13 +251,13 @@ def test_generate_base_context_with_placeholder_comments(
     mock_snowflake_connection_env,
 ):
 
-    physical_tables = ["test_db.schema_test.ALIAS"]
+    base_tables = ["test_db.schema_test.ALIAS"]
     snowflake_account = "test_account"
     output_path = "output_model_path.yaml"
     semantic_model_name = "my awesome semantic model"
 
     generate_base_semantic_model_from_snowflake(
-        physical_tables=physical_tables,
+        base_tables=base_tables,
         snowflake_account=snowflake_account,
         output_yaml_path=output_path,
         semantic_model_name=semantic_model_name,
@@ -278,7 +278,7 @@ def test_generate_base_context_with_placeholder_comments_cross_database_cross_sc
     mock_snowflake_connection_env,
 ):
 
-    physical_tables = [
+    base_tables = [
         "test_db.schema_test.ALIAS",
         "a_different_database.a_different_schema.PRODUCTS",
     ]
@@ -287,7 +287,7 @@ def test_generate_base_context_with_placeholder_comments_cross_database_cross_sc
     semantic_model_name = "Another Incredible Semantic Model"
 
     generate_base_semantic_model_from_snowflake(
-        physical_tables=physical_tables,
+        base_tables=base_tables,
         snowflake_account=snowflake_account,
         output_yaml_path=output_path,
         semantic_model_name=semantic_model_name,
@@ -310,13 +310,13 @@ def test_generate_base_context_with_placeholder_comments_missing_datatype(
     mock_snowflake_connection_env,
 ):
 
-    physical_tables = ["test_db.schema_test.ALIAS"]
+    base_tables = ["test_db.schema_test.ALIAS"]
     snowflake_account = "test_account"
     output_path = "output_model_path.yaml"
     semantic_model_name = "Another Incredible Semantic Model with new dtypes"
 
     generate_base_semantic_model_from_snowflake(
-        physical_tables=physical_tables,
+        base_tables=base_tables,
         snowflake_account=snowflake_account,
         output_yaml_path=output_path,
         semantic_model_name=semantic_model_name,