Princeton-CDH · rlskoeser · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/data-export-meta/README.rst b/data-export-meta/README.rst
@@ -2,7 +2,27 @@ Data exports
 ============
 
 This directory contains [Frictionless data](https://frictionlessdata.io/) [data package](https://specs.frictionlessdata.io/data-package/)
-files to describe and validate Project data exports.
+files to describe and validate Project data exports, along with utility scripts for auto-generating portions of dataset readmes, data dictionaries, and list of members and books changes from previous published versions of the datasets.
 
-They are currently generated and maintained manually; they should be updated
-for deposit with revised data exports as needed.
+Datapackage files are currently generated and maintained manually; they should be updated
+for deposit with revised data exports as needed.
+
+Validation
+^^^^^^^^^^
+
+To validate datapackage files and associated data files, use frictionless:
+
+1. `pip install frictionless`
+2. `frictionless validate vX.X/datapakage.json`
+
+This will report any errors in the datapackage file as well as any validation errors where the types or pattern constraints specified in the data package file do not match the data in the associated CSV files.
+
+
+Scripts
+^^^^^^^
+
+All scripts require pandas (`pip install pandas`).
+
+- `readme_info.py` - use to generate dataset summary information for inclusion in plain-text readme (number of fields, number of rows, optional list of fields with descriptions); can also be used to generate a CSV data dictionary. Takes a path to the datapackage file; resource paths referenced in the datapackage must resolve.
+- `member_changes.py` - for members in an old version not in the new version, creates a csv of changes with new ids for member ids that changed; requires pandas. Must be updated for new versions and should be added to changes from previous versions.
+- `book_changes.py` - same as above, but for book ids
diff --git a/data-export-meta/readme_info.py b/data-export-meta/readme_info.py
@@ -8,33 +8,87 @@
 
 import json
 import sys
+import argparse
+import pathlib
+import csv
 
 import pandas as pd
 
 
-def readme_info(df, dp_resource):
+def readme_info(df, dp_resource, field_list=True):
     print("1. Number of fields: %d\n" % len(df.columns))
     print("2. Number of rows: {:,}\n".format(len(df)))
     schema_fields = dp_resource["schema"]["fields"]
 
     assert len(schema_fields) == len(df.columns)
     field_info = {field["name"]: field for field in schema_fields}
 
-    print("3. Field List:")
-    for col in df.columns:
-        print("%s : %s" % (col, field_info[col]["description"]))
+    if field_list:
+        print("3. Field List:")
+        for col in df.columns:
+            print("%s : %s" % (col, field_info[col]["description"]))
 
 
 if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print("Please provide path to frictionless datapackage file")
-        exit(0)
+    parser = argparse.ArgumentParser(
+        "Generate dataset info readme from datapackage and data files"
+    )
+    parser.add_argument("datapackage", type=pathlib.Path)
+    # flag to determine whether fields be listed
+    parser.add_argument(
+        "--field-list",
+        help="Generate field list in readme.txt format",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+    )
+    parser.add_argument(
+        "-dd",
+        "--data-dictionary",
+        help="Create a data dictionary in the specified file",
+        type=pathlib.Path,
+    )
 
-    with open(sys.argv[1]) as packagejson:
+    args = parser.parse_args()
+
+    if args.data_dictionary:
+        if args.data_dictionary.exists():
+            print(
+                f"Requested data dictionary file {args.data_dictionary} already exists"
+            )
+            raise SystemExit(1)
+    with args.datapackage.open() as packagejson:
         datapackage = json.load(packagejson)
 
-    csvfile = datapackage["resources"][0]["path"]
-    print("Inspecting %s...\n\n" % csvfile)
+        for resource in datapackage["resources"]:
+            # resource path should be relative to the datapackage file
+            datafile = args.datapackage.parent / resource["path"]
+            print("\n\nInspecting %s...\n\n" % datafile)
+            with datafile.open() as csvfile:
+                df = pd.read_csv(csvfile)
+            readme_info(df, resource, field_list=args.field_list)
 
-    df = pd.read_csv(csvfile)
-    readme_info(df, datapackage["resources"][0])
+        if args.data_dictionary:
+            print(f"\n\nWriting data dictionary to {args.data_dictionary}")
+            with args.data_dictionary.open("w", encoding="utf-8") as csv_datadict:
+                fieldnames = [
+                    "Filename",
+                    "Variable",
+                    "Variable name",
+                    "Description",
+                    "Type",
+                    "Format",
+                ]
+                csvwriter = csv.DictWriter(csv_datadict, fieldnames=fieldnames)
+                csvwriter.writeheader()
+                for resource in datapackage["resources"]:
+                    for field in resource["schema"]["fields"]:
+                        csvwriter.writerow(
+                            {
+                                "Filename": resource["path"],
+                                "Variable": field["title"],
+                                "Variable name": field["name"],
+                                "Description": field["description"],
+                                "Type": field["type"],
+                                "Format": field.get("format"),
+                            }
+                        )
diff --git a/...oData_books_v1.2_2022-12_datapackage.json → ...oData_books_v1.2_2022-12_datapackage.json b/...oData_books_v1.2_2022-12_datapackage.json → ...oData_books_v1.2_2022-12_datapackage.json
diff --git a/...ta_combined_v1.2_2022-01_datapackage.json → ...ta_combined_v1.2_2022-01_datapackage.json b/...ta_combined_v1.2_2022-01_datapackage.json → ...ta_combined_v1.2_2022-01_datapackage.json
diff --git a/...Data_events_v1.2_2022-01_datapackage.json → ...Data_events_v1.2_2022-01_datapackage.json b/...Data_events_v1.2_2022-01_datapackage.json → ...Data_events_v1.2_2022-01_datapackage.json
diff --git a/...ata_members_v1.2_2022-01_datapackage.json → ...ata_members_v1.2_2022-01_datapackage.json b/...ata_members_v1.2_2022-01_datapackage.json → ...ata_members_v1.2_2022-01_datapackage.json