From e59209a7fd18aa3e141e28161d373379393bc8ae Mon Sep 17 00:00:00 2001 From: Steve Robinson Date: Mon, 4 Dec 2023 15:00:48 +0000 Subject: [PATCH 1/2] Add optional argument of table schema source This change adds an optional command line argument (-s) to specify a directory containing any table schemas. This allows the definition of the table schema to be pulled into external local files for convenience. --- src/csv2rdf/csvw.clj | 21 ++++++++++---- src/csv2rdf/main.clj | 13 +++++++-- src/csv2rdf/metadata.clj | 8 +++--- src/csv2rdf/metadata/context.clj | 13 +++++---- src/csv2rdf/metadata/schema.clj | 4 +-- src/csv2rdf/metadata/table.clj | 11 +++++-- src/csv2rdf/metadata/types.clj | 35 ++++++++++++++++------- src/csv2rdf/tabular/metadata.clj | 2 +- src/csv2rdf/tabular/processing.clj | 10 +++---- test/csv2rdf/metadata_test.clj | 4 +-- test/csv2rdf/w3c_csvw_suite_test/impl.clj | 2 +- 11 files changed, 81 insertions(+), 42 deletions(-) diff --git a/src/csv2rdf/csvw.clj b/src/csv2rdf/csvw.clj index 9ee503cf..b334f105 100644 --- a/src/csv2rdf/csvw.clj +++ b/src/csv2rdf/csvw.clj @@ -57,10 +57,13 @@ - `:annotated` a custom mode, not part of the standard, which is like `:minimal`, but it also includes RDF data from the CSVW metadata json file." - ([tabular-source metadata-source] (csv->rdf tabular-source metadata-source {})) - ([tabular-source metadata-source {:keys [mode] :as options}] + ([tabular-source metadata-source] (csv->rdf tabular-source metadata-source nil {})) + ([tabular-source + metadata-source + table-schema-source + {:keys [mode] :as options}] (let [mode (or mode :standard) - {:keys [tables] :as metadata} (processing/get-metadata tabular-source metadata-source) + {:keys [tables] :as metadata} (processing/get-metadata tabular-source metadata-source table-schema-source) table-group-dialect (:dialect metadata) output-tables (remove properties/suppress-output? tables) {:keys [statements] :as ctx} (table-group-context mode metadata) @@ -73,8 +76,16 @@ "Run csv->rdf for the given tabular/metadata sources and options then write the resulting statements to the given destination. destination must implement grafter-2.rdf.protocols/ITripleWriteable." - [tabular-source metadata-source destination options] - (gproto/add destination (csv->rdf tabular-source metadata-source options))) + [tabular-source + metadata-source + table-schema-source + destination + options] + (gproto/add destination (csv->rdf + tabular-source + metadata-source + table-schema-source + options))) (defn csv->rdf->file "Run csv->rdf for the given tabular/metadata source and options then write the resulting diff --git a/src/csv2rdf/main.clj b/src/csv2rdf/main.clj index eacf65b3..28958b54 100644 --- a/src/csv2rdf/main.clj +++ b/src/csv2rdf/main.clj @@ -15,6 +15,7 @@ [["-t" "--tabular TABULAR" "Location of the tabular file"] ["-u" "--user-metadata METADATA" "Location of the metadata file"] ["-o" "--output-file OUTPUT" "Output file to write to"] + ["-s" "--table-schema TABLE-SCHEMA" "A directory containing any referenced table schema files"] ["-m" "--mode MODE" "CSVW mode to run" :validate [#(contains? #{:minimal :standard :annotated} %)] :default :standard @@ -51,9 +52,14 @@ :summary summary})) options))) -(defn- write-output [writer {:keys [rdf-format tabular-source metadata-source mode]}] +(defn- write-output [writer {:keys [rdf-format tabular-source metadata-source mode table-schema-source]}] (let [dest (gio/rdf-writer writer :format rdf-format :prefixes nil)] - (csvw/csv->rdf->destination tabular-source metadata-source dest {:mode mode}))) + (csvw/csv->rdf->destination + tabular-source + metadata-source + table-schema-source + dest + {:mode mode}))) (defmulti display-error "Displays an exception in the UI" @@ -71,9 +77,10 @@ (defn- inner-main [args] (let [options (parse-cli-options args) - {:keys [mode tabular user-metadata output-file]} options + {:keys [mode tabular user-metadata output-file table-schema]} options opts {:tabular-source (some-> tabular parse-source) :metadata-source (some-> user-metadata parse-source) + :table-schema-source (some-> table-schema ((fn [t] (URI. (str "file://" t))))) :rdf-format (or (some-> output-file formats/->rdf-format) RDFFormat/TURTLE) :mode mode} output-file (some-> output-file io/file)] diff --git a/src/csv2rdf/metadata.clj b/src/csv2rdf/metadata.clj index 3cd0379c..be2396aa 100644 --- a/src/csv2rdf/metadata.clj +++ b/src/csv2rdf/metadata.clj @@ -7,8 +7,8 @@ [csv2rdf.source :as source] [clojure.spec.alpha :as s])) -(defn parse-metadata-json [base-uri json] - (let [context (make-context base-uri)] +(defn parse-metadata-json [base-uri json table-schema-source] + (let [context (make-context base-uri table-schema-source)] (cond (table-group/looks-like-table-group-json? json) (properties/set-table-group-parent-references (table-group/parse-table-group-json context json)) @@ -18,9 +18,9 @@ :else (make-error context "Expected top-level of metadata document to describe a table or table group")))) -(defn parse-table-group-from-source [source] +(defn parse-table-group-from-source [source table-schema-source] (let [json (source/get-json source)] - (parse-metadata-json (source/->uri source) json))) + (parse-metadata-json (source/->uri source) json table-schema-source))) (s/fdef parse-table-group-from-source :args (s/cat :source (s/and ::source/uriable ::source/json-source))) diff --git a/src/csv2rdf/metadata/context.clj b/src/csv2rdf/metadata/context.clj index 757a05ce..38facad0 100644 --- a/src/csv2rdf/metadata/context.clj +++ b/src/csv2rdf/metadata/context.clj @@ -9,8 +9,14 @@ (defn document-uri ^URI [context] (:document-uri context)) -(defn make-context [metadata-uri] - {:base-uri metadata-uri :document-uri metadata-uri :path [] :language nil}) +(defn make-context + ([metadata-uri] (make-context metadata-uri nil)) + ([metadata-uri table-schema-source] + {:base-uri metadata-uri + :document-uri metadata-uri + :path [] + :language nil + :table-schema-source table-schema-source})) (defn language-code-or-default [{:keys [language] :as context}] (or language "und")) @@ -29,9 +35,6 @@ (defn append-path [context path-element] (update context :path conj path-element)) -(defn resolve-uri [{:keys [^URI base-uri] :as context} ^URI uri] - (util/resolve-uri base-uri uri)) - (defn with-document-uri [context ^URI new-document-uri] (assoc context :document-uri new-document-uri)) diff --git a/src/csv2rdf/metadata/schema.clj b/src/csv2rdf/metadata/schema.clj index 869d18f1..0746e4a7 100644 --- a/src/csv2rdf/metadata/schema.clj +++ b/src/csv2rdf/metadata/schema.clj @@ -1,9 +1,9 @@ (ns csv2rdf.metadata.schema - (:require [csv2rdf.metadata.validator :refer [make-warning make-error invalid chain array-of type-eq strict variant + (:require [csv2rdf.metadata.validator :refer [make-warning make-error array-of type-eq strict variant type-error-message]] [csv2rdf.json :as mjson] [csv2rdf.metadata.context :refer [append-path]] - [csv2rdf.metadata.types :refer [object-of object-property link-property column-reference id]] + [csv2rdf.metadata.types :refer [link-property column-reference id]] [csv2rdf.metadata.inherited :refer [metadata-of]] [csv2rdf.metadata.column :as column] [clojure.string :as string] diff --git a/src/csv2rdf/metadata/table.clj b/src/csv2rdf/metadata/table.clj index 8c35b65f..59eaf2c5 100644 --- a/src/csv2rdf/metadata/table.clj +++ b/src/csv2rdf/metadata/table.clj @@ -1,6 +1,6 @@ (ns csv2rdf.metadata.table (:require [csv2rdf.metadata.validator :refer [array-of bool type-eq strict]] - [csv2rdf.metadata.types :refer [link-property note table-direction object-property id contextual-object]] + [csv2rdf.metadata.types :refer [link-property note table-direction table-schema-object-property id contextual-object]] [csv2rdf.metadata.inherited :refer [metadata-of]] [csv2rdf.metadata.schema :as schema] [csv2rdf.metadata.transformation :as transformation] @@ -20,7 +20,7 @@ :notes (array-of note) :suppressOutput bool :tableDirection table-direction - :tableSchema (object-property schema/schema) + :tableSchema (table-schema-object-property schema/schema) :transformations (array-of transformation/transformation) :id id :type (type-eq "Table")}})) @@ -45,7 +45,12 @@ {:url table-uri :tableSchema schema}) -(defn ^{:metadata-spec "5.4.3"} validate-compatible [validating? {^URI uri1 :url schema1 :tableSchema :as table1} {^URI uri2 :url schema2 :tableSchema :as table2}] +(defn ^{:metadata-spec "5.4.3"} validate-compatible + [validating? + {^URI uri1 :url + schema1 :tableSchema} + {^URI uri2 :url + schema2 :tableSchema}] (when-not (= (.normalize uri1) (.normalize uri2)) (logging/log-warning (format "Table URIs %s and %s not equal after normalisation" uri1 uri2))) (schema/validate-compatible validating? schema1 schema2)) diff --git a/src/csv2rdf/metadata/types.clj b/src/csv2rdf/metadata/types.clj index 789f88d8..ffdee5ed 100644 --- a/src/csv2rdf/metadata/types.clj +++ b/src/csv2rdf/metadata/types.clj @@ -1,9 +1,9 @@ (ns csv2rdf.metadata.types (:require [csv2rdf.metadata.validator :refer [make-warning default-if-invalid variant invalid array-of kvps optional-key - required-key any map-of one-of string invalid? warn-invalid - chain try-parse-with where make-error uri ignore-invalid + required-key map-of one-of string invalid? warn-invalid + chain where make-error uri ignore-invalid type-error-message with-error-handler]] - [csv2rdf.metadata.context :refer [resolve-uri append-path language-code-or-default + [csv2rdf.metadata.context :refer [append-path language-code-or-default base-key language-key id-key update-from-local-context with-document-uri]] [csv2rdf.json-ld :refer [expand-uri-string]] [csv2rdf.json :refer [array? object?] :as mjson] @@ -60,17 +60,20 @@ (def default-uri (URI. "")) -(defn ^{:metadata-spec "6.3"} normalise-link-property - "Normalises a link property URI by resolving it against the current base URI." - [context uri] - (resolve-uri context uri)) - (defn ^{:metadata-spec "5.1.2"} link-property ([context x] (link-property context x warn-invalid)) ([context x error-fn] - (let [v (chain (default-if-invalid (with-error-handler (variant {:string uri}) error-fn) default-uri) normalise-link-property)] + (let [v (chain (default-if-invalid (with-error-handler (variant {:string uri}) error-fn) default-uri) + #(util/resolve-uri (:base-uri %1) %2))] (v context x)))) +(defn ^{:metadata-spec "5.1.2"} link-property-from-table-schema-source + ([context x] (link-property-from-table-schema-source context x warn-invalid)) + ([context x error-fn] + (let [v (chain (default-if-invalid (with-error-handler (variant {:string uri}) error-fn) default-uri) + #(util/resolve-uri (or (:table-schema-source %1) (:base-uri %1)) %2))] + (v context x)))) + (defn id "An id is a link property whose value cannot begin with _:" [context x] @@ -123,7 +126,7 @@ (defn ^{:metadata-spec "5.8.2"} expand-description-object-type-uri "If type is the name of a description object defined in the metadata specification (e.g. Table, Schema), - returns the corresponding id URI for the type. Otherwise returns nil." + returns the corresponding id URI for the type. Otherwise, returns nil." [type] (if (contains? description-object-types type) (util/set-fragment csvw type))) @@ -164,7 +167,7 @@ (URI. expanded) (catch URISyntaxException ex (make-error context (format "Invalid URI '%s'" s))))] - (resolve-uri context uri)))) + (util/resolve-uri (:base-uri context) uri)))) (make-error context (type-error-message #{:string} (mjson/get-json-type x))))) (defn type-one-of [allowed-types] @@ -397,3 +400,13 @@ (variant {:string (chain link-property (linked-object-property object-validator)) :object object-validator :default {}})) + +(defn ^{:metadata-spec "5.1.5"} table-schema-object-property + "Object which may be: + 1. specified in line in the metadata document + 2. referenced through a URI + 3. looked up from a file in the table-schema-source directory" + [object-validator] + (variant {:string (chain link-property-from-table-schema-source (linked-object-property object-validator)) + :object object-validator + :default {}})) \ No newline at end of file diff --git a/src/csv2rdf/tabular/metadata.clj b/src/csv2rdf/tabular/metadata.clj index a668600a..4c0ca4ca 100644 --- a/src/csv2rdf/tabular/metadata.clj +++ b/src/csv2rdf/tabular/metadata.clj @@ -145,7 +145,7 @@ (if-let [metadata-doc (resolve-associated-metadata uri metadata-link)] (do (.close stream) - (meta/parse-metadata-json uri metadata-doc)) + (meta/parse-metadata-json uri metadata-doc nil)) (let [dialect (dialect/get-default-dialect headers) options (dialect/dialect->options dialect) rows (reader/make-row-seq stream options)] diff --git a/src/csv2rdf/tabular/processing.clj b/src/csv2rdf/tabular/processing.clj index fe378766..02c8cc41 100644 --- a/src/csv2rdf/tabular/processing.clj +++ b/src/csv2rdf/tabular/processing.clj @@ -17,8 +17,8 @@ (table/validate-compatible validating? user-table table-metadata) (table/compatibility-merge user-table table-metadata))) -(defn- from-metadata-source [metadata-source] - (let [{:keys [tables] :as user-table-group} (meta/parse-table-group-from-source metadata-source) +(defn- from-metadata-source [metadata-source table-schema-source] + (let [{:keys [tables] :as user-table-group} (meta/parse-table-group-from-source metadata-source table-schema-source) validating? false merged-tables (mapv (fn [table] (validate-merge-table validating? table)) tables) merged-table-group (assoc user-table-group :tables merged-tables)] @@ -27,13 +27,13 @@ (defn ^{:tabular-spec "6.1"} get-metadata "Retrieves and resolves the metadata given either a tabular data source or metadata source. If user metadata is provided, each referenced table definition is validated against the corresponding tabular data file." - [tabular-source metadata-source] + [tabular-source metadata-source table-schema-source] (cond (and (some? tabular-source) (some? metadata-source)) - (from-metadata-source (meta/overriding-metadata tabular-source metadata-source)) + (from-metadata-source (meta/overriding-metadata tabular-source metadata-source) table-schema-source) (some? metadata-source) - (from-metadata-source metadata-source) + (from-metadata-source metadata-source table-schema-source) (some? tabular-source) (from-tabular-source tabular-source) diff --git a/test/csv2rdf/metadata_test.clj b/test/csv2rdf/metadata_test.clj index 4f09f810..f332bb7e 100644 --- a/test/csv2rdf/metadata_test.clj +++ b/test/csv2rdf/metadata_test.clj @@ -15,7 +15,7 @@ "tableSchema" {"columns" [{"name" "col1"} {"name" "col2"}]} "tables" [{"url" "http://example.com/table.csv"}]} - parsed (parse-metadata-json (URI. "http://example.com/metadata.json") json) + parsed (parse-metadata-json (URI. "http://example.com/metadata.json") json nil) table (get-in parsed [:tables 0]) dialect (properties/dialect table) schema (properties/table-schema table)] @@ -28,7 +28,7 @@ "dialect" {"quoteChar" "{"} "tableSchema" {"columns" [{"name" "col1"} {"name" "col2"}]}} - parsed (parse-metadata-json (URI. "http://example.com/metadata.json") json) + parsed (parse-metadata-json (URI. "http://example.com/metadata.json") json nil) table (get-in parsed [:tables 0]) dialect (properties/dialect table) schema (properties/table-schema table)] diff --git a/test/csv2rdf/w3c_csvw_suite_test/impl.clj b/test/csv2rdf/w3c_csvw_suite_test/impl.clj index fea4f4c9..f546a92c 100644 --- a/test/csv2rdf/w3c_csvw_suite_test/impl.clj +++ b/test/csv2rdf/w3c_csvw_suite_test/impl.clj @@ -16,7 +16,7 @@ (logging/with-logger logger (with-open [destination (repo/->connection repo)] (try - (csv->rdf->destination tabular-source metadata-source destination options) + (csv->rdf->destination tabular-source metadata-source nil destination options) {:errors [] :warnings @(:warnings logger) :result (into [] (gio/statements destination))} (catch Exception ex {:errors [(.getMessage ex)] :warnings @(:warnings logger) :result nil})))))) From 1758e013d786469a1056b2c476a24fdc6d58605a Mon Sep 17 00:00:00 2001 From: Steve Robinson Date: Mon, 4 Dec 2023 15:12:50 +0000 Subject: [PATCH 2/2] Include tools.namespace dep in dev This includes the super helpful refresh-all function which refreshes all files in the repl --- deps.edn | 3 ++- src/csv2rdf/csvw.clj | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/deps.edn b/deps.edn index 2717aafa..be95b396 100644 --- a/deps.edn +++ b/deps.edn @@ -15,7 +15,8 @@ org.apache.logging.log4j/log4j-core {:mvn/version "2.20.0"} org.apache.logging.log4j/log4j-slf4j2-impl {:mvn/version "2.20.0"}}} :dev { - :extra-deps {com.clojure-goes-fast/clj-async-profiler {:mvn/version "1.0.4"}} + :extra-deps {com.clojure-goes-fast/clj-async-profiler {:mvn/version "1.0.4"} + org.clojure/tools.namespace {:mvn/version "1.4.4"}} :jvm-opts ["-Djdk.attach.allowAttachSelf" ;; for jdk9+ ] diff --git a/src/csv2rdf/csvw.clj b/src/csv2rdf/csvw.clj index b334f105..5f63d85c 100644 --- a/src/csv2rdf/csvw.clj +++ b/src/csv2rdf/csvw.clj @@ -61,7 +61,7 @@ ([tabular-source metadata-source table-schema-source - {:keys [mode] :as options}] + {:keys [mode]}] (let [mode (or mode :standard) {:keys [tables] :as metadata} (processing/get-metadata tabular-source metadata-source table-schema-source) table-group-dialect (:dialect metadata)