From 6873e01a5b1cca8992c005ee8c449d910951ded9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kate=C5=99ina=20Hron=C3=ADkov=C3=A1?= <56041262+katacek@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:07:00 +0100 Subject: [PATCH] docs: add dataset schema validation (#1304) --- .../DatasetSchemaValidationError.yaml | 53 +++++ .../datasets/PutItemResponseError.yaml | 9 + .../openapi/paths/datasets/datasets.yaml | 1 + .../datasets/datasets@{datasetId}@items.yaml | 31 +++ .../index.md} | 2 +- .../dataset_schema/validation.md | 222 ++++++++++++++++++ sources/platform/monitoring/index.md | 12 +- 7 files changed, 328 insertions(+), 2 deletions(-) create mode 100644 apify-api/openapi/components/schemas/datasets/DatasetSchemaValidationError.yaml create mode 100644 apify-api/openapi/components/schemas/datasets/PutItemResponseError.yaml rename sources/platform/actors/development/actor_definition/{output_schema.md => dataset_schema/index.md} (99%) create mode 100644 sources/platform/actors/development/actor_definition/dataset_schema/validation.md diff --git a/apify-api/openapi/components/schemas/datasets/DatasetSchemaValidationError.yaml b/apify-api/openapi/components/schemas/datasets/DatasetSchemaValidationError.yaml new file mode 100644 index 000000000..0121b9993 --- /dev/null +++ b/apify-api/openapi/components/schemas/datasets/DatasetSchemaValidationError.yaml @@ -0,0 +1,53 @@ +type: object +properties: + error: + type: object + properties: + type: + type: string + description: The type of the error. + example: "schema-validation-error" + message: + type: string + description: A human-readable message describing the error. + example: "Schema validation failed" + data: + type: object + properties: + invalidItems: + type: array + description: A list of invalid items in the received array of items. + items: + type: object + properties: + itemPosition: + type: number + description: The position of the invalid item in the array. + example: 2 + validationErrors: + type: array + description: A complete list of AJV validation error objects for the invalid item. + items: + type: object + properties: + instancePath: + type: string + description: The path to the instance being validated. + schemaPath: + type: string + description: The path to the schema that failed the validation. + keyword: + type: string + description: The validation keyword that caused the error. + message: + type: string + description: A message describing the validation error. + params: + type: object + description: Additional parameters specific to the validation error. + required: + - invalidItems + required: + - type + - message + - data diff --git a/apify-api/openapi/components/schemas/datasets/PutItemResponseError.yaml b/apify-api/openapi/components/schemas/datasets/PutItemResponseError.yaml new file mode 100644 index 000000000..2f6f30ceb --- /dev/null +++ b/apify-api/openapi/components/schemas/datasets/PutItemResponseError.yaml @@ -0,0 +1,9 @@ +title: PutItemResponseError +required: + - error +type: object +properties: + error: + allOf: + - $ref: ./DatasetSchemaValidationError.yaml + - {} diff --git a/apify-api/openapi/paths/datasets/datasets.yaml b/apify-api/openapi/paths/datasets/datasets.yaml index e22976e89..b9f734fe5 100644 --- a/apify-api/openapi/paths/datasets/datasets.yaml +++ b/apify-api/openapi/paths/datasets/datasets.yaml @@ -106,6 +106,7 @@ post: Keep in mind that data stored under unnamed dataset follows [data retention period](https://docs.apify.com/platform/storage#data-retention). It creates a dataset with the given name if the parameter name is used. If a dataset with the given name already exists then returns its object. + operationId: datasets_post parameters: - name: name diff --git a/apify-api/openapi/paths/datasets/datasets@{datasetId}@items.yaml b/apify-api/openapi/paths/datasets/datasets@{datasetId}@items.yaml index 1bd4312e9..4cad700ad 100644 --- a/apify-api/openapi/paths/datasets/datasets@{datasetId}@items.yaml +++ b/apify-api/openapi/paths/datasets/datasets@{datasetId}@items.yaml @@ -478,6 +478,10 @@ post: The POST payload is a JSON object or a JSON array of objects to save into the dataset. + If the data you attempt to store in the dataset is invalid (meaning any of the items received by the API fails the validation), the whole request is discarded and the API will return a response with status code 400. + For more information about dataset schema validation, see [Dataset schema](https://docs.apify.com/platform/actors/development/actor-definition/dataset-schema/validation). + + **IMPORTANT:** The limit of request payload size for the dataset is 5 MB. If the array exceeds the size, you'll need to split it into a number of smaller arrays. operationId: dataset_items_post parameters: @@ -523,6 +527,33 @@ post: type: object example: {} example: {} + '400': + description: '' + headers: {} + content: + application/json: + schema: + allOf: + - $ref: >- + ../../components/schemas/datasets/PutItemResponseError.yaml + - example: + error: + type: schema-validation-error + message: Schema validation failed + example: + error: + type: schema-validation-error + message: Schema validation failed + data: + invalidItems: + - itemPosition: 2 + validationErrors: + - instancePath: /1/stringField + schemaPath: /items/properties/stringField/type + keyword: type + params: + type: string + message: 'must be string' deprecated: false x-legacy-doc-urls: - https://docs.apify.com/api/v2#/reference/datasets/item-collection/put-items diff --git a/sources/platform/actors/development/actor_definition/output_schema.md b/sources/platform/actors/development/actor_definition/dataset_schema/index.md similarity index 99% rename from sources/platform/actors/development/actor_definition/output_schema.md rename to sources/platform/actors/development/actor_definition/dataset_schema/index.md index 6834c907f..d4299ed9a 100644 --- a/sources/platform/actors/development/actor_definition/output_schema.md +++ b/sources/platform/actors/development/actor_definition/dataset_schema/index.md @@ -118,7 +118,7 @@ The template above defines the configuration for the default dataset output view The default behavior of the Output tab UI table is to display all fields from `transformation.fields` in the specified order. You can customize the display properties for specific formats or column labels if needed. -![Output tab UI](./images/output-schema-example.png) +![Output tab UI](../images/output-schema-example.png) ## Structure diff --git a/sources/platform/actors/development/actor_definition/dataset_schema/validation.md b/sources/platform/actors/development/actor_definition/dataset_schema/validation.md new file mode 100644 index 000000000..882b614e9 --- /dev/null +++ b/sources/platform/actors/development/actor_definition/dataset_schema/validation.md @@ -0,0 +1,222 @@ +--- +title: Dataset validation +description: Specify the dataset schema within the Actors so you can add monitoring and validation at the field level. +slug: /actors/development/actor-definition/dataset-schema/validation +--- + +**Specify the dataset schema within the Actors so you can add monitoring and validation at the field level.** + +--- + +To define a schema for a default dataset of an Actor run, you need to set `fields` property in the dataset schema. + +:::info + +The schema defines a single item in the dataset. Be careful not to define the schema as an array, it always needs to be a schema of an object. + +Schema configuration is not available for named datasets or dataset views. + +::: + +You can either do that directly through `actor.json`: + +```json title=".actor.json" +{ + "actorSpecification": 1, + "storages": { + "dataset": { + "actorSpecification": 1, + "fields": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "name": { + "type": "string" + } + }, + "required": ["name"] + }, + "views": {} + } + } +} +``` + +Or in a separate file linked from the `.actor.json`: + +```json title=".actor.json" +{ + "actorSpecification": 1, + "storages": { + "dataset": "./dataset_schema.json" + } +} +``` + +```json title="dataset_schema.json" +{ + "actorSpecification": 1, + "fields": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "name": { + "type": "string" + } + }, + "required": ["name"] + }, + "views": {} +} +``` + +:::important + +Dataset schema needs to be a valid JSON schema draft-07, so the `$schema` line is important and must be exactly this value or it must be omitted: + +`"$schema": "http://json-schema.org/draft-07/schema#"` + +::: + +## Dataset validation + +When you define a schema of your default dataset, the schema is then always used when you insert data into the dataset to perform validation (we use [AJV](https://ajv.js.org/)). + +If the validation succeeds, nothing changes from the current behavior, data is stored and an empty response with status code `201` is returned. + +If the data you attempt to store in the dataset is _invalid_ (meaning any of the items received by the API fails validation), _the entire request will be discarded_, The API will return a response with status code `400` and the following JSON response: + +```json +{ + "error": { + "type": "schema-validation-error", + "message": "Schema validation failed", + "data": { + "invalidItems": [{ + "itemPosition": "", + "validationErrors": "" + }] + } + } +} +``` + +The type of the AJV validation error object is [here](https://github.com/ajv-validator/ajv/blob/master/lib/types/index.ts#L86). + +If you use the Apify JS client or Apify SDK and call `pushData` function you can access the validation errors in a `try catch` block like this: + +```javascript +try { + const response = await Actor.pushData(items); +} catch (error) { + if (!error.data?.invalidItems) throw error; + error.data.invalidItems.forEach((item) => { + const { itemPosition, validationErrors } = item; + }); +} +``` + +## Examples of common types of validation + +Optional field (price is optional in this case): + +```json +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "price": { + "type": "number" + } + }, + "required": ["name"] +} +``` + +Field with multiple types: + +```json +{ + "price": { + "type": ["string", "number"] + } +} +``` + +Field with type `any`: + +```json +{ + "price": { + "type": ["string", "number", "object", "array", "boolean"] + } +} +``` + +Enabling fields to be `null` : + +```json +{ + "name": { + "type": "string", + "nullable": true + } +} +``` + +Define type of objects in array: + +```json +{ + "comments": { + "type": "array", + "items": { + "type": "object", + "properties": { + "author_name": { + "type": "string" + } + } + } + } +} +``` + +Define specific fields, but allow anything else to be added to the item: + +```json +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "name": { + "type": "string" + } + }, + "additionalProperties": true +} +``` + +See [json schema reference](https://json-schema.org/understanding-json-schema/reference) for additional options. + +You can also use [conversion tools](https://www.liquid-technologies.com/online-json-to-schema-converter) to convert an existing JSON document into it's JSON schema. + +## Dataset field statistics + +When you configure the dataset fields schema, we generate a field list and measure the following statistics: + +- **Null count:** how many items in the dataset have the field set to null +- **Empty count:** how many items in the dataset are `undefined` , meaning that for example empty string is not considered empty +- **Minimum and maximum** + - For numbers, this is calculated directly + - For strings, this field tracks string length + - For arrays, this field tracks the number of items in the array + - For objects, this tracks the number of keys + - For booleans, this tracks whether the boolean was set to true. Minimum is always 0, but maximum can be either 1 or 0 based on whether at least one item in the dataset has the boolean field set to true. + + +You can use them in [monitoring](../../../../monitoring#alert-configuration). + diff --git a/sources/platform/monitoring/index.md b/sources/platform/monitoring/index.md index d308d6fce..5d208cc7c 100644 --- a/sources/platform/monitoring/index.md +++ b/sources/platform/monitoring/index.md @@ -41,12 +41,22 @@ Currently, the monitoring option offers the following features: ### Alert configuration -When you set up an alert, you have two choices for how you want the metrics to be evaluated. And depending on your choices, the alerting system will behave differently: +When you set up an alert, you have four choices for how you want the metrics to be evaluated. And depending on your choices, the alerting system will behave differently: 1. **Alert, when the metric is lower than** - This type of alert is checked after the run finishes. If the metric is lower than the value you set, the alert will be triggered and you will receive a notification. 2. **Alert, when the metric is higher than** - This type of alert is checked both during the run and after the run finishes. During the run, we do periodic checks (approximately every 5 minutes) so that we can notify you as soon as possible if the metric is higher than the value you set. After the run finishes, we do a final check to make sure that the metric does not go over the limit in the last few minutes of the run. +3. **Alert, when run status is one of following** - This type of alert is checked only after the run finishes. It makes possible to track the status of your finished runs and send an alert if the run finishes in a state you do not expect. If your Actor runs very often and suddenly starts failing, you will receive a single alert after the first failed run in 1 minute, and then aggregated alert every 15 minutes. + +4. **Alert for dataset field statistics** - If you have a [dataset schema](../actors/development/actor_definition/dataset_schema/validation.md) set up, then you can use the field statistics to set up an alert. You can use field statistics for example to track if some field is filled in in all records, if some numeric value is too low/high (for example when tracking the price of a product over multiple sources), if the number of items in an array is too low/high (for example alert on Instagram Actor if post has a lot of comments) and many other tasks like these. + + :::important + + Available dataset fields are taken from the last successful build of the monitored Actor. If different versions have different fields, currently the solution will always display only those from the default version. + + ::: + ![Metric condition configuration](./images/metric-options.png) You can get notified by email, Slack, or in Apify Console. If you use Slack, we suggest using Slack notifications instead of email because they are more reliable, and you can also get notified quicker.