-
Notifications
You must be signed in to change notification settings - Fork 94
77 lines (67 loc) · 2.18 KB
/
create-parquet.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
name: Create parquet
on:
workflow_dispatch:
jobs:
create-parquet:
if: github.repository_owner == 'european-modelling-hubs'
runs-on: ubuntu-latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v3
- uses: r-lib/actions/setup-r@v2
with:
install-r: false
use-public-rspm: true
- name: Install R system requirements
run: |
sudo apt-get update
sudo apt-get install libcurl4-openssl-dev
- uses: r-lib/actions/setup-renv@v2
- name: Generate parquet file
run: |
library(readr)
library(arrow)
library(dplyr)
library(lubridate)
locations <- read_csv(
here::here("data-locations", "locations_eu.csv")
) |>
dplyr::select(location, location_name, population)
raw_forecasts <- arrow::open_dataset(
here::here("data-processed"),
format = "csv",
partitioning = schema(model = string()),
hive_style = FALSE,
col_types = schema(
forecast_date = date32(),
target = string(),
target_end_date = date32(),
location = string(),
type = string(),
quantile = float32(),
value = float32()
)
) |>
# Ideally, we would handle this with separate() but it's not yet implemented
# in arrow
dplyr::mutate(
horizon = as.integer(gsub("^(\\d+) .*", "\\1", target)),
target_variable = gsub(".* (\\w+ \\w+)$", "\\1", target),
.keep = "unused"
) |>
dplyr::filter(
forecast_date >= ymd("2021-03-08")
) |>
dplyr::rename(prediction = value) |>
dplyr::left_join(locations) |>
# set forecast date to corresponding submission date
mutate(
forecast_date = ceiling_date(forecast_date, "week", week_start = 3)
)
arrow::write_parquet(raw_forecasts, "covid19-forecast-hub-europe.parquet")
shell: Rscript {0}
- uses: actions/upload-artifact@v3
with:
name: parquet-dataset
path: covid19-forecast-hub-europe.parquet