-
Notifications
You must be signed in to change notification settings - Fork 0
/
combine_ppb_error_histograms.R
276 lines (228 loc) · 13.2 KB
/
combine_ppb_error_histograms.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# ---------------------------------------------------------
# TUM - Technichal University of Munich
#
# Authors: Aydin Uzun
# Date: 2022
# Purpose: Create daily ppb error histogram - This script could be run on the cluster.
# Run: Run this script on the cluster - or you can do it on your local PC if you have the histogram matrices
# and scaling factors.
# For cluster:
# 1. Load correct dependencies to compile R libraries
# module unload intel-mpi
# module unload intel
# module load gcc
# 2. Load your desired R module
# module load r/3.6.3-gcc11-mkl
# 3. Run your script
# Rscript combine_ppb_error_histograms.R
# ---------------------------------------------------------
rm(list = ls())
############################ Day under investigation HAM_XXXXXXX
# The working directory is "/dss/dsstumfs01/pn69ki/pn69ki-dss-0000/STILT_Ham/HAM_large/HAM_XXXXXXXX/by-id
# Upload this script to the working directory and submit the job
# XXXXXXXX being e.g. 20180905
Date <- "20210905"
# Please note that you need to include the scaling factors for the corresponding day in your working directory
# The scaling factors for each day can be found on '/Volumes/esm/11-Thesis/
# 03-Scientific-Internship/2021 FP Aydin Uzun/Data/Scaling_factors'
# scales <- read.csv(file = 'HAM_XXXXXXXX_scaling.csv') e.g. 20210905
scales <- read.csv(file = paste("HAM_",Date,"_scaling.csv",sep=""))
############################ Day under investigation HAM_XXXXXXX
# Subscripts 1,2,3 correspond to the different inventories used
# e.g. mb_hist_ppb_errors_1 contains the histogram counts of the bins created
# after the error simulations are done using the particle trajectory files for this specific day
# and for this specific instrument location
# So there are different distributions corresponding to different instruments, different datetimes and different particle release heights
# The goal is to combine them to create a daily distribution
mb_hist_ppb_errors_1 <- readRDS("parallel_mb_hist_ppb_errors_1_counts.rds") # int [1:20,1:507]
# 507 is the number of different combinations of timepoints and particle release heights
# there are 39 timepoints and 13 particle release heights
mc_hist_ppb_errors_1 <- readRDS("parallel_mc_hist_ppb_errors_1_counts.rds") # int [1:20,1:507]
md_hist_ppb_errors_1 <- readRDS("parallel_md_hist_ppb_errors_1_counts.rds") # int [1:20,1:507]
me_hist_ppb_errors_1 <- readRDS("parallel_me_hist_ppb_errors_1_counts.rds") # int [1:20,1:507]
mb_hist_ppb_errors_2 <- readRDS("parallel_mb_hist_ppb_errors_2_counts.rds") # int [1:20,1:507]
mc_hist_ppb_errors_2 <- readRDS("parallel_mc_hist_ppb_errors_2_counts.rds") # int [1:20,1:507]
md_hist_ppb_errors_2 <- readRDS("parallel_md_hist_ppb_errors_2_counts.rds") # int [1:20,1:507]
me_hist_ppb_errors_2 <- readRDS("parallel_me_hist_ppb_errors_2_counts.rds") # int [1:20,1:507]
mb_hist_ppb_errors_3 <- readRDS("parallel_mb_hist_ppb_errors_3_counts.rds") # int [1:20,1:507]
mc_hist_ppb_errors_3 <- readRDS("parallel_mc_hist_ppb_errors_3_counts.rds") # int [1:20,1:507]
md_hist_ppb_errors_3 <- readRDS("parallel_md_hist_ppb_errors_3_counts.rds") # int [1:20,1:507]
me_hist_ppb_errors_3 <- readRDS("parallel_me_hist_ppb_errors_3_counts.rds") # int [1:20,1:507]
vert_pres <-scales$x # vertical scaling factors for the 13 atm. layers
# please remember that these are in the increasing order
# first element corresponds to first atm. layer [20m], last element corresponds to the 13.th atm layer [2220m]
# Read the names of the trajectories related to "mb"
file_names_mb <- readRDS("parallel_mb_particle_files.rds") # chr [1:507]
# e.g. mb_202109050630_1060m" "mb_202109050630_1244m" "mb_202109050630_1432m" .....
file_names_mb_list <- strsplit(file_names_mb, split = "_") # list of 507
# e.g. chr[1:3] "mb" "202109050630" "1060m"
# chr[1:3] "mb" "202109050630" "1244m" .....
file_names_mb_matrix <- matrix(unlist(file_names_mb_list),nrow=length(file_names_mb_list),ncol=3,byrow=TRUE)
# e.g.
# V1 V2 V3
# 1 mb 202109050630 1060m
# 2 mb 202109050630 1244m
# 3 mb 202109050630 1432m
# .....
# 507 .............
file_dates_mb_matrix<-file_names_mb_matrix[,2:3]
# e.g.
# [,1] [,2]
#[1,] "202109050630" "1060m"
#[2,] "202109050630" "1244m"
#[3,] "202109050630" "1432m"
#[4,] "202109050630" "1623m"
#[5,] "202109050630" "1818m"
#[6,] "202109050630" "186m"
#[7,] "202109050630" "2017m"
#[8,] "202109050630" "20m"
#[9,] "202109050630" "2220m"
#[10,] "202109050630" "355m"
#[11,] "202109050630" "527m"
#[12,] "202109050630" "701m"
#[13,] "202109050630" "879m"
#[14,] "202109050645" "1060m"
#[15,] "202109050645" "1244m"
#[16,] "202109050645" "1432m"
#[17,] "202109050645" "1623m"
#[18,] "202109050645" "1818m"
#[19,] "202109050645" "186m"
#[20,] "202109050645" "2017m"
# .....
# It's noticed that the order of the particle release heights are the same
# always start with 1060m and end with 879m then next datetime
# Now do the same for the other instruments
file_names_mc <- readRDS("parallel_mc_particle_files.rds")
file_names_mc_list <- strsplit(file_names_mc, split = "_")
file_names_mc_matrix <- matrix(unlist(file_names_mc_list),nrow=length(file_names_mc_list),ncol=3,byrow=TRUE)
file_dates_mc_matrix<-file_names_mc_matrix[,2:3]
file_names_md <- readRDS("parallel_md_particle_files.rds")
file_names_md_list <- strsplit(file_names_md, split = "_")
file_names_md_matrix <- matrix(unlist(file_names_md_list),nrow=length(file_names_md_list),ncol=3,byrow=TRUE)
file_dates_md_matrix<-file_names_md_matrix[,2:3]
file_names_me <- readRDS("parallel_me_particle_files.rds")
file_names_me_list <- strsplit(file_names_me, split = "_")
file_names_me_matrix <- matrix(unlist(file_names_me_list),nrow=length(file_names_me_list),ncol=3,byrow=TRUE)
file_dates_me_matrix<-file_names_me_matrix[,2:3]
# now we expect that the created matrices using the names from mb,mc,md,me are the same.
# because in the end for a datetime and particle release height pair there should be 4 different particle traj documents.
# but to make sure check if they are equal
check_1 <-setequal(file_dates_mb_matrix,file_dates_mc_matrix)
check_2<-setequal(file_dates_mb_matrix,file_dates_md_matrix)
check_3<-setequal(file_dates_mb_matrix,file_dates_me_matrix)
if (check_1==TRUE && check_2==TRUE && check_3==TRUE){ # if the name matrices are the same i.e.
# if there are equal number of trajectory files for each instrument
# we can easily do matrix operations and then take average
unique_times<-unique(file_names_mb_matrix[,2]) # chr [1:39]
# e.g.
# 202109050630 202109050645 202109050700 202109050715 .... 202109051545 202109051600
for (i in 1:1) { # this is to determine the particle release heights
mytime<- unique_times[i] # pick a datetime
# mytime is "202109050630"
idx_mytime<-which(file_names_mb_matrix[,2]==mytime)
# now take a look at the huge matrix file_names_mb_matrix above
# we want to determine the idx corresponding to this specific datetime
# e.g. idx_mytime = 1 2 3 4 5 .... 13
meters_of_idx_mytime <- strsplit( file_names_mb_matrix[idx_mytime,3], split = "m")
# now determine the particle release heights of these idx
# e.g. list of 13
# chr "1060" chr "1244" chr "1432" ....
meters_of_idx_mytime <- matrix(unlist(meters_of_idx_mytime),nrow=13,ncol=1,byrow=TRUE)
# now chr [1:13] "1060" "1244" "1432" ...
meters_of_idx_mytime_num<- as.numeric(meters_of_idx_mytime)
# now num [1:13] 1060 1244 1432 1623 1818 186 2017 20 2220 355 527 701 879
# now we want to get the vertical pressure scaling factors in the same order
# remember vert_pres is in the increasing order i.e.
# first element of vert_pres corresponds to the scaling factor for the first atm. layer (20m) etc.
sorted_vert_pres<-sort(meters_of_idx_mytime_num,decreasing = FALSE,index.return=TRUE)
#$x
#[1] 20 186 355 527 701 879 1060 1244 1432 1623 1818 2017 2220
}
# vert_pres[1] corresponds to 1st atm. layer, vert_pres[2] corresponds to 2nd atm. layer, vert_pres[3] corresponds to 3rd atm. layer etc.
vert_pres_sorted <-vert_pres[match(meters_of_idx_mytime_num,sorted_vert_pres$x)]
# now vert_pres_sorted[1] corresponds to 7th atm. layer i.e. 1060m,
# vert_pres_sorted[2] corresponds to 8th atm. layer i.e. 1244 etc.
# normalize
vert_pres_sorted <- vert_pres_sorted/ sum(vert_pres_sorted)
# initialize
# these will store the vertical averaged [using the pressure scaling factors] distribution for one timepoint
mb_avg_alts_1 <- c()
mc_avg_alts_1 <- c()
md_avg_alts_1 <- c()
me_avg_alts_1 <- c()
mb_avg_alts_2 <- c()
mc_avg_alts_2 <- c()
md_avg_alts_2 <- c()
me_avg_alts_2 <- c()
mb_avg_alts_3 <- c()
mc_avg_alts_3 <- c()
md_avg_alts_3 <- c()
me_avg_alts_3 <- c()
# for each time point
for (i in 1:(length(file_names_mb_list)/13)) { # length(file_names_mb_list)/13=39, this is the number of timepoints for one day
my_idx <- ((i-1)*13)+seq(1,13) # in each iteration pick the idx corresponding to one time point
# e.g. mb_hist_ppb_errors_1[,my_idx] is of size [1:20,1:13]
# 1:13 are the atm. layers and we know that now the particle release heights are matched ie.
# vert_pres_sorted[1] and mb_hist_ppb_errors_1[,1] correspond to the same atm. layer etc.
mb_avg_of_alts_1 <-rowSums(mb_hist_ppb_errors_1[,my_idx]*vert_pres_sorted)
# mb_avg_of_alts_1 num[1:20] : averaged using the pressure scaling factors, represents the histogram counts i.e. distribution
# for a time point
# then concatanate them, in the end we'll store each distribution for each time point
mb_avg_alts_1<-cbind(mb_avg_alts_1,mb_avg_of_alts_1)
# do the same for others
# again there are 4 instruments mb mc md me and 3 available inventories for the same area [represented by the subscripts]
# so this operation needs to be done 12 times.
mc_avg_of_alts_1 <-rowSums(mc_hist_ppb_errors_1[,my_idx]*vert_pres_sorted)
mc_avg_alts_1<-cbind(mc_avg_alts_1,mc_avg_of_alts_1)
md_avg_of_alts_1 <-rowSums(md_hist_ppb_errors_1[,my_idx]*vert_pres_sorted)
md_avg_alts_1<-cbind(md_avg_alts_1,md_avg_of_alts_1)
me_avg_of_alts_1 <-rowSums(me_hist_ppb_errors_1[,my_idx]*vert_pres_sorted)
me_avg_alts_1<-cbind(me_avg_alts_1,me_avg_of_alts_1)
mb_avg_of_alts_2 <-rowSums(mb_hist_ppb_errors_2[,my_idx]*vert_pres_sorted)
mb_avg_alts_2<-cbind(mb_avg_alts_2,mb_avg_of_alts_2)
mc_avg_of_alts_2 <-rowSums(mc_hist_ppb_errors_2[,my_idx]*vert_pres_sorted)
mc_avg_alts_2<-cbind(mc_avg_alts_2,mc_avg_of_alts_2)
md_avg_of_alts_2 <-rowSums(md_hist_ppb_errors_2[,my_idx]*vert_pres_sorted)
md_avg_alts_2<-cbind(md_avg_alts_2,md_avg_of_alts_2)
me_avg_of_alts_2 <-rowSums(me_hist_ppb_errors_2[,my_idx]*vert_pres_sorted)
me_avg_alts_2<-cbind(me_avg_alts_2,me_avg_of_alts_2)
mb_avg_of_alts_3 <-rowSums(mb_hist_ppb_errors_3[,my_idx]*vert_pres_sorted)
mb_avg_alts_3<-cbind(mb_avg_alts_3,mb_avg_of_alts_3)
mc_avg_of_alts_3 <-rowSums(mc_hist_ppb_errors_3[,my_idx]*vert_pres_sorted)
mc_avg_alts_3<-cbind(mc_avg_alts_3,mc_avg_of_alts_3)
md_avg_of_alts_3 <-rowSums(md_hist_ppb_errors_3[,my_idx]*vert_pres_sorted)
md_avg_alts_3<-cbind(md_avg_alts_3,md_avg_of_alts_3)
me_avg_of_alts_3 <-rowSums(me_hist_ppb_errors_3[,my_idx]*vert_pres_sorted)
me_avg_alts_3<-cbind(me_avg_alts_3,me_avg_of_alts_3)
}
# mb_avg_alts_1 num [1:20,1:39] , 39 are the number of time points
# we need to find the mean of all 39 time points' distributions
# mb_daily_avg_1 num [1:20], now this is the ppb error distribution of all particle trajectory files' error simulations related to instrument mb
mb_daily_avg_1<-rowMeans(mb_avg_alts_1)
# do the same for others
# again there are 4 instruments mb mc md me and 3 available inventories for the same area [represented by the subscripts]
# so this operation needs to be done 12 times.
mc_daily_avg_1<-rowMeans(mc_avg_alts_1)
md_daily_avg_1<-rowMeans(md_avg_alts_1)
me_daily_avg_1<-rowMeans(me_avg_alts_1)
mb_daily_avg_2<-rowMeans(mb_avg_alts_2)
mc_daily_avg_2<-rowMeans(mc_avg_alts_2)
md_daily_avg_2<-rowMeans(md_avg_alts_2)
me_daily_avg_2<-rowMeans(me_avg_alts_2)
mb_daily_avg_3<-rowMeans(mb_avg_alts_3)
mc_daily_avg_3<-rowMeans(mc_avg_alts_3)
md_daily_avg_3<-rowMeans(md_avg_alts_3)
me_daily_avg_3<-rowMeans(me_avg_alts_3)
# now take the average of the 4 distributions related to one day - take average of all instruments' error simulations
# daily_avg_1 num [1:20] , ppb error distribution of all particle trajectory files' error simulations when the inventory number 1 is used
# please remember that the subscript represents the inventory used
daily_avg_1 <- (mb_daily_avg_1 + mc_daily_avg_1 + md_daily_avg_1 + me_daily_avg_1)/4
daily_avg_2 <- (mb_daily_avg_2 + mc_daily_avg_2 + md_daily_avg_2 + me_daily_avg_2)/4
daily_avg_3 <- (mb_daily_avg_3 + mc_daily_avg_3 + md_daily_avg_3 + me_daily_avg_3)/4
# save
saveRDS(daily_avg_1,paste(Date,"_daily_city_and_river_smooth.rds",sep=""))
saveRDS(daily_avg_2,paste(Date,"_daily_corrected_elev_bothInstruments_city_and_river_smooth.rds",sep=""))
saveRDS(daily_avg_3,paste(Date,"_daily_corrected_raw_bothInstruments_city_and_river_smooth.rds",sep=""))
# The results for the chosen days can be found on:
# "/Volumes/esm/11-Thesis/03-Scientific-Internship/2021 FP Aydin Uzun/Data/Daily histograms"
}