generated from bluegreen-labs/R_project_template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
69 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# compile a random dataset | ||
# balancing the different sheets / handwritten styles | ||
# the characters represented, the uncertainty of the | ||
# citizen science transcriptions and empty values | ||
# | ||
# ideally a subset of the full dataset (>350K) values | ||
# is made and restricted to a couple of thousand values | ||
# to fit within the repo for demo purposes, only to train | ||
# the full model on a separate data directory later | ||
|
||
library(dplyr) | ||
library(tidyr) | ||
library(readr) | ||
|
||
# read in the majority votes for the Jungle Weather | ||
# citizen science programme | ||
df <- readr::read_csv("data-raw/climate_data_majority_vote.csv") | ||
|
||
# only retain values with absolute concensus between | ||
# volunteers, can be relaxed to grow the dataset | ||
df <- df |> | ||
filter( | ||
consensus == 1 | ||
) |> | ||
select( | ||
filename, | ||
final_value, | ||
folder, | ||
col | ||
) | ||
|
||
# how many images are retained | ||
message(sprintf("number of retained images: %s", nrow(df))) | ||
|
||
# too much snot in my head just do a simple | ||
# random sampling for now with 10k samples | ||
df <- df |> | ||
sample_n(size = 10000) |> | ||
mutate( | ||
filename = gsub(".png","_dl.png", filename) | ||
) | ||
|
||
readr::write_csv(df, "data/tmp/labels.csv") | ||
|
||
files <- list.files("/data/scratch/zooniverse/format_1_batch_1_dl/","*.png") | ||
|
||
# copy all files over to tmp directory | ||
file.copy( | ||
file.path("/data/scratch/zooniverse/format_1_batch_1_dl/",df$filename), | ||
"data/tmp/" | ||
) | ||
|
||
# split out individual characters | ||
# run some statistics on the prevalence of the characters | ||
# present in the dataset | ||
char_stats <- df |> | ||
rowwise() |> | ||
mutate( | ||
character = strsplit(as.character(final_value[1]),"") | ||
) |> | ||
unnest(cols = "character") |> | ||
ungroup() |> | ||
group_by(character) |> | ||
summarize( | ||
n = n() | ||
) |> | ||
arrange( | ||
n | ||
) |