Skip to content

Commit

Permalink
label selection script
Browse files Browse the repository at this point in the history
  • Loading branch information
khufkens committed Oct 10, 2023
1 parent f972e22 commit bb169b1
Showing 1 changed file with 69 additions and 0 deletions.
69 changes: 69 additions & 0 deletions data-raw/00_select_ml_training_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# compile a random dataset
# balancing the different sheets / handwritten styles
# the characters represented, the uncertainty of the
# citizen science transcriptions and empty values
#
# ideally a subset of the full dataset (>350K) values
# is made and restricted to a couple of thousand values
# to fit within the repo for demo purposes, only to train
# the full model on a separate data directory later

library(dplyr)
library(tidyr)
library(readr)

# read in the majority votes for the Jungle Weather
# citizen science programme
df <- readr::read_csv("data-raw/climate_data_majority_vote.csv")

# only retain values with absolute concensus between
# volunteers, can be relaxed to grow the dataset
df <- df |>
filter(
consensus == 1
) |>
select(
filename,
final_value,
folder,
col
)

# how many images are retained
message(sprintf("number of retained images: %s", nrow(df)))

# too much snot in my head just do a simple
# random sampling for now with 10k samples
df <- df |>
sample_n(size = 10000) |>
mutate(
filename = gsub(".png","_dl.png", filename)
)

readr::write_csv(df, "data/tmp/labels.csv")

files <- list.files("/data/scratch/zooniverse/format_1_batch_1_dl/","*.png")

# copy all files over to tmp directory
file.copy(
file.path("/data/scratch/zooniverse/format_1_batch_1_dl/",df$filename),
"data/tmp/"
)

# split out individual characters
# run some statistics on the prevalence of the characters
# present in the dataset
char_stats <- df |>
rowwise() |>
mutate(
character = strsplit(as.character(final_value[1]),"")
) |>
unnest(cols = "character") |>
ungroup() |>
group_by(character) |>
summarize(
n = n()
) |>
arrange(
n
)

0 comments on commit bb169b1

Please sign in to comment.