-
Notifications
You must be signed in to change notification settings - Fork 0
/
Word level one-hot encoding example.R
52 lines (42 loc) · 1.58 KB
/
Word level one-hot encoding example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Non-keras solution
#
samples <- c("The cat sat on the mat.", "The dog ate my homework.")
token_index <- list()
for (sample in samples)
for (word in strsplit(sample, " ")[[1]])
if (!word %in% names(token_index))
token_index[[word]] <- length(token_index) + 2
max_length <- 10
# Prepare an empty results array for one-hot encoding
results <- array(0, dim = c(length(samples),
max_length,
max(as.integer(token_index))))
for (i in 1:length(samples)) {
sample <- samples[[i]]
words <- head(strsplit(sample, " ")[[1]], n = max_length)
for (j in 1:length(words)) {
index <- token_index[[words[[j]]]]
results[[i, j, index]] <- 1
}
}
# Keras-solution
#
library(keras)
samples <- c("The cat sat on the mat.", "The dog ate my homework.")
tokenizer <- text_tokenizer(num_words = 1000) %>%
fit_text_tokenizer(samples)
sequences <- texts_to_sequences(tokenizer, samples)
one_hot_results <- texts_to_matrix(tokenizer, samples, mode = "binary")
dim(one_hot_results)
word_index <- tokenizer$word_index
cat("Found", length(word_index), "unique tokens.\n")
# Adding a new sample
# After tokenizer is updated the tokens indexes may change so all the samples need
# to be recoded.
new_sample <- "Ala ma kota"
tokenizer <- tokenizer %>% fit_text_tokenizer(new_sample)
sequences <- texts_to_sequences(tokenizer, new_sample)
one_hot_results <- texts_to_matrix(tokenizer, c(samples, new_sample), mode = "binary")
dim(one_hot_results)
word_index <- tokenizer$word_index
cat("Found", length(word_index), "unique tokens.\n")