-
Notifications
You must be signed in to change notification settings - Fork 1
/
convert_excel.R
139 lines (106 loc) · 3.92 KB
/
convert_excel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#' Simple script for grabbing table data out of an NZ stats excel sheet.
#'
#' format seems to be:
#' <header junk>
#' <table header>
#' <outer header>
#' <inner header>
#' <inner table rows>
#' ...
#' <outer header>
#' <inner header>
#' <inner table rows>
#' ...
#' <footer junk>
#'
#' 1. Find inner tables (complete rows)
#' 2. Find headers and apply to each table.
#' 3. Remove headers
#' 4. Repeat from 2.
#'
library(readxl)
library(dplyr)
library(reshape2)
sheets <- excel_sheets("2013-census-electorate-tables.xls")
all <- list()
# ignore the first sheet of contents
for (sheet_num in 2:length(sheets)) {
sheet <- as.data.frame(read_excel("2013-census-electorate-tables.xls", sheet=sheet_num))
#' remove all blank rows
blanks <- apply(sheet, 1, function(x) { all(is.na(x)) })
sheet <- sheet[!blanks,]
#' grab the variable names - first two rows contain this
inner_var <- sheet[1,1]
level_var <- str_trim(unlist(str_split(sheet[2,1], ",")))
level_var <- str_trim(gsub("(By|and)(.*)", "\\2", level_var))
#' doesn't seem plausible to assign these to columns unfortunately
#' save the data columns, as we'll be adding more
data_cols <- 2:ncol(sheet)
# Righto, we repeat from here...
level <- 0
names(sheet)[1] <- paste0("level", level)
while (TRUE) {
#' We assume that a table is a contiguous block with no
#' missing entries which are all the same size
table_rows <- apply(sheet, 1, function(x) { all(!is.na(x)) })
tr.rle <- rle(table_rows)
if (sum(tr.rle$values) == 1) {
# only one group left - break out
break
}
table_entries <- tr.rle$values
table_entries[table_entries] <- 1:sum(table_entries)
tr.rle$values <- table_entries
table_rows <- inverse.rle(tr.rle)
# pull out header of each repeated table from the sheet.
# this will be the rows just before each table starts
hr.rle <- rle(table_rows > 0)
header_rows <- cumsum(hr.rle$lengths)[!hr.rle$value]
header_rows <- header_rows[-length(header_rows)]
# grab the table names
table_names <- sheet[header_rows,1]
# apply the table names to each of the tables beneath them
sheet <- cbind(sheet, NA)
sheet[table_rows > 0, ncol(sheet)] <- table_names[table_rows[table_rows>0]]
# now remove the header rows and any blanks
sheet <- sheet[-header_rows,]
# and name the column
level <- level + 1
names(sheet)[length(names(sheet))] <- paste0("level", level)
}
# names of the columns can now be found from the top
table_rows <- apply(sheet, 1, function(x) { all(!is.na(x)) })
name_rows <- rle(table_rows)$length[1]
# column names are then above the first header row, skipping blanks
data_col_names <- rep(NA, length(data_cols))
for (i in rev(seq_len(name_rows))) {
needed_cols <- is.na(data_col_names)
data_col_names[needed_cols] <- as.character(sheet[i,data_cols[needed_cols]])
if (!any(is.na(data_col_names))) {
break;
}
}
# fill in the column names
names(sheet)[data_cols] <- data_col_names
# and kill any rows that aren't complete
complete <- apply(sheet, 1, function(x) { all(!is.na(x)) })
sheet <- sheet[complete,]
# convert fields to numeric
for (i in data_cols) {
sheet[,i] <- as.numeric(sheet[,i])
}
# convert to long format
sheet <- melt(sheet, value.name="count")
# kill anything with 'total' or 'median' in it
data_cols <- (1:ncol(sheet))[names(sheet) != "count"]
for (i in data_cols) {
kill_rows <- grepl("^(total|median).*", sheet[,i], ignore.case=TRUE)
sheet <- sheet[!kill_rows,]
}
names(sheet)[names(sheet) == "variable"] <- inner_var
# can infer other variables with knowledge of the table only
# unfortuantely. We'll add them to the filename
csv_name <- paste0(inner_var, " by ", paste(level_var, collapse=", "), ".csv")
write.csv(sheet, file.path("csv", csv_name), row.names=FALSE)
all[[sheet_num]] <- sheet
}