-
Notifications
You must be signed in to change notification settings - Fork 0
/
piechart.R
149 lines (135 loc) · 4.48 KB
/
piechart.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
df<-read.csv('survey.csv',stringsAsFactors = FALSE)
#to check if country has NA
any(is.na(df$Country))
#plotting time series data
age1<-c()
ct<-1
#below for loop finds all the age values which are within 15 and 60
for(i in 1:nrow(df)){
if(df$Age[i]>=15 || df$Age[i]<=60){
#age1 contains these age values b/w 15 & 60
age1[ct]<-df$Age[i]
ct<-ct+1
}
}
median_age1<-median(age1)
#the below for loop replaces the age values <15 & >60 with the median value
for(i in 1:nrow(df)){
if(df$Age[i]<15 || df$Age[i]>60){
df$Age[i]<-median_age1
}
}
unique(df$Gender)
# dealing with gender
#the below replaces all the inconsistent values of gender with theconsistent values
g<-c()
for(i in 1:nrow(df)){
x<-df$Gender[i]
if(x=="fluid"){
g[i]<-"Other"
}
else if("Trans"==substr(x,1,5) || "trans"==substr(x,1,5) || regexpr("trans",x)[1]!=-1 || regexpr("Trans",x)[1]!=-1){
if(substr(x,7,7)=="f" || substr(x,7,7)=="w"){
g[i]<-"Trans-Female"
}
else if(substr(x,1,1)=="F" || substr(x,1,1)=="f"){
g[i]<-"Trans-Female"
}
}
else if("M"==substr(x,1,1) || "m"==substr(x,1,1) || "Guy"==substr(x,1,3) || regexpr("male",x)[1]!=-1){
g[i]<-"Male"
}
else if("F"==substr(x,1,1) || "f"==substr(x,1,1) || "w"==substr(x,1,1) || "W"==substr(x,1,1)){
g[i]<-"Female"
}
else if("Cis"==substr(x,1,3) || "cis"==substr(x,1,3)){
if(substr(x,5,5)=="f" || substr(x,5,5)=="F"){
g[i]<-"Female"
}
else if(substr(x,5,5)=="m" || substr(x,5,5)=="M"){
g[i]<-"Male"
}
}
else
g[i]<-"Other"
}
#replacing gender
df$Gender<-g
#drop column comments
df<-subset(df,select=-comments)
#drop column no_employees
df<-subset(df,select=-no_employees)
#drop rows with NA for self_employed
df<-df[-c(1:18),]
#NA here are replaced by a new category - Not Known
df[is.na(df$work_interfere),"work_interfere"] <- "Not Known"
#pre-analysis graphs
#pie chart for country
p<-as.data.frame(table(df$Country))$Freq
pie(p,labels=as.data.frame(table(df$Country))$Var1)
#pie chart for all the attributes
grid<-matrix(c(1:9),nrow=3,ncol=3,byrow=TRUE)
layout(grid,widths = rep.int(1, ncol(grid)),
heights = rep.int(1, nrow(grid)))
#pie chart for the attributes 6 to 14
for(i in 6:14){
slices<-as.data.frame(table(df[,i]))$Freq
lbls<-as.data.frame(table(df[,i]))$Var1
pct <- round(slices/sum(slices)*100,2)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep=":") # ad % to labels
pie(slices,labels=lbls,col = rainbow(length(lbls)),main=names(df)[i],radius = 1, cex = 0.7)
Sys.sleep(0.5)
}
grid<-matrix(c(1:9),nrow=3,ncol=3,byrow=TRUE)
layout(grid,widths = rep.int(1, ncol(grid)),
heights = rep.int(1, nrow(grid)))
#pie chart for the attributes 15 to 23
for(i in 15:23){
slices<-as.data.frame(table(df[,i]))$Freq
lbls<-as.data.frame(table(df[,i]))$Var1
pct <- round(slices/sum(slices)*100,2)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep=":") # ad % to labels
pie(slices,labels=lbls,col = rainbow(length(lbls)),main=names(df)[i],radius = 1, cex = 0.7)
Sys.sleep(0.5)
}
grid<-matrix(c(1:4),nrow=2,ncol=2,byrow=TRUE)
layout(grid,widths = rep.int(1, ncol(grid)),
heights = rep.int(1, nrow(grid)))
#pie chart for the attributes 24 & 25
for(i in 24:25){
slices<-as.data.frame(table(df[,i]))$Freq
lbls<-as.data.frame(table(df[,i]))$Var1
pct <- round(slices/sum(slices)*100,2)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep=":") # ad % to labels
pie(slices,labels=lbls,col = rainbow(length(lbls)),main=names(df)[i],radius = 1, cex = 0.7)
Sys.sleep(0.5)
}
#pie chart for gender
slices<-as.data.frame(table(df$Gender))$Freq
slices<-slices/sum(slices)*100
lbls<-c("Female","Male","Other","Trans-F")
barplot(slices, main="Gender",names.arg=lbls)
#pie chart for age
slices<-c(0,0,0)
lbls<-c("Age(15-30)","Age(30-45)","Age(45-60)")
for(i in 1:nrow(df)){
if(df$Age[i]<30 && df$Age[i]>=15){
slices[1]=slices[1]+1
}
else if(df$Age[i]<45 && df$Age[i]>=30){
slices[2]=slices[2]+1
}
else if(df$Age[i]<=60 && df$Age[i]>=45){
slices[3]=slices[3]+1
}
else
print("NI")
}
pct <- round(slices/sum(slices)*100,2)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels=lbls,col = rainbow(length(lbls)),main="Age in a range",radius = 0.90, cex = 0.5)
par(mfrow=c(1,1))