-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEDA_absenteeism_.R
368 lines (260 loc) · 19.3 KB
/
EDA_absenteeism_.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
## Info about the dataset: https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work
# load libraries
library('readr')
library('plyr')
library('dplyr')
library('ggplot2')
library('gridExtra')
#download file to current directory
download.file('https://archive.ics.uci.edu/ml/machine-learning-databases/00445/Absenteeism_at_work_AAA.zip',
destfile = 'data.zip')
#unzip file
unzip('data.zip', exdir = './dataset')
#store dataframe in df variable
df <- read_delim('./dataset/Absenteeism_at_work.csv', delim = ";", locale=locale(decimal_mark = "."))
##################################
### DATA PREPARATION/CLEANING ###
##################################
#get all column names to lower case
colnames(df) <- tolower(colnames(df))
#replace spaces in column names with "_"
names(df) <- gsub(" ", "_", names(df))
#check missing values
missing <- is.na(df)
sum(missing)
# no missing values in the dataframe
#output a summary of all features
summary(df)
#based on the summary and on features names, convert some features to categorical
colnames(df) #get column names for indices
factors_indices <- c(1:5,11,13,15,16)
#loop to convert attributes to factors
for (i in factors_indices)
{
df[[i]] <- as.factor(df[[i]])
}
#check summary again
summary(df)
#change some levels of categorical features for better understanding
levels(df$day_of_the_week) = c("Mon","Tue","Wed","Thu","Fri") #changes the levels of "Days of the week" attribute
levels(df$seasons) = c('Summer', 'Autumn', 'Winter', 'Spring')
levels(df$education) = c('High school','Graduate','Postgraduate','Master or Doctor')
levels(df$social_drinker) = c('Nondrinker','Drinker')
levels(df$social_smoker) = c('Nonsmoker','Smoker')
df$disciplinary_failure <- as.logical(df$disciplinary_failure)
#check summary again
summary(df)
#check levels of some features
levels(df$id) #employees from #1 to #36
levels(df$reason_for_absence) #reasons for absence from #0 to #28 (reason #0 has no info on the dataset; to be check later)
levels(df$month_of_absence) #month of absence from #0 to #12 (month #zero doesn't make sense; to be check later)
levels(df$hit_target) # several levels of hit_target feature from "81" to "100"
#investigate month #zero
month_zero <- df[df$month_of_absence == 0,]
month_zero
# employees id 4,8 and 35 - just 3 records. No absent time for these records.
employees_month_zero <- df[df$id == 4 | df$id == 8 | df$id == 35,]
employees_month_zero
# employees 4 and 35 have just one record of absence (in month zero) with no reason and no absence time
# employee 8 has two records of absence (one of them is disciplinary failure; other is month zero)
# Conclusions:
# - employees 4 and 35 never missed a day of work;
# - employee 8 had just one record and it was a disciplinary failure (no absence time);
# investigate reason for absence #zero
reason_zero <- df[df$reason_for_absence == 0,]
# all those occurences have zero time of absence
# all those occurences are due to the disciplinary failure
# it seems that disciplinary failure doesn't count for absence time
# however, I will imput the most common value of absent time in the dataset
# mode function definition
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
#get the absence time mode
absence_mode <- Mode(df$absenteeism_time_in_hours) # 8 hours; seems to be daily work journey
#We store disciplinary failure indices
fail_indices <- which(df$disciplinary_failure==T)
#We set absenteeism hours caused by disciplinary failure to 8 hours
df$absenteeism_time_in_hours[fail_indices] <- absence_mode
#I will mutate my dataframe adding a column with the reason for absence written in full.
#"reasons_full.txt" is another dataset that contains the reasons for absence written in full
reasons_full <- read_delim('reasons_full.txt', delim='\t',col_names = "Reason")
#Drop any string that occurs up to (and including) the first space in order to have only the
#reasons written in full
reasons_full <- sub(".*? ", "", reasons_full$Reason)
#Add a column to our "reasons_full" dataframe with the ordered list of reasons so we can join it later
#to my main dataframe
reasons_full <- data.frame(reason_for_absence = c(0:28), reason_full = reasons_full)
#convert reason_for_absence as factor in reasons_full dataframe
reasons_full$reason_for_absence <- as.factor(reasons_full$reason_for_absence)
#Join both datasets
df <- inner_join(df,reasons_full,by='reason_for_absence')
#########################
#### VISUALISATION 1 ###
########################
#exploring reason for absence to understand what is most common
reason_chart <- ggplot(df, aes(reason_full)) + labs(x = "Reason for absence", y = "Occurences")
reason_chart <- reason_chart + geom_bar(stat="count", fill="black")
reason_chart
#impossible to read horizontal axis
reason_chart <- reason_chart + coord_flip() #rotating bar chart
reason_chart
#plot total of hours per reasons of absence instead
#this time grouping the dataset by reason for absence so we can reorder in descent fashion
absence_by_reason <- group_by(df,reason_full)
#summarise the total number of hours per reason
reason_total_hours <- summarise(absence_by_reason,absenteeism_time_in_hours=sum(absenteeism_time_in_hours))
reason_total_hours
#plot
reason_hours_plot <- ggplot(reason_total_hours, aes(x=reorder(reason_full,absenteeism_time_in_hours), y=absenteeism_time_in_hours)) + labs(x = "Reason for absence", y = "Total absenteeism in hours")
reason_hours_plot <- reason_hours_plot + geom_bar(stat="identity", fill="black") + coord_flip()
reason_hours_plot
#plot only the top 10 reasons
reason_total_hours <- reason_total_hours[order(-reason_total_hours$absenteeism_time_in_hours),] #reorder
absence_by_reason_top10 <- reason_total_hours[1:10,] #subset only top 10
reason_hours_top10_plot1 <- ggplot(absence_by_reason_top10, aes(x=reorder(reason_full,absenteeism_time_in_hours), y=absenteeism_time_in_hours)) + labs(x = "Reason for absence", y = "Total absenteeism in hours")
reason_hours_top10_plot1 <- reason_hours_top10_plot1 + geom_bar(stat="identity", fill="black") + coord_flip()
reason_hours_top10_plot1
#some reasons are too descriptive so I will abreviate the reasons (levels)
levels(absence_by_reason_top10$reason_full)[levels(absence_by_reason_top10$reason_full)=="Diseases of the musculoskeletal system and connective tissue "] <- "Muscuskeletal diseases"
levels(absence_by_reason_top10$reason_full)[levels(absence_by_reason_top10$reason_full)=="Injury, poisoning and certain other consequences of external causes "] <- "Injury, poisoning or external causes"
levels(absence_by_reason_top10$reason_full)[levels(absence_by_reason_top10$reason_full)=="Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified "] <- "Nonclassified symptoms"
#plot with abreviated reasons
reason_hours_top10_plot2 <- ggplot(absence_by_reason_top10, aes(x=reorder(reason_full,absenteeism_time_in_hours), y=absenteeism_time_in_hours)) + labs(x = "Reason for absence", y = "Total absenteeism in hours")
reason_hours_top10_plot2 <- reason_hours_top10_plot2 + geom_bar(stat="identity", fill="darkgrey",width = 0.75) + coord_flip()
reason_hours_top10_plot2
#remove default grey background and grid lines to use grey colour in bars
reason_hours_top10_plot3 <- reason_hours_top10_plot2 + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),panel.grid.minor = element_blank(), axis.line = element_line(colour = "darkgrey"), axis.ticks = element_line(colour = "darkgrey"))
reason_hours_top10_plot3
#remove y axis label and add title
reason_hours_top10_plot4 <- reason_hours_top10_plot3 + theme(axis.title.y = element_blank()) + ggtitle("What are the main reasons for absence?")
reason_hours_top10_plot4
#remove axis lines
reason_hours_top10_plot5 <- reason_hours_top10_plot4 + theme(axis.text.x=element_blank(), axis.ticks=element_blank(), axis.line = element_blank())
reason_hours_top10_plot5
#align title and horizontal axis title
reason_hours_top10_plot6 <- reason_hours_top10_plot5 + theme(plot.title=element_text(hjust = 0.3), axis.title.x=element_text(hjust = 0.075, vjust=2.12) )
reason_hours_top10_plot6
#highlight main reasons for absence and add values as text labels
reason_hours_top10_plot7 <- reason_hours_top10_plot6 + geom_bar(data=subset(absence_by_reason_top10, absenteeism_time_in_hours>500), aes(x=reason_full, y=absenteeism_time_in_hours), fill="darkred", stat="identity",width = 0.75)
reason_hours_top10_plot7 <- reason_hours_top10_plot7 + geom_text(aes(label=absence_by_reason_top10$absenteeism_time_in_hours),hjust=1.1, color="white")
reason_hours_top10_plot7
#highlight disciplinary failure and add values as text labels
reason_hours_top10_plot8 <- reason_hours_top10_plot7 + geom_bar(data=subset(absence_by_reason_top10, absenteeism_time_in_hours==320), aes(x=reason_full, y=absenteeism_time_in_hours), fill="darksalmon", stat="identity",width = 0.75)
reason_hours_top10_plot8 <- reason_hours_top10_plot8 + geom_text(aes(label=absence_by_reason_top10$absenteeism_time_in_hours),hjust=1.1, color="white")
reason_hours_top10_plot8
#########################
#### VISUALISATION 2 ###
########################
# seasonal behaviour of absenteeism
season_absence <- ggplot(df, aes(x=seasons,y=absenteeism_time_in_hours)) + geom_bar(stat = "identity",width = 0.5,fill="black") + labs(x = "Seasons", y = "Total absenteeism in hours")
season_absence
#focusing on top 2 reasons from previous visualisation
#abreviate reasons for absence in main dataframe as well
levels(df$reason_full)[levels(df$reason_full)=="Diseases of the musculoskeletal system and connective tissue "] <- "Muscuskeletal diseases"
levels(df$reason_full)[levels(df$reason_full)=="Injury, poisoning and certain other consequences of external causes "] <- "Injury, poisoning or external causes"
levels(df$reason_full)[levels(df$reason_full)=="Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified "] <- "Nonclassified symptoms"
#filter top 2 reasons in our dataset
df_top2_reasons <- filter(df, reason_full %in% c("Muscuskeletal diseases","Injury, poisoning or external causes" ))
#plot barchart with total absence of top 2 reasons
top2_reasons_plot <- ggplot(df_top2_reasons, aes(x=seasons, y=absenteeism_time_in_hours)) + labs(x = "Seasons", y = "Total absenteeism in hours")
top2_reasons_plot <- top2_reasons_plot + geom_bar(stat="identity", fill="black", width=0.75) + facet_wrap(df_top2_reasons$reason_full)
top2_reasons_plot
#remove ggplot defaults and apply aesthetics operaions
top2_reasons_plot2 <- top2_reasons_plot + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line("grey"),axis.line.x= element_blank(),
axis.ticks.x = element_blank(),axis.title.x = element_blank(),
axis.ticks.y = element_line(colour = "darkgrey"), axis.text.y = element_text(colour="black"),
axis.text.x=element_text(color=c("black","black","red","black")), strip.background = element_rect(color="white", fill="white", size=1),
strip.text=element_text(size=11))
#add title with center alignment
top2_reasons_plot3 <- top2_reasons_plot2 + ggtitle("When preventive measures should be adopted?") + theme(plot.title = element_text(hjust = 0.5)) + ylim(0,500)
top2_reasons_plot3
#boxplot to better understand main reasons of absence
top2_reasons_boxplot <- ggplot(df_top2_reasons, aes(x=seasons, y=absenteeism_time_in_hours)) + labs(x = "Seasons", y = "Absenteeism in hours") + facet_wrap(df_top2_reasons$reason_full)
top2_reasons_boxplot <- top2_reasons_boxplot + geom_boxplot(varwidth = F) #varwidth=T means it is proportional to occurences
top2_reasons_boxplot
#apply improvement of aesthetic operations
top2_reasons_boxplot2 <- top2_reasons_boxplot + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line("grey"),
axis.ticks.x = element_blank(),axis.line.x= element_blank(),
axis.ticks.y = element_line(colour = "darkgrey"), axis.text.y = element_text(colour="black"),
strip.background = element_rect(color="white", fill="white", size=1),
strip.text.x = element_blank())
#plot two plots above in same plot
gridplot <- grid.arrange(top2_reasons_plot3, top2_reasons_boxplot2, nrow = 2)
#########################
#### VISUALISATION 3 ###
########################
#assessing impact if preventive measures are adopted
#if preventive measures are adopted it is expected a reduction of 50% of absence in top2 reasons for absence
df$absenteeism_time_expect <- df$absenteeism_time_in_hours
df <- within(df, absenteeism_time_expect[reason_full == 'Muscuskeletal diseases' | reason_full == 'Injury, poisoning or external causes'] <- df$absenteeism_time_in_hours[reason_full == 'Muscuskeletal diseases' | reason_full == 'Injury, poisoning or external causes']*0.5)
#subset dataframe to remove month zero
df <- df[df$month_of_absence!=0,]
#plot absenteeism by month
#group my month
absence_by_month <- group_by(df,month_of_absence)
#summarise the total number of hours per month (actual and expected)
month_total_hours <- summarise(absence_by_month,absenteeism_time_in_hours=sum(absenteeism_time_in_hours), absenteeism_time_expect=sum(absenteeism_time_expect))
month_total_hours
#change levels of months
#define grey color to use
grey = "grey38"
#define size of point
point_size=3
#define normal linethickness
normal_thick = 0.8
#define bold linethickness
bold_thick = 1.2
#start plot iterations
absence_by_month_plot1 <- ggplot(month_total_hours, aes(x=month_of_absence)) + labs(x = "Month", y = "Total absenteeism in hours") #group=1 because month is a factor (not a continuous data)
absence_by_month_plot1 <- absence_by_month_plot1 + geom_line(aes(y=absenteeism_time_in_hours, group=1),color = grey) + geom_line(aes(y=absenteeism_time_expect, group=1),color = "darkred") + ylim(0,800)
absence_by_month_plot1
#change levels of months
levels(month_total_hours$month_of_absence) = c("Und","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
#create vector of x axis labels to focus
winter_months_colour <- ifelse( month_total_hours$month_of_absence %in% c("Apr","May","Jun"), "darkred", grey )
#remove ggplot defaults and apply aesthetics operaions
absence_by_month_plot2 <- absence_by_month_plot1 + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(grey),axis.line.x= element_line(colour = grey),
axis.ticks.x = element_blank(),axis.text.x=element_text(color=winter_months_colour),
axis.ticks.y = element_line(colour = grey ), axis.text.y = element_text(colour=grey ))
absence_by_month_plot2
#subset winter months and non-winter months to edit for different focus
month_total_hours_jan_feb <- month_total_hours[1:4,]
month_total_hours_mar_jun <- month_total_hours[4:6,]
month_total_hours_jul_dec <- month_total_hours[6:12,]
#plot by layers to customize subsets
#plot current absenteeism time in hours
absence_by_month_plot3 <- ggplot(month_total_hours, aes(x=month_of_absence,y=absenteeism_time_in_hours)) + geom_line(group=1, color=grey, size=normal_thick) + labs(x = "Month", y = "Total absenteeism in hours") + ylim(0,800) #group=1 because month is a factor (not a continuous data)
absence_by_month_plot3 <- absence_by_month_plot3 + geom_point(group=1, color=grey,size=point_size)
#plot 1st subset of expected time in hours
absence_by_month_plot3 <- absence_by_month_plot3 + geom_line(data=month_total_hours_jan_feb,aes(y=absenteeism_time_expect, group=1),color = "darkred",linetype="dashed",size=normal_thick) +
geom_point(data=month_total_hours_jan_feb,aes(y=absenteeism_time_expect, group=1),size=point_size,color = "darkred",size=point_size)
#plot 3rd subset of expected time in hours
absence_by_month_plot3 <- absence_by_month_plot3 + geom_line(data=month_total_hours_jul_dec,group=1,aes(y=absenteeism_time_expect),color = "darkred",linetype="dashed",size=normal_thick) +
geom_point(data=month_total_hours_jul_dec,aes(y=absenteeism_time_expect, group=1),size=point_size,color = "darkred",size=point_size)
#plot 2nd subset of expected time in hours
absence_by_month_plot3 <- absence_by_month_plot3 + geom_line(data=month_total_hours_mar_jun,group=1,aes(y=absenteeism_time_expect),color = "darkred",linetype="dashed",size=bold_thick) +
geom_point(data=month_total_hours_mar_jun,group=1,aes(y=absenteeism_time_expect),color = "darkred",size=point_size)
absence_by_month_plot3
#remove ggplot defaults and apply aesthetics operaions
absence_by_month_plot4 <- absence_by_month_plot3 + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(grey),axis.line.x= element_line(colour = grey),
axis.ticks.x = element_blank(),axis.text.x=element_text(color=winter_months_colour),
axis.ticks.y = element_line(colour = grey ), axis.text.y = element_text(colour=grey ))
#add title
absence_by_month_plot5 <- absence_by_month_plot4 + ggtitle("What is the impact if preventive measures are adopted?")
#add annotations
absence_by_month_plot6 <- absence_by_month_plot5 + annotate(geom="text", x=5, y=250, label="Expected Absence", color="darkred") +
annotate(geom="text", x=5.5, y=200, label="(preventive measures are more effective in Apr-Jun months)", color="darkred", size=3) +
annotate(geom="text", x=5.1, y=600, label="Current Absence", color=grey)
sum_current_abs <- sum(month_total_hours$absenteeism_time_in_hours)
sum_expect_abs <- round(sum(month_total_hours$absenteeism_time_expect),0)
relative_drecrease <- round((sum_current_abs-sum_expect_abs)/sum_current_abs*100,1)
absence_by_month_plot7 <- absence_by_month_plot6 + annotate(geom="text", x=11.5, y=250, label="Total of 4658 hrs", color="darkred",size=3.5) +
annotate(geom="text", x=11.5, y=200, label="(less 14.4%)", color="darkred", size=3) +
annotate(geom="text", x=11.5, y=600, label="Total of 5444 hrs", color=grey,size=3.5)
absence_by_month_plot7