IBM HR Analytics
In this case, we will analyze a data set on Human Resource Analytics. The IBM HR Analytics Employee Attrition & Performance data set is a fictional data set created by IBM data scientists.
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
tidy=FALSE, # display code as typed
size="small") # slightly smaller font for code
options(digits = 3)
# default figure size
knitr::opts_chunk$set(
fig.width=6.75,
fig.height=6.75,
fig.align = "center"
)
library(tidyverse) # Load ggplot2, dplyr, and all the other tidyverse packages
library(mosaic)
library(ggthemes)
library(lubridate)
library(fivethirtyeight)
library(here)
library(skimr)
library(janitor)
library(vroom)
library(tidyquant)
library(rvest) # scrape websites
library(purrr)
library(lubridate) #to handle dates
library(kableExtra)
First let us load the data.
hr_dataset <- read_csv(here::here("data", "datasets_1067_1925_WA_Fn-UseC_-HR-Employee-Attrition.csv"))
glimpse(hr_dataset)
## Rows: 1,470
## Columns: 35
## $ Age <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35...
## $ Attrition <chr> "Yes", "No", "Yes", "No", "No", "No", "No"...
## $ BusinessTravel <chr> "Travel_Rarely", "Travel_Frequently", "Tra...
## $ DailyRate <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 13...
## $ Department <chr> "Sales", "Research & Development", "Resear...
## $ DistanceFromHome <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 2...
## $ Education <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, ...
## $ EducationField <chr> "Life Sciences", "Life Sciences", "Other",...
## $ EmployeeCount <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ EmployeeNumber <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, ...
## $ EnvironmentSatisfaction <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, ...
## $ Gender <chr> "Female", "Male", "Male", "Female", "Male"...
## $ HourlyRate <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84...
## $ JobInvolvement <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, ...
## $ JobLevel <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, ...
## $ JobRole <chr> "Sales Executive", "Research Scientist", "...
## $ JobSatisfaction <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, ...
## $ MaritalStatus <chr> "Single", "Married", "Single", "Married", ...
## $ MonthlyIncome <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, ...
## $ MonthlyRate <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9...
## $ NumCompaniesWorked <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, ...
## $ Over18 <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y...
## $ OverTime <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Ye...
## $ PercentSalaryHike <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13...
## $ PerformanceRating <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, ...
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, ...
## $ StandardHours <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80...
## $ StockOptionLevel <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, ...
## $ TotalWorkingYears <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5...
## $ TrainingTimesLastYear <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, ...
## $ WorkLifeBalance <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, ...
## $ YearsAtCompany <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2,...
## $ YearsInCurrentRole <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, ...
## $ YearsSinceLastPromotion <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, ...
## $ YearsWithCurrManager <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, ...
I am going to clean the data set, as variable names are in capital letters, some variables are not really necessary, and some variables, e.g., education are given as a number rather than a more useful description
hr_cleaned <- hr_dataset %>%
clean_names() %>%
mutate(
education = case_when(
education == 1 ~ "Below College",
education == 2 ~ "College",
education == 3 ~ "Bachelor",
education == 4 ~ "Master",
education == 5 ~ "Doctor"
),
environment_satisfaction = case_when(
environment_satisfaction == 1 ~ "Low",
environment_satisfaction == 2 ~ "Medium",
environment_satisfaction == 3 ~ "High",
environment_satisfaction == 4 ~ "Very High"
),
job_satisfaction = case_when(
job_satisfaction == 1 ~ "Low",
job_satisfaction == 2 ~ "Medium",
job_satisfaction == 3 ~ "High",
job_satisfaction == 4 ~ "Very High"
),
performance_rating = case_when(
performance_rating == 1 ~ "Low",
performance_rating == 2 ~ "Good",
performance_rating == 3 ~ "Excellent",
performance_rating == 4 ~ "Outstanding"
),
work_life_balance = case_when(
work_life_balance == 1 ~ "Bad",
work_life_balance == 2 ~ "Good",
work_life_balance == 3 ~ "Better",
work_life_balance == 4 ~ "Best"
)
) %>%
select(age, attrition, daily_rate, department,
distance_from_home, education,
gender, job_role,environment_satisfaction,
job_satisfaction, marital_status,
monthly_income, num_companies_worked, percent_salary_hike,
performance_rating, total_working_years,
work_life_balance, years_at_company,
years_since_last_promotion)
How often do people leave the company ?
leave <- hr_cleaned%>%
group_by(attrition)%>%
count()%>%
ungroup()%>%
mutate(pct_job_satisfaction=n/sum(n))
ggplot(leave,aes(x=attrition,y=n))+
geom_col(fill="light blue")+
labs(title="The attrition rate of IBM is 16.1%",
x="Attrtion",
y="Number of people")+
theme_clean()+
NULL

There are 1470 observations in the dataset, and 237 employees left the company. Therefore, the attrition rate is 16.1%.
How are age, years_at_company, monthly_income and years_since_last_promotion distributed?
First of all, let’s take a look at the summary statistics.
summary(hr_cleaned)
## age attrition daily_rate department
## Min. :18.0 Length:1470 Min. : 102 Length:1470
## 1st Qu.:30.0 Class :character 1st Qu.: 465 Class :character
## Median :36.0 Mode :character Median : 802 Mode :character
## Mean :36.9 Mean : 802
## 3rd Qu.:43.0 3rd Qu.:1157
## Max. :60.0 Max. :1499
## distance_from_home education gender job_role
## Min. : 1.00 Length:1470 Length:1470 Length:1470
## 1st Qu.: 2.00 Class :character Class :character Class :character
## Median : 7.00 Mode :character Mode :character Mode :character
## Mean : 9.19
## 3rd Qu.:14.00
## Max. :29.00
## environment_satisfaction job_satisfaction marital_status monthly_income
## Length:1470 Length:1470 Length:1470 Min. : 1009
## Class :character Class :character Class :character 1st Qu.: 2911
## Mode :character Mode :character Mode :character Median : 4919
## Mean : 6503
## 3rd Qu.: 8379
## Max. :19999
## num_companies_worked percent_salary_hike performance_rating
## Min. :0.00 Min. :11.0 Length:1470
## 1st Qu.:1.00 1st Qu.:12.0 Class :character
## Median :2.00 Median :14.0 Mode :character
## Mean :2.69 Mean :15.2
## 3rd Qu.:4.00 3rd Qu.:18.0
## Max. :9.00 Max. :25.0
## total_working_years work_life_balance years_at_company
## Min. : 0.0 Length:1470 Min. : 0
## 1st Qu.: 6.0 Class :character 1st Qu.: 3
## Median :10.0 Mode :character Median : 5
## Mean :11.3 Mean : 7
## 3rd Qu.:15.0 3rd Qu.: 9
## Max. :40.0 Max. :40
## years_since_last_promotion
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 1.00
## Mean : 2.19
## 3rd Qu.: 3.00
## Max. :15.00
Age <- hr_cleaned %>%
summarise(min = min(age),max = max(age), median=median(age), mean=mean(age), sd = sd(age))
Monthly_income <- hr_cleaned %>%
summarise(min = min(monthly_income),max = max(monthly_income), median=median(monthly_income), mean=mean(monthly_income), sd = sd(monthly_income))
Year_since_last_promotion <- hr_cleaned %>%
summarise(min = min(years_since_last_promotion),max = max(years_since_last_promotion), median=median(years_since_last_promotion), mean=mean(years_since_last_promotion), sd = sd(years_since_last_promotion))
stats_basic<-bind_rows(Age,Monthly_income,Year_since_last_promotion)
Factors<-c("Age","Monthly Income","Year Since Last Promotion")
stats_all<-bind_cols(Factors,stats_basic)
stats_all%>%
kbl()%>%
kable_styling()
| …1 | min | max | median | mean | sd |
|---|---|---|---|---|---|
| Age | 18 | 60 | 36 | 36.92 | 9.13 |
| Monthly Income | 1009 | 19999 | 4919 | 6502.93 | 4707.96 |
| Year Since Last Promotion | 0 | 15 | 1 | 2.19 | 3.22 |
Employees have an average age of nearly 37 in this company, and their average monthly income is 6502. And the average year since their last promotion is 2.19 years. Compared with the other 2 variables, the mean and median of employees’ age are close, and this value is nearly in the middle of total spread. Therefore, the distribution of age is closer to normal just by looking at summary statistics.
Here are the distributions of the age, monthly income of the employees and years since their last promotion. And the graphs below confirmed our guessing.
ggplot(hr_cleaned,aes(x=age))+
geom_density(fill="light blue",color="blue")+
labs(title = "Distribution of Age",
x="Age",
y="Density")+
theme_clean()+
NULL

ggplot(hr_cleaned,aes(x=monthly_income))+
geom_density(fill="light blue",color="blue")+
labs(title = "Distribution of Monthly Income",
x="Monthly Income",
y="Density")+
theme_clean()+
NULL

ggplot(hr_cleaned,aes(x=years_since_last_promotion))+
geom_density(fill="light blue",color="blue")+
labs(title = "Distribution of Years Since Last Promotion",
x="Years",
y="Density")+
theme_clean()+
NULL

The distribution of monthly income and years since last promotion are obviously not symmetrical and typically right skewed. While half of the employees would get promoted within one year, there are also people who haven’t get a promotion in 15 years. Meanwhile, the majority of people get a monthly payment lower than 5000, but someone could get more than 10000 a month as well. This distribution might reflect the staff level of this company.
How are job_satisfaction and work_life_balance distributed?
Here are the distribution of employees’ evaluation on job satisfaction and work-life balance.
job_satisfy<- hr_cleaned %>%
group_by(job_satisfaction)%>%
count()%>%
ungroup()%>%
mutate(pct_job_satisfaction=n/sum(n))%>%
mutate(edu_level=factor(job_satisfaction,ordered = TRUE,levels=c("Low","Medium","High","Very High")))
job_satisfy%>%
kbl()%>%
kable_styling()
| job_satisfaction | n | pct_job_satisfaction | edu_level |
|---|---|---|---|
| High | 442 | 0.301 | High |
| Low | 289 | 0.197 | Low |
| Medium | 280 | 0.190 | Medium |
| Very High | 459 | 0.312 | Very High |
ggplot(job_satisfy,aes(x=edu_level,y=n))+
geom_col(fill="light blue")+
labs(title = "Employee's Job Satisfaction",
x="Job Satisfaction",
y="Frequency")+
theme_clean()+
NULL

w_l_balance<- hr_cleaned %>%
group_by(work_life_balance) %>%
count() %>%
ungroup() %>%
mutate(pct_work_life_balance = n/sum(n))%>%
mutate(wl_level=factor(work_life_balance,ordered = TRUE,levels = c("Bad","Good","Better","Best") ))
w_l_balance%>%
kbl()%>%
kable_styling()
| work_life_balance | n | pct_work_life_balance | wl_level |
|---|---|---|---|
| Bad | 80 | 0.054 | Bad |
| Best | 153 | 0.104 | Best |
| Better | 893 | 0.607 | Better |
| Good | 344 | 0.234 | Good |
ggplot(w_l_balance,aes(x=wl_level,y=n))+
geom_col(fill="light blue")+
labs(title = "Employees' Work-Life Balance",
x="Work-life Balance",
y="Frequency")+
theme_clean()+
NULL

According to the tables and graphs above, over 60% of the 1470 employees are highly satisfied with their job, there are still 19.7% of employees who reported low job satisfaction. Meanwhile, over 70% of the employees regarded their work_life balance as better than “Good”, and 10% of employees have assessed their work-life balance as “Best”.
Besides,the number of people who have low job satisfaction are much bigger than those who have bad work-life balance, which suggests that other variables apart from work-life balance may be the main reason that lead to low job satisfaction.
Is there any relationship between monthly income and education? Monthly income and gender?
Let’s take a look at the distribution of monthly income in different educational levels.
hr_dataset %>%
select(Education, MonthlyIncome) %>%
cor()
## Education MonthlyIncome
## Education 1.000 0.095
## MonthlyIncome 0.095 1.000
mi_edu<-hr_cleaned%>%
group_by(education)%>%
summarise(average_income=mean(monthly_income),std_income=sd(monthly_income),min_income=min(monthly_income),max_income=max(monthly_income))
mi_edu%>%
kbl()%>%
kable_styling()
| education | average_income | std_income | min_income | max_income |
|---|---|---|---|---|
| Bachelor | 6517 | 4817 | 1081 | 19926 |
| Below College | 5641 | 4484 | 1009 | 19973 |
| College | 6227 | 4525 | 1051 | 19613 |
| Doctor | 8278 | 5061 | 2127 | 19586 |
| Master | 6832 | 4657 | 1359 | 19999 |
edu_income<- hr_cleaned%>%
mutate(edu_level=factor(education,ordered = TRUE,levels = c("Below College","College","Bachelor","Master","Doctor")))
ggplot(edu_income,aes(x=edu_level,y=monthly_income,color=education))+
geom_boxplot(fill="light blue", color="blue")+
labs(title = "Monthly Income Spread",
subtitle="in different education levels",
x="Education Level",
y="Monthly Income")+
theme_clean()+
theme(legend.position="none")+
NULL

The correlation between education level and monthly income is 0.095, which shows that the 2 variable are positively correlated. However, this correlation is also close to zero, so let’s check the income distribution across different education levels.
The average monthly income moves up as the years of education goes up, which means those who with higher educational level may have a higher income level in general. And The variance becomes greater as the education level increases as well. Meanwhile, each group has some people whose monthly income is more than 15000, which means that the educational background would not bring certain ceiling in their income.
Now let’s look at the monthly income distribution in different gender groups.
mi_gender<-hr_cleaned%>%
group_by(gender)%>%
summarise(average_income=mean(monthly_income),std_income=sd(monthly_income),min_income=min(monthly_income),max_income=max(monthly_income))
mi_gender%>%
kbl()%>%
kable_styling()
| gender | average_income | std_income | min_income | max_income |
|---|---|---|---|---|
| Female | 6687 | 4696 | 1129 | 19973 |
| Male | 6381 | 4715 | 1009 | 19999 |
ggplot(hr_cleaned,aes(x=gender,y=monthly_income,color=gender))+
geom_boxplot(fill="light blue", color="blue")+
labs(title = "Monthly Income Spread",
subtitle="in gender groups",
x="Gender",
y="Monthly Income")+
theme_clean()+
theme(legend.position="none")+
NULL

The overall distribution is quite similar between gender groups. The average monthly payment of female employees is 6687, which is 300 more than that of male employees. What’s more, the variance of monthly income in male group are higher than that in female group.
Monthly Income Distribution in Different Job Roles
hr_cleaned%>%
group_by(job_role)%>%
arrange(monthly_income)
## # A tibble: 1,470 x 19
## # Groups: job_role [9]
## age attrition daily_rate department distance_from_h~ education gender
## <dbl> <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 20 Yes 1362 Research ~ 10 Below Co~ Male
## 2 18 No 287 Research ~ 5 College Male
## 3 28 No 1144 Sales 10 Below Co~ Male
## 4 30 Yes 945 Sales 9 Bachelor Male
## 5 29 Yes 746 Sales 24 Bachelor Male
## 6 19 Yes 303 Research ~ 2 Bachelor Male
## 7 25 Yes 599 Sales 24 Below Co~ Male
## 8 31 No 1276 Research ~ 2 Below Co~ Female
## 9 18 No 812 Sales 10 Bachelor Female
## 10 23 No 373 Research ~ 1 College Male
## # ... with 1,460 more rows, and 12 more variables: job_role <chr>,
## # environment_satisfaction <chr>, job_satisfaction <chr>,
## # marital_status <chr>, monthly_income <dbl>, num_companies_worked <dbl>,
## # percent_salary_hike <dbl>, performance_rating <chr>,
## # total_working_years <dbl>, work_life_balance <chr>, years_at_company <dbl>,
## # years_since_last_promotion <dbl>
ggplot(hr_cleaned,aes(x=reorder(job_role,-monthly_income),y=monthly_income))+
geom_boxplot(fill="light blue", color="blue")+
labs(title = "Monthly Income Distribution",
subtitle="in different job roles",
x="Job Role",
y="Monthly Income")+
theme_clean()+
theme(axis.text.x = element_text(angle=45,hjust=1,vjust=1))+
NULL

The average monthly income of managers and research directors are more than 15000, which is more than 3 times of research scientists’, laboratory technicians’ and sales representatives’ average income. Meanwhile, income has a right skewed distribution job roles apart from manager and research director.
Calculate and plot a bar chart of the mean income by education level.
mean_income<-hr_cleaned%>%
group_by(education)%>%
summarise(average_income=mean(monthly_income))%>%
mutate(edu_level=factor(education,ordered = TRUE,levels = c("Below College","College","Bachelor","Master","Doctor")))
mean_income%>%
kbl()%>%
kable_styling()
| education | average_income | edu_level |
|---|---|---|
| Bachelor | 6517 | Bachelor |
| Below College | 5641 | Below College |
| College | 6227 | College |
| Doctor | 8278 | Doctor |
| Master | 6832 | Master |
ggplot(mean_income,aes(x=edu_level,y=average_income))+
geom_col(fill="light blue")+
labs(title = "Average Monthly Income in Different Education Level",
x="Education Level",
y="Monthly Income")+
theme_clean()+
NULL

Monthly Income in Different Education Levels
ggplot(edu_income,aes(x=monthly_income))+
geom_density(fill="light blue", color="blue")+
facet_wrap(~edu_level)+
labs(title = "Monthly Income Distributions",
subtitle="in different education levels",
x="Monthly Income",
y="Density")+
theme_clean()+
NULL

The right tail of income distribution becomes fatter as the education level increases, which also supports the positive correlation between monthly income and education level.
Monthly Income Distribution in Ages
ggplot(hr_cleaned,aes(x=age, y=monthly_income,color=age))+
geom_col(fill="light blue")+
facet_wrap(~reorder(job_role,-monthly_income))+
labs(title = "Monthly Income Distribution in Ages",
subtitle="across job roles",
x="Age",
y="Monthly Income")+
theme_clean()+
theme(legend.position="none")+
NULL

Employees’ monthly income usually increases with age and and comes to the peak between their thirties to forties. While in senior roles such as managers and research directors, this peak usually appears around their fifties.
The spread of monthly income varies a lot among different job roles as well.Typically, sales executives and manager have wider spread of income than that of HRs and sales representatives.