IBM HR Analytics

In this case, we will analyze a data set on Human Resource Analytics. The IBM HR Analytics Employee Attrition & Performance data set is a fictional data set created by IBM data scientists.

knitr::opts_chunk$set(
  message = FALSE, 
  warning = FALSE, 
  tidy=FALSE,     # display code as typed
  size="small")   # slightly smaller font for code
options(digits = 3)

# default figure size
knitr::opts_chunk$set(
  fig.width=6.75, 
  fig.height=6.75,
  fig.align = "center"
)

library(tidyverse)  # Load ggplot2, dplyr, and all the other tidyverse packages
library(mosaic)
library(ggthemes)
library(lubridate)
library(fivethirtyeight)
library(here)
library(skimr)
library(janitor)
library(vroom)
library(tidyquant)
library(rvest)    # scrape websites
library(purrr)  
library(lubridate) #to handle dates
library(kableExtra)

First let us load the data.

hr_dataset <- read_csv(here::here("data", "datasets_1067_1925_WA_Fn-UseC_-HR-Employee-Attrition.csv"))
glimpse(hr_dataset)

## Rows: 1,470
## Columns: 35
## $ Age                      <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35...
## $ Attrition                <chr> "Yes", "No", "Yes", "No", "No", "No", "No"...
## $ BusinessTravel           <chr> "Travel_Rarely", "Travel_Frequently", "Tra...
## $ DailyRate                <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 13...
## $ Department               <chr> "Sales", "Research & Development", "Resear...
## $ DistanceFromHome         <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 2...
## $ Education                <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, ...
## $ EducationField           <chr> "Life Sciences", "Life Sciences", "Other",...
## $ EmployeeCount            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ EmployeeNumber           <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, ...
## $ EnvironmentSatisfaction  <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, ...
## $ Gender                   <chr> "Female", "Male", "Male", "Female", "Male"...
## $ HourlyRate               <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84...
## $ JobInvolvement           <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, ...
## $ JobLevel                 <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, ...
## $ JobRole                  <chr> "Sales Executive", "Research Scientist", "...
## $ JobSatisfaction          <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, ...
## $ MaritalStatus            <chr> "Single", "Married", "Single", "Married", ...
## $ MonthlyIncome            <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, ...
## $ MonthlyRate              <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9...
## $ NumCompaniesWorked       <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, ...
## $ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y...
## $ OverTime                 <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Ye...
## $ PercentSalaryHike        <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13...
## $ PerformanceRating        <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, ...
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, ...
## $ StandardHours            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80...
## $ StockOptionLevel         <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, ...
## $ TotalWorkingYears        <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5...
## $ TrainingTimesLastYear    <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, ...
## $ WorkLifeBalance          <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, ...
## $ YearsAtCompany           <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2,...
## $ YearsInCurrentRole       <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, ...
## $ YearsSinceLastPromotion  <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, ...
## $ YearsWithCurrManager     <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, ...

I am going to clean the data set, as variable names are in capital letters, some variables are not really necessary, and some variables, e.g., education are given as a number rather than a more useful description

hr_cleaned <- hr_dataset %>% 
  clean_names() %>% 
  mutate(
    education = case_when(
      education == 1 ~ "Below College",
      education == 2 ~ "College",
      education == 3 ~ "Bachelor",
      education == 4 ~ "Master",
      education == 5 ~ "Doctor"
    ),
    environment_satisfaction = case_when(
      environment_satisfaction == 1 ~ "Low",
      environment_satisfaction == 2 ~ "Medium",
      environment_satisfaction == 3 ~ "High",
      environment_satisfaction == 4 ~ "Very High"
    ),
    job_satisfaction = case_when(
      job_satisfaction == 1 ~ "Low",
      job_satisfaction == 2 ~ "Medium",
      job_satisfaction == 3 ~ "High",
      job_satisfaction == 4 ~ "Very High"
    ),
    performance_rating = case_when(
      performance_rating == 1 ~ "Low",
      performance_rating == 2 ~ "Good",
      performance_rating == 3 ~ "Excellent",
      performance_rating == 4 ~ "Outstanding"
    ),
    work_life_balance = case_when(
      work_life_balance == 1 ~ "Bad",
      work_life_balance == 2 ~ "Good",
      work_life_balance == 3 ~ "Better",
      work_life_balance == 4 ~ "Best"
    )
  ) %>% 
  select(age, attrition, daily_rate, department,
         distance_from_home, education,
         gender, job_role,environment_satisfaction,
         job_satisfaction, marital_status,
         monthly_income, num_companies_worked, percent_salary_hike,
         performance_rating, total_working_years,
         work_life_balance, years_at_company,
         years_since_last_promotion)

How often do people leave the company ?

leave <- hr_cleaned%>%
  group_by(attrition)%>%
  count()%>%
  ungroup()%>%
  mutate(pct_job_satisfaction=n/sum(n))
ggplot(leave,aes(x=attrition,y=n))+
  geom_col(fill="light blue")+
  labs(title="The attrition rate of IBM is 16.1%",
       x="Attrtion",
       y="Number of people")+
  theme_clean()+
  NULL

There are 1470 observations in the dataset, and 237 employees left the company. Therefore, the attrition rate is 16.1%.

How are `age`, `years_at_company`, `monthly_income` and `years_since_last_promotion` distributed?

First of all, let’s take a look at the summary statistics.

summary(hr_cleaned)

##       age        attrition           daily_rate    department       
##  Min.   :18.0   Length:1470        Min.   : 102   Length:1470       
##  1st Qu.:30.0   Class :character   1st Qu.: 465   Class :character  
##  Median :36.0   Mode  :character   Median : 802   Mode  :character  
##  Mean   :36.9                      Mean   : 802                     
##  3rd Qu.:43.0                      3rd Qu.:1157                     
##  Max.   :60.0                      Max.   :1499                     
##  distance_from_home  education            gender            job_role        
##  Min.   : 1.00      Length:1470        Length:1470        Length:1470       
##  1st Qu.: 2.00      Class :character   Class :character   Class :character  
##  Median : 7.00      Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 9.19                                                              
##  3rd Qu.:14.00                                                              
##  Max.   :29.00                                                              
##  environment_satisfaction job_satisfaction   marital_status     monthly_income 
##  Length:1470              Length:1470        Length:1470        Min.   : 1009  
##  Class :character         Class :character   Class :character   1st Qu.: 2911  
##  Mode  :character         Mode  :character   Mode  :character   Median : 4919  
##                                                                 Mean   : 6503  
##                                                                 3rd Qu.: 8379  
##                                                                 Max.   :19999  
##  num_companies_worked percent_salary_hike performance_rating
##  Min.   :0.00         Min.   :11.0        Length:1470       
##  1st Qu.:1.00         1st Qu.:12.0        Class :character  
##  Median :2.00         Median :14.0        Mode  :character  
##  Mean   :2.69         Mean   :15.2                          
##  3rd Qu.:4.00         3rd Qu.:18.0                          
##  Max.   :9.00         Max.   :25.0                          
##  total_working_years work_life_balance  years_at_company
##  Min.   : 0.0        Length:1470        Min.   : 0      
##  1st Qu.: 6.0        Class :character   1st Qu.: 3      
##  Median :10.0        Mode  :character   Median : 5      
##  Mean   :11.3                           Mean   : 7      
##  3rd Qu.:15.0                           3rd Qu.: 9      
##  Max.   :40.0                           Max.   :40      
##  years_since_last_promotion
##  Min.   : 0.00             
##  1st Qu.: 0.00             
##  Median : 1.00             
##  Mean   : 2.19             
##  3rd Qu.: 3.00             
##  Max.   :15.00

Age <- hr_cleaned %>% 
  summarise(min = min(age),max = max(age), median=median(age), mean=mean(age), sd = sd(age))
Monthly_income <- hr_cleaned %>% 
  summarise(min = min(monthly_income),max = max(monthly_income), median=median(monthly_income), mean=mean(monthly_income), sd = sd(monthly_income))
Year_since_last_promotion <- hr_cleaned %>% 
  summarise(min = min(years_since_last_promotion),max = max(years_since_last_promotion), median=median(years_since_last_promotion), mean=mean(years_since_last_promotion), sd = sd(years_since_last_promotion))
stats_basic<-bind_rows(Age,Monthly_income,Year_since_last_promotion)
Factors<-c("Age","Monthly Income","Year Since Last Promotion")
stats_all<-bind_cols(Factors,stats_basic)

stats_all%>%
  kbl()%>%
  kable_styling()

…1	min	max	median	mean	sd
Age	18	60	36	36.92	9.13
Monthly Income	1009	19999	4919	6502.93	4707.96
Year Since Last Promotion	0	15	1	2.19	3.22

Employees have an average age of nearly 37 in this company, and their average monthly income is 6502. And the average year since their last promotion is 2.19 years. Compared with the other 2 variables, the mean and median of employees’ age are close, and this value is nearly in the middle of total spread. Therefore, the distribution of age is closer to normal just by looking at summary statistics.

Here are the distributions of the age, monthly income of the employees and years since their last promotion. And the graphs below confirmed our guessing.

ggplot(hr_cleaned,aes(x=age))+
  geom_density(fill="light blue",color="blue")+
  labs(title = "Distribution of Age",
       x="Age",
       y="Density")+
  theme_clean()+
  NULL

ggplot(hr_cleaned,aes(x=monthly_income))+
  geom_density(fill="light blue",color="blue")+
  labs(title = "Distribution of Monthly Income",
       x="Monthly Income",
       y="Density")+
  theme_clean()+
  NULL

ggplot(hr_cleaned,aes(x=years_since_last_promotion))+
  geom_density(fill="light blue",color="blue")+
  labs(title = "Distribution of Years Since Last Promotion",
       x="Years",
       y="Density")+
  theme_clean()+
  NULL

The distribution of monthly income and years since last promotion are obviously not symmetrical and typically right skewed. While half of the employees would get promoted within one year, there are also people who haven’t get a promotion in 15 years. Meanwhile, the majority of people get a monthly payment lower than 5000, but someone could get more than 10000 a month as well. This distribution might reflect the staff level of this company.

How are `job_satisfaction` and `work_life_balance` distributed?

Here are the distribution of employees’ evaluation on job satisfaction and work-life balance.

job_satisfy<- hr_cleaned %>%
  group_by(job_satisfaction)%>%
  count()%>%
  ungroup()%>%
  mutate(pct_job_satisfaction=n/sum(n))%>%
  mutate(edu_level=factor(job_satisfaction,ordered = TRUE,levels=c("Low","Medium","High","Very High")))
job_satisfy%>%
  kbl()%>%
  kable_styling()

job_satisfaction	n	pct_job_satisfaction	edu_level
High	442	0.301	High
Low	289	0.197	Low
Medium	280	0.190	Medium
Very High	459	0.312	Very High

ggplot(job_satisfy,aes(x=edu_level,y=n))+
  geom_col(fill="light blue")+
  labs(title = "Employee's Job Satisfaction",
       x="Job Satisfaction",
       y="Frequency")+
  theme_clean()+
  NULL

w_l_balance<- hr_cleaned %>%
  group_by(work_life_balance) %>%
  count() %>%
  ungroup() %>%
  mutate(pct_work_life_balance = n/sum(n))%>%
  mutate(wl_level=factor(work_life_balance,ordered = TRUE,levels = c("Bad","Good","Better","Best") ))
w_l_balance%>%
  kbl()%>%
  kable_styling()

work_life_balance	n	pct_work_life_balance	wl_level
Bad	80	0.054	Bad
Best	153	0.104	Best
Better	893	0.607	Better
Good	344	0.234	Good

ggplot(w_l_balance,aes(x=wl_level,y=n))+
  geom_col(fill="light blue")+
  labs(title = "Employees' Work-Life Balance",
       x="Work-life Balance",
       y="Frequency")+
  theme_clean()+
  NULL

According to the tables and graphs above, over 60% of the 1470 employees are highly satisfied with their job, there are still 19.7% of employees who reported low job satisfaction. Meanwhile, over 70% of the employees regarded their work_life balance as better than “Good”, and 10% of employees have assessed their work-life balance as “Best”.

Besides,the number of people who have low job satisfaction are much bigger than those who have bad work-life balance, which suggests that other variables apart from work-life balance may be the main reason that lead to low job satisfaction.

Is there any relationship between monthly income and education? Monthly income and gender?

Let’s take a look at the distribution of monthly income in different educational levels.

hr_dataset %>% 
  select(Education, MonthlyIncome) %>% 
  cor()

##               Education MonthlyIncome
## Education         1.000         0.095
## MonthlyIncome     0.095         1.000

mi_edu<-hr_cleaned%>%
  group_by(education)%>%
  summarise(average_income=mean(monthly_income),std_income=sd(monthly_income),min_income=min(monthly_income),max_income=max(monthly_income))
mi_edu%>%
  kbl()%>%
  kable_styling()

education	average_income	std_income	min_income	max_income
Bachelor	6517	4817	1081	19926
Below College	5641	4484	1009	19973
College	6227	4525	1051	19613
Doctor	8278	5061	2127	19586
Master	6832	4657	1359	19999

edu_income<- hr_cleaned%>%
  mutate(edu_level=factor(education,ordered = TRUE,levels = c("Below College","College","Bachelor","Master","Doctor")))

ggplot(edu_income,aes(x=edu_level,y=monthly_income,color=education))+
  geom_boxplot(fill="light blue", color="blue")+
  labs(title = "Monthly Income Spread",
       subtitle="in different education levels",
       x="Education Level",
       y="Monthly Income")+
  theme_clean()+
  theme(legend.position="none")+
  NULL

The correlation between education level and monthly income is 0.095, which shows that the 2 variable are positively correlated. However, this correlation is also close to zero, so let’s check the income distribution across different education levels.

The average monthly income moves up as the years of education goes up, which means those who with higher educational level may have a higher income level in general. And The variance becomes greater as the education level increases as well. Meanwhile, each group has some people whose monthly income is more than 15000, which means that the educational background would not bring certain ceiling in their income.

Now let’s look at the monthly income distribution in different gender groups.

mi_gender<-hr_cleaned%>%
  group_by(gender)%>%
  summarise(average_income=mean(monthly_income),std_income=sd(monthly_income),min_income=min(monthly_income),max_income=max(monthly_income))
mi_gender%>%
  kbl()%>%
  kable_styling()

gender	average_income	std_income	min_income	max_income
Female	6687	4696	1129	19973
Male	6381	4715	1009	19999

ggplot(hr_cleaned,aes(x=gender,y=monthly_income,color=gender))+
  geom_boxplot(fill="light blue", color="blue")+
  labs(title = "Monthly Income Spread",
       subtitle="in gender groups",
       x="Gender",
       y="Monthly Income")+
  theme_clean()+
  theme(legend.position="none")+
  NULL

The overall distribution is quite similar between gender groups. The average monthly payment of female employees is 6687, which is 300 more than that of male employees. What’s more, the variance of monthly income in male group are higher than that in female group.

Monthly Income Distribution in Different Job Roles

hr_cleaned%>%
  group_by(job_role)%>%
  arrange(monthly_income)

## # A tibble: 1,470 x 19
## # Groups:   job_role [9]
##      age attrition daily_rate department distance_from_h~ education gender
##    <dbl> <chr>          <dbl> <chr>                 <dbl> <chr>     <chr> 
##  1    20 Yes             1362 Research ~               10 Below Co~ Male  
##  2    18 No               287 Research ~                5 College   Male  
##  3    28 No              1144 Sales                    10 Below Co~ Male  
##  4    30 Yes              945 Sales                     9 Bachelor  Male  
##  5    29 Yes              746 Sales                    24 Bachelor  Male  
##  6    19 Yes              303 Research ~                2 Bachelor  Male  
##  7    25 Yes              599 Sales                    24 Below Co~ Male  
##  8    31 No              1276 Research ~                2 Below Co~ Female
##  9    18 No               812 Sales                    10 Bachelor  Female
## 10    23 No               373 Research ~                1 College   Male  
## # ... with 1,460 more rows, and 12 more variables: job_role <chr>,
## #   environment_satisfaction <chr>, job_satisfaction <chr>,
## #   marital_status <chr>, monthly_income <dbl>, num_companies_worked <dbl>,
## #   percent_salary_hike <dbl>, performance_rating <chr>,
## #   total_working_years <dbl>, work_life_balance <chr>, years_at_company <dbl>,
## #   years_since_last_promotion <dbl>

ggplot(hr_cleaned,aes(x=reorder(job_role,-monthly_income),y=monthly_income))+
  geom_boxplot(fill="light blue", color="blue")+
  labs(title = "Monthly Income Distribution",
       subtitle="in different job roles",
       x="Job Role",
       y="Monthly Income")+
  theme_clean()+
  theme(axis.text.x = element_text(angle=45,hjust=1,vjust=1))+
  NULL

The average monthly income of managers and research directors are more than 15000, which is more than 3 times of research scientists’, laboratory technicians’ and sales representatives’ average income. Meanwhile, income has a right skewed distribution job roles apart from manager and research director.

Calculate and plot a bar chart of the mean income by education level.

mean_income<-hr_cleaned%>%
  group_by(education)%>%
  summarise(average_income=mean(monthly_income))%>%
  mutate(edu_level=factor(education,ordered = TRUE,levels = c("Below College","College","Bachelor","Master","Doctor")))

mean_income%>%
  kbl()%>%
  kable_styling()

education	average_income	edu_level
Bachelor	6517	Bachelor
Below College	5641	Below College
College	6227	College
Doctor	8278	Doctor
Master	6832	Master

ggplot(mean_income,aes(x=edu_level,y=average_income))+
  geom_col(fill="light blue")+
  labs(title = "Average Monthly Income in Different Education Level",
       x="Education Level",
       y="Monthly Income")+
  theme_clean()+
  NULL

Monthly Income in Different Education Levels

ggplot(edu_income,aes(x=monthly_income))+
  geom_density(fill="light blue", color="blue")+
  facet_wrap(~edu_level)+
  labs(title = "Monthly Income Distributions",
       subtitle="in different education levels",
       x="Monthly Income",
       y="Density")+
  theme_clean()+
  NULL

The right tail of income distribution becomes fatter as the education level increases, which also supports the positive correlation between monthly income and education level.

Monthly Income Distribution in Ages

ggplot(hr_cleaned,aes(x=age, y=monthly_income,color=age))+
  geom_col(fill="light blue")+
  facet_wrap(~reorder(job_role,-monthly_income))+
  labs(title = "Monthly Income Distribution in Ages",
       subtitle="across job roles",
       x="Age",
       y="Monthly Income")+
  theme_clean()+
  theme(legend.position="none")+
  NULL

Employees’ monthly income usually increases with age and and comes to the peak between their thirties to forties. While in senior roles such as managers and research directors, this peak usually appears around their fifties.

The spread of monthly income varies a lot among different job roles as well.Typically, sales executives and manager have wider spread of income than that of HRs and sales representatives.