Importing important libraries
require('ggplot2')
require('dplyr')
require('tidyr')
The first step is to import the data and view it
employee_data <-read.csv('C:/Users/97798/Documents/sem 4 materials/SEM 4/CT127-3-2-PFDA/Assignment - Individual/employee_attrition.csv')
head(employee_data)
## EmployeeID recorddate_key birthdate_key orighiredate_key terminationdate_key
## 1 1318 12/31/2006 0:00 1/3/1954 8/28/1989 1/1/1900
## 2 1318 12/31/2007 0:00 1/3/1954 8/28/1989 1/1/1900
## 3 1318 12/31/2008 0:00 1/3/1954 8/28/1989 1/1/1900
## 4 1318 12/31/2009 0:00 1/3/1954 8/28/1989 1/1/1900
## 5 1318 12/31/2010 0:00 1/3/1954 8/28/1989 1/1/1900
## 6 1318 12/31/2011 0:00 1/3/1954 8/28/1989 1/1/1900
## age length_of_service city_name department_name job_title store_name
## 1 52 17 Vancouver Executive CEO 35
## 2 53 18 Vancouver Executive CEO 35
## 3 54 19 Vancouver Executive CEO 35
## 4 55 20 Vancouver Executive CEO 35
## 5 56 21 Vancouver Executive CEO 35
## 6 57 22 Vancouver Executive CEO 35
## gender_short gender_full termreason_desc termtype_desc STATUS_YEAR STATUS
## 1 M Male Not Applicable Not Applicable 2006 ACTIVE
## 2 M Male Not Applicable Not Applicable 2007 ACTIVE
## 3 M Male Not Applicable Not Applicable 2008 ACTIVE
## 4 M Male Not Applicable Not Applicable 2009 ACTIVE
## 5 M Male Not Applicable Not Applicable 2010 ACTIVE
## 6 M Male Not Applicable Not Applicable 2011 ACTIVE
## BUSINESS_UNIT
## 1 HEADOFFICE
## 2 HEADOFFICE
## 3 HEADOFFICE
## 4 HEADOFFICE
## 5 HEADOFFICE
## 6 HEADOFFICE
The first step is to check for missing values. there is not missing values
missing_data = colSums(is.na(employee_data))
print(missing_data)
## EmployeeID recorddate_key birthdate_key orighiredate_key
## 0 0 0 0
## terminationdate_key age length_of_service city_name
## 0 0 0 0
## department_name job_title store_name gender_short
## 0 0 0 0
## gender_full termreason_desc termtype_desc STATUS_YEAR
## 0 0 0 0
## STATUS BUSINESS_UNIT
## 0 0
There is no missing data Now understanding data structure
str(employee_data)
## 'data.frame': 49653 obs. of 18 variables:
## $ EmployeeID : int 1318 1318 1318 1318 1318 1318 1318 1318 1318 1318 ...
## $ recorddate_key : chr "12/31/2006 0:00" "12/31/2007 0:00" "12/31/2008 0:00" "12/31/2009 0:00" ...
## $ birthdate_key : chr "1/3/1954" "1/3/1954" "1/3/1954" "1/3/1954" ...
## $ orighiredate_key : chr "8/28/1989" "8/28/1989" "8/28/1989" "8/28/1989" ...
## $ terminationdate_key: chr "1/1/1900" "1/1/1900" "1/1/1900" "1/1/1900" ...
## $ age : int 52 53 54 55 56 57 58 59 60 61 ...
## $ length_of_service : int 17 18 19 20 21 22 23 24 25 26 ...
## $ city_name : chr "Vancouver" "Vancouver" "Vancouver" "Vancouver" ...
## $ department_name : chr "Executive" "Executive" "Executive" "Executive" ...
## $ job_title : chr "CEO" "CEO" "CEO" "CEO" ...
## $ store_name : int 35 35 35 35 35 35 35 35 35 35 ...
## $ gender_short : chr "M" "M" "M" "M" ...
## $ gender_full : chr "Male" "Male" "Male" "Male" ...
## $ termreason_desc : chr "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
## $ termtype_desc : chr "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
## $ STATUS_YEAR : int 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 ...
## $ STATUS : chr "ACTIVE" "ACTIVE" "ACTIVE" "ACTIVE" ...
## $ BUSINESS_UNIT : chr "HEADOFFICE" "HEADOFFICE" "HEADOFFICE" "HEADOFFICE" ...
The data stores and updates employee data in the same file. Each year a new record is added for example 1318 the ceo has been working from 2006 to 2015 and thier data is updated each year.
convert all date columns to date
cols = colnames(employee_data)
for (col in cols){
if(length(grep("date",col))){
employee_data[[col]] = as.Date(employee_data[[col]], format = "%m/%d/%Y")
}
}
str(employee_data)
## 'data.frame': 49653 obs. of 18 variables:
## $ EmployeeID : int 1318 1318 1318 1318 1318 1318 1318 1318 1318 1318 ...
## $ recorddate_key : Date, format: "2006-12-31" "2007-12-31" ...
## $ birthdate_key : Date, format: "1954-01-03" "1954-01-03" ...
## $ orighiredate_key : Date, format: "1989-08-28" "1989-08-28" ...
## $ terminationdate_key: Date, format: "1900-01-01" "1900-01-01" ...
## $ age : int 52 53 54 55 56 57 58 59 60 61 ...
## $ length_of_service : int 17 18 19 20 21 22 23 24 25 26 ...
## $ city_name : chr "Vancouver" "Vancouver" "Vancouver" "Vancouver" ...
## $ department_name : chr "Executive" "Executive" "Executive" "Executive" ...
## $ job_title : chr "CEO" "CEO" "CEO" "CEO" ...
## $ store_name : int 35 35 35 35 35 35 35 35 35 35 ...
## $ gender_short : chr "M" "M" "M" "M" ...
## $ gender_full : chr "Male" "Male" "Male" "Male" ...
## $ termreason_desc : chr "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
## $ termtype_desc : chr "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
## $ STATUS_YEAR : int 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 ...
## $ STATUS : chr "ACTIVE" "ACTIVE" "ACTIVE" "ACTIVE" ...
## $ BUSINESS_UNIT : chr "HEADOFFICE" "HEADOFFICE" "HEADOFFICE" "HEADOFFICE" ...
The data stores and updates employee data in the same file. Each year a new record is added for example 1318 the ceo has been working from 2006 to 2015 and thier data is updated each year. Firstly i will segreate create a new data with one row for one employee by fetching their latest record. First, group the records by employeeID, arrange them based on status and then recorddate in descending order. If the employee has terminated status that will be their latest record. If they have not yet been terminated then their latest recorddate will be their latest record.
employee_latest_record <- employee_data
employee_latest_record <- as.data.frame( employee_latest_record %>%
group_by(EmployeeID) %>%
arrange(desc(STATUS), desc(recorddate_key)) %>%
slice(1) %>%
ungroup())
Now, getting the summary for Numerical vairables
numerical_columns <- c("age", "length_of_service")
summary(employee_latest_record[,numerical_columns])
## age length_of_service
## Min. :19.00 Min. : 0.00
## 1st Qu.:32.00 1st Qu.: 7.00
## Median :45.00 Median :13.00
## Mean :44.74 Mean :12.84
## 3rd Qu.:58.00 3rd Qu.:19.00
## Max. :65.00 Max. :26.00
Then, getting all the unique values for each Categorical variables
categorical_columns <- c('city_name','department_name','job_title','store_name','gender_full','termreason_desc', 'termtype_desc','STATUS', 'BUSINESS_UNIT')
for ( category in categorical_columns){
# unique_values <- toString(unique(employee_latest_record[[category]]))
print(paste('The unique vairables in ',category,'column are:') )
print(unique(employee_latest_record[category]))
}
## [1] "The unique vairables in city_name column are:"
## city_name
## 1 Vancouver
## 9 Terrace
## 11 Nanaimo
## 16 Nelson
## 17 Kelowna
## 19 Victoria
## 25 Kamloops
## 33 Fort St John
## 35 Surrey
## 45 Vernon
## 49 Quesnel
## 51 Chilliwack
## 59 Dawson Creek
## 71 Squamish
## 72 New Westminster
## 82 Port Coquitlam
## 84 Cortes Island
## 85 Burnaby
## 107 Bella Bella
## 109 Cranbrook
## 111 Williams Lake
## 119 Trail
## 127 Prince George
## 129 Richmond
## 194 Grand Forks
## 195 West Vancouver
## 211 Abbotsford
## 228 Aldergrove
## 279 Langley
## 305 North Vancouver
## 343 White Rock
## 399 New Westminister
## 434 Fort Nelson
## 453 Haney
## 639 Valemount
## 683 Ocean Falls
## 713 Pitt Meadows
## 799 Princeton
## 1320 Dease Lake
## 3249 Blue River
## [1] "The unique vairables in department_name column are:"
## department_name
## 1 Executive
## 9 Store Management
## 10 Meats
## 14 Recruitment
## 15 Training
## 18 Labor Relations
## 20 HR Technology
## 21 Employee Records
## 23 Compensation
## 24 Legal
## 28 Produce
## 30 Accounts Receiveable
## 32 Bakery
## 34 Information Technology
## 36 Accounts Payable
## 37 Audit
## 38 Accounting
## 40 Investment
## 41 Dairy
## 47 Processed Foods
## 53 Customer Service
## [1] "The unique vairables in job_title column are:"
## job_title
## 1 CEO
## 2 VP Stores
## 3 Legal Counsel
## 4 VP Human Resources
## 5 VP Finance
## 6 Exec Assistant, VP Stores
## 7 Exec Assistant, Legal Counsel
## 8 CHief Information Officer
## 9 Store Manager
## 10 Meats Manager
## 12 Exec Assistant, Human Resources
## 13 Exec Assistant, Finance
## 14 Director, Recruitment
## 15 Director, Training
## 18 Director, Labor Relations
## 20 Director, HR Technology
## 21 Director, Employee Records
## 23 Director, Compensation
## 24 Corporate Lawyer
## 28 Produce Manager
## 30 Director, Accounts Receivable
## 32 Bakery Manager
## 34 Systems Analyst
## 36 Director, Accounts Payable
## 37 Director, Audit
## 38 Director, Accounting
## 40 Director, Investments
## 41 Dairy Person
## 42 Recruiter
## 47 Processed Foods Manager
## 53 Customer Service Manager
## 60 Trainer
## 67 Meat Cutter
## 87 Labor Relations Analyst
## 96 Dairy Manager
## 100 HRIS Analyst
## 114 Benefits Admin
## 130 Compensation Analyst
## 134 Accounts Receiveable Clerk
## 144 Accounts Payable Clerk
## 149 Baker
## 160 Auditor
## 167 Accounting Clerk
## 185 Investment Analyst
## 384 Produce Clerk
## 759 Shelf Stocker
## 1057 Cashier
## [1] "The unique vairables in store_name column are:"
## store_name
## 1 35
## 9 32
## 11 18
## 16 19
## 17 16
## 19 37
## 25 15
## 33 12
## 35 31
## 45 36
## 49 28
## 51 6
## 59 9
## 71 30
## 72 21
## 73 46
## 82 25
## 84 7
## 85 5
## 107 3
## 109 8
## 111 40
## 119 33
## 127 26
## 129 29
## 190 41
## 194 13
## 195 38
## 209 42
## 211 1
## 228 2
## 241 43
## 263 44
## 279 17
## 289 45
## 305 22
## 343 39
## 399 20
## 434 11
## 453 14
## 639 34
## 683 23
## 713 24
## 799 27
## 1320 10
## 3249 4
## [1] "The unique vairables in gender_full column are:"
## gender_full
## 1 Male
## 2 Female
## [1] "The unique vairables in termreason_desc column are:"
## termreason_desc
## 1 Not Applicable
## 15 Retirement
## 33 Resignaton
## 149 Layoff
## [1] "The unique vairables in termtype_desc column are:"
## termtype_desc
## 1 Not Applicable
## 15 Voluntary
## 149 Involuntary
## [1] "The unique vairables in STATUS column are:"
## STATUS
## 1 ACTIVE
## 15 TERMINATED
## [1] "The unique vairables in BUSINESS_UNIT column are:"
## BUSINESS_UNIT
## 1 HEADOFFICE
## 9 STORES
Getting descriptive statistics to understand the employee population
Total number of employees
total_employee <- length(employee_latest_record$EmployeeID)
print(paste('There are ',total_employee,'employees in the company'))
## [1] "There are 6284 employees in the company"
Total number of employees by categories Gender
male_employee_count <- nrow(employee_latest_record[employee_latest_record$gender_full == "Male",])
print(paste('There are ',male_employee_count,'male employees in the company'))
## [1] "There are 3006 male employees in the company"
female_employee_count <- nrow(employee_latest_record[employee_latest_record$gender_full != "Male",])
print(paste('There are ',female_employee_count,'female employees in the company'))
## [1] "There are 3278 female employees in the company"
gender_count_df <- employee_latest_record %>%
group_by(gender_short) %>%
summarise(count = n())
gender_count_df <- gender_count_df %>% mutate(percentage = ((count/total_employee)*100))
gender_count_df <- gender_count_df %>%
rename(
Gender = gender_short,
Count = count,
Percentage = percentage
)
plot3 <- ggplot(gender_count_df, aes(x = "", y = Count, fill = Gender)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(title = "Gender Division", fill = "Gender") +
geom_text(aes(label = sprintf("%.1f%%", Percentage)), position = position_stack(vjust = 0.5))
show(plot3)
department_name
department_employee_count <- employee_latest_record %>% group_by(department_name,STATUS) %>% count() %>% rename(counts = n) %>% arrange(desc(counts))
department_employee_count <- department_employee_count %>%
arrange(desc(counts)) %>%
mutate(department_name = reorder(department_name, counts))
department_employee_count
## # A tibble: 29 × 3
## # Groups: department_name, STATUS [29]
## department_name STATUS counts
## <fct> <chr> <int>
## 1 Customer Service ACTIVE 928
## 2 Meats ACTIVE 875
## 3 Dairy ACTIVE 842
## 4 Bakery ACTIVE 786
## 5 Produce ACTIVE 707
## 6 Processed Foods ACTIVE 646
## 7 Meats TERMINATED 377
## 8 Produce TERMINATED 353
## 9 Customer Service TERMINATED 262
## 10 Dairy TERMINATED 191
## # ℹ 19 more rows
department_employee_count <- employee_latest_record %>%
group_by(department_name, STATUS) %>%
count() %>%
rename(counts = n) %>%
arrange(desc(counts)) %>%
mutate(department_name = reorder(department_name, counts))
# Filter for the top 7 departments
top7_departments <- department_employee_count %>%
group_by(department_name) %>%
summarize(total_counts = sum(counts)) %>%
top_n(7, wt = total_counts) %>%
pull(department_name)
top7_departments
## [1] Bakery Customer Service Dairy Meats
## [5] Processed Foods Produce Store Management
## 21 Levels: Accounting Accounts Payable Accounts Receiveable Audit ... Training
library(ggplot2)
ggplot(department_employee_count[1:10,], aes(x = reorder(department_name, counts), y = counts,
fill = STATUS )) +
geom_bar(stat = "identity", position="stack", aes(fill=STATUS)) +
scale_y_continuous(
trans = "log10",
breaks = (c(1,10, 100, 1000)), # Specify the log-transformed breaks
labels = c(1,10, 100, 1000) # Set the desired labels
)+
xlab("Department") +
ylab("Number of Employees") +
labs(title="Top 10 Department By Employees Count (Logarithmic Scale)") +
geom_text(aes(label = counts), position = position_dodge(width = 0.8), vjust = -0.5, size = 3) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),
axis.title.x = element_text(size = 12),
panel.grid.major = element_blank(), # Remove major gridlines
panel.grid.minor = element_blank(), # Remove minor gridlines
panel.background = element_blank() ,
panel.border = element_blank(), # Remove panel border
axis.line = element_line(color = "black")
)
employee_latest_record$Age_Group <- cut(employee_latest_record$age, breaks = c(18, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, max(employee_latest_record$age) + 1),
labels = c("19-23", "24-27", "28-32", "32-36", "36-40", "40-44", "44-48", "48-52", "52-56", "56-60", "60+"))
age_count_df <- employee_latest_record %>% group_by(Age_Group) %>% summarize(Count = n())
ggplot(age_count_df, aes(x=reorder(Age_Group,Count), y=Count))+
geom_bar(stat="identity",fill="lightgreen")+
geom_text(aes(label=Count) ,vjust = -0.5, size = 3)
There is no outliers
# Assuming you have a variable 'age' in your dataset
ggplot(employee_latest_record, aes(y = age)) +
geom_boxplot() +
geom_text(data = subset(employee_latest_record, age > quantile(age, 0.75) + 1.5 * IQR(age) | age < quantile(age, 0.25) - 1.5 * IQR(age)),
aes(label = age), hjust = -0.5, vjust = 0.5, color = "red") +
ylab("Age") +
labs(title = "Boxplot of Age with Outliers Highlighted")
Average Length of service
average_length_of_service <- mean(employee_latest_record$length_of_service)
print(paste("The average length of service is ", round(average_length_of_service,2), " years."))
## [1] "The average length of service is 12.84 years."
Attrition Rate
IGNORE THIS
length_of_service_over_departments <- employee_latest_record %>% group_by(department_name) %>% summarize(Count= n(),Mean = mean(length_of_service),Min = min(length_of_service),Max = max(length_of_service),OutlierCount = length(boxplot.stats(length_of_service)$out))
# Assuming employee_latest_record is a data frame with 'length_of_service' and 'department' variables
ggplot(employee_latest_record, aes(x = department_name, y = length_of_service)) +
geom_boxplot() +
geom_text(data = subset(employee_latest_record, length_of_service > quantile(length_of_service, 0.75) + 1.5 * IQR(length_of_service) | length_of_service < quantile(length_of_service, 0.25) - 1.5 * IQR(length_of_service)),
aes(label = sprintf("%.2f", length_of_service)), hjust = -0.5, vjust = 0.5, color = "red") +
ylab("Length of Service") +
labs(title = "Boxplot of Length of Service by Department") +
facet_wrap(~ department_name, 6)
#since we have 37 job titles, i will only check for top 5 jobs with most employee
length_of_service_over_job<- employee_latest_record %>% group_by(job_title) %>% summarize(Count= n(),Mean = mean(length_of_service),Min = min(length_of_service),Max = max(length_of_service),OutlierCount = length(boxplot.stats(length_of_service)$out)) %>%
arrange(desc(Count)) %>% top_n(5)
## Selecting by OutlierCount
hiring_in_jobs_after_2006 <- employee_latest_record %>% group_by(job_title) %>% filter(orighiredate_key >="2006-01-01") %>% summarize(count = n())
hiring_in_jobs_since_2006 <- employee_latest_record %>% group_by(job_title) %>% summarize(count = n())
top5_departments <- length_of_service_over_job %>%
top_n(5, wt = Count) %>%
pull(job_title)
# Filter the original data for the top 5 departments
filtered_data <- employee_latest_record %>%
filter(job_title %in% top5_departments, orighiredate_key >="2006-01-01")
# Create box plots
ggplot(filtered_data, aes(x = job_title, y = length_of_service)) +
geom_boxplot() +
ylab("Length of Service") +
labs(title = "Boxplot of Length of Service in Top 5 Departments")
Business unit
business_unit_count <- employee_latest_record %>% group_by(BUSINESS_UNIT) %>% count() %>% rename(counts = n)
business_unit_count
## # A tibble: 2 × 2
## # Groups: BUSINESS_UNIT [2]
## BUSINESS_UNIT counts
## <chr> <int>
## 1 HEADOFFICE 80
## 2 STORES 6204
departments in head office
department_head_office <- employee_latest_record %>% filter(BUSINESS_UNIT == "HEADOFFICE") %>% group_by(department_name) %>% count() %>% rename(counts = n)
head(department_head_office,n=14)
## # A tibble: 14 × 2
## # Groups: department_name [14]
## department_name counts
## <chr> <int>
## 1 Accounting 6
## 2 Accounts Payable 4
## 3 Accounts Receiveable 5
## 4 Audit 4
## 5 Compensation 4
## 6 Employee Records 6
## 7 Executive 10
## 8 HR Technology 9
## 9 Information Technology 5
## 10 Investment 4
## 11 Labor Relations 6
## 12 Legal 3
## 13 Recruitment 9
## 14 Training 5
sum(department_head_office$counts)
## [1] 80
headoffice store name 14 active employees there
head_office_store_name <- unique(employee_latest_record[employee_latest_record$BUSINESS_UNIT == "HEADOFFICE",]$store_name)
print(paste("The head office corresponds to the store name:",head_office_store_name))
## [1] "The head office corresponds to the store name: 35"
store management department has store manager
store without store management
store_manager_data <- employee_latest_record[employee_latest_record$job_title == "Store Manager", ]
unique_store_names <- unique(store_manager_data$store_name)
all_store_names <- unique(employee_latest_record$store_name)
store_names_without_management <- setdiff(all_store_names,unique_store_names)
print(paste("There are total",length(all_store_names), "stores but, only",length(unique_store_names)," stores had a store manager"))
## [1] "There are total 46 stores but, only 35 stores had a store manager"
active_store_manager <- store_manager_data %>% filter(STATUS == "ACTIVE")
print(paste("In 2015, there are only", count(active_store_manager)," active store managers."))
## [1] "In 2015, there are only 4 active store managers."
is there relation to stores with employee count
store_employee_count <- employee_latest_record %>% filter(STATUS=="ACTIVE") %>%group_by(store_name) %>% count() %>% rename(counts = n) %>% arrange(desc(counts))
print(paste("The mean number of active employees in stores with managers are"))
## [1] "The mean number of active employees in stores with managers are"
store_employee_count_without_manager <- employee_latest_record %>% filter(store_name %in% store_names_without_management) %>% group_by(store_name) %>% count()
store_employee_count_without_manager_status <- employee_latest_record %>% filter(store_name %in% store_names_without_management) %>% group_by(store_name, STATUS) %>%
summarise(counts = n(), .groups = "drop") %>%
pivot_wider(names_from = STATUS, values_from = counts, names_prefix = "count_")
store_employee_count_without_manager
## # A tibble: 11 × 2
## # Groups: store_name [11]
## store_name n
## <int> <int>
## 1 4 1
## 2 7 6
## 3 10 2
## 4 11 47
## 5 22 81
## 6 23 7
## 7 24 9
## 8 34 5
## 9 35 221
## 10 40 72
## 11 43 311
store_employee_count_without_manager
## # A tibble: 11 × 2
## # Groups: store_name [11]
## store_name n
## <int> <int>
## 1 4 1
## 2 7 6
## 3 10 2
## 4 11 47
## 5 22 81
## 6 23 7
## 7 24 9
## 8 34 5
## 9 35 221
## 10 40 72
## 11 43 311
store_employee_count_without_manager_status
## # A tibble: 11 × 3
## store_name count_TERMINATED count_ACTIVE
## <int> <int> <int>
## 1 4 1 NA
## 2 7 6 NA
## 3 10 2 NA
## 4 11 47 NA
## 5 22 12 69
## 6 23 7 NA
## 7 24 9 NA
## 8 34 5 NA
## 9 35 207 14
## 10 40 10 62
## 11 43 24 287
stores with manager with varying jobttile
store_job_count_without_manager <- employee_latest_record[(employee_latest_record$store_name %in% store_names_without_management),] %>% group_by(store_name) %>% summarise(unique_job = n_distinct(job_title))
store_job_count_without_manager
## # A tibble: 11 × 2
## store_name unique_job
## <int> <int>
## 1 4 1
## 2 7 2
## 3 10 1
## 4 11 11
## 5 22 11
## 6 23 2
## 7 24 4
## 8 34 1
## 9 35 36
## 10 40 11
## 11 43 11
store_job_count <- employee_latest_record %>% group_by(store_name) %>% summarise(unique_job = n_distinct(job_title))
store_job_count
## # A tibble: 46 × 2
## store_name unique_job
## <int> <int>
## 1 1 12
## 2 2 10
## 3 3 6
## 4 4 1
## 5 5 12
## 6 6 12
## 7 7 2
## 8 8 12
## 9 9 6
## 10 10 1
## # ℹ 36 more rows
store_category_by_jobs <- c(unique(store_job_count$unique_job))
for (job_count in store_category_by_jobs){
store_with_job_count = store_job_count[store_job_count$unique_job == job_count,]
store_name<- store_with_job_count$store_name[1]
job_titles_for_store <- unique(employee_latest_record[employee_latest_record$store_name == store_name,]$job_title)
print(paste("the job titles for store", store_name," with ",job_count,"job titles are: "))
print(job_titles_for_store)
}
## [1] "the job titles for store 1 with 12 job titles are: "
## [1] "Store Manager" "Meats Manager"
## [3] "Produce Manager" "Bakery Manager"
## [5] "Dairy Person" "Processed Foods Manager"
## [7] "Customer Service Manager" "Meat Cutter"
## [9] "Produce Clerk" "Shelf Stocker"
## [11] "Baker" "Cashier"
## [1] "the job titles for store 2 with 10 job titles are: "
## [1] "Store Manager" "Meats Manager"
## [3] "Produce Manager" "Dairy Person"
## [5] "Processed Foods Manager" "Meat Cutter"
## [7] "Produce Clerk" "Baker"
## [9] "Shelf Stocker" "Cashier"
## [1] "the job titles for store 3 with 6 job titles are: "
## [1] "Store Manager" "Meat Cutter" "Baker" "Cashier"
## [5] "Dairy Person" "Produce Clerk"
## [1] "the job titles for store 4 with 1 job titles are: "
## [1] "Dairy Person"
## [1] "the job titles for store 7 with 2 job titles are: "
## [1] "Dairy Person" "Baker"
## [1] "the job titles for store 11 with 11 job titles are: "
## [1] "Meats Manager" "Produce Manager"
## [3] "Meat Cutter" "Bakery Manager"
## [5] "Dairy Person" "Processed Foods Manager"
## [7] "Customer Service Manager" "Produce Clerk"
## [9] "Baker" "Shelf Stocker"
## [11] "Cashier"
## [1] "the job titles for store 14 with 7 job titles are: "
## [1] "Store Manager" "Meat Cutter" "Produce Clerk" "Baker"
## [5] "Dairy Person" "Shelf Stocker" "Cashier"
## [1] "the job titles for store 18 with 13 job titles are: "
## [1] "Store Manager" "Meats Manager"
## [3] "Produce Manager" "Bakery Manager"
## [5] "Dairy Manager" "Processed Foods Manager"
## [7] "Customer Service Manager" "Meat Cutter"
## [9] "Produce Clerk" "Baker"
## [11] "Dairy Person" "Shelf Stocker"
## [13] "Cashier"
## [1] "the job titles for store 24 with 4 job titles are: "
## [1] "Meat Cutter" "Produce Clerk" "Dairy Person" "Baker"
## [1] "the job titles for store 35 with 36 job titles are: "
## [1] "CEO" "VP Stores"
## [3] "Legal Counsel" "VP Human Resources"
## [5] "VP Finance" "Exec Assistant, VP Stores"
## [7] "Exec Assistant, Legal Counsel" "CHief Information Officer"
## [9] "Exec Assistant, Human Resources" "Exec Assistant, Finance"
## [11] "Director, Recruitment" "Director, Training"
## [13] "Director, Labor Relations" "Director, HR Technology"
## [15] "Director, Employee Records" "Director, Compensation"
## [17] "Corporate Lawyer" "Director, Accounts Receivable"
## [19] "Systems Analyst" "Director, Accounts Payable"
## [21] "Director, Audit" "Director, Accounting"
## [23] "Director, Investments" "Recruiter"
## [25] "Trainer" "Dairy Person"
## [27] "Labor Relations Analyst" "HRIS Analyst"
## [29] "Benefits Admin" "Compensation Analyst"
## [31] "Accounts Receiveable Clerk" "Accounts Payable Clerk"
## [33] "Auditor" "Accounting Clerk"
## [35] "Investment Analyst" "Meat Cutter"
## [1] "the job titles for store 37 with 9 job titles are: "
## [1] "Store Manager" "Meats Manager"
## [3] "Produce Manager" "Bakery Manager"
## [5] "Dairy Person" "Processed Foods Manager"
## [7] "Customer Service Manager" "Meat Cutter"
## [9] "Produce Clerk"
the number of unique jobs can be used to categorize the stores like all stores with 1 job title is dariy person and so on….
stores with manager and tehir top 3 active and terminated job titles
store_job_summary <- employee_latest_record %>%
group_by(store_name, job_title, STATUS) %>%
summarize(count = n(), .groups = "drop") %>%
arrange(store_name, STATUS,desc(count))
# To get the top 3 job titles in each store by count:
top_3_job_titles <- store_job_summary %>%
group_by(store_name,STATUS) %>%
top_n(3, wt = count)
top_3_job_titles_store_with_manger <- top_3_job_titles[!(top_3_job_titles$store_name %in% store_names_without_management),]
# Now you have a summary of the top 3 job titles in each store by count.
Overall Attrition rate of the company
count_status <- employee_latest_record %>% count(STATUS)
# Get the number of employee of 'TERMINATED' STATUS
terminated_count = count_status[count_status$STATUS == 'TERMINATED', 'n']
# Calculate the percentage of 'TERMINATED' status
terminated_count_percent <- (terminated_count / sum(count_status$n)) * 100
# Print the percentage
print(paste('The overall attrition rate of company is :', round(terminated_count_percent,2), '%'))
## [1] "The overall attrition rate of company is : 23.63 %"
Attrition Rate by gender
count_status_by_gender <- employee_latest_record %>% group_by(gender_full) %>% count(STATUS)
# Get the number of Male employee of 'TERMINATED' STATUS
terminated_count_male = count_status_by_gender[count_status_by_gender$STATUS == 'TERMINATED' & count_status_by_gender$gender_full =='Male', 'n']
# Calculate the percentage of 'TERMINATED' status
terminated_count_percent_male <- (terminated_count_male / sum(count_status_by_gender[count_status_by_gender$gender_full =='Male','n']) * 100)
# Print the percentage
print(paste('The overall attrition rate of Male in the company is :', round(terminated_count_percent_male,2), '%'))
## [1] "The overall attrition rate of Male in the company is : 18.96 %"
# Get the number of employee of Female 'TERMINATED' STATUS
terminated_count_female = count_status_by_gender[count_status_by_gender$STATUS == 'TERMINATED' & count_status_by_gender$gender_full =='Female', 'n']
# Calculate the percentage of 'TERMINATED' status
terminated_count_percent_female <- (terminated_count_female / sum(count_status_by_gender[count_status_by_gender$gender_full =='Female','n']) * 100)
# Print the percentage
print(paste('The overall attrition rate of Female in the company is :', round(terminated_count_percent_female,2), '%'))
## [1] "The overall attrition rate of Female in the company is : 27.91 %"
Termination Reason Distribution
terminated_employees = employee_latest_record %>% filter(STATUS == 'TERMINATED')
dim(terminated_employees)
## [1] 1485 19
terminated_employees_reason = terminated_employees %>% group_by(termreason_desc) %>% count()
for (i in c(1:3)){
print(paste('There were',terminated_employees_reason[i,][2],terminated_employees_reason[i,][1]))
}
## [1] "There were 215 Layoff"
## [1] "There were 385 Resignaton"
## [1] "There were 885 Retirement"
terminated_employees_type = terminated_employees %>% group_by(termtype_desc) %>% count()
for (i in c(1:2)){
print(paste('There were',terminated_employees_type[i,][2],terminated_employees_type[i,][1]))
}
## [1] "There were 215 Involuntary"
## [1] "There were 1270 Voluntary"
There are 215 layoff and 215 involuntary exits.
No need to verify but still
count(terminated_employees[terminated_employees$termreason_desc =='Layoff' & terminated_employees$termtype_desc == 'Involuntary' ,])
## n
## 1 215
lets see the layoff over years
terminated_employees_layoff = terminated_employees %>% filter(termreason_desc =='Layoff') %>% group_by(STATUS_YEAR) %>% count()
terminated_employees_layoff
## # A tibble: 2 × 2
## # Groups: STATUS_YEAR [2]
## STATUS_YEAR n
## <int> <int>
## 1 2014 142
## 2 2015 73
It is found that there were layoffs only in 2014 and 2015. lets check the same for resignation
resignation_by_year= terminated_employees %>% filter(termreason_desc =='Resignaton') %>% group_by(STATUS_YEAR) %>% count()
resignation_by_year
## # A tibble: 10 × 2
## # Groups: STATUS_YEAR [10]
## STATUS_YEAR n
## <int> <int>
## 1 2006 12
## 2 2007 25
## 3 2008 26
## 4 2009 18
## 5 2010 29
## 6 2011 69
## 7 2012 76
## 8 2013 49
## 9 2014 55
## 10 2015 26
lets visualize this
resignaton_plot = ggplot(resignation_by_year, aes(x=STATUS_YEAR, y=n))+geom_line()+scale_x_continuous(breaks=seq(2006,2016,1))
resignaton_plot
we can see peak in 2011 to 2013. lets analyse this for different department.
terminated_employees_resignation <- terminated_employees[terminated_employees$termreason_desc == 'Resignaton',]
resignation_by_department <- terminated_employees_resignation %>% group_by(STATUS_YEAR, department_name) %>% summarise(counts = n()) %>% arrange(STATUS_YEAR, desc(counts)) %>% group_by(STATUS_YEAR)
## `summarise()` has grouped output by 'STATUS_YEAR'. You can override using the
## `.groups` argument.
p <- ggplot(resignation_by_department, aes(x=STATUS_YEAR, y= counts)) +geom_line(aes(color=department_name))+scale_x_continuous(breaks=seq(2006,2016,1))
p
In 2014 alone there were 55 resignaton of which 54 were customer_service
terminated_employees_resignation[terminated_employees_resignation$STATUS_YEAR == 2014,]%>% group_by(department_name) %>% summarise(counts = n())
## # A tibble: 2 × 2
## department_name counts
## <chr> <int>
## 1 Customer Service 54
## 2 Processed Foods 1
department with most resignation
department_resignation = terminated_employees %>% filter( termreason_desc =='Resignaton') %>% group_by(department_name) %>% summarise(resignaton_count = n()) %>% arrange(desc(resignaton_count))
job with most resgination
job_resignation = terminated_employees %>% filter( termreason_desc =='Resignaton') %>% group_by(job_title) %>% summarise(resignaton_count = n()) %>% arrange(desc(resignaton_count))
age group with most resgination
age_resignation = terminated_employees %>% filter( termreason_desc =='Resignaton') %>% group_by(age) %>% summarise(resignaton_count = n()) %>% arrange(desc(resignaton_count)) %>% top_n(3,resignaton_count)
The top 3 age of resgintion were 30,21,23 which is nearly 43% of all resignation and top 7 age is under 30 with 63% of all resgination
department wise customer what has more resignation and is there any relation to length of service
lets try to find the department and job titles each of the top 3 age work in
department_resignation_age_21 <- terminated_employees %>% filter(termreason_desc =='Resignaton' & age==21) %>% group_by(department_name) %>% summarize(count = n())
department_resignation_age_23 <- terminated_employees %>% filter(termreason_desc =='Resignaton' & age==23) %>% group_by(department_name) %>% summarize(count = n())
department_resignation_age_30 <- terminated_employees %>% filter(termreason_desc =='Resignaton' & age==30) %>% group_by(department_name) %>% summarize(count = n())
age when they joined ## creating join_age feature
terminated_employees['join_age'] = as.integer( difftime(terminated_employees$orighiredate_key, terminated_employees$birthdate_key, units = "days")/365.25)
#age_21_terminated<- terminated_employees %>% filter(age==21 & join_age == 21 ) %>% group_by(department_name) %>% summarize(count = n())
most of the age 21 employees who resgined are involved in customer service 58 out of 60 most of the age 23 employees who resgined are involved in customer service 30 out of 33
filtering which city has most termiantion
city_terminated <- terminated_employees %>% group_by(city_name)%>% summarize(count = n())%>% arrange(desc(count))
p <- ggplot(employee_data, aes(x = age, , fill = "Age Distribution")) +
geom_histogram( breaks= seq(from=18, to=70,by=10),color = "black", alpha = 0.7) +
labs(fill = NULL) + # Remove legend title
theme_minimal() # Apply a minimal theme
p
p <- ggplot(employee_data, aes(x = length_of_service, , fill = "lenght of service Distribution")) +
geom_histogram(bins=5,color = "black", alpha = 0.7) +
labs(fill = NULL) + # Remove legend title
scale_x_continuous(breaks = seq(0, max(employee_data$length_of_service),5))+
scale_y_continuous(breaks = seq(0, max(table(cut(employee_data$length_of_service, breaks = 5)))+5000, by = 2500))+
theme_minimal() # Apply a minimal theme
p
trying to see if age is a significant factor
employee_latest_record$Age_Group <- cut(employee_latest_record$age, breaks = c(18, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, max(employee_latest_record$age) + 1),
labels = c("19-23", "24-27", "28-32", "32-36", "36-40", "40-44", "44-48", "48-52", "52-56", "56-60", "60+"))
# Calculate attrition rates for each age group
attrition_rates <- employee_latest_record %>%
group_by(Age_Group) %>%
summarize(Attrition_Rate = mean(STATUS == "TERMINATED"))
# Perform a statistical test (e.g., ANOVA) to determine if the differences in attrition rates are significant
# Install the "stats" package if not already installed
# install.packages("stats")
library(stats)
# Analysis of Variance (ANOVA)
anova_model <- aov(Attrition_Rate ~ Age_Group, data = attrition_rates)
# Check for significance
summary(anova_model)
## Df Sum Sq Mean Sq
## Age_Group 10 0.3271 0.03271