Importing important libraries

require('ggplot2')
require('dplyr')
require('tidyr')

The first step is to import the data and view it

employee_data <-read.csv('C:/Users/97798/Documents/sem 4 materials/SEM 4/CT127-3-2-PFDA/Assignment - Individual/employee_attrition.csv')
head(employee_data)
##   EmployeeID  recorddate_key birthdate_key orighiredate_key terminationdate_key
## 1       1318 12/31/2006 0:00      1/3/1954        8/28/1989            1/1/1900
## 2       1318 12/31/2007 0:00      1/3/1954        8/28/1989            1/1/1900
## 3       1318 12/31/2008 0:00      1/3/1954        8/28/1989            1/1/1900
## 4       1318 12/31/2009 0:00      1/3/1954        8/28/1989            1/1/1900
## 5       1318 12/31/2010 0:00      1/3/1954        8/28/1989            1/1/1900
## 6       1318 12/31/2011 0:00      1/3/1954        8/28/1989            1/1/1900
##   age length_of_service city_name department_name job_title store_name
## 1  52                17 Vancouver       Executive       CEO         35
## 2  53                18 Vancouver       Executive       CEO         35
## 3  54                19 Vancouver       Executive       CEO         35
## 4  55                20 Vancouver       Executive       CEO         35
## 5  56                21 Vancouver       Executive       CEO         35
## 6  57                22 Vancouver       Executive       CEO         35
##   gender_short gender_full termreason_desc  termtype_desc STATUS_YEAR STATUS
## 1            M        Male  Not Applicable Not Applicable        2006 ACTIVE
## 2            M        Male  Not Applicable Not Applicable        2007 ACTIVE
## 3            M        Male  Not Applicable Not Applicable        2008 ACTIVE
## 4            M        Male  Not Applicable Not Applicable        2009 ACTIVE
## 5            M        Male  Not Applicable Not Applicable        2010 ACTIVE
## 6            M        Male  Not Applicable Not Applicable        2011 ACTIVE
##   BUSINESS_UNIT
## 1    HEADOFFICE
## 2    HEADOFFICE
## 3    HEADOFFICE
## 4    HEADOFFICE
## 5    HEADOFFICE
## 6    HEADOFFICE

The first step is to check for missing values. there is not missing values

missing_data = colSums(is.na(employee_data))
print(missing_data)
##          EmployeeID      recorddate_key       birthdate_key    orighiredate_key 
##                   0                   0                   0                   0 
## terminationdate_key                 age   length_of_service           city_name 
##                   0                   0                   0                   0 
##     department_name           job_title          store_name        gender_short 
##                   0                   0                   0                   0 
##         gender_full     termreason_desc       termtype_desc         STATUS_YEAR 
##                   0                   0                   0                   0 
##              STATUS       BUSINESS_UNIT 
##                   0                   0

There is no missing data Now understanding data structure

str(employee_data)
## 'data.frame':    49653 obs. of  18 variables:
##  $ EmployeeID         : int  1318 1318 1318 1318 1318 1318 1318 1318 1318 1318 ...
##  $ recorddate_key     : chr  "12/31/2006 0:00" "12/31/2007 0:00" "12/31/2008 0:00" "12/31/2009 0:00" ...
##  $ birthdate_key      : chr  "1/3/1954" "1/3/1954" "1/3/1954" "1/3/1954" ...
##  $ orighiredate_key   : chr  "8/28/1989" "8/28/1989" "8/28/1989" "8/28/1989" ...
##  $ terminationdate_key: chr  "1/1/1900" "1/1/1900" "1/1/1900" "1/1/1900" ...
##  $ age                : int  52 53 54 55 56 57 58 59 60 61 ...
##  $ length_of_service  : int  17 18 19 20 21 22 23 24 25 26 ...
##  $ city_name          : chr  "Vancouver" "Vancouver" "Vancouver" "Vancouver" ...
##  $ department_name    : chr  "Executive" "Executive" "Executive" "Executive" ...
##  $ job_title          : chr  "CEO" "CEO" "CEO" "CEO" ...
##  $ store_name         : int  35 35 35 35 35 35 35 35 35 35 ...
##  $ gender_short       : chr  "M" "M" "M" "M" ...
##  $ gender_full        : chr  "Male" "Male" "Male" "Male" ...
##  $ termreason_desc    : chr  "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ termtype_desc      : chr  "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ STATUS_YEAR        : int  2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 ...
##  $ STATUS             : chr  "ACTIVE" "ACTIVE" "ACTIVE" "ACTIVE" ...
##  $ BUSINESS_UNIT      : chr  "HEADOFFICE" "HEADOFFICE" "HEADOFFICE" "HEADOFFICE" ...

The data stores and updates employee data in the same file. Each year a new record is added for example 1318 the ceo has been working from 2006 to 2015 and thier data is updated each year.

convert all date columns to date

cols = colnames(employee_data)
for (col in cols){
  if(length(grep("date",col))){
   employee_data[[col]] = as.Date(employee_data[[col]], format = "%m/%d/%Y")
  }
}
str(employee_data)
## 'data.frame':    49653 obs. of  18 variables:
##  $ EmployeeID         : int  1318 1318 1318 1318 1318 1318 1318 1318 1318 1318 ...
##  $ recorddate_key     : Date, format: "2006-12-31" "2007-12-31" ...
##  $ birthdate_key      : Date, format: "1954-01-03" "1954-01-03" ...
##  $ orighiredate_key   : Date, format: "1989-08-28" "1989-08-28" ...
##  $ terminationdate_key: Date, format: "1900-01-01" "1900-01-01" ...
##  $ age                : int  52 53 54 55 56 57 58 59 60 61 ...
##  $ length_of_service  : int  17 18 19 20 21 22 23 24 25 26 ...
##  $ city_name          : chr  "Vancouver" "Vancouver" "Vancouver" "Vancouver" ...
##  $ department_name    : chr  "Executive" "Executive" "Executive" "Executive" ...
##  $ job_title          : chr  "CEO" "CEO" "CEO" "CEO" ...
##  $ store_name         : int  35 35 35 35 35 35 35 35 35 35 ...
##  $ gender_short       : chr  "M" "M" "M" "M" ...
##  $ gender_full        : chr  "Male" "Male" "Male" "Male" ...
##  $ termreason_desc    : chr  "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ termtype_desc      : chr  "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ STATUS_YEAR        : int  2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 ...
##  $ STATUS             : chr  "ACTIVE" "ACTIVE" "ACTIVE" "ACTIVE" ...
##  $ BUSINESS_UNIT      : chr  "HEADOFFICE" "HEADOFFICE" "HEADOFFICE" "HEADOFFICE" ...

The data stores and updates employee data in the same file. Each year a new record is added for example 1318 the ceo has been working from 2006 to 2015 and thier data is updated each year. Firstly i will segreate create a new data with one row for one employee by fetching their latest record. First, group the records by employeeID, arrange them based on status and then recorddate in descending order. If the employee has terminated status that will be their latest record. If they have not yet been terminated then their latest recorddate will be their latest record.

employee_latest_record <- employee_data
employee_latest_record <- as.data.frame( employee_latest_record %>%
  group_by(EmployeeID) %>%
  arrange(desc(STATUS), desc(recorddate_key)) %>%
  slice(1) %>%
  ungroup())

Now, getting the summary for Numerical vairables

numerical_columns <- c("age", "length_of_service")
summary(employee_latest_record[,numerical_columns])
##       age        length_of_service
##  Min.   :19.00   Min.   : 0.00    
##  1st Qu.:32.00   1st Qu.: 7.00    
##  Median :45.00   Median :13.00    
##  Mean   :44.74   Mean   :12.84    
##  3rd Qu.:58.00   3rd Qu.:19.00    
##  Max.   :65.00   Max.   :26.00

Then, getting all the unique values for each Categorical variables

categorical_columns <- c('city_name','department_name','job_title','store_name','gender_full','termreason_desc', 'termtype_desc','STATUS', 'BUSINESS_UNIT')
for ( category in categorical_columns){
#  unique_values <- toString(unique(employee_latest_record[[category]]))
  print(paste('The unique vairables in ',category,'column are:') )
  print(unique(employee_latest_record[category]))
}
## [1] "The unique vairables in  city_name column are:"
##             city_name
## 1           Vancouver
## 9             Terrace
## 11            Nanaimo
## 16             Nelson
## 17            Kelowna
## 19           Victoria
## 25           Kamloops
## 33       Fort St John
## 35             Surrey
## 45             Vernon
## 49            Quesnel
## 51         Chilliwack
## 59       Dawson Creek
## 71           Squamish
## 72    New Westminster
## 82     Port Coquitlam
## 84      Cortes Island
## 85            Burnaby
## 107       Bella Bella
## 109         Cranbrook
## 111     Williams Lake
## 119             Trail
## 127     Prince George
## 129          Richmond
## 194       Grand Forks
## 195    West Vancouver
## 211        Abbotsford
## 228        Aldergrove
## 279           Langley
## 305   North Vancouver
## 343        White Rock
## 399  New Westminister
## 434       Fort Nelson
## 453             Haney
## 639         Valemount
## 683       Ocean Falls
## 713      Pitt Meadows
## 799         Princeton
## 1320       Dease Lake
## 3249       Blue River
## [1] "The unique vairables in  department_name column are:"
##           department_name
## 1               Executive
## 9        Store Management
## 10                  Meats
## 14            Recruitment
## 15               Training
## 18        Labor Relations
## 20          HR Technology
## 21       Employee Records
## 23           Compensation
## 24                  Legal
## 28                Produce
## 30   Accounts Receiveable
## 32                 Bakery
## 34 Information Technology
## 36       Accounts Payable
## 37                  Audit
## 38             Accounting
## 40             Investment
## 41                  Dairy
## 47        Processed Foods
## 53       Customer Service
## [1] "The unique vairables in  job_title column are:"
##                            job_title
## 1                                CEO
## 2                          VP Stores
## 3                      Legal Counsel
## 4                 VP Human Resources
## 5                         VP Finance
## 6          Exec Assistant, VP Stores
## 7      Exec Assistant, Legal Counsel
## 8          CHief Information Officer
## 9                      Store Manager
## 10                     Meats Manager
## 12   Exec Assistant, Human Resources
## 13           Exec Assistant, Finance
## 14             Director, Recruitment
## 15                Director, Training
## 18         Director, Labor Relations
## 20           Director, HR Technology
## 21        Director, Employee Records
## 23            Director, Compensation
## 24                  Corporate Lawyer
## 28                   Produce Manager
## 30     Director, Accounts Receivable
## 32                    Bakery Manager
## 34                   Systems Analyst
## 36        Director, Accounts Payable
## 37                   Director, Audit
## 38              Director, Accounting
## 40             Director, Investments
## 41                      Dairy Person
## 42                         Recruiter
## 47           Processed Foods Manager
## 53          Customer Service Manager
## 60                           Trainer
## 67                       Meat Cutter
## 87           Labor Relations Analyst
## 96                     Dairy Manager
## 100                     HRIS Analyst
## 114                   Benefits Admin
## 130             Compensation Analyst
## 134       Accounts Receiveable Clerk
## 144           Accounts Payable Clerk
## 149                            Baker
## 160                          Auditor
## 167                 Accounting Clerk
## 185               Investment Analyst
## 384                    Produce Clerk
## 759                    Shelf Stocker
## 1057                         Cashier
## [1] "The unique vairables in  store_name column are:"
##      store_name
## 1            35
## 9            32
## 11           18
## 16           19
## 17           16
## 19           37
## 25           15
## 33           12
## 35           31
## 45           36
## 49           28
## 51            6
## 59            9
## 71           30
## 72           21
## 73           46
## 82           25
## 84            7
## 85            5
## 107           3
## 109           8
## 111          40
## 119          33
## 127          26
## 129          29
## 190          41
## 194          13
## 195          38
## 209          42
## 211           1
## 228           2
## 241          43
## 263          44
## 279          17
## 289          45
## 305          22
## 343          39
## 399          20
## 434          11
## 453          14
## 639          34
## 683          23
## 713          24
## 799          27
## 1320         10
## 3249          4
## [1] "The unique vairables in  gender_full column are:"
##   gender_full
## 1        Male
## 2      Female
## [1] "The unique vairables in  termreason_desc column are:"
##     termreason_desc
## 1    Not Applicable
## 15       Retirement
## 33       Resignaton
## 149          Layoff
## [1] "The unique vairables in  termtype_desc column are:"
##      termtype_desc
## 1   Not Applicable
## 15       Voluntary
## 149    Involuntary
## [1] "The unique vairables in  STATUS column are:"
##        STATUS
## 1      ACTIVE
## 15 TERMINATED
## [1] "The unique vairables in  BUSINESS_UNIT column are:"
##   BUSINESS_UNIT
## 1    HEADOFFICE
## 9        STORES

Getting descriptive statistics to understand the employee population

Total number of employees

total_employee <- length(employee_latest_record$EmployeeID)
print(paste('There are ',total_employee,'employees in the company'))
## [1] "There are  6284 employees in the company"

Total number of employees by categories Gender

male_employee_count <- nrow(employee_latest_record[employee_latest_record$gender_full == "Male",])
print(paste('There are ',male_employee_count,'male employees in the company'))
## [1] "There are  3006 male employees in the company"
female_employee_count <- nrow(employee_latest_record[employee_latest_record$gender_full != "Male",])
print(paste('There are ',female_employee_count,'female employees in the company'))
## [1] "There are  3278 female employees in the company"
gender_count_df <- employee_latest_record %>% 
  group_by(gender_short) %>% 
  summarise(count = n())
gender_count_df <- gender_count_df %>% mutate(percentage = ((count/total_employee)*100))
gender_count_df <- gender_count_df %>%
  rename(
    Gender = gender_short,
    Count = count,
    Percentage = percentage
  )
plot3 <- ggplot(gender_count_df, aes(x = "", y = Count, fill = Gender)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  labs(title = "Gender Division", fill = "Gender") +
  geom_text(aes(label = sprintf("%.1f%%", Percentage)), position = position_stack(vjust = 0.5)) 
show(plot3)

department_name

department_employee_count <- employee_latest_record %>% group_by(department_name,STATUS) %>% count() %>% rename(counts = n) %>% arrange(desc(counts))
department_employee_count <- department_employee_count %>%
  arrange(desc(counts)) %>%
  mutate(department_name = reorder(department_name, counts))
department_employee_count
## # A tibble: 29 × 3
## # Groups:   department_name, STATUS [29]
##    department_name  STATUS     counts
##    <fct>            <chr>       <int>
##  1 Customer Service ACTIVE        928
##  2 Meats            ACTIVE        875
##  3 Dairy            ACTIVE        842
##  4 Bakery           ACTIVE        786
##  5 Produce          ACTIVE        707
##  6 Processed Foods  ACTIVE        646
##  7 Meats            TERMINATED    377
##  8 Produce          TERMINATED    353
##  9 Customer Service TERMINATED    262
## 10 Dairy            TERMINATED    191
## # ℹ 19 more rows
department_employee_count <- employee_latest_record %>%
  group_by(department_name, STATUS) %>%
  count() %>%
  rename(counts = n) %>%
  arrange(desc(counts)) %>%
  mutate(department_name = reorder(department_name, counts))

# Filter for the top 7 departments
top7_departments <- department_employee_count %>%
  group_by(department_name) %>%
  summarize(total_counts = sum(counts)) %>%
  top_n(7, wt = total_counts) %>%
  pull(department_name)
top7_departments
## [1] Bakery           Customer Service Dairy            Meats           
## [5] Processed Foods  Produce          Store Management
## 21 Levels: Accounting Accounts Payable Accounts Receiveable Audit ... Training
library(ggplot2)


ggplot(department_employee_count[1:10,], aes(x = reorder(department_name, counts), y = counts,
                                         fill = STATUS     )) +
  geom_bar(stat = "identity", position="stack", aes(fill=STATUS)) +
   scale_y_continuous(
    trans = "log10", 
    breaks = (c(1,10, 100, 1000)),  # Specify the log-transformed breaks
    labels = c(1,10, 100, 1000)  # Set the desired labels
  )+
  xlab("Department") +
  ylab("Number of Employees") +
  labs(title="Top 10 Department By Employees Count (Logarithmic Scale)") +
  geom_text(aes(label = counts), position = position_dodge(width = 0.8), vjust = -0.5, size = 3) + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),
    axis.title.x = element_text(size = 12),
    panel.grid.major = element_blank(),  # Remove major gridlines
    panel.grid.minor = element_blank(),  # Remove minor gridlines
    panel.background = element_blank() ,
    panel.border = element_blank(),      # Remove panel border
    axis.line = element_line(color = "black") 
  ) 

employee_latest_record$Age_Group <- cut(employee_latest_record$age, breaks = c(18, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, max(employee_latest_record$age) + 1),
                      labels = c("19-23", "24-27", "28-32", "32-36", "36-40", "40-44", "44-48", "48-52", "52-56", "56-60", "60+"))

age_count_df <- employee_latest_record %>% group_by(Age_Group) %>% summarize(Count = n())

ggplot(age_count_df, aes(x=reorder(Age_Group,Count), y=Count))+
  geom_bar(stat="identity",fill="lightgreen")+
  geom_text(aes(label=Count) ,vjust = -0.5, size = 3)

There is no outliers

# Assuming you have a variable 'age' in your dataset

ggplot(employee_latest_record, aes(y = age)) +
  geom_boxplot() +
  geom_text(data = subset(employee_latest_record, age > quantile(age, 0.75) + 1.5 * IQR(age) | age < quantile(age, 0.25) - 1.5 * IQR(age)),
            aes(label = age), hjust = -0.5, vjust = 0.5, color = "red") +
  ylab("Age") +
  labs(title = "Boxplot of Age with Outliers Highlighted")

Average Length of service

average_length_of_service <- mean(employee_latest_record$length_of_service)
print(paste("The average length of service is ", round(average_length_of_service,2), " years."))
## [1] "The average length of service is  12.84  years."

Attrition Rate

IGNORE THIS

length_of_service_over_departments <- employee_latest_record %>% group_by(department_name) %>% summarize(Count= n(),Mean = mean(length_of_service),Min = min(length_of_service),Max = max(length_of_service),OutlierCount = length(boxplot.stats(length_of_service)$out))


# Assuming employee_latest_record is a data frame with 'length_of_service' and 'department' variables

ggplot(employee_latest_record, aes(x = department_name, y = length_of_service)) +
  geom_boxplot() +
  geom_text(data = subset(employee_latest_record, length_of_service > quantile(length_of_service, 0.75) + 1.5 * IQR(length_of_service) | length_of_service < quantile(length_of_service, 0.25) - 1.5 * IQR(length_of_service)),
            aes(label = sprintf("%.2f", length_of_service)), hjust = -0.5, vjust = 0.5, color = "red") +
  ylab("Length of Service") +
  labs(title = "Boxplot of Length of Service by Department") +
  facet_wrap(~ department_name, 6)

#since we have 37 job titles, i will only check for top 5 jobs with most employee

length_of_service_over_job<- employee_latest_record %>% group_by(job_title) %>% summarize(Count= n(),Mean = mean(length_of_service),Min = min(length_of_service),Max = max(length_of_service),OutlierCount = length(boxplot.stats(length_of_service)$out)) %>%
  arrange(desc(Count)) %>% top_n(5)
## Selecting by OutlierCount
hiring_in_jobs_after_2006 <- employee_latest_record %>% group_by(job_title) %>% filter(orighiredate_key >="2006-01-01") %>% summarize(count = n())
hiring_in_jobs_since_2006 <- employee_latest_record %>% group_by(job_title) %>% summarize(count = n())




top5_departments <- length_of_service_over_job %>%
  top_n(5, wt = Count) %>%
  pull(job_title)

# Filter the original data for the top 5 departments
filtered_data <- employee_latest_record %>%
  filter(job_title %in% top5_departments, orighiredate_key >="2006-01-01")

# Create box plots
ggplot(filtered_data, aes(x = job_title, y = length_of_service)) +
  geom_boxplot() +
  ylab("Length of Service") +
  labs(title = "Boxplot of Length of Service in Top 5 Departments")

Business unit

business_unit_count <- employee_latest_record %>% group_by(BUSINESS_UNIT) %>% count() %>% rename(counts = n)
business_unit_count
## # A tibble: 2 × 2
## # Groups:   BUSINESS_UNIT [2]
##   BUSINESS_UNIT counts
##   <chr>          <int>
## 1 HEADOFFICE        80
## 2 STORES          6204

departments in head office

department_head_office <- employee_latest_record %>% filter(BUSINESS_UNIT == "HEADOFFICE") %>% group_by(department_name) %>% count() %>% rename(counts = n)
head(department_head_office,n=14)
## # A tibble: 14 × 2
## # Groups:   department_name [14]
##    department_name        counts
##    <chr>                   <int>
##  1 Accounting                  6
##  2 Accounts Payable            4
##  3 Accounts Receiveable        5
##  4 Audit                       4
##  5 Compensation                4
##  6 Employee Records            6
##  7 Executive                  10
##  8 HR Technology               9
##  9 Information Technology      5
## 10 Investment                  4
## 11 Labor Relations             6
## 12 Legal                       3
## 13 Recruitment                 9
## 14 Training                    5
sum(department_head_office$counts)
## [1] 80

headoffice store name 14 active employees there

head_office_store_name <- unique(employee_latest_record[employee_latest_record$BUSINESS_UNIT == "HEADOFFICE",]$store_name)
print(paste("The head office corresponds to the store name:",head_office_store_name))
## [1] "The head office corresponds to the store name: 35"

store management department has store manager

store without store management

store_manager_data <- employee_latest_record[employee_latest_record$job_title == "Store Manager", ]
unique_store_names <- unique(store_manager_data$store_name)
all_store_names <- unique(employee_latest_record$store_name)
store_names_without_management <- setdiff(all_store_names,unique_store_names)
print(paste("There are total",length(all_store_names), "stores but, only",length(unique_store_names)," stores had a store manager"))
## [1] "There are total 46 stores but, only 35  stores had a store manager"
active_store_manager <- store_manager_data %>% filter(STATUS == "ACTIVE")
print(paste("In 2015, there are only", count(active_store_manager)," active store managers."))
## [1] "In 2015, there are only 4  active store managers."

is there relation to stores with employee count

store_employee_count <- employee_latest_record %>% filter(STATUS=="ACTIVE") %>%group_by(store_name) %>% count() %>% rename(counts = n) %>% arrange(desc(counts))
print(paste("The mean number of active employees in stores with managers are"))
## [1] "The mean number of active employees in stores with managers are"
store_employee_count_without_manager <- employee_latest_record %>% filter(store_name %in% store_names_without_management) %>% group_by(store_name) %>% count()

store_employee_count_without_manager_status <- employee_latest_record %>% filter(store_name %in% store_names_without_management) %>% group_by(store_name, STATUS) %>%
  summarise(counts = n(), .groups = "drop") %>%
  pivot_wider(names_from = STATUS, values_from = counts, names_prefix = "count_")
store_employee_count_without_manager
## # A tibble: 11 × 2
## # Groups:   store_name [11]
##    store_name     n
##         <int> <int>
##  1          4     1
##  2          7     6
##  3         10     2
##  4         11    47
##  5         22    81
##  6         23     7
##  7         24     9
##  8         34     5
##  9         35   221
## 10         40    72
## 11         43   311
store_employee_count_without_manager
## # A tibble: 11 × 2
## # Groups:   store_name [11]
##    store_name     n
##         <int> <int>
##  1          4     1
##  2          7     6
##  3         10     2
##  4         11    47
##  5         22    81
##  6         23     7
##  7         24     9
##  8         34     5
##  9         35   221
## 10         40    72
## 11         43   311
store_employee_count_without_manager_status
## # A tibble: 11 × 3
##    store_name count_TERMINATED count_ACTIVE
##         <int>            <int>        <int>
##  1          4                1           NA
##  2          7                6           NA
##  3         10                2           NA
##  4         11               47           NA
##  5         22               12           69
##  6         23                7           NA
##  7         24                9           NA
##  8         34                5           NA
##  9         35              207           14
## 10         40               10           62
## 11         43               24          287

stores with manager with varying jobttile

store_job_count_without_manager <- employee_latest_record[(employee_latest_record$store_name %in% store_names_without_management),] %>% group_by(store_name) %>% summarise(unique_job = n_distinct(job_title))
store_job_count_without_manager
## # A tibble: 11 × 2
##    store_name unique_job
##         <int>      <int>
##  1          4          1
##  2          7          2
##  3         10          1
##  4         11         11
##  5         22         11
##  6         23          2
##  7         24          4
##  8         34          1
##  9         35         36
## 10         40         11
## 11         43         11
store_job_count <- employee_latest_record %>% group_by(store_name) %>% summarise(unique_job = n_distinct(job_title))
store_job_count
## # A tibble: 46 × 2
##    store_name unique_job
##         <int>      <int>
##  1          1         12
##  2          2         10
##  3          3          6
##  4          4          1
##  5          5         12
##  6          6         12
##  7          7          2
##  8          8         12
##  9          9          6
## 10         10          1
## # ℹ 36 more rows
store_category_by_jobs <- c(unique(store_job_count$unique_job))

for (job_count in store_category_by_jobs){
  store_with_job_count = store_job_count[store_job_count$unique_job == job_count,]
  store_name<- store_with_job_count$store_name[1]
  job_titles_for_store <- unique(employee_latest_record[employee_latest_record$store_name == store_name,]$job_title)
  print(paste("the job titles for store", store_name," with ",job_count,"job titles are: "))
  print(job_titles_for_store)
  
}
## [1] "the job titles for store 1  with  12 job titles are: "
##  [1] "Store Manager"            "Meats Manager"           
##  [3] "Produce Manager"          "Bakery Manager"          
##  [5] "Dairy Person"             "Processed Foods Manager" 
##  [7] "Customer Service Manager" "Meat Cutter"             
##  [9] "Produce Clerk"            "Shelf Stocker"           
## [11] "Baker"                    "Cashier"                 
## [1] "the job titles for store 2  with  10 job titles are: "
##  [1] "Store Manager"           "Meats Manager"          
##  [3] "Produce Manager"         "Dairy Person"           
##  [5] "Processed Foods Manager" "Meat Cutter"            
##  [7] "Produce Clerk"           "Baker"                  
##  [9] "Shelf Stocker"           "Cashier"                
## [1] "the job titles for store 3  with  6 job titles are: "
## [1] "Store Manager" "Meat Cutter"   "Baker"         "Cashier"      
## [5] "Dairy Person"  "Produce Clerk"
## [1] "the job titles for store 4  with  1 job titles are: "
## [1] "Dairy Person"
## [1] "the job titles for store 7  with  2 job titles are: "
## [1] "Dairy Person" "Baker"       
## [1] "the job titles for store 11  with  11 job titles are: "
##  [1] "Meats Manager"            "Produce Manager"         
##  [3] "Meat Cutter"              "Bakery Manager"          
##  [5] "Dairy Person"             "Processed Foods Manager" 
##  [7] "Customer Service Manager" "Produce Clerk"           
##  [9] "Baker"                    "Shelf Stocker"           
## [11] "Cashier"                 
## [1] "the job titles for store 14  with  7 job titles are: "
## [1] "Store Manager" "Meat Cutter"   "Produce Clerk" "Baker"        
## [5] "Dairy Person"  "Shelf Stocker" "Cashier"      
## [1] "the job titles for store 18  with  13 job titles are: "
##  [1] "Store Manager"            "Meats Manager"           
##  [3] "Produce Manager"          "Bakery Manager"          
##  [5] "Dairy Manager"            "Processed Foods Manager" 
##  [7] "Customer Service Manager" "Meat Cutter"             
##  [9] "Produce Clerk"            "Baker"                   
## [11] "Dairy Person"             "Shelf Stocker"           
## [13] "Cashier"                 
## [1] "the job titles for store 24  with  4 job titles are: "
## [1] "Meat Cutter"   "Produce Clerk" "Dairy Person"  "Baker"        
## [1] "the job titles for store 35  with  36 job titles are: "
##  [1] "CEO"                             "VP Stores"                      
##  [3] "Legal Counsel"                   "VP Human Resources"             
##  [5] "VP Finance"                      "Exec Assistant, VP Stores"      
##  [7] "Exec Assistant, Legal Counsel"   "CHief Information Officer"      
##  [9] "Exec Assistant, Human Resources" "Exec Assistant, Finance"        
## [11] "Director, Recruitment"           "Director, Training"             
## [13] "Director, Labor Relations"       "Director, HR Technology"        
## [15] "Director, Employee Records"      "Director, Compensation"         
## [17] "Corporate Lawyer"                "Director, Accounts Receivable"  
## [19] "Systems Analyst"                 "Director, Accounts Payable"     
## [21] "Director, Audit"                 "Director, Accounting"           
## [23] "Director, Investments"           "Recruiter"                      
## [25] "Trainer"                         "Dairy Person"                   
## [27] "Labor Relations Analyst"         "HRIS Analyst"                   
## [29] "Benefits Admin"                  "Compensation Analyst"           
## [31] "Accounts Receiveable Clerk"      "Accounts Payable Clerk"         
## [33] "Auditor"                         "Accounting Clerk"               
## [35] "Investment Analyst"              "Meat Cutter"                    
## [1] "the job titles for store 37  with  9 job titles are: "
## [1] "Store Manager"            "Meats Manager"           
## [3] "Produce Manager"          "Bakery Manager"          
## [5] "Dairy Person"             "Processed Foods Manager" 
## [7] "Customer Service Manager" "Meat Cutter"             
## [9] "Produce Clerk"

the number of unique jobs can be used to categorize the stores like all stores with 1 job title is dariy person and so on….

stores with manager and tehir top 3 active and terminated job titles

store_job_summary <- employee_latest_record %>%
  group_by(store_name, job_title, STATUS) %>%
  summarize(count = n(), .groups = "drop") %>%
  arrange(store_name, STATUS,desc(count))

# To get the top 3 job titles in each store by count:
top_3_job_titles <- store_job_summary %>%
  group_by(store_name,STATUS) %>%
  top_n(3, wt = count)


top_3_job_titles_store_with_manger <- top_3_job_titles[!(top_3_job_titles$store_name %in% store_names_without_management),]
# Now you have a summary of the top 3 job titles in each store by count.

Overall Attrition rate of the company

count_status <- employee_latest_record %>% count(STATUS)

# Get the number of employee of 'TERMINATED' STATUS
terminated_count = count_status[count_status$STATUS == 'TERMINATED', 'n']
# Calculate the percentage of 'TERMINATED' status
terminated_count_percent <- (terminated_count / sum(count_status$n)) * 100

# Print the percentage
print(paste('The overall attrition rate of company is :', round(terminated_count_percent,2), '%'))
## [1] "The overall attrition rate of company is : 23.63 %"

Attrition Rate by gender

count_status_by_gender <- employee_latest_record %>% group_by(gender_full) %>% count(STATUS)



# Get the number of Male employee of 'TERMINATED' STATUS
terminated_count_male = count_status_by_gender[count_status_by_gender$STATUS == 'TERMINATED' & count_status_by_gender$gender_full =='Male', 'n']
# Calculate the percentage of 'TERMINATED' status
terminated_count_percent_male <- (terminated_count_male / sum(count_status_by_gender[count_status_by_gender$gender_full =='Male','n']) * 100)

# Print the percentage
print(paste('The overall attrition rate of Male in the company is :', round(terminated_count_percent_male,2), '%'))
## [1] "The overall attrition rate of Male in the company is : 18.96 %"
# Get the number of employee of Female 'TERMINATED' STATUS
terminated_count_female = count_status_by_gender[count_status_by_gender$STATUS == 'TERMINATED' & count_status_by_gender$gender_full =='Female', 'n']
# Calculate the percentage of 'TERMINATED' status
terminated_count_percent_female <- (terminated_count_female / sum(count_status_by_gender[count_status_by_gender$gender_full =='Female','n']) * 100)

# Print the percentage
print(paste('The overall attrition rate of Female in the company is :', round(terminated_count_percent_female,2), '%'))
## [1] "The overall attrition rate of Female in the company is : 27.91 %"

Termination Reason Distribution

Termination Reason Distribution

terminated_employees = employee_latest_record %>% filter(STATUS == 'TERMINATED')
dim(terminated_employees)
## [1] 1485   19
terminated_employees_reason = terminated_employees %>% group_by(termreason_desc) %>% count()
for (i in c(1:3)){
  print(paste('There were',terminated_employees_reason[i,][2],terminated_employees_reason[i,][1]))
}
## [1] "There were 215 Layoff"
## [1] "There were 385 Resignaton"
## [1] "There were 885 Retirement"
terminated_employees_type = terminated_employees %>% group_by(termtype_desc) %>% count()

for (i in c(1:2)){
  print(paste('There were',terminated_employees_type[i,][2],terminated_employees_type[i,][1]))
}
## [1] "There were 215 Involuntary"
## [1] "There were 1270 Voluntary"

There are 215 layoff and 215 involuntary exits.

No need to verify but still

count(terminated_employees[terminated_employees$termreason_desc =='Layoff' & terminated_employees$termtype_desc == 'Involuntary' ,])
##     n
## 1 215

lets see the layoff over years

terminated_employees_layoff = terminated_employees %>% filter(termreason_desc =='Layoff') %>% group_by(STATUS_YEAR) %>% count()
terminated_employees_layoff
## # A tibble: 2 × 2
## # Groups:   STATUS_YEAR [2]
##   STATUS_YEAR     n
##         <int> <int>
## 1        2014   142
## 2        2015    73

It is found that there were layoffs only in 2014 and 2015. lets check the same for resignation

resignation_by_year= terminated_employees %>% filter(termreason_desc =='Resignaton') %>% group_by(STATUS_YEAR) %>% count()
resignation_by_year
## # A tibble: 10 × 2
## # Groups:   STATUS_YEAR [10]
##    STATUS_YEAR     n
##          <int> <int>
##  1        2006    12
##  2        2007    25
##  3        2008    26
##  4        2009    18
##  5        2010    29
##  6        2011    69
##  7        2012    76
##  8        2013    49
##  9        2014    55
## 10        2015    26

lets visualize this

resignaton_plot = ggplot(resignation_by_year, aes(x=STATUS_YEAR, y=n))+geom_line()+scale_x_continuous(breaks=seq(2006,2016,1))
resignaton_plot

we can see peak in 2011 to 2013. lets analyse this for different department.

terminated_employees_resignation <- terminated_employees[terminated_employees$termreason_desc == 'Resignaton',]
resignation_by_department <- terminated_employees_resignation %>% group_by(STATUS_YEAR, department_name) %>% summarise(counts = n()) %>% arrange(STATUS_YEAR, desc(counts)) %>% group_by(STATUS_YEAR) 
## `summarise()` has grouped output by 'STATUS_YEAR'. You can override using the
## `.groups` argument.
p <- ggplot(resignation_by_department, aes(x=STATUS_YEAR, y= counts)) +geom_line(aes(color=department_name))+scale_x_continuous(breaks=seq(2006,2016,1))
p

In 2014 alone there were 55 resignaton of which 54 were customer_service

terminated_employees_resignation[terminated_employees_resignation$STATUS_YEAR == 2014,]%>% group_by(department_name) %>% summarise(counts = n())
## # A tibble: 2 × 2
##   department_name  counts
##   <chr>             <int>
## 1 Customer Service     54
## 2 Processed Foods       1

department with most resignation

department_resignation = terminated_employees %>% filter( termreason_desc =='Resignaton') %>% group_by(department_name) %>% summarise(resignaton_count = n()) %>% arrange(desc(resignaton_count))

job with most resgination

job_resignation = terminated_employees %>% filter( termreason_desc =='Resignaton') %>% group_by(job_title) %>% summarise(resignaton_count = n()) %>% arrange(desc(resignaton_count))

age group with most resgination

age_resignation = terminated_employees %>% filter( termreason_desc =='Resignaton') %>% group_by(age) %>% summarise(resignaton_count = n()) %>% arrange(desc(resignaton_count)) %>% top_n(3,resignaton_count)

The top 3 age of resgintion were 30,21,23 which is nearly 43% of all resignation and top 7 age is under 30 with 63% of all resgination

department wise customer what has more resignation and is there any relation to length of service

this is irrelevant below

lets try to find the department and job titles each of the top 3 age work in

department_resignation_age_21 <- terminated_employees %>% filter(termreason_desc =='Resignaton' & age==21) %>% group_by(department_name) %>% summarize(count = n())
department_resignation_age_23 <- terminated_employees %>% filter(termreason_desc =='Resignaton' & age==23) %>% group_by(department_name) %>% summarize(count = n())
department_resignation_age_30 <- terminated_employees %>% filter(termreason_desc =='Resignaton' & age==30) %>% group_by(department_name) %>% summarize(count = n())

age when they joined ## creating join_age feature

terminated_employees['join_age'] = as.integer( difftime(terminated_employees$orighiredate_key, terminated_employees$birthdate_key, units = "days")/365.25)
#age_21_terminated<- terminated_employees %>% filter(age==21 & join_age == 21 ) %>% group_by(department_name) %>% summarize(count = n())

most of the age 21 employees who resgined are involved in customer service 58 out of 60 most of the age 23 employees who resgined are involved in customer service 30 out of 33

filtering which city has most termiantion

city_terminated <- terminated_employees %>% group_by(city_name)%>% summarize(count = n())%>% arrange(desc(count))
p <- ggplot(employee_data, aes(x = age, , fill = "Age Distribution")) +
  geom_histogram( breaks= seq(from=18, to=70,by=10),color = "black", alpha = 0.7) +
  labs(fill = NULL) +  # Remove legend title
  theme_minimal()  # Apply a minimal theme
p

p <- ggplot(employee_data, aes(x = length_of_service, , fill = "lenght of service Distribution")) +
  geom_histogram(bins=5,color = "black", alpha = 0.7) +
  labs(fill = NULL) +  # Remove legend title
  scale_x_continuous(breaks = seq(0, max(employee_data$length_of_service),5))+
  scale_y_continuous(breaks = seq(0, max(table(cut(employee_data$length_of_service, breaks = 5)))+5000, by = 2500))+
  theme_minimal()  # Apply a minimal theme
p

trying to see if age is a significant factor

employee_latest_record$Age_Group <- cut(employee_latest_record$age, breaks = c(18, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, max(employee_latest_record$age) + 1),
                      labels = c("19-23", "24-27", "28-32", "32-36", "36-40", "40-44", "44-48", "48-52", "52-56", "56-60", "60+"))

# Calculate attrition rates for each age group
attrition_rates <- employee_latest_record %>%
  group_by(Age_Group) %>%
  summarize(Attrition_Rate = mean(STATUS == "TERMINATED"))

# Perform a statistical test (e.g., ANOVA) to determine if the differences in attrition rates are significant
# Install the "stats" package if not already installed
# install.packages("stats")
library(stats)

# Analysis of Variance (ANOVA)
anova_model <- aov(Attrition_Rate ~ Age_Group, data = attrition_rates)

# Check for significance
summary(anova_model)
##             Df Sum Sq Mean Sq
## Age_Group   10 0.3271 0.03271