# loading the library files suppressPackageStartupMessages(library(mice)) suppressPackageStartupMessages(library(VIM)) suppressPackageStartupMessages(library(dplyr)) suppressPackageStartupMessages(library(missForest)) library(dummies) library(RSNNS) # reading the files #setwd("D:/ML/R Materials/Projects/Project - 7") getwd() #load("D:/ML/R Materials/Projects/Project - 7/Promotion.RData") train_raw <- read.csv(file.choose()) test_raw <- read.csv(file.choose()) # creating mice plot for checking if there are missing mp_raw_train <- aggr(train_raw, col=c('grey','pink'), numbers=TRUE, sortVars=TRUE, labels=names(train_raw), cex.axis=.5, gap=3, ylab=c("Missing data","Pattern"), cex.lab = .8, plot = TRUE, border = par("fg"), only.miss = TRUE, cex.numbers = .8) mp_raw_test <- aggr(test_raw, col=c('grey','pink'), numbers=TRUE, sortVars=TRUE, labels=names(test_raw), cex.axis=.5, gap=3, ylab=c("Missing data","Pattern"), cex.lab = .8, plot = TRUE, border = par("fg"), only.miss = TRUE, cex.numbers = .8) # cleansing the no_of_trainings train_raw$no_of_trainings <- as.factor(as.character(train_raw$no_of_trainings)) test_raw$no_of_trainings <- as.factor(as.character(test_raw$no_of_trainings)) row_train_traings <- c(which(train_raw$no_of_trainings == "4"), which(train_raw$no_of_trainings == "5"), which(train_raw$no_of_trainings == "6"), which(train_raw$no_of_trainings == "7"), which(train_raw$no_of_trainings == "8"), which(train_raw$no_of_trainings == "9"), which(train_raw$no_of_trainings == "10")) train_raw[row_train_traings,7] <- "4" row_test_traings <- c(which(test_raw$no_of_trainings == "4"), which(test_raw$no_of_trainings == "5"), which(test_raw$no_of_trainings == "6"), which(test_raw$no_of_trainings == "7"), which(test_raw$no_of_trainings == "8"), which(test_raw$no_of_trainings == "9")) test_raw[row_test_traings,7] <- "4" train_raw <- droplevels(train_raw) test_raw <- droplevels(test_raw) rm(row_test_traings) rm(row_train_traings) # cleansing previous_year_rating train_raw$previous_year_rating <- as.factor(as.character(train_raw$previous_year_rating)) test_raw$previous_year_rating <- as.factor(as.character(test_raw$previous_year_rating)) rating.Blank.train <- which(is.na(train_raw$previous_year_rating)) rating.Blank.test <- which(is.na(test_raw$previous_year_rating)) # cleansing KPIs_met..80. train_raw$KPIs_met..80. <- as.factor(as.character(train_raw$KPIs_met..80.)) test_raw$KPIs_met..80. <- as.factor(as.character(test_raw$KPIs_met..80.)) # cleansing KPIs_met..80. train_raw$awards_won. <- as.factor(as.character(train_raw$awards_won.)) test_raw$awards_won. <- as.factor(as.character(test_raw$awards_won.)) # cleansing is_promoted train_raw$is_promoted <- as.factor(as.character(train_raw$is_promoted)) # cleansing region row_train_region <- c(which(train_raw$region == "region_2"), which(train_raw$region == "region_22"), which(train_raw$region == "region_7")) train_raw[-row_train_region,3] <- "region_1" train_raw <- droplevels(train_raw) rm(row_train_region) row_test_region <- c(which(test_raw$region == "region_2"), which(test_raw$region == "region_22"), which(test_raw$region == "region_7")) test_raw[-row_test_region,3] <- "region_1" test_raw <- droplevels(test_raw) rm(row_test_region) # imputing missing values set.seed(300) train_imp <- missForest(train_raw[-c(14)], maxiter = 10, ntree = 100, variablewise = TRUE, verbose = TRUE, mtry = floor(sqrt(ncol(train_raw))), replace = TRUE) imputed_train <- train_imp$ximp train_raw[rating.Blank.train,9] <- imputed_train[rating.Blank.train,9] rm(rating.Blank.train) rm(imputed_train) set.seed(300) test_imp <- missForest(test_raw, maxiter = 10, ntree = 100, variablewise = TRUE, verbose = TRUE, mtry = floor(sqrt(ncol(test_raw))), replace = TRUE) imputed_test <- test_imp$ximp test_raw[rating.Blank.test,9] <- imputed_test[rating.Blank.test,9] rm(rating.Blank.test) rm(imputed_test) # cleansing education as below secondary may not mention their education level row_train_education <- c(which(train_raw$education == "Bachelor's"), which(train_raw$education == "Below Secondary"), which(train_raw$education == "Master's & above")) train_raw[-row_train_education,4] <- "Below Secondary" train_raw <- droplevels(train_raw) rm(row_train_education) row_test_education <- c(which(test_raw$education == "Bachelor's"), which(test_raw$education == "Below Secondary"), which(test_raw$education == "Master's & above")) test_raw[-row_test_education,4] <- "Below Secondary" test_raw <- droplevels(test_raw) rm(row_test_education) # scaling the numeric columns train_raw[,c(8,10,13)] <- scale(train_raw[,c(8,10,13)],center = TRUE, scale = TRUE) test_raw[,c(8,10,13)] <- scale(test_raw[,c(8,10,13)],center = TRUE, scale = TRUE) # creating final DS final_train <- train_raw[,-1] final_test <- test_raw[,-1] # creating dummy variables for training dataset for the factor data type new_train_DS <- dummy.data.frame(final_train[,-13], names = c("department", "region", "education", "gender", "recruitment_channel", "no_of_trainings", "previous_year_rating", "KPIs_met..80.", "awards_won.")) new_train_DS <- cbind(new_train_DS,final_train[,13]) colnames(new_train_DS)[colnames(new_train_DS)=="final_train[, 13]"] <-"is_promoted" # creating dummy variables for test dataset for the factor data type new_test_DS <- dummy.data.frame(final_test, names = c("department", "region", "education", "gender", "recruitment_channel", "no_of_trainings", "previous_year_rating", "KPIs_met..80.", "awards_won.")) ### begin model code # mlp model set.seed(123) # generating the multi payer perceptron model_RSNNS <- mlp(new_train_DS[,-38], as.numeric(as.character(new_train_DS[,38])), size = 90, initFunc = "Randomize_Weights", initFuncParams = c(-0.3, 0.3), #learnFunc = "BackpropBatch", learnFuncParams = c(0.001), maxit = 2700, updateFunc = "Topological_Order", inputsTest = NULL, targetsTest = NULL, linOut = FALSE) # generating error plot plotIterativeError(model_RSNNS) # generating prediction pred_RSNNS <- predict(model_RSNNS,new_test_DS) mlp_sub <- data.frame(test_raw$employee_id) mlp_sub$is_promoted <- ifelse(pred_RSNNS[,1] < .26,0,1) colnames(mlp_sub)[colnames(mlp_sub)=="test_raw.employee_id"] <-"employee_id" write.csv(mlp_sub, file = gsub(":","_",paste("Submission_",Sys.time(),".csv", sep = "")), row.names = FALSE) save.image("D:/ML/R Materials/Projects/Project - 7/Promotion.RData")