# Remove the variables from the environment rm(list=ls(all=TRUE)) # Set working directory setwd("E:/WNS") # Read the UniversalBank dataset into R as a dataframe. MNC<-read.csv("train_LZdllcl.csv",header = T) test<-read.csv("test_2umaH9m.csv",header = T) # Removing the id, exp, and Zip MNC=MNC[,setdiff(names(MNC),c('employee_id'))] test=test[,setdiff(names(test),c('employee_id'))] # Convert the categorical attributes to factors str(MNC) cat_attr<-c('department','region','education','gender','recruitment_channel','no_of_trainings','previous_year_rating','KPIs_met..80.','awards_won.','is_promoted') MNC[cat_attr]<-lapply(MNC[cat_attr] , factor) cat_attr<-c('department','region','education','gender','recruitment_channel','no_of_trainings','previous_year_rating','KPIs_met..80.','awards_won.') test[cat_attr]<-lapply(test[cat_attr] , factor) #Check the distribution of the input data using the summary function summary(MNC) summary(test) colSums(is.na(MNC)) colSums(is.na(test)) #######3 library(forcats) catatt_reducelevels <- function(attr, newlevel, levels) { print(newlevel) attr = fct_collapse(attr, newlevel = levels) levels(attr)[levels(attr)=="newlevel"] <- newlevel return(attr) } levels(MNC$region) table(MNC$region, MNC$is_promoted) MNC$region = catatt_reducelevels(MNC$region, "region_3" ,c("region_3", "region_7", "region_22", "region_23", "region_28")) test$region = catatt_reducelevels(test$region, "region_3" ,c("region_3", "region_7", "region_22", "region_23", "region_28")) MNC$region = catatt_reducelevels(MNC$region, "region_4" ,c("region_4", "region_17", "region_27")) test$region = catatt_reducelevels(test$region, "region_4" ,c("region_4", "region_17", "region_27")) MNC$region = catatt_reducelevels(MNC$region, "region_1" ,c("region_1", "region_2", "region_8", "region_13", "region_30")) test$region = catatt_reducelevels(test$region, "region_1" ,c("region_1", "region_2", "region_8", "region_13", "region_30")) MNC$region = catatt_reducelevels(MNC$region, "region_2" ,c("region_10", "region_14", "region_15", "region_16", "region_19","region_26","region_27")) test$region = catatt_reducelevels(test$region, "region_2" ,c("region_10", "region_14", "region_15", "region_16", "region_19","region_26","region_27")) MNC$region = catatt_reducelevels(MNC$region, "region_5" ,c("region_5", "region_6", "region_11", "region_20", "region_21","region_31","region_32")) test$region = catatt_reducelevels(test$region, "region_5" ,c("region_5", "region_6", "region_11", "region_20", "region_21","region_31","region_32")) #Imputing the variables Public_meeting and Permit to have only two levels MNC$education[is.na(MNC$education)]<-"Other" test$education[is.na(test$education)]<-"Other" MNC$previous_year_rating[is.na(MNC$previous_year_rating)]<-0 test$previous_year_rating[is.na(test$previous_year_rating)]<-0 library(caret) set.seed(123) train_RowIDs = createDataPartition(MNC$is_promoted,p=0.7,list=F) train = MNC[train_RowIDs,] validation=MNC[-train_RowIDs,] test= test # Load H2o library library(h2o) # Start H2O on the local machine using all available cores and with 2 gigabytes of memory h2o.init(nthreads = -1, max_mem_size = "1g") # Import a local R train data frame to the H2O cloud train.hex <- as.h2o(x = train, destination_frame = "train.hex") # Number of CV folds (to generate level-one data for stacking) nfolds <- 7 my_gbm <- h2o.gbm(x = setdiff(names(train.hex), "is_promoted"), y = "is_promoted", training_frame = train.hex, distribution = "bernoulli", max_depth = 3, min_rows = 2, learn_rate = 0.2, nfolds = nfolds, fold_assignment = "Stratified", keep_cross_validation_predictions = TRUE, seed = 1) # Train & Cross-validate a RF my_rf <- h2o.randomForest(x = setdiff(names(train.hex), "is_promoted"), y = "is_promoted", training_frame = train.hex, nfolds = nfolds, max_depth=3, min_rows=2, fold_assignment = "Stratified", keep_cross_validation_predictions = TRUE, seed = 1) # Train & Cross-validate a DNN my_dl <- h2o.deeplearning(x = setdiff(names(train.hex), "is_promoted"), y = "is_promoted", training_frame = train.hex, l1 = 0.001, l2 = 0.001, hidden = c(200, 200, 200), nfolds = nfolds, fold_assignment = "Stratified", keep_cross_validation_predictions = TRUE, seed = 1) # Train & Cross-validate a (shallow) XGB-GBM my_glm <- h2o.glm(x = setdiff(names(train.hex), "is_promoted"), y = "is_promoted", training_frame = train.hex, family="binomial", nfolds = nfolds, fold_assignment = "Stratified", keep_cross_validation_predictions = TRUE, seed = 1) # Train & Cross-validate another (deeper) XGB-GBM my_nb <- h2o.naiveBayes(x = setdiff(names(train.hex), "is_promoted"), y = "is_promoted", training_frame = train.hex, nfolds = nfolds, fold_assignment = "Stratified", keep_cross_validation_predictions = TRUE, seed = 1) # Train a stacked ensemble using the H2O and XGBoost models from above base_models <- list(my_gbm@model_id, my_rf@model_id, my_dl@model_id, my_glm@model_id, my_nb@model_id) ensemble <- h2o.stackedEnsemble(x = setdiff(names(train.hex), "is_promoted"), y = "is_promoted", training_frame = train.hex, base_models = base_models) # Import a local R test data frame to the H2O cloud validation.hex <- as.h2o(x = validation, destination_frame = "validation.hex") # Predict on same training data set predict.hex = h2o.predict(ensemble, newdata = validation.hex[,setdiff(names(validation.hex), "is_promoted")]) data_validation = h2o.cbind(validation.hex[,"is_promoted"], predict.hex) # Copy predictions from H2O to R pred_validation = as.data.frame(data_validation) # Hit Rate and Penetration calculation cm=table(pred_validation$is_promoted,pred_validation$predict) conf_Matrix = table(pred_validation$is_promoted,pred_validation$predict) conf_Matrix Accuracy = ((cm[1,1]+cm[2,2])/sum(cm)) Accuracy recall_Validation <- cm[2, 2]/sum(cm[2, ]) precision_Validation<-cm[2,2]/sum(cm[,2]) F1_Validation <- (2 * precision_Validation * recall_Validation) / (precision_Validation + recall_Validation) F1_Validation # Import a local R test data frame to the H2O cloud test.hex <- as.h2o(x = test, destination_frame = "test.hex") # Predict on same training data set predict_test.hex = h2o.predict(ensemble, newdata = test.hex) # Copy predictions from H2O to R pred_test = as.data.frame(predict_test.hex) summary(pred_test$predict) # Shutdown H2O h2o.shutdown(prompt= FALSE) # Hit Rate and Penetration calculation table(pred_test$predict) ############## test<-read.csv("test_2umaH9m.csv",header = T) output<-data.frame(test$employee_id,pred_test$predict) summary(output) colnames(output)<-c("employee_id","is_promoted") write.csv(output,file="Samplesubmission1.csv",row.names = F)