train<- read.csv("D:/Users/gry7/Downloads/train_LZdllcl.csv") test<- read.csv("D:/Users/gry7/Downloads/test_2umaH9m.csv") inputData<- train # Create Training Data input_ones <- inputData[which(inputData$is_promoted == 1), ] # all 1's input_zeros <- inputData[which(inputData$is_promoted == 0), ] # all 0's set.seed(100) # for repeatability of samples input_ones_training_rows <- sample(1:nrow(input_ones), 1*nrow(input_ones)) # 1's for training input_zeros_training_rows <- sample(1:nrow(input_zeros), 5*nrow(input_ones)) # 0's for training. Pick as many 0's as 1's training_ones <- input_ones[input_ones_training_rows, ] training_zeros <- input_zeros[input_zeros_training_rows, ] trainingData <- rbind(training_ones, training_zeros) # row bind the 1's and 0's test_ones <- input_ones[-input_ones_training_rows, ] test_zeros <- input_zeros[-input_zeros_training_rows, ] testData <- rbind(test_ones, test_zeros) # row bind the 1's and 0's trainingData <- trainingData[,c('department' , 'education' , 'recruitment_channel' , 'no_of_trainings' , 'age' , 'previous_year_rating' , 'length_of_service' , 'KPIs_met..80.' , 'awards_won.' , 'avg_training_score' , 'is_promoted')] testData <- testData[,c('department' , 'education' , 'recruitment_channel' , 'no_of_trainings' , 'age' , 'previous_year_rating' , 'length_of_service' , 'KPIs_met..80.' , 'awards_won.' , 'avg_training_score' , 'is_promoted')] testtest <- test[,c('department' , 'education' , 'recruitment_channel' , 'no_of_trainings' , 'age' , 'previous_year_rating' , 'length_of_service' , 'KPIs_met..80.' , 'awards_won.' , 'avg_training_score' )] setDT(trainingData) setDT(testData) setDT(testtest) labels <- trainingData$is_promoted ts_label <- testData$is_promoted new_tr <- model.matrix(~.+0,data = trainingData[,-c("is_promoted"),with=F]) new_ts <- model.matrix(~.+0,data = testData[,-c("is_promoted"),with=F]) new_tst <- model.matrix(~.+0,data = testtest) #convert factor to numeric labels <- as.numeric(labels) ts_label <- as.numeric(ts_label) dtrain <- xgb.DMatrix(data = new_tr,label = labels) dtest <- xgb.DMatrix(data = new_ts,label=ts_label) ftest <- xgb.DMatrix(data = new_tst) params <- list(booster = "gbtree", objective = "binary:logistic", eta=0.08, gamma=0.1, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1) xgbcv <- xgb.cv( params = params, data = dtrain, nrounds = 600, nfold = 5, showsd = T, early.stop.round = 20, stratified = T, metrics="logloss", print.every.n = 10, maximize = F) xgb1 <- xgb.train (params = params, data = dtrain, nrounds = 155, watchlist = list(val=dtest,train=dtrain), print.every.n = 10, early.stop.round = 10, maximize = F , eval_metric = "logloss") #model prediction xgbpred <- predict (xgb1,ftest) xgbpred <- ifelse (xgbpred > 0.5,1,0) predictions<- data.frame(predictions =xgbpred) #write.csv(predictions,"D:/Users/gry7/Downloads/predictions.csv",row.names=F) #confusion matrix library(caret) confusionMatrix (as.factor(xgbpred), as.factor(ts_label)) #view variable importance plot mat <- xgb.importance (feature_names = colnames(new_tr),model = xgb1) xgb.plot.importance (importance_matrix = mat[1:20]) precision <- posPredValue(as.factor(xgbpred), as.factor(ts_label), positive="1") recall <- sensitivity(as.factor(xgbpred), as.factor(ts_label), positive="1") (2 * precision * recall) / (precision + recall)