setwd("~/Hackathons/WNS 2019") # LOAD LIBRARIES ---------------------------------------------------------- load_libraries <- function(library_list){ # get all installed packages installed_libraries <- installed.packages()[,1] # identify packages in the list and not installed missing_libraries <- setdiff(library_list, installed_libraries) # if list is non-empty, install all new packages if (length(missing_libraries) > 0) install.packages(missing_libraries) # load libraries in package_list lapply(library_list, require, character.only = TRUE) } library_list <- c('tidyverse', 'lubridate', 'skimr', 'Information', 'smbinning', 'xgboost', 'Matrix', 'caret', 'corrplot', 'ModelMetrics', 'unbalanced', 'pROC', 'data.table') load_libraries(library_list) #----- FUNCTIONS ------- #check for missing values in all columns missing <- function(data){ na_count <-sapply(data, function(y) sum(length(which(is.na(y))))) na_count <- data.frame(na_count) na_count <- setDT(na_count, keep.rownames = TRUE)[na_count > 0] return(na_count) } #function to standardize column names standardize_column_names <- function(input_data){ #convert to lowercase colnames(input_data) <- tolower(colnames(input_data)) #replace punctuation characters or spaces with underscore colnames(input_data) <- gsub('([[:punct:]])|\\s+','_',colnames(input_data)) #replace double underscore with single underscore while(any(grepl("__",colnames(input_data),fixed = TRUE)) == TRUE){ colnames(input_data) <- gsub("__","_",colnames(input_data),fixed = TRUE) } return(input_data) } #function to calculate ctr ctr_calc <- function(data, field){ data <- data %>% group_by_(field) %>% summarise(clicks = sum(is_click), impressions = n()) %>% ungroup() %>% mutate(ctr = (clicks/impressions)*100, has_click = ifelse(clicks > 0, 1, 0)) # rename column names colnames(data) <- c(field, paste0(field, "_", setdiff(names(data), field))) # return output return(data) } # function to calculate CTR before impression time ctr_features <- function(data, field){ # self join data and filter entries where impression time new is after current impression time data <- data %>% dplyr::select(impression_id_actual = impression_id, impression_time_actual = impression_time, is_click_actual = is_click, field) %>% inner_join(data, by = field) %>% filter(impression_time < impression_time_actual) # calculate ctr ctr_df <- data %>% group_by(impression_id_actual) %>% summarise(clicks = sum(is_click), impressions = n()) %>% ungroup() %>% mutate(ctr = (clicks/impressions)*100, has_click = ifelse(clicks > 0, 1, 0)) # rename column names colnames(ctr_df) <- c("impression_id", paste0(field, "_", setdiff(names(ctr_df), "impression_id_actual"))) # return out data frame return(ctr_df) } # calculate f1 score f1_score <- function(y, predicted, positive.class = "1") { # convert both predicted and y variable to factors with same levels. y <- as.factor(y) predicted <- factor(as.character(round(predicted)), labels = unique(as.character(y))) t <- table(y, predicted) #calculate precision and recall precision <- t[4] / (t[3] + t[4]) recall <- t[4] / (t[4] + t[2]) # calculate f1 score f1 <- (2 * precision * recall) / (precision + recall) # return f1 score return(f1) } # create user features based on type {'lasthour', 'currentperiod', 'today', 'last24hour', 'last7days', 'alltime'} user_session <- function(data, type){ # filter data as per type if(type == "lasthour"){ data <- data %>% filter(server_time >= impression_time - hours(1)) } if(type == "currentperiod"){ data <- data %>% filter((impression_doy == view_doy) & (impression_period == view_period)) } if(type == "today"){ data <- data %>% filter(view_doy == impression_doy) } if(type == "last24hour"){ data <- data %>% filter(server_time >= impression_time - hours(24)) } if(type == "last7days"){ data <- data %>% filter(server_time >= impression_time - days(7)) } if(type == "previous24hour"){ data <- data %>% filter(server_time >= impression_time - days(7+1)) } if(type == "previous7days"){ data <- data %>% filter(server_time >= impression_time - days(7+7)) } if(type == "previous30days"){ data <- data %>% filter(server_time >= impression_time - days(30+7)) } if(type == "alltime"){ data <- data } # generate output data frame session <- data %>% group_by(impression_id, session_id) %>% summarize( min_server_time = min(server_time), max_server_time = max(server_time), session_pageviews = n() ) %>% ungroup() %>% mutate(session_time = as.numeric(as.duration(interval(min_server_time, max_server_time)))) %>% dplyr::select(-min_server_time, -max_server_time) %>% group_by(impression_id) %>% summarise( session_count = n(), page_count = sum(session_pageviews), total_session_time = sum(session_time), max_session_time = max(session_time), avg_session_time = mean(session_time), bounce_rate = sum(session_time == 0) / session_count, avg_page_per_session = sum(session_pageviews) / session_count ) %>% ungroup() item <- data %>% group_by(impression_id) %>% summarize( unique_item_count = n_distinct(item_id), avg_item_price = mean(item_price), max_item_price = max(item_price), min_item_price = min(item_price), unique_product_type = n_distinct(product_type), unique_category_1 = n_distinct(category_1), unique_category_2 = n_distinct(category_2), unique_cagegory_3 = n_distinct(category_3) ) %>% ungroup() output <- session %>% left_join(item, by = "impression_id") output[is.na(output)] <- 0 # reset column names colnames(output) <- c("impression_id", paste0(type, "_", setdiff(names(output), "impression_id"))) # return output data frame return(output) } #----- READING RAW DATA ------ train <- read_csv("train.csv") %>% standardize_column_names() view_log <- read_csv("view_log.csv") %>% standardize_column_names() item_data <- read_csv("item_data.csv") %>% standardize_column_names() # read test data test <- read_csv("test.csv") %>% standardize_column_names() #joining view_log and item_data view_log_item <- view_log %>% left_join(item_data, by = c("item_id")) #checking for missing value na <- missing(view_log_item) #dealing with missing value - since percentage of missing values are low deleting the observations instead of performing any treatment view_log_item <- view_log %>% left_join(item_data, by = c("item_id")) # create summmary statistics using skimr library skim_with(integer = list(hist = NULL)) # train summary_stat_train <- skim(train) summary_stat_view_log_item <- skim(view_log_item) # test summary_stat_test <- skim(test) #----- FEATURE ENGINEERING ----- # dependent variable retrieved and stored dv <- "is_click" id_var <- "impression_id" #different time periods #training data train <- train %>% mutate( impression_date = lubridate::date(impression_time), impression_doy = lubridate::yday(impression_time), impression_hod = lubridate::hour(impression_time), impression_dow = lubridate::wday(impression_time), impression_period = case_when( impression_hod >= 0 & impression_hod < 6 ~ "midnight", impression_hod >= 6 & impression_hod < 12 ~ "morning", impression_hod >= 12 & impression_hod < 16 ~ "afternoon", impression_hod >= 16 & impression_hod <= 23 ~ "evening" ) ) #test_data test <- test %>% mutate( impression_date = lubridate::date(impression_time), impression_doy = lubridate::yday(impression_time), impression_hod = lubridate::hour(impression_time), impression_dow = lubridate::wday(impression_time), impression_period = case_when( impression_hod >= 0 & impression_hod < 6 ~ "midnight", impression_hod >= 6 & impression_hod < 12 ~ "morning", impression_hod >= 12 & impression_hod < 16 ~ "afternoon", impression_hod >= 16 & impression_hod <= 23 ~ "evening" ) ) #view_log data view_log_item <- view_log_item %>% mutate(view_date = lubridate::date(server_time), view_doy = lubridate::yday(server_time), view_hod = lubridate::hour(server_time), view_dow = lubridate::wday(server_time), view_period = case_when( view_hod >= 0 & view_hod < 6 ~ "midnight", view_hod >= 6 & view_hod < 12 ~ "morning", view_hod >= 12 & view_hod < 16 ~ "afternoon", view_hod >= 16 & view_hod <= 23 ~ "evening" ) ) #merge train and view_log_item merged_data <- train %>% left_join(view_log_item, by = "user_id") %>% dplyr::filter(impression_time > (server_time + days(7))) #merge test and view_log_item merged_data_test <- test %>% left_join(view_log_item, by = "user_id") %>% dplyr::filter(impression_time > (server_time + days(7))) train <- train %>% mutate(app_code_os = paste0(app_code, "_", os_version) ,app_code_4g = paste0(app_code, "_", is_4g) ,app_code_hod = paste0(app_code, "_", impression_hod) ,app_code_period = paste0(app_code, "_", impression_period) ,app_code_dow = paste0(app_code, "_", impression_dow) ,app_code_user = paste0(app_code, "_", user_id) ,user_hod = paste0(user_id, "_", impression_hod) ,user_period = paste0(user_id, "_", impression_period) ,user_dow = paste0(user_id, "_", impression_dow) ) test <- test %>% mutate(app_code_os = paste0(app_code, "_", os_version) ,app_code_4g = paste0(app_code, "_", is_4g) ,app_code_hod = paste0(app_code, "_", impression_hod) ,app_code_period = paste0(app_code, "_", impression_period) ,app_code_dow = paste0(app_code, "_", impression_dow) ,app_code_user = paste0(app_code, "_", user_id) ,user_hod = paste0(user_id, "_", impression_hod) ,user_period = paste0(user_id, "_", impression_period) ,user_dow = paste0(user_id, "_", impression_dow) ) train_final <- train %>% left_join(ctr_features(train, 'user_id'), by = "impression_id") %>% left_join(ctr_features(train, 'app_code_user'), by = "impression_id") %>% left_join(ctr_features(train, 'user_hod'), by = "impression_id") %>% left_join(ctr_features(train, 'user_period'), by = "impression_id") %>% left_join(ctr_features(train, 'user_dow'), by = "impression_id") %>% left_join(ctr_calc(train, 'app_code'), by = "app_code") %>% left_join(ctr_calc(train, "app_code_os"), by = "app_code_os") %>% left_join(ctr_calc(train, "app_code_4g"), by = "app_code_4g") %>% left_join(ctr_calc(train, "app_code_hod"), by = "app_code_hod") %>% left_join(ctr_calc(train, "app_code_period"), by = "app_code_period") %>% left_join(ctr_calc(train, "app_code_dow"), by = "app_code_dow") test_final <- test %>% left_join(ctr_calc(train, 'user_id'), by = "user_id") %>% left_join(ctr_calc(train, 'app_code_user'), by = "app_code_user") %>% left_join(ctr_calc(train, 'user_hod'), by = "user_hod") %>% left_join(ctr_calc(train, 'user_period'), by = "user_period") %>% left_join(ctr_calc(train, 'user_dow'), by = "user_dow") %>% left_join(ctr_calc(train, 'app_code'), by = "app_code") %>% left_join(ctr_calc(train, "app_code_os"), by = "app_code_os") %>% left_join(ctr_calc(train, "app_code_4g"), by = "app_code_4g") %>% left_join(ctr_calc(train, "app_code_hod"), by = "app_code_hod") %>% left_join(ctr_calc(train, "app_code_period"), by = "app_code_period") %>% left_join(ctr_calc(train, "app_code_dow"), by = "app_code_dow") # create user features based on type {'lasthour', 'currentperiod', 'today', 'last24hour', 'thisweek', 'last7days', 'alltime'} all <- user_session(merged_data, type = "alltime") system.time(previous24hours <- user_session(merged_data, type = "previous24hours")) previous7days <- user_session(merged_data, type = "previous7days") previous30days <- user_session(merged_data, type = "previous30days") train_final <- train_final %>% left_join(all, by = "impression_id") %>% left_join(previous24hours, by = "impression_id") %>% left_join(previous7days, by = "impression_id") %>% left_join(previous30days, by = "impression_id") # # test all_test <- user_session(merged_data_test, type = "alltime") previous24hours_test <- user_session(merged_data_test, type = "previous24hours") previous7days_test <- user_session(merged_data_test, type = "previous7days") previous30days_test <- user_session(merged_data_test, type = "previous30days") #test test_final <- test_final %>% left_join(all_test, by = "impression_id") %>% left_join(previous24hours_test, by = "impression_id") %>% left_join(previous7days_test, by = "impression_id") %>% left_join(previous30days_test, by = "impression_id") #replacing missing values with 0 train_final[is.na(train_final)] <- 0 test_final[is.na(test_final)] <- 0 # variables of importance iv_list <- names( train_final %>% select(-dv) %>% select(-id_var) %>% select(-contains("_impressions")) %>% select(-contains("_clicks")) %>% select(-c(impression_time, impression_date, user_id, app_code, impression_date)) %>% select(-c(app_code_os, app_code_4g, app_code_hod, app_code_period, app_code_dow, app_code_user)) %>% select(-c(user_hod, user_period, user_period, user_dow)) ) #CTR calculation ctr_check <- lapply( iv_list, function(field) { ctr_calc(train_final, field) }) # INFORMATION VALUE 2 IV2 <- create_infotables( data = train_final %>% select(iv_list, dv), y = dv, parallel = TRUE ) #check correlation plot cor_matrix <- cor(train_final %>% select_if(is.numeric)) # MODEL DATA SELECTION ------------------------------------------------------------------- set.seed(2808) train_final <- train_final %>% mutate_if(is.character, as.factor) test_final <- test_final %>% mutate_if(is.character, as.factor) #training and validation set creation train_index <- sample(1:nrow(train_final), 0.7 * nrow(train_final)) val_index <- setdiff(1:nrow(train_final), train_index) train_train <- train_final[train_index,] train_val <- train_final[val_index,] # variable list to be included in model # iv_list_final <- iv_list iv_list_final <- setdiff(iv_list, c('app_code')) train_train <- train_train %>% select(iv_list_final, dv) table(train_train %>% pull(dv)) train_val <- train_val %>% select(iv_list_final, dv) table(train_val %>% pull(dv)) test_test <- test_final %>% select(iv_list_final) # select model data model_data_train <- train_train model_data_val <- train_val model_data_test <- test_test dim(model_data_train) table(model_data_train$is_click) #-------- MODEL DATA BUILDING ------------ dtrain <- xgb.DMatrix(data = data.matrix(model_data_train %>% select(-dv)), label = as.numeric(as.character(model_data_train %>% pull(dv)))) dval <- xgb.DMatrix(data = data.matrix(model_data_val %>% select(-dv)), label = model_data_val %>% pull(dv)) dtest <- xgb.DMatrix(data = data.matrix(model_data_test)) #-------- HYPER PARAMETER TUNING ----------- search_grid <- expand.grid(eta = c(0.1, 0.2) ,max_depth = c(3, 4) ,subsample = c(0.8, 0.9, 1.0) ,colsample_bytree = c(0.8, 0.9, 1.0) ) nrounds = 1000 scale_pos_weight = round(sum(model_data_train$is_click == 0)/sum(model_data_train$is_click == 1),0) system.time(auc_error_hyperparameter <- apply(search_grid, 1, function(parameter_list){ #extract current parameters current_eta <- parameter_list[["eta"]] current_depth <- parameter_list[["max_depth"]] current_subsample <- parameter_list[["subsample"]] current_colsample <- parameter_list[["colsample_bytree"]] # print parameters print(c(current_eta, current_depth, current_subsample, current_colsample)) # create parameter list params <- list(booster = "gbtree" ,objective = "binary:logistic" ,eval_metric = 'auc' ,eta = current_eta ,max_depth = current_depth ,subsample = current_subsample ,colsample_bytree = current_colsample ,scale_pos_weight = scale_pos_weight ) # fit cross validation model with current parameters xg.fit.current <- xgb.cv(params = params ,data = dtrain ,nrounds = nrounds ,nfold = 5 ,showsd = TRUE ,verbose = TRUE ,stratified = TRUE ,maximize = TRUE ,print_every_n = 10 ,early_stopping_rounds = 10) # obtain cross-validated nrounds current_nrounds <- min(xg.fit.current$evaluation_log[xg.fit.current$evaluation_log$test_auc_mean == max(xg.fit.current$evaluation_log$test_auc_mean)]$iter) #obtain optimum test_score and train_score xg_validation_scores <- as.data.frame(xg.fit.current$evaluation_log) test_score <- tail(xg_validation_scores$test_auc_mean, 1) train_score <- tail(xg_validation_scores$train_auc_mean,1) #create output vector output <- return(c(test_score, train_score, current_nrounds, current_eta, current_depth, current_subsample, current_colsample)) })) # select the iteration with optimum test auc best_auc <- auc_error_hyperparameter[,which(auc_error_hyperparameter[1,] == max(auc_error_hyperparameter[1,]))] # set final parameters final_params <- list(booster = "gbtree" ,objective = "binary:logistic" ,eval_metric = "auc" ,eta = best_auc[4] ,max_depth = best_auc[5] ,subsample = best_auc[6] ,colsample_bytree = best_auc[7] ,scale_pos_weight = scale_pos_weight ) #--------------- MODEL FITTING BASED ON OPTIMAL HYPERPARAMETERS -------------------- # nrounds cross validation with final parameters xg.fit.cv <- xgb.cv(params = final_params ,data = dtrain ,nrounds = nrounds ,nfold = 5 ,showsd = TRUE ,verbose = TRUE ,stratified = TRUE ,maximize = TRUE ,print_every_n = 10 ,early_stopping_rounds = 10 ,prediction = T) nrounds_final <- xg.fit.cv$evaluation_log[xg.fit.cv$evaluation_log$test_auc_mean == max(xg.fit.cv$evaluation_log$test_auc_mean)]$iter # final model xg.fit <- xgb.train(params = final_params ,data = dtrain ,nrounds = nrounds_final ,nfold = 5 ,stratified = TRUE ,print_every_n = 10 ,watchlist = list(train = dtrain, val = dval)) #-------- MODEL VALIDATION ---------- # variable importance var_importance <- xgb.importance(feature_names = colnames(model_data_train), model = xg.fit) # plot variable importance xgb.plot.importance(importance_matrix = var_importance[1:25]) # ROC CURVE for validation data train_val$pred_prob <- predict(xg.fit, dval) #build roc object roc_val <- pROC::roc(response = as.numeric(as.character(train_val %>% pull(dv))), predictor = train_val %>% pull(pred_prob), print.auc = TRUE, print.auc.col = "red", # arguments for ci - not necessary, but if used, gives confidence interval for AUC ci = TRUE, boot.n = 100, ci.alpha = 0.9, stratified = FALSE, # arguments for plot - not necessary, but if used, plots the ROC curve and adds some aesthetics plot = TRUE, auc.polygon = TRUE, auc.polygon.col = "white", max.auc.polygon = TRUE, grid = TRUE, show.thres = TRUE, main = "ROC CURVE") # get all roc metrics roc_all <- pROC::coords( roc_val, "all", ret = c("threshold", "specificity", "sensitivity", "accuracy", "tn", "tp", "fn", "fp", "npv", "ppv", "1-specificity", "1-sensitivity", "1-accuracy", "1-npv", "1-ppv"), transpose = FALSE ) #add best threshold in the plot plot(ci(roc_val, of = "thresholds", thresholds = "best")) # get the best threshold, where true positive rate(tpr) and 1 - false positive rate(fpr) overlap cutoff_best <- coords(roc_val, x = "best")[1] # for getting best f1 score based threshold, let us create f1 score in roc_all roc_all$f1 <- (2 * roc_all$ppv * roc_all$sensitivity)/(roc_all$ppv + roc_all$sensitivity) # pick the threshold for best f1 score cutoff_best_f1 <- roc_all[which(roc_all$f1 == max(roc_all$f1, na.rm = TRUE)),]$threshold # validation prediction based on best cutoff obtained train_val$pred <- ifelse(predict(xg.fit, dval) > cutoff_best, 1, 0) auc(train_val %>% pull(dv), train_val$pred) table(train_val %>% pull(dv), train_val$pred) # create final cutoff for test data set cutoff_final <- cutoff_best #------ output file -------- # output file test$pred <- ifelse(predict(xg.fit, dtest) > cutoff_final, 1, 0) output <- data.frame(impression_id = test %>% pull(id_var), is_click = test$pred) write.csv(output, "output_1_2_alldata.csv", row.names = FALSE) write.csv( data.frame(parameter = c('test_auc','train_auc', 'nrounds','eta', 'max_depth', 'subsample', 'colsample_bytree'), value = best_auc), "best_hyperparameter_1_2.csv", row.names = FALSE ) write.csv( auc_error_hyperparameter, "all_hyperparameter_1_2.csv", row.names = FALSE )