# -*- coding: utf-8 -*- """ Created on Sat Nov 17 09:03:58 2018 @author: Sri.Venkatesh """ import pandas import os from sklearn.cross_validation import train_test_split os.chdir('C:/Users/sri.venkatesh/Desktop/Work/Kaggle/AV/AmEx') # Given Train Data Given_Train_Data = pandas.read_csv("./train.csv") Given_Train_Data['Test_Train_Flag'] = 'Given_Train_Data' LB_Data = pandas.read_csv("./test.csv") LB_Data['Test_Train_Flag'] = 'Given_LB_Data' Entire_Data = pandas.concat([Given_Train_Data,LB_Data],ignore_index = True) Entire_Data = Entire_Data[Given_Train_Data.columns.tolist()] user_hist_data = pandas.read_csv('./historical_user_logs.csv') # Processing Categorical variables Entire_Data['gender'] = Entire_Data['gender'].fillna('Null') Entire_Data['user_group_id'] = Entire_Data['user_group_id'].fillna(999) Entire_Data['user_depth'] = Entire_Data['user_depth'].fillna(999) Entire_Data['city_development_index'] = Entire_Data['city_development_index'].fillna(999) product_dummies = pandas.get_dummies(Entire_Data['product'],prefix='product') campaign_id_dummies = pandas.get_dummies(Entire_Data['campaign_id'],prefix='campaign_id') user_group_id_dummies = pandas.get_dummies(Entire_Data['user_group_id'],prefix='user_group_id') user_depth_dummies = pandas.get_dummies(Entire_Data['user_depth'],prefix='user_depth') gender_dummies = pandas.get_dummies(Entire_Data['gender'],prefix='gender') city_development_index_dummies = pandas.get_dummies(Entire_Data['city_development_index'],prefix='city_development_index') # Join the encoded df Entire_Data = Entire_Data.join(product_dummies) Entire_Data = Entire_Data.join(campaign_id_dummies) Entire_Data = Entire_Data.join(user_group_id_dummies) Entire_Data = Entire_Data.join(user_depth_dummies) Entire_Data = Entire_Data.join(gender_dummies) Entire_Data = Entire_Data.join(city_development_index_dummies) del product_dummies, user_group_id_dummies, user_depth_dummies, gender_dummies, city_development_index_dummies cols = Entire_Data.columns.tolist() cols = cols[15:16] + cols[14:15] + cols[:14] + cols[16:] Entire_Data = Entire_Data[cols] user_hist_data_agg2 = user_hist_data.pivot_table(values = 'DateTime', index= ['user_id','product'], columns = ['action'], aggfunc = 'size',fill_value = 0).reset_index() Entire_Data = pandas.merge(Entire_Data,user_hist_data_agg2,on = ['user_id','product'],how = 'left') Entire_Data['interest'] = Entire_Data['interest'].fillna(0) Entire_Data['view'] = Entire_Data['view'].fillna(0) Entire_Data['product_A_view'] = Entire_Data['product_A']*Entire_Data['view'].fillna(0) Entire_Data['product_B_view'] = Entire_Data['product_B']*Entire_Data['view'].fillna(0) Entire_Data['product_C_view'] = Entire_Data['product_C']*Entire_Data['view'].fillna(0) Entire_Data['product_D_view'] = Entire_Data['product_D']*Entire_Data['view'].fillna(0) Entire_Data['product_E_view'] = Entire_Data['product_E']*Entire_Data['view'].fillna(0) Entire_Data['product_F_view'] = Entire_Data['product_F']*Entire_Data['view'].fillna(0) Entire_Data['product_G_view'] = Entire_Data['product_G']*Entire_Data['view'].fillna(0) Entire_Data['product_H_view'] = Entire_Data['product_H']*Entire_Data['view'].fillna(0) Entire_Data['product_I_view'] = Entire_Data['product_I']*Entire_Data['view'].fillna(0) Entire_Data['product_J_view'] = Entire_Data['product_J']*Entire_Data['view'].fillna(0) Entire_Data['product_A_interest'] = Entire_Data['product_A']*Entire_Data['interest'].fillna(0) Entire_Data['product_B_interest'] = Entire_Data['product_B']*Entire_Data['interest'].fillna(0) Entire_Data['product_C_interest'] = Entire_Data['product_C']*Entire_Data['interest'].fillna(0) Entire_Data['product_D_interest'] = Entire_Data['product_D']*Entire_Data['interest'].fillna(0) Entire_Data['product_E_interest'] = Entire_Data['product_E']*Entire_Data['interest'].fillna(0) Entire_Data['product_F_interest'] = Entire_Data['product_F']*Entire_Data['interest'].fillna(0) Entire_Data['product_G_interest'] = Entire_Data['product_G']*Entire_Data['interest'].fillna(0) Entire_Data['product_H_interest'] = Entire_Data['product_H']*Entire_Data['interest'].fillna(0) Entire_Data['product_I_interest'] = Entire_Data['product_I']*Entire_Data['interest'].fillna(0) Entire_Data['product_J_interest'] = Entire_Data['product_J']*Entire_Data['interest'].fillna(0) Entire_Data = Entire_Data.drop(['product_J', 'campaign_id_414149', 'user_group_id_999.0', 'user_depth_999.0', 'gender_Null', 'city_development_index_999.0','view','interest'],axis =1) #Entire_Data.head(5).to_csv('train_all_cols_sample.csv',index = None) X = Entire_Data[Entire_Data['Test_Train_Flag'] == 'Given_Train_Data'].iloc[:, 15:]#.values y = Entire_Data[Entire_Data['Test_Train_Flag'] == 'Given_Train_Data'].iloc[:, 1].values # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting Logistic Regression to the Training set from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier LR_classifier = LogisticRegression(random_state = 1) KNN_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) SVM_classifier = SVC(kernel = 'linear', random_state = 0) SVMK_classifier = SVC(kernel = 'rbf', random_state = 0) NB_classifier = GaussianNB() DT_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0) RF_classifier = RandomForestClassifier(n_estimators = 500, criterion = 'entropy', random_state = 0) LR_classifier.fit(X_train, y_train) #KNN_classifier.fit(X_train, y_train) #SVM_classifier.fit(X_train, y_train) #SVMK_classifier.fit(X_train, y_train) NB_classifier.fit(X_train, y_train) DT_classifier.fit(X_train, y_train) RF_classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = LR_classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm1 = confusion_matrix(y_test, LR_classifier.predict(X_test)) #cm2 = confusion_matrix(y_test, KNN_classifier.predict(X_test)) #cm3 = confusion_matrix(y_test, SVM_classifier.predict(X_test)) #cm4 = confusion_matrix(y_test, SVMK_classifier.predict(X_test)) cm5 = confusion_matrix(y_test, NB_classifier.predict(X_test)) cm6 = confusion_matrix(y_test, DT_classifier.predict(X_test)) cm7 = confusion_matrix(y_test, RF_classifier.predict(X_test)) from sklearn.metrics import classification_report print(classification_report(y_test, LR_classifier.predict(X_test))) #print(roc_auc_score(y_test, KNN_classifier.predict(X_test))) #print(roc_auc_score(y_test, SVM_classifier.predict(X_test))) #print(roc_auc_score(y_test, SVMK_classifier.predict(X_test))) print(classification_report(y_test, NB_classifier.predict(X_test))) print(classification_report(y_test, DT_classifier.predict(X_test))) print(classification_report(y_test, RF_classifier.predict(X_test))) from sklearn.metrics import roc_auc_score print(roc_auc_score(y_train, LR_classifier.predict(X_train))) print(roc_auc_score(y_test, LR_classifier.predict(X_test))) #print(roc_auc_score(y_train, KNN_classifier.predict(X_train))) #print(roc_auc_score(y_test, KNN_classifier.predict(X_test))) #print(roc_auc_score(y_train, SVM_classifier.predict(X_train))) #print(roc_auc_score(y_test, SVM_classifier.predict(X_test))) #print(roc_auc_score(y_train, SVMK_classifier.predict(X_train))) #print(roc_auc_score(y_test, SVMK_classifier.predict(X_test))) print(roc_auc_score(y_train, NB_classifier.predict(X_train))) print(roc_auc_score(y_test, NB_classifier.predict(X_test))) print(roc_auc_score(y_train, DT_classifier.predict(X_train))) print(roc_auc_score(y_test, DT_classifier.predict(X_test))) print(roc_auc_score(y_train, RF_classifier.predict(X_train))) print(roc_auc_score(y_test, RF_classifier.predict(X_test))) from sklearn.ensemble import GradientBoostingClassifier GB_classifier = GradientBoostingClassifier(random_state = 1) GB_classifier.fit(X_train, y_train) cm10 = confusion_matrix(y_test, GB_classifier.predict(X_test)) print(classification_report(y_test, GB_classifier.predict(X_test))) import xgboost as xgb xgb_params = { 'eta': 0.1, 'max_depth': 10, 'subsample': 0.6, 'colsample_bytree': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': 1 } dtrain = xgb.DMatrix(X_train, y_train) dtest = xgb.DMatrix(X_test) #cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=300, early_stopping_rounds=20, # verbose_eval=True, show_stdv=False) #cv_output[['train-rmse-mean', 'test-rmse-mean']].plot() from sklearn.grid_search import GridSearchCV from xgboost.sklearn import XGBClassifier param_test1 = { 'max_depth':range(3,10,2), 'min_child_weight':range(1,6,2) } gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5) gsearch1.fit(X_train, y_train) gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ num_boost_rounds = 250 model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds) y_predict = model.predict(dtest) #import lightgbm as lgb # Leader Baord Data Predictions X_LB = Entire_Data[Entire_Data['Test_Train_Flag'] == 'Given_LB_Data'].iloc[:, 15:].values X_LB = sc.transform(X_LB) X_LB = xgb.DMatrix(X_LB) y_pred_df_lb = pandas.DataFrame(model.predict(X_LB)) LB_Data = pandas.read_csv("./test.csv") LB_Data['Test_Train_Flag'] = 'Given_LB_Data' LB_Data = pandas.concat([LB_Data[['session_id']],y_pred_df_lb],axis =1) LB_Data.to_csv('test15.csv',index = None)