{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, KFold\n",
"from sklearn.metrics import roc_auc_score\n",
"import xgboost\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"%matplotlib inline\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"from sklearn.preprocessing import StandardScaler,LabelEncoder\n",
"from sklearn import metrics"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train_data = pd.read_csv('train_amex/train.csv')\n",
"historic_data = pd.read_csv('train_amex/historical_user_logs.csv')\n",
"test_data = pd.read_csv('test_LNMuIYp/test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"train_copy = train_data.copy(deep=True)\n",
"test_copy = test_data.copy(deep=True)\n",
"historic_copy = historic_data.copy(deep=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# A helper function for writing predictions to a file\n",
"def write_to_submission_file(predicted_labels, out_file,\n",
" target='is_click', index_label=\"session_id\"):\n",
" test_index = test_combined['session_id']\n",
" predicted_df = pd.DataFrame(predicted_labels,\n",
" index = test_index,\n",
" columns=[target])\n",
" predicted_df.to_csv(out_file, index_label=index_label)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def call_CV(X,y):\n",
" params={\n",
" 'random_state':17,\n",
" 'n_estimators':200,\n",
" 'objective': 'binary:logistic',\n",
" 'learning_rate': 0.01,\n",
" 'max_depth':7,\n",
" }\n",
" \n",
" xgb = xgboost.XGBClassifier(**params)\n",
" kfold = KFold(n_splits=5)\n",
" cv_scores = cross_val_score(xgb, X, y, cv=kfold, \n",
" scoring='roc_auc', n_jobs=-1)\n",
" return(cv_scores.mean())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Check the data is loaded**"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"session_id 0\n",
"DateTime 0\n",
"user_id 0\n",
"product 0\n",
"campaign_id 0\n",
"webpage_id 0\n",
"product_category_1 0\n",
"product_category_2 365854\n",
"user_group_id 18243\n",
"gender 18243\n",
"age_level 18243\n",
"user_depth 18243\n",
"city_development_index 125129\n",
"var_1 0\n",
"is_click 0\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data.head(3)\n",
"test_data.head(3)\n",
"historic_data.head(3)\n",
"train_data.apply(lambda x: sum(x.isnull()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Number of clicks by Gender** :
\n",
"\n",
"0 - Male
\n",
"1 - Female"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#train_copy.groupby('gender',as_index=False).sum()['is_click'].plot.bar()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"******"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Number of clicks and non clicks for each gender** \n",
"
\n",
"\n",
"Approximately 7% of men and women have clicked on ads ."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clicks_gender = train_copy.groupby(['gender','is_click']).count()['session_id']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"% of time a male clicks and % of time a female clicks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clicks_gender.groupby(level=0).apply(lambda x : x / x.sum())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*****"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Clicks by webpage_id**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(12,8))\n",
"train_copy.groupby(['webpage_id','is_click']).count()['session_id'].groupby(level=0).apply(lambda x : (x / x.sum())*100).plot.barh()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*******"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Were few ad campaigns more successful than others ?**
\n",
"\n",
"Some ad campaigns have more clicks as seen in the graph below"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(12,8))\n",
"train_copy.groupby(['campaign_id','is_click']).count()['session_id'].groupby(level=0). \\\n",
"apply(lambda x : (x / x.sum())*100).plot.barh(cmap='viridis')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_sorted = train_copy.sort_values('DateTime')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_sorted['hour'] = pd.to_datetime(train_sorted['DateTime']).apply(lambda x: x.hour)\n",
"train_sorted['day_of_week'] = pd.to_datetime(train_sorted['DateTime']).apply(lambda x: x.dayofweek)\n",
"\n",
"#---------------------------------------------------------\n",
"\n",
"test_data['hour'] = pd.to_datetime(test_data['DateTime']).apply(lambda x: x.hour)\n",
"test_data['day_of_week'] = pd.to_datetime(test_data['DateTime']).apply(lambda x: x.dayofweek)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#train_sorted.groupby(['day_of_week']).sum()['is_click'].plot()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*****"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#train_sorted.groupby(['user_group_id','is_click'],as_index=False).count()[['session_id','user_group_id','is_click']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*****"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Clicks based on Products? "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Historic Data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" DateTime | \n",
" user_id | \n",
" product | \n",
" action | \n",
" user_prod | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2017-05-28 15:44 | \n",
" 704 | \n",
" B | \n",
" view | \n",
" 704_B | \n",
"
\n",
" \n",
" 1 | \n",
" 2017-05-29 07:08 | \n",
" 499679 | \n",
" F | \n",
" view | \n",
" 499679_F | \n",
"
\n",
" \n",
" 2 | \n",
" 2017-05-29 07:10 | \n",
" 499679 | \n",
" G | \n",
" view | \n",
" 499679_G | \n",
"
\n",
" \n",
" 3 | \n",
" 2017-05-29 07:10 | \n",
" 499679 | \n",
" G | \n",
" view | \n",
" 499679_G | \n",
"
\n",
" \n",
" 4 | \n",
" 2017-05-29 07:10 | \n",
" 499679 | \n",
" G | \n",
" view | \n",
" 499679_G | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" DateTime user_id product action user_prod\n",
"0 2017-05-28 15:44 704 B view 704_B\n",
"1 2017-05-29 07:08 499679 F view 499679_F\n",
"2 2017-05-29 07:10 499679 G view 499679_G\n",
"3 2017-05-29 07:10 499679 G view 499679_G\n",
"4 2017-05-29 07:10 499679 G view 499679_G"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"historic_data['user_prod'] = historic_data['user_id'].astype('str') + '_' + historic_data['product'].astype('str')\n",
"historic_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"int_by_prod = historic_data[historic_data['action'] == 'interest'].groupby('user_prod',as_index=False).count()[['action','user_prod']]\n",
"total_action = historic_data.groupby('user_prod',as_index=False).count()[['action','user_prod']]\n",
"total_action.columns = ['total_action','user_prod']\n",
"total_user_action = historic_data.groupby('user_id',as_index=False).count()[['action','user_id']]\n",
"total_user_action.columns = ['total_user_action','user_id']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of times a user has expressed interest in any product in the past"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prepare the data for the model"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"train_copy['user_prod'] = train_copy['user_id'].astype('str') + '_' + train_copy['product'].astype('str')\n",
"test_copy['user_prod'] = test_copy['user_id'].astype('str') + '_' + test_copy['product'].astype('str')\n",
"\n",
"train_combined = train_copy.sort_values('DateTime')\n",
"\n",
"train_combined = pd.merge(train_copy,int_by_prod,how='left',on='user_prod').\\\n",
" merge(total_action,on='user_prod',how='left').merge(total_user_action,on='user_id',how='left')\n",
" \n",
"test_combined = pd.merge(test_copy,int_by_prod,how='left',on='user_prod').\\\n",
" merge(total_action,on='user_prod',how='left').merge(total_user_action,on='user_id',how='left')"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"pcat = [3,7,1]\n",
"camp_id = [405490,359520]\n",
"train_combined['hour'] = pd.to_datetime(train_combined['DateTime']).apply(lambda x: x.hour)\n",
"train_combined['day_of_week'] = pd.to_datetime(train_combined['DateTime']).apply(lambda x: x.dayofweek)\n",
"train_combined['pcat'] = train_combined['product_category_1'].\\\n",
" apply(lambda x: 1 if x in pcat else 0)\n",
"train_combined['campid'] = train_combined['campaign_id'].\\\n",
" apply(lambda x: 1 if x in camp_id else 0)\n",
"\n",
"#---------------------------------------------------------\n",
"\n",
"test_combined['hour'] = pd.to_datetime(test_combined['DateTime']).apply(lambda x: x.hour)\n",
"test_combined['day_of_week'] = pd.to_datetime(test_combined['DateTime']).apply(lambda x: x.dayofweek)\n",
"test_combined['pcat'] = test_combined['product_category_1']\\\n",
" .apply(lambda x: 1 if x in pcat else 0)\n",
"test_combined['campid'] = test_combined['campaign_id'].\\\n",
" apply(lambda x: 1 if x in camp_id else 0)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['session_id', 'DateTime', 'user_id', 'product', 'campaign_id',\n",
" 'webpage_id', 'product_category_1', 'product_category_2',\n",
" 'user_group_id', 'gender', 'age_level', 'user_depth',\n",
" 'city_development_index', 'var_1', 'is_click', 'user_prod', 'action',\n",
" 'total_action', 'total_user_action', 'hour', 'day_of_week', 'camp82320',\n",
" 'camp98970', 'camp105960', 'camp118601', 'camp359520', 'camp360936',\n",
" 'camp396664', 'camp404347', 'camp405490', 'pcat', 'campid'],\n",
" dtype='object')"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_combined.columns"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"train_combined['action'] = train_combined['action'].fillna(0)\n",
"train_combined['total_action'] = train_combined['total_action'].fillna(0)\n",
"train_combined['total_user_action'] = train_combined['total_user_action'].fillna(0)\n",
"\n",
"X_train = train_combined[['campaign_id','webpage_id','user_group_id','total_user_action','total_action',\n",
" 'var_1','product_category_1','pcat','campid'\n",
" ,'product','age_level','action','user_depth','hour','day_of_week']]\n",
"\n",
"#X_train['action'] = StandardScaler().fit_transform(X_train[['action']])\n",
"#X_train['total_action'] = StandardScaler().fit_transform(X_train[['total_action']])\n",
"#X_train['total_user_action'] = StandardScaler().fit_transform(X_train[['total_user_action']])\n",
"\n",
"X_train['age_level'] = X_train['age_level'].fillna(0)\n",
"\n",
"X_train_dummies = pd.get_dummies(X_train,columns=['age_level','campaign_id','webpage_id','user_group_id',\n",
" 'product','user_depth'],drop_first=True)\n",
"y_train = train_combined['is_click']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"X_test = test_combined[['campaign_id','webpage_id','user_group_id','total_user_action','total_action',\n",
" 'var_1','product_category_1','pcat','campid'\n",
" ,'product','age_level','action','user_depth','hour','day_of_week']]\n",
"\n",
"X_test['action'] = X_test['action'].fillna(0)\n",
"X_test['total_action'] = X_test['total_action'].fillna(0)\n",
"X_test['total_user_action'] = X_test['total_user_action'].fillna(0)\n",
"\n",
"#X_test['action'] = StandardScaler().fit_transform(X_test[['action']])\n",
"#X_test['total_action'] = StandardScaler().fit_transform(X_test[['total_action']])\n",
"#X_test['total_user_action'] = StandardScaler().fit_transform(X_test[['total_user_action']])\n",
"\n",
"X_test_dummies = pd.get_dummies(X_test,columns=['age_level','campaign_id','webpage_id','user_group_id',\n",
" 'product','user_depth'],drop_first=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Logistic Regression"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"params={\n",
" 'random_state':17,\n",
" 'n_estimators':200,\n",
" 'objective': 'binary:logistic',\n",
" 'learning_rate': 0.01,\n",
" 'max_depth':7,\n",
"}\n",
"xgb = xgboost.XGBClassifier(**params)\n",
"xgb.fit(X_train_dummies,y_train)\n",
"logit_test_pred = xgb.predict_proba(X_test_dummies)[:, 1]\n",
"write_to_submission_file(logit_test_pred, '1118_xg_5658CV.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"import lightgbm as lgb"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.662960208971848"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = lgb.LGBMClassifier(boosting='dart',num_leaves=150,\n",
" learning_rate=0.05, n_estimators=150, max_depth=15,\n",
" metric='auc',is_training_metric=True,\n",
" max_bin = 700, bagging_fraction = 0.8,verbose=-1,\n",
" bagging_freq = 5, feature_fraction = 1.0)\n",
"## model training and prediction\n",
"model.fit(X_train_dummies,y_train)\n",
"train_preds = model.predict_proba(X_train_dummies)[:,1]\n",
"metrics.roc_auc_score(y_train,train_preds)\n",
"#63.10 default\n",
"#63.48 200 estimators"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [],
"source": [
"pred1 = model.predict_proba(X_test_dummies)[:,1]\n",
"write_to_submission_file(pred1, '1118_lgbm_10.csv')"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(101518, 11)"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test.shape"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" session_id | \n",
" DateTime | \n",
" user_id | \n",
" product | \n",
" campaign_id | \n",
" webpage_id | \n",
" product_category_1 | \n",
" product_category_2 | \n",
" user_group_id | \n",
" gender | \n",
" age_level | \n",
" user_depth | \n",
" city_development_index | \n",
" var_1 | \n",
" is_click | \n",
" user_prod | \n",
" action | \n",
" total_action | \n",
" total_user_action | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 140690 | \n",
" 2017-07-02 00:00 | \n",
" 858557 | \n",
" C | \n",
" 359520 | \n",
" 13787 | \n",
" 4 | \n",
" NaN | \n",
" 10.0 | \n",
" Female | \n",
" 4.0 | \n",
" 3.0 | \n",
" 3.0 | \n",
" 0 | \n",
" 0 | \n",
" 858557_C | \n",
" NaN | \n",
" 5 | \n",
" 627 | \n",
"
\n",
" \n",
" 1 | \n",
" 252642 | \n",
" 2017-07-06 22:09 | \n",
" 858557 | \n",
" I | \n",
" 396664 | \n",
" 51181 | \n",
" 1 | \n",
" NaN | \n",
" 10.0 | \n",
" Female | \n",
" 4.0 | \n",
" 3.0 | \n",
" 3.0 | \n",
" 0 | \n",
" 0 | \n",
" 858557_I | \n",
" NaN | \n",
" 6 | \n",
" 627 | \n",
"
\n",
" \n",
" 2 | \n",
" 333291 | \n",
" 2017-07-02 00:00 | \n",
" 243253 | \n",
" C | \n",
" 105960 | \n",
" 11085 | \n",
" 5 | \n",
" NaN | \n",
" 8.0 | \n",
" Female | \n",
" 2.0 | \n",
" 2.0 | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" 243253_C | \n",
" NaN | \n",
" 11 | \n",
" 62 | \n",
"
\n",
" \n",
" 3 | \n",
" 129781 | \n",
" 2017-07-02 00:00 | \n",
" 243253 | \n",
" C | \n",
" 359520 | \n",
" 13787 | \n",
" 4 | \n",
" NaN | \n",
" 8.0 | \n",
" Female | \n",
" 2.0 | \n",
" 2.0 | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" 243253_C | \n",
" NaN | \n",
" 11 | \n",
" 62 | \n",
"
\n",
" \n",
" 4 | \n",
" 21726 | \n",
" 2017-07-02 00:02 | \n",
" 243253 | \n",
" C | \n",
" 360936 | \n",
" 13787 | \n",
" 5 | \n",
" NaN | \n",
" 8.0 | \n",
" Female | \n",
" 2.0 | \n",
" 2.0 | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" 243253_C | \n",
" NaN | \n",
" 11 | \n",
" 62 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" session_id DateTime user_id product campaign_id webpage_id \\\n",
"0 140690 2017-07-02 00:00 858557 C 359520 13787 \n",
"1 252642 2017-07-06 22:09 858557 I 396664 51181 \n",
"2 333291 2017-07-02 00:00 243253 C 105960 11085 \n",
"3 129781 2017-07-02 00:00 243253 C 359520 13787 \n",
"4 21726 2017-07-02 00:02 243253 C 360936 13787 \n",
"\n",
" product_category_1 product_category_2 user_group_id gender age_level \\\n",
"0 4 NaN 10.0 Female 4.0 \n",
"1 1 NaN 10.0 Female 4.0 \n",
"2 5 NaN 8.0 Female 2.0 \n",
"3 4 NaN 8.0 Female 2.0 \n",
"4 5 NaN 8.0 Female 2.0 \n",
"\n",
" user_depth city_development_index var_1 is_click user_prod action \\\n",
"0 3.0 3.0 0 0 858557_C NaN \n",
"1 3.0 3.0 0 0 858557_I NaN \n",
"2 2.0 NaN 0 0 243253_C NaN \n",
"3 2.0 NaN 0 0 243253_C NaN \n",
"4 2.0 NaN 0 0 243253_C NaN \n",
"\n",
" total_action total_user_action \n",
"0 5 627 \n",
"1 6 627 \n",
"2 11 62 \n",
"3 11 62 \n",
"4 11 62 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_combined.head()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"model = lgb.LGBMClassifier(boosting='dart',num_leaves=150,\\n learning_rate=0.05, n_estimators=17000, max_depth=15,\\n metric='auc',is_training_metric=True,\\n max_bin = 700, bagging_fraction = 0.8,verbose=-1,\\n bagging_freq = 5, feature_fraction = 1.0)\\n## model training and prediction\\nmodel.fit(X_train_dummies,y_train)\\npred1 = model.predict_proba(X_test_dummies)[:,1]\""
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''model = lgb.LGBMClassifier(boosting='dart',num_leaves=150,\n",
" learning_rate=0.05, n_estimators=17000, max_depth=15,\n",
" metric='auc',is_training_metric=True,\n",
" max_bin = 700, bagging_fraction = 0.8,verbose=-1,\n",
" bagging_freq = 5, feature_fraction = 1.0)\n",
"## model training and prediction\n",
"model.fit(X_train_dummies,y_train)\n",
"pred1 = model.predict_proba(X_test_dummies)[:,1]'''"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(463291, 16)"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_copy.shape"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(463291, 19)"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_combined.shape"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(128858, 51)"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_dummies.shape"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"campaign_id\n",
"414149 1710\n",
"105960 1762\n",
"82320 1776\n",
"396664 1809\n",
"118601 1896\n",
"404347 2235\n",
"360936 2346\n",
"98970 2694\n",
"359520 6340\n",
"405490 8763\n",
"Name: session_id, dtype: int64"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_combined[train_combined['is_click'] == 1].groupby(['campaign_id']).count()['session_id'].sort_values()"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['session_id', 'DateTime', 'user_id', 'product', 'campaign_id',\n",
" 'webpage_id', 'product_category_1', 'product_category_2',\n",
" 'user_group_id', 'gender', 'age_level', 'user_depth',\n",
" 'city_development_index', 'var_1', 'is_click', 'user_prod', 'action',\n",
" 'total_action', 'total_user_action', 'hour', 'day_of_week', 'camp82320',\n",
" 'camp98970', 'camp105960', 'camp118601', 'camp359520', 'camp360936',\n",
" 'camp396664', 'camp404347', 'camp405490', 'pcat', 'campid'],\n",
" dtype='object')"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}