{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np # linear algebra\n",
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
"\n",
"# Input data files are available in the \"../input/\" directory.\n",
"# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
"\n",
"import os\n",
"#for dirname, _, filenames in os.walk('/Desktop'): #walk('/kaggle/input'):\n",
" #for filename in filenames:\n",
" #print(os.path.join(dirname, filename)) #how to get this"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" impression_id | \n",
" impression_time | \n",
" user_id | \n",
" app_code | \n",
" os_version | \n",
" is_4G | \n",
" is_click | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" c4ca4238a0b923820dcc509a6f75849b | \n",
" 2018-11-15 00:00:00 | \n",
" 87862 | \n",
" 422 | \n",
" old | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" impression_id impression_time user_id app_code \\\n",
"0 c4ca4238a0b923820dcc509a6f75849b 2018-11-15 00:00:00 87862 422 \n",
"\n",
" os_version is_4G is_click \n",
"0 old 0 0 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train = pd.read_csv('train.csv')\n",
"test = pd.read_csv('test.csv')\n",
"train.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"from tqdm import tqdm_notebook # what is this?\n",
"\n",
"cats = ['user_id','app_code','os_version','is_4G']\n",
"contin = [i for i in train.columns if i not in cats and i not in ['is_click','impression_id','impression_time']]\n",
"contin"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" impression_time | \n",
" user_id | \n",
" app_code | \n",
" os_version | \n",
" is_4G | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" c4ca4238a0b923820dcc509a6f75849b | \n",
" 2018-11-15 00:00:00 | \n",
" 87862 | \n",
" 422 | \n",
" old | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id impression_time user_id app_code \\\n",
"0 c4ca4238a0b923820dcc509a6f75849b 2018-11-15 00:00:00 87862 422 \n",
"\n",
" os_version is_4G target \n",
"0 old 0 0 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"IDTEST = test['impression_id']\n",
"train.rename(columns={'impression_id':'id','is_click':'target'},inplace=True)\n",
"test.rename(columns={'impression_id':'id'},inplace=True)\n",
"train.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"for c in cats:\n",
" _ = pd.concat([train[c],test[c]],0)\n",
" le = LabelEncoder()\n",
" le.fit(_)\n",
" train[c] = le.transform(train[c])\n",
" test[c] = le.transform(test[c])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\M Phanidhar\\Anaconda3\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n"
]
}
],
"source": [
"\"\"\"\n",
"Tensorflow implementation of DeepFM [1]\n",
"Reference:\n",
"[1] DeepFM: A Factorization-Machine based Neural Network for CTR Prediction,\n",
" Huifeng Guo\u0003, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He.\n",
"\"\"\"\n",
"\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.metrics import roc_auc_score\n",
"from time import time\n",
"from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm\n",
"#from yellowfin import YFOptimizer\n",
"\n",
"\n",
"class DeepFM(BaseEstimator, TransformerMixin):\n",
" def __init__(self, feature_size, field_size,\n",
" embedding_size=8, dropout_fm=[1.0, 1.0],\n",
" deep_layers=[32, 32], dropout_deep=[0.5, 0.5, 0.5],\n",
" deep_layers_activation=tf.nn.relu,\n",
" epoch=10, batch_size=256,\n",
" learning_rate=0.001, optimizer_type=\"adam\",\n",
" batch_norm=0, batch_norm_decay=0.995,\n",
" verbose=False, random_seed=2016,\n",
" use_fm=True, use_deep=True,\n",
" loss_type=\"logloss\", eval_metric=roc_auc_score,\n",
" l2_reg=0.0, greater_is_better=True):\n",
" assert (use_fm or use_deep)\n",
" assert loss_type in [\"logloss\", \"mse\"], \\\n",
" \"loss_type can be either 'logloss' for classification task or 'mse' for regression task\"\n",
"\n",
" self.feature_size = feature_size # denote as M, size of the feature dictionary\n",
" self.field_size = field_size # denote as F, size of the feature fields\n",
" self.embedding_size = embedding_size # denote as K, size of the feature embedding\n",
"\n",
" self.dropout_fm = dropout_fm\n",
" self.deep_layers = deep_layers\n",
" self.dropout_deep = dropout_deep\n",
" self.deep_layers_activation = deep_layers_activation\n",
" self.use_fm = use_fm\n",
" self.use_deep = use_deep\n",
" self.l2_reg = l2_reg\n",
"\n",
" self.epoch = epoch\n",
" self.batch_size = batch_size\n",
" self.learning_rate = learning_rate\n",
" self.optimizer_type = optimizer_type\n",
"\n",
" self.batch_norm = batch_norm\n",
" self.batch_norm_decay = batch_norm_decay\n",
"\n",
" self.verbose = verbose\n",
" self.random_seed = random_seed\n",
" self.loss_type = loss_type\n",
" self.eval_metric = roc_auc_score\n",
" self.greater_is_better = greater_is_better\n",
" self.train_result, self.valid_result = [], []\n",
"\n",
" self._init_graph()\n",
"\n",
"\n",
" def _init_graph(self):\n",
" self.graph = tf.Graph()\n",
" with self.graph.as_default():\n",
"\n",
" tf.set_random_seed(self.random_seed)\n",
"\n",
" self.feat_index = tf.placeholder(tf.int32, shape=[None, None],\n",
" name=\"feat_index\") # None * F\n",
" self.feat_value = tf.placeholder(tf.float32, shape=[None, None],\n",
" name=\"feat_value\") # None * F\n",
" self.label = tf.placeholder(tf.float32, shape=[None, 1], name=\"label\") # None * 1\n",
" self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name=\"dropout_keep_fm\")\n",
" self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name=\"dropout_keep_deep\")\n",
" self.train_phase = tf.placeholder(tf.bool, name=\"train_phase\")\n",
"\n",
" self.weights = self._initialize_weights()\n",
"\n",
" # model\n",
" self.embeddings = tf.nn.embedding_lookup(self.weights[\"feature_embeddings\"],\n",
" self.feat_index) # None * F * K\n",
" feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1])\n",
" self.embeddings = tf.multiply(self.embeddings, feat_value)\n",
"\n",
" # ---------- first order term ----------\n",
" self.y_first_order = tf.nn.embedding_lookup(self.weights[\"feature_bias\"], self.feat_index) # None * F * 1\n",
" self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order, feat_value), 2) # None * F\n",
" self.y_first_order = tf.nn.dropout(self.y_first_order, self.dropout_keep_fm[0]) # None * F\n",
"\n",
" # ---------- second order term ---------------\n",
" # sum_square part\n",
" self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K\n",
" self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K\n",
"\n",
" # square_sum part\n",
" self.squared_features_emb = tf.square(self.embeddings)\n",
" self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1) # None * K\n",
"\n",
" # second order\n",
" self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square, self.squared_sum_features_emb) # None * K\n",
" self.y_second_order = tf.nn.dropout(self.y_second_order, self.dropout_keep_fm[1]) # None * K\n",
"\n",
" # ---------- Deep component ----------\n",
" self.y_deep = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K)\n",
" self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])\n",
" for i in range(0, len(self.deep_layers)):\n",
" self.y_deep = tf.add(tf.matmul(self.y_deep, self.weights[\"layer_%d\" %i]), self.weights[\"bias_%d\"%i]) # None * layer[i] * 1\n",
" if self.batch_norm:\n",
" self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn=\"bn_%d\" %i) # None * layer[i] * 1\n",
" self.y_deep = self.deep_layers_activation(self.y_deep)\n",
" self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[1+i]) # dropout at each Deep layer\n",
"\n",
" # ---------- DeepFM ----------\n",
" if self.use_fm and self.use_deep:\n",
" concat_input = tf.concat([self.y_first_order, self.y_second_order, self.y_deep], axis=1)\n",
" elif self.use_fm:\n",
" concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1)\n",
" elif self.use_deep:\n",
" concat_input = self.y_deep\n",
" self.out = tf.add(tf.matmul(concat_input, self.weights[\"concat_projection\"]), self.weights[\"concat_bias\"])\n",
"\n",
" # loss\n",
" if self.loss_type == \"logloss\":\n",
" self.out = tf.nn.sigmoid(self.out)\n",
" self.loss = tf.losses.log_loss(self.label, self.out)\n",
" elif self.loss_type == \"mse\":\n",
" self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))\n",
" # l2 regularization on weights\n",
" if self.l2_reg > 0:\n",
" self.loss += tf.contrib.layers.l2_regularizer(\n",
" self.l2_reg)(self.weights[\"concat_projection\"])\n",
" if self.use_deep:\n",
" for i in range(len(self.deep_layers)):\n",
" self.loss += tf.contrib.layers.l2_regularizer(\n",
" self.l2_reg)(self.weights[\"layer_%d\"%i])\n",
"\n",
" # optimizer\n",
" if self.optimizer_type == \"adam\":\n",
" self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,\n",
" epsilon=1e-8).minimize(self.loss)\n",
" elif self.optimizer_type == \"adagrad\":\n",
" self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,\n",
" initial_accumulator_value=1e-8).minimize(self.loss)\n",
" elif self.optimizer_type == \"gd\":\n",
" self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)\n",
" elif self.optimizer_type == \"momentum\":\n",
" self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(\n",
" self.loss)\n",
" elif self.optimizer_type == \"yellowfin\":\n",
" self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(\n",
" self.loss)\n",
"\n",
" # init\n",
" self.saver = tf.train.Saver()\n",
" init = tf.global_variables_initializer()\n",
" self.sess = self._init_session()\n",
" self.sess.run(init)\n",
"\n",
" # number of params\n",
" total_parameters = 0\n",
" for variable in self.weights.values():\n",
" shape = variable.get_shape()\n",
" variable_parameters = 1\n",
" for dim in shape:\n",
" variable_parameters *= dim.value\n",
" total_parameters += variable_parameters\n",
" if self.verbose > 0:\n",
" print(\"#params: %d\" % total_parameters)\n",
"\n",
"\n",
" def _init_session(self):\n",
" config = tf.ConfigProto(device_count={\"gpu\": 0})\n",
" config.gpu_options.allow_growth = True\n",
" return tf.Session(config=config)\n",
"\n",
"\n",
" def _initialize_weights(self):\n",
" weights = dict()\n",
"\n",
" # embeddings\n",
" weights[\"feature_embeddings\"] = tf.Variable(\n",
" tf.random_normal([self.feature_size, self.embedding_size], 0.0, 0.01),\n",
" name=\"feature_embeddings\") # feature_size * K\n",
" weights[\"feature_bias\"] = tf.Variable(\n",
" tf.random_uniform([self.feature_size, 1], 0.0, 1.0), name=\"feature_bias\") # feature_size * 1\n",
"\n",
" # deep layers\n",
" num_layer = len(self.deep_layers)\n",
" input_size = self.field_size * self.embedding_size\n",
" glorot = np.sqrt(2.0 / (input_size + self.deep_layers[0]))\n",
" weights[\"layer_0\"] = tf.Variable(\n",
" np.random.normal(loc=0, scale=glorot, size=(input_size, self.deep_layers[0])), dtype=np.float32)\n",
" weights[\"bias_0\"] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[0])),\n",
" dtype=np.float32) # 1 * layers[0]\n",
" for i in range(1, num_layer):\n",
" glorot = np.sqrt(2.0 / (self.deep_layers[i-1] + self.deep_layers[i]))\n",
" weights[\"layer_%d\" % i] = tf.Variable(\n",
" np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i-1], self.deep_layers[i])),\n",
" dtype=np.float32) # layers[i-1] * layers[i]\n",
" weights[\"bias_%d\" % i] = tf.Variable(\n",
" np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),\n",
" dtype=np.float32) # 1 * layer[i]\n",
"\n",
" # final concat projection layer\n",
" if self.use_fm and self.use_deep:\n",
" input_size = self.field_size + self.embedding_size + self.deep_layers[-1]\n",
" elif self.use_fm:\n",
" input_size = self.field_size + self.embedding_size\n",
" elif self.use_deep:\n",
" input_size = self.deep_layers[-1]\n",
" glorot = np.sqrt(2.0 / (input_size + 1))\n",
" weights[\"concat_projection\"] = tf.Variable(\n",
" np.random.normal(loc=0, scale=glorot, size=(input_size, 1)),\n",
" dtype=np.float32) # layers[i-1]*layers[i]\n",
" weights[\"concat_bias\"] = tf.Variable(tf.constant(0.01), dtype=np.float32)\n",
"\n",
" return weights\n",
"\n",
"\n",
" def batch_norm_layer(self, x, train_phase, scope_bn):\n",
" bn_train = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,\n",
" is_training=True, reuse=None, trainable=True, scope=scope_bn)\n",
" bn_inference = batch_norm(x, decay=self.batch_norm_decay, center=True, scale=True, updates_collections=None,\n",
" is_training=False, reuse=True, trainable=True, scope=scope_bn)\n",
" z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)\n",
" return z\n",
"\n",
"\n",
" def get_batch(self, Xi, Xv, y, batch_size, index):\n",
" start = index * batch_size\n",
" end = (index+1) * batch_size\n",
" end = end if end < len(y) else len(y)\n",
" return Xi[start:end], Xv[start:end], [[y_] for y_ in y[start:end]]\n",
"\n",
"\n",
" # shuffle three lists simutaneously\n",
" def shuffle_in_unison_scary(self, a, b, c):\n",
" rng_state = np.random.get_state()\n",
" np.random.shuffle(a)\n",
" np.random.set_state(rng_state)\n",
" np.random.shuffle(b)\n",
" np.random.set_state(rng_state)\n",
" np.random.shuffle(c)\n",
"\n",
"\n",
" def fit_on_batch(self, Xi, Xv, y):\n",
" feed_dict = {self.feat_index: Xi,\n",
" self.feat_value: Xv,\n",
" self.label: y,\n",
" self.dropout_keep_fm: self.dropout_fm,\n",
" self.dropout_keep_deep: self.dropout_deep,\n",
" self.train_phase: True}\n",
" loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)\n",
" return loss\n",
"\n",
"\n",
" def fit(self, Xi_train, Xv_train, y_train,\n",
" Xi_valid=None, Xv_valid=None, y_valid=None,\n",
" early_stopping=False, refit=False):\n",
" \"\"\"\n",
" :param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]\n",
" indi_j is the feature index of feature field j of sample i in the training set\n",
" :param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]\n",
" vali_j is the feature value of feature field j of sample i in the training set\n",
" vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)\n",
" :param y_train: label of each sample in the training set\n",
" :param Xi_valid: list of list of feature indices of each sample in the validation set\n",
" :param Xv_valid: list of list of feature values of each sample in the validation set\n",
" :param y_valid: label of each sample in the validation set\n",
" :param early_stopping: perform early stopping or not\n",
" :param refit: refit the model on the train+valid dataset or not\n",
" :return: None\n",
" \"\"\"\n",
" has_valid = Xv_valid is not None\n",
" for epoch in range(self.epoch):\n",
" t1 = time()\n",
" self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)\n",
" total_batch = int(len(y_train) / self.batch_size)\n",
" for i in range(total_batch):\n",
" Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train, self.batch_size, i)\n",
" self.fit_on_batch(Xi_batch, Xv_batch, y_batch)\n",
"\n",
" # evaluate training and validation datasets\n",
" train_result = self.evaluate(Xi_train, Xv_train, y_train)\n",
" self.train_result.append(train_result)\n",
" if has_valid:\n",
" valid_result = self.evaluate(Xi_valid, Xv_valid, y_valid)\n",
" self.valid_result.append(valid_result)\n",
" if self.verbose > 0 and epoch % self.verbose == 0:\n",
" if has_valid:\n",
" print(\"[%d] train-result=%.4f, valid-result=%.4f [%.1f s]\"\n",
" % (epoch + 1, train_result, valid_result, time() - t1))\n",
" else:\n",
" print(\"[%d] train-result=%.4f [%.1f s]\"\n",
" % (epoch + 1, train_result, time() - t1))\n",
" if has_valid and early_stopping and self.training_termination(self.valid_result):\n",
" break\n",
"\n",
" # fit a few more epoch on train+valid until result reaches the best_train_score\n",
" if has_valid and refit:\n",
" if self.greater_is_better:\n",
" best_valid_score = max(self.valid_result)\n",
" else:\n",
" best_valid_score = min(self.valid_result)\n",
" best_epoch = self.valid_result.index(best_valid_score)\n",
" best_train_score = self.train_result[best_epoch]\n",
" Xi_train = Xi_train + Xi_valid\n",
" Xv_train = Xv_train + Xv_valid\n",
" y_train = y_train + y_valid\n",
" for epoch in range(100):\n",
" self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train)\n",
" total_batch = int(len(y_train) / self.batch_size)\n",
" for i in range(total_batch):\n",
" Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train, Xv_train, y_train,\n",
" self.batch_size, i)\n",
" self.fit_on_batch(Xi_batch, Xv_batch, y_batch)\n",
" # check\n",
" train_result = self.evaluate(Xi_train, Xv_train, y_train)\n",
" if abs(train_result - best_train_score) < 0.001 or \\\n",
" (self.greater_is_better and train_result > best_train_score) or \\\n",
" ((not self.greater_is_better) and train_result < best_train_score):\n",
" break\n",
"\n",
"\n",
" def training_termination(self, valid_result):\n",
" if len(valid_result) > 5:\n",
" if self.greater_is_better:\n",
" if valid_result[-1] < valid_result[-2] and \\\n",
" valid_result[-2] < valid_result[-3] and \\\n",
" valid_result[-3] < valid_result[-4] and \\\n",
" valid_result[-4] < valid_result[-5]:\n",
" return True\n",
" else:\n",
" if valid_result[-1] > valid_result[-2] and \\\n",
" valid_result[-2] > valid_result[-3] and \\\n",
" valid_result[-3] > valid_result[-4] and \\\n",
" valid_result[-4] > valid_result[-5]:\n",
" return True\n",
" return False\n",
"\n",
"\n",
" def predict(self, Xi, Xv):\n",
" \"\"\"\n",
" :param Xi: list of list of feature indices of each sample in the dataset\n",
" :param Xv: list of list of feature values of each sample in the dataset\n",
" :return: predicted probability of each sample\n",
" \"\"\"\n",
" # dummy y\n",
" dummy_y = [1] * len(Xi)\n",
" batch_index = 0\n",
" Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)\n",
" y_pred = None\n",
" while len(Xi_batch) > 0:\n",
" num_batch = len(y_batch)\n",
" feed_dict = {self.feat_index: Xi_batch,\n",
" self.feat_value: Xv_batch,\n",
" self.label: y_batch,\n",
" self.dropout_keep_fm: [1.0] * len(self.dropout_fm),\n",
" self.dropout_keep_deep: [1.0] * len(self.dropout_deep),\n",
" self.train_phase: False}\n",
" batch_out = self.sess.run(self.out, feed_dict=feed_dict)\n",
"\n",
" if batch_index == 0:\n",
" y_pred = np.reshape(batch_out, (num_batch,))\n",
" else:\n",
" y_pred = np.concatenate((y_pred, np.reshape(batch_out, (num_batch,))))\n",
"\n",
" batch_index += 1\n",
" Xi_batch, Xv_batch, y_batch = self.get_batch(Xi, Xv, dummy_y, self.batch_size, batch_index)\n",
"\n",
" return y_pred\n",
"\n",
"\n",
" def evaluate(self, Xi, Xv, y):\n",
" \"\"\"\n",
" :param Xi: list of list of feature indices of each sample in the dataset\n",
" :param Xv: list of list of feature values of each sample in the dataset\n",
" :param y: label of each sample in the dataset\n",
" :return: metric of the evaluation\n",
" \"\"\"\n",
" y_pred = self.predict(Xi, Xv)\n",
" return self.eval_metric(y, y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"\n",
"class FeatureDictionary(object):\n",
" def __init__(self, trainfile=None, testfile=None,\n",
" dfTrain=None, dfTest=None, numeric_cols=[], ignore_cols=[]):\n",
" assert not ((trainfile is None) and (dfTrain is None)), \"trainfile or dfTrain at least one is set\"\n",
" assert not ((trainfile is not None) and (dfTrain is not None)), \"only one can be set\"\n",
" assert not ((testfile is None) and (dfTest is None)), \"testfile or dfTest at least one is set\"\n",
" assert not ((testfile is not None) and (dfTest is not None)), \"only one can be set\"\n",
" self.trainfile = trainfile\n",
" self.testfile = testfile\n",
" self.dfTrain = dfTrain\n",
" self.dfTest = dfTest\n",
" self.numeric_cols = numeric_cols\n",
" self.ignore_cols = ignore_cols\n",
" self.gen_feat_dict()\n",
"\n",
" def gen_feat_dict(self):\n",
" if self.dfTrain is None:\n",
" dfTrain = pd.read_csv(self.trainfile)\n",
" else:\n",
" dfTrain = self.dfTrain\n",
" if self.dfTest is None:\n",
" dfTest = pd.read_csv(self.testfile)\n",
" else:\n",
" dfTest = self.dfTest\n",
" df = pd.concat([dfTrain, dfTest])\n",
" self.feat_dict = {}\n",
" tc = 0\n",
" for col in df.columns:\n",
" if col in self.ignore_cols:\n",
" continue\n",
" if col in self.numeric_cols:\n",
" # map to a single index\n",
" self.feat_dict[col] = tc\n",
" tc += 1\n",
" else:\n",
" us = df[col].unique()\n",
" self.feat_dict[col] = dict(zip(us, range(tc, len(us)+tc)))\n",
" tc += len(us)\n",
" self.feat_dim = tc\n",
"\n",
"\n",
"class DataParser(object):\n",
" def __init__(self, feat_dict):\n",
" self.feat_dict = feat_dict\n",
"\n",
" def parse(self, infile=None, df=None, has_label=False):\n",
" assert not ((infile is None) and (df is None)), \"infile or df at least one is set\"\n",
" assert not ((infile is not None) and (df is not None)), \"only one can be set\"\n",
" if infile is None:\n",
" dfi = df.copy()\n",
" else:\n",
" dfi = pd.read_csv(infile)\n",
" if has_label:\n",
" y = dfi[\"target\"].values.tolist()\n",
" dfi.drop([\"id\", \"target\"], axis=1, inplace=True)\n",
" else:\n",
" ids = dfi[\"id\"].values.tolist()\n",
" dfi.drop([\"id\"], axis=1, inplace=True)\n",
" # dfi for feature index\n",
" # dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)\n",
" dfv = dfi.copy()\n",
" for col in dfi.columns:\n",
" if col in self.feat_dict.ignore_cols:\n",
" dfi.drop(col, axis=1, inplace=True)\n",
" dfv.drop(col, axis=1, inplace=True)\n",
" continue\n",
" if col in self.feat_dict.numeric_cols:\n",
" dfi[col] = self.feat_dict.feat_dict[col]\n",
" else:\n",
" dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])\n",
" dfv[col] = 1.\n",
"\n",
" # list of list of feature indices of each sample in the dataset\n",
" Xi = dfi.values.tolist()\n",
" # list of list of feature values of each sample in the dataset\n",
" Xv = dfv.values.tolist()\n",
" if has_label:\n",
" return Xi, Xv, y\n",
" else:\n",
" return Xi, Xv, ids\n",
"# set the path-to-files\n",
"TRAIN_FILE = None\n",
"TEST_FILE = None\n",
"\n",
"SUB_DIR = \".\"\n",
"\n",
"\n",
"NUM_SPLITS = 5\n",
"RANDOM_SEED = 2017\n",
"\n",
"# types of columns of the dataset dataframe\n",
"CATEGORICAL_COLS = cats\n",
"\n",
"NUMERIC_COLS = contin\n",
"\n",
"IGNORE_COLS = [\n",
" \"id\", \"target\"]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"from matplotlib import pyplot as plt\n",
"from sklearn.metrics import make_scorer\n",
"from sklearn.model_selection import StratifiedKFold\n",
"\n",
"def _load_data():\n",
"\n",
" dfTrain = train\n",
" dfTest = test\n",
"\n",
" def preprocess(df):\n",
" cols = [c for c in df.columns if c not in [\"id\", \"target\"]]\n",
" return df\n",
"\n",
" dfTrain = preprocess(dfTrain)\n",
" dfTest = preprocess(dfTest)\n",
"\n",
" cols = [c for c in dfTrain.columns if c not in [\"id\", \"target\"]]\n",
" cols = [c for c in cols if (not c in IGNORE_COLS)]\n",
"\n",
" X_train = dfTrain[cols].values\n",
" y_train = dfTrain[\"target\"].values\n",
" X_test = dfTest[cols].values\n",
" ids_test = dfTest[\"id\"].values\n",
" cat_features_indices = [i for i,c in enumerate(cols) if c in CATEGORICAL_COLS]\n",
"\n",
" return dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices\n",
"\n",
"\n",
"def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):\n",
" fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,\n",
" numeric_cols=NUMERIC_COLS,\n",
" ignore_cols=IGNORE_COLS)\n",
" data_parser = DataParser(feat_dict=fd)\n",
" Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)\n",
" Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)\n",
"\n",
" dfm_params[\"feature_size\"] = fd.feat_dim\n",
" dfm_params[\"field_size\"] = len(Xi_train[0])\n",
"\n",
" y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)\n",
" y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)\n",
" _get = lambda x, l: [x[i] for i in l]\n",
" gini_results_cv = np.zeros(len(folds), dtype=float)\n",
" gini_results_epoch_train = np.zeros((len(folds), dfm_params[\"epoch\"]), dtype=float)\n",
" gini_results_epoch_valid = np.zeros((len(folds), dfm_params[\"epoch\"]), dtype=float)\n",
" for i, (train_idx, valid_idx) in enumerate(folds):\n",
" Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)\n",
" Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)\n",
"\n",
" dfm = DeepFM(**dfm_params)\n",
" dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)\n",
"\n",
" y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)\n",
" y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)\n",
"\n",
"# gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])\n",
"# gini_results_epoch_train[i] = dfm.train_result\n",
"# gini_results_epoch_valid[i] = dfm.valid_result\n",
"\n",
" y_test_meta /= float(len(folds))\n",
"\n",
" # save result\n",
" if dfm_params[\"use_fm\"] and dfm_params[\"use_deep\"]:\n",
" clf_str = \"DeepFM\"\n",
" elif dfm_params[\"use_fm\"]:\n",
" clf_str = \"FM\"\n",
" elif dfm_params[\"use_deep\"]:\n",
" clf_str = \"DNN\"\n",
" #print(\"%s: %.5f (%.5f)\"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()))\n",
" filename = \"DeepFM_Try0.csv\"\n",
" _make_submission(ids_test, y_test_meta, filename)\n",
"\n",
" #_plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)\n",
"\n",
" return y_train_meta, y_test_meta\n",
"\n",
"\n",
"def _make_submission(IDTEST, y_pred, filename=\"DeepFM_submission.csv\"):\n",
" pd.DataFrame({\"impression_id\": IDTEST, \"is_click\": y_pred.flatten()}).to_csv(\n",
" os.path.join(SUB_DIR, filename), index=False, float_format=\"%.5f\")\n",
"\n",
"\n",
"def _plot_fig(train_results, valid_results, model_name):\n",
" colors = [\"red\", \"blue\", \"green\"]\n",
" xs = np.arange(1, train_results.shape[1]+1)\n",
" plt.figure()\n",
" legends = []\n",
" for i in range(train_results.shape[0]):\n",
" plt.plot(xs, train_results[i], color=colors[i], linestyle=\"solid\", marker=\"o\")\n",
" plt.plot(xs, valid_results[i], color=colors[i], linestyle=\"dashed\", marker=\"o\")\n",
" legends.append(\"train-%d\"%(i+1))\n",
" legends.append(\"valid-%d\"%(i+1))\n",
" plt.xlabel(\"Epoch\")\n",
" plt.ylabel(\"Normalized Gini\")\n",
" plt.title(\"%s\"%model_name)\n",
" plt.legend(legends)\n",
" plt.savefig(\"./fig/%s.png\"%model_name)\n",
" plt.close()\n",
"\n",
"\n",
"# load data\n",
"dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices = _load_data()\n",
"\n",
"# folds\n",
"from sklearn.model_selection import KFold\n",
"folds = list(KFold(n_splits=NUM_SPLITS, shuffle=True,\n",
" random_state=RANDOM_SEED).split(X_train, y_train))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"#params: 4446857\n",
"[1] train-result=0.4761, valid-result=0.4788 [161.9 s]\n",
"[2] train-result=0.5770, valid-result=0.5414 [146.4 s]\n",
"[3] train-result=0.6777, valid-result=0.6211 [137.3 s]\n",
"[4] train-result=0.7521, valid-result=0.6872 [133.5 s]\n",
"[5] train-result=0.7805, valid-result=0.6963 [130.9 s]\n",
"[6] train-result=0.8091, valid-result=0.7067 [135.0 s]\n",
"[7] train-result=0.8646, valid-result=0.7177 [146.8 s]\n",
"[8] train-result=0.8942, valid-result=0.7244 [146.2 s]\n",
"[9] train-result=0.9304, valid-result=0.7260 [146.9 s]\n",
"[10] train-result=0.9639, valid-result=0.7148 [146.9 s]\n",
"#params: 4446857\n",
"[1] train-result=0.4920, valid-result=0.4830 [148.7 s]\n",
"[2] train-result=0.5625, valid-result=0.5055 [146.2 s]\n",
"[3] train-result=0.6721, valid-result=0.6210 [131.2 s]\n",
"[4] train-result=0.7490, valid-result=0.6774 [140.8 s]\n",
"[5] train-result=0.7772, valid-result=0.6863 [163.5 s]\n",
"[6] train-result=0.7939, valid-result=0.6922 [144.4 s]\n",
"[7] train-result=0.8599, valid-result=0.7033 [158.0 s]\n",
"[8] train-result=0.8878, valid-result=0.7174 [138.8 s]\n",
"[9] train-result=0.9242, valid-result=0.7237 [142.2 s]\n",
"[10] train-result=0.9615, valid-result=0.7186 [132.6 s]\n",
"#params: 4446857\n",
"[1] train-result=0.4545, valid-result=0.4269 [160.2 s]\n",
"[2] train-result=0.5811, valid-result=0.5222 [137.8 s]\n",
"[3] train-result=0.6845, valid-result=0.6250 [136.5 s]\n",
"[4] train-result=0.7499, valid-result=0.6694 [139.1 s]\n",
"[5] train-result=0.7863, valid-result=0.6766 [132.4 s]\n",
"[6] train-result=0.8415, valid-result=0.6936 [133.7 s]\n",
"[7] train-result=0.8752, valid-result=0.7031 [132.0 s]\n",
"[8] train-result=0.9036, valid-result=0.7064 [136.3 s]\n",
"[9] train-result=0.9369, valid-result=0.7062 [131.7 s]\n",
"[10] train-result=0.9644, valid-result=0.6984 [133.0 s]\n",
"#params: 4446857\n",
"[1] train-result=0.5337, valid-result=0.5305 [133.1 s]\n",
"[2] train-result=0.6263, valid-result=0.5730 [131.1 s]\n",
"[3] train-result=0.6889, valid-result=0.6278 [132.0 s]\n",
"[4] train-result=0.7488, valid-result=0.6877 [131.5 s]\n",
"[5] train-result=0.7752, valid-result=0.6941 [131.1 s]\n",
"[6] train-result=0.7959, valid-result=0.6978 [133.4 s]\n",
"[7] train-result=0.8168, valid-result=0.7003 [142.5 s]\n",
"[8] train-result=0.8525, valid-result=0.7096 [168.3 s]\n",
"[9] train-result=0.8908, valid-result=0.7177 [169.7 s]\n",
"[10] train-result=0.9238, valid-result=0.7233 [170.0 s]\n",
"#params: 4446857\n",
"[1] train-result=0.4569, valid-result=0.4647 [173.9 s]\n",
"[2] train-result=0.5456, valid-result=0.5116 [176.4 s]\n",
"[3] train-result=0.6373, valid-result=0.5485 [175.8 s]\n",
"[4] train-result=0.7453, valid-result=0.6617 [177.0 s]\n",
"[5] train-result=0.7811, valid-result=0.6813 [192.5 s]\n",
"[6] train-result=0.8290, valid-result=0.6964 [199.0 s]\n",
"[7] train-result=0.8706, valid-result=0.7075 [200.3 s]\n",
"[8] train-result=0.9043, valid-result=0.7136 [207.0 s]\n",
"[9] train-result=0.9208, valid-result=0.7178 [200.6 s]\n",
"[10] train-result=0.9467, valid-result=0.7165 [1046.8 s]\n"
]
}
],
"source": [
"# ------------------ DeepFM Model ------------------\n",
"# params\n",
"k = 0.5\n",
"dfm_params = {\n",
" \"use_fm\": True,\n",
" \"use_deep\": True,\n",
" \"embedding_size\": 32,\n",
" \"dropout_fm\": [k,k],\n",
" \"deep_layers\": [32, 32,32,32,32],\n",
" \"dropout_deep\": [k/5, k/5, k/5,k/5, k/5, k/5],\n",
" \"deep_layers_activation\": tf.nn.relu,\n",
" \"epoch\": 10,\n",
" \"batch_size\": 128,\n",
" \"learning_rate\": 0.001,\n",
" \"optimizer_type\": \"adam\",\n",
" \"batch_norm\": 1,\n",
" \"batch_norm_decay\": 0.995,\n",
" \"l2_reg\": 0.01,\n",
" \"verbose\": True,\n",
" \"eval_metric\": roc_auc_score,\n",
" \"random_seed\": RANDOM_SEED\n",
"}\n",
"\n",
"# # ------------------ FM Model ------------------\n",
"fm_params = dfm_params.copy()\n",
"y_train_fm, y_test_fm = _run_base_model_dfm(dfTrain, dfTest, folds, fm_params)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}