# coding: utf-8

# In[1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Importing train test dataset

# In[41]:

train = pd.read_csv("train_LZdllcl.csv")
test = pd.read_csv("test_2umaH9m.csv")


# In[42]:

train.shape, test.shape


# Missing values

# In[43]:

train.isnull().sum()


# In[44]:

test.isnull().sum()


# In[45]:

train.education.fillna("not_mentioned", inplace=True)


# In[46]:

test.education.fillna("not_mentioned", inplace=True)


# In[47]:

train.previous_year_rating.fillna(0, inplace=True)
test.previous_year_rating.fillna(0, inplace=True)


# Categorical variables

# In[48]:

train.ix[train["gender"]=="m", "gender"] = 0
train.ix[train["gender"]=="f", "gender"] = 1


# In[49]:

test.ix[test["gender"]=="m", "gender"] = 0
test.ix[test["gender"]=="f", "gender"] = 1


# In[50]:

train.dtypes


# In[51]:

test.dtypes


# In[52]:

train['gender']=train['gender'].astype(int)
test['gender']=test['gender'].astype(int)


# In[53]:

test.dtypes


# In[54]:

df_train=pd.get_dummies(train, columns=["department", "region","education","recruitment_channel"], prefix=["department", "region","education","recruitment_channel"])


# In[55]:

df_test=pd.get_dummies(test, columns=["department", "region","education","recruitment_channel"], prefix=["department", "region","education","recruitment_channel"])


# In[56]:

df_train.head()


# In[57]:

df_test.head()


# In[58]:

x_train = df_train.drop(["employee_id","is_promoted"],axis=1).values


# In[59]:

y_train = df_train.ix[:,"is_promoted"]


# In[61]:

x_test = df_test.drop(["employee_id"],axis=1).values


# In[65]:

df_train.is_promoted.value_counts()


# In[67]:

(y_train==1).sum()


# # Model training

# After different interations and parameter tuning I select lightgbm as classifier with n_estimators=85 
# and probability thresold = 0.25

# In[68]:

import lightgbm as lgb


# In[191]:

model = lgb.LGBMClassifier(n_estimators=85,min_child_samples=20)
model.fit(x_train,y_train)


# In[192]:

pred_prob = model.predict_proba(x_test)[:,1]


# In[198]:

y_pred = np.where(pred_prob > 0.25,1,0)


# In[199]:

Submission = pd.DataFrame({"employee_id":df_test.ix[:,"employee_id"].values,"is_promoted":y_pred})


# In[200]:

Submission.head()


# In[201]:

Submission.shape


# In[202]:

Submission.is_promoted.value_counts()


# In[203]:

Submission.to_csv("Submission15.csv")


# In[ ]: