{
"cells": [
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pylab as plt"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {},
"outputs": [],
"source": [
"df=pd.read_csv(\"C:\\\\Users\\\\DELL\\\\Downloads\\\\train_LZdllcl.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {},
"outputs": [],
"source": [
"dftest=pd.read_csv(\"C:\\\\Users\\\\DELL\\\\Downloads\\\\test_2umaH9m.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" employee_id | \n",
" department | \n",
" region | \n",
" education | \n",
" gender | \n",
" recruitment_channel | \n",
" no_of_trainings | \n",
" age | \n",
" previous_year_rating | \n",
" length_of_service | \n",
" KPIs_met >80% | \n",
" awards_won? | \n",
" avg_training_score | \n",
" is_promoted | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 65438 | \n",
" Sales & Marketing | \n",
" region_7 | \n",
" Master's & above | \n",
" f | \n",
" sourcing | \n",
" 1 | \n",
" 35 | \n",
" 5.0 | \n",
" 8 | \n",
" 1 | \n",
" 0 | \n",
" 49 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 65141 | \n",
" Operations | \n",
" region_22 | \n",
" Bachelor's | \n",
" m | \n",
" other | \n",
" 1 | \n",
" 30 | \n",
" 5.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 7513 | \n",
" Sales & Marketing | \n",
" region_19 | \n",
" Bachelor's | \n",
" m | \n",
" sourcing | \n",
" 1 | \n",
" 34 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2542 | \n",
" Sales & Marketing | \n",
" region_23 | \n",
" Bachelor's | \n",
" m | \n",
" other | \n",
" 2 | \n",
" 39 | \n",
" 1.0 | \n",
" 10 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 48945 | \n",
" Technology | \n",
" region_26 | \n",
" Bachelor's | \n",
" m | \n",
" other | \n",
" 1 | \n",
" 45 | \n",
" 3.0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 73 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" employee_id department region education gender \\\n",
"0 65438 Sales & Marketing region_7 Master's & above f \n",
"1 65141 Operations region_22 Bachelor's m \n",
"2 7513 Sales & Marketing region_19 Bachelor's m \n",
"3 2542 Sales & Marketing region_23 Bachelor's m \n",
"4 48945 Technology region_26 Bachelor's m \n",
"\n",
" recruitment_channel no_of_trainings age previous_year_rating \\\n",
"0 sourcing 1 35 5.0 \n",
"1 other 1 30 5.0 \n",
"2 sourcing 1 34 3.0 \n",
"3 other 2 39 1.0 \n",
"4 other 1 45 3.0 \n",
"\n",
" length_of_service KPIs_met >80% awards_won? avg_training_score \\\n",
"0 8 1 0 49 \n",
"1 4 0 0 60 \n",
"2 7 0 0 50 \n",
"3 10 0 0 50 \n",
"4 2 0 0 73 \n",
"\n",
" is_promoted \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [],
"source": [
"df.drop(axis=1, columns='employee_id', level=None, inplace=True, errors='raise')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" department | \n",
" region | \n",
" education | \n",
" gender | \n",
" recruitment_channel | \n",
" no_of_trainings | \n",
" age | \n",
" previous_year_rating | \n",
" length_of_service | \n",
" KPIs_met >80% | \n",
" awards_won? | \n",
" avg_training_score | \n",
" is_promoted | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Sales & Marketing | \n",
" region_7 | \n",
" Master's & above | \n",
" f | \n",
" sourcing | \n",
" 1 | \n",
" 35 | \n",
" 5.0 | \n",
" 8 | \n",
" 1 | \n",
" 0 | \n",
" 49 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" Operations | \n",
" region_22 | \n",
" Bachelor's | \n",
" m | \n",
" other | \n",
" 1 | \n",
" 30 | \n",
" 5.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" Sales & Marketing | \n",
" region_19 | \n",
" Bachelor's | \n",
" m | \n",
" sourcing | \n",
" 1 | \n",
" 34 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" Sales & Marketing | \n",
" region_23 | \n",
" Bachelor's | \n",
" m | \n",
" other | \n",
" 2 | \n",
" 39 | \n",
" 1.0 | \n",
" 10 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" Technology | \n",
" region_26 | \n",
" Bachelor's | \n",
" m | \n",
" other | \n",
" 1 | \n",
" 45 | \n",
" 3.0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 73 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" department region education gender recruitment_channel \\\n",
"0 Sales & Marketing region_7 Master's & above f sourcing \n",
"1 Operations region_22 Bachelor's m other \n",
"2 Sales & Marketing region_19 Bachelor's m sourcing \n",
"3 Sales & Marketing region_23 Bachelor's m other \n",
"4 Technology region_26 Bachelor's m other \n",
"\n",
" no_of_trainings age previous_year_rating length_of_service \\\n",
"0 1 35 5.0 8 \n",
"1 1 30 5.0 4 \n",
"2 1 34 3.0 7 \n",
"3 2 39 1.0 10 \n",
"4 1 45 3.0 2 \n",
"\n",
" KPIs_met >80% awards_won? avg_training_score is_promoted \n",
"0 1 0 49 0 \n",
"1 0 0 60 0 \n",
"2 0 0 50 0 \n",
"3 0 0 50 0 \n",
"4 0 0 73 0 "
]
},
"execution_count": 154,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"department object\n",
"region object\n",
"education object\n",
"gender object\n",
"recruitment_channel object\n",
"no_of_trainings int64\n",
"age int64\n",
"previous_year_rating float64\n",
"length_of_service int64\n",
"KPIs_met >80% int64\n",
"awards_won? int64\n",
"avg_training_score int64\n",
"is_promoted int64\n",
"dtype: object"
]
},
"execution_count": 155,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['department', 'region', 'education', 'gender', 'recruitment_channel'], dtype='object')"
]
},
"execution_count": 156,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"objvar=df.dtypes.loc[df.dtypes==\"object\"].index\n",
"objvar"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"department 9\n",
"region 34\n",
"education 4\n",
"gender 2\n",
"recruitment_channel 3\n",
"dtype: int64"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[objvar].apply(lambda x:len(x.unique()))"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Sales & Marketing' 'Operations' 'Technology' 'Analytics' 'R&D'\n",
" 'Procurement' 'Finance' 'HR' 'Legal']\n",
"['region_7' 'region_22' 'region_19' 'region_23' 'region_26' 'region_2'\n",
" 'region_20' 'region_34' 'region_1' 'region_4' 'region_29' 'region_31'\n",
" 'region_15' 'region_14' 'region_11' 'region_5' 'region_28' 'region_17'\n",
" 'region_13' 'region_16' 'region_25' 'region_10' 'region_27' 'region_30'\n",
" 'region_12' 'region_21' 'region_8' 'region_32' 'region_6' 'region_33'\n",
" 'region_24' 'region_3' 'region_9' 'region_18']\n",
"[\"Master's & above\" \"Bachelor's\" nan 'Below Secondary']\n",
"['f' 'm']\n",
"['sourcing' 'other' 'referred']\n"
]
}
],
"source": [
"for var in objvar:\n",
" print(df[var].unique())"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sales & Marketing 16840\n",
"Operations 11348\n",
"Procurement 7138\n",
"Technology 7138\n",
"Analytics 5352\n",
"Finance 2536\n",
"HR 2418\n",
"Legal 1039\n",
"R&D 999\n",
"Name: department, dtype: int64\n",
"region_2 12343\n",
"region_22 6428\n",
"region_7 4843\n",
"region_15 2808\n",
"region_13 2648\n",
"region_26 2260\n",
"region_31 1935\n",
"region_4 1703\n",
"region_27 1659\n",
"region_16 1465\n",
"region_28 1318\n",
"region_11 1315\n",
"region_23 1175\n",
"region_29 994\n",
"region_32 945\n",
"region_19 874\n",
"region_20 850\n",
"region_14 827\n",
"region_25 819\n",
"region_17 796\n",
"region_5 766\n",
"region_6 690\n",
"region_30 657\n",
"region_8 655\n",
"region_10 648\n",
"region_1 610\n",
"region_24 508\n",
"region_12 500\n",
"region_9 420\n",
"region_21 411\n",
"region_3 346\n",
"region_34 292\n",
"region_33 269\n",
"region_18 31\n",
"Name: region, dtype: int64\n",
"Bachelor's 36669\n",
"Master's & above 14925\n",
"Below Secondary 805\n",
"Name: education, dtype: int64\n",
"m 38496\n",
"f 16312\n",
"Name: gender, dtype: int64\n",
"other 30446\n",
"sourcing 23220\n",
"referred 1142\n",
"Name: recruitment_channel, dtype: int64\n"
]
}
],
"source": [
"for var in objvar:\n",
" print(df[var].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.91483\n",
"1 0.08517\n",
"Name: is_promoted, dtype: float64"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['is_promoted'].value_counts()/df.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.977224\n",
"1 0.022776\n",
"Name: awards_won?, dtype: float64"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dftest['awards_won?'].value_counts()/dftest.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"nrows=dftest.index"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"count=0\n",
"n=0"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"for n in nrows:\n",
" if dftest.iloc[n,12]>95:\n",
" count=count+1"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"varn=df.columns"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"department\n",
"Analytics 0.095665\n",
"Finance 0.081230\n",
"HR 0.056245\n",
"Legal 0.051011\n",
"Operations 0.090148\n",
"Procurement 0.096386\n",
"R&D 0.069069\n",
"Sales & Marketing 0.072031\n",
"Technology 0.107593\n",
"All 0.085170\n",
"dtype: float64\n",
"region\n",
"region_1 0.095082\n",
"region_10 0.078704\n",
"region_11 0.056274\n",
"region_12 0.066000\n",
"region_13 0.086858\n",
"region_14 0.074970\n",
"region_15 0.079060\n",
"region_16 0.069625\n",
"region_17 0.136935\n",
"region_18 0.032258\n",
"region_19 0.060641\n",
"region_2 0.080126\n",
"region_20 0.057647\n",
"region_21 0.043796\n",
"region_22 0.114188\n",
"region_23 0.116596\n",
"region_24 0.035433\n",
"region_25 0.125763\n",
"region_26 0.063274\n",
"region_27 0.078963\n",
"region_28 0.116844\n",
"region_29 0.043260\n",
"region_3 0.106936\n",
"region_30 0.089802\n",
"region_31 0.056848\n",
"region_32 0.042328\n",
"region_33 0.037175\n",
"region_34 0.027397\n",
"region_4 0.144451\n",
"region_5 0.045692\n",
"region_6 0.046377\n",
"region_7 0.106546\n",
"region_8 0.080916\n",
"region_9 0.019048\n",
"All 0.085170\n",
"dtype: float64\n",
"education\n",
"Bachelor's 0.082031\n",
"Below Secondary 0.083230\n",
"Master's & above 0.098559\n",
"All 0.086757\n",
"dtype: float64\n",
"gender\n",
"f 0.089934\n",
"m 0.083151\n",
"All 0.085170\n",
"dtype: float64\n",
"recruitment_channel\n",
"other 0.083952\n",
"referred 0.120841\n",
"sourcing 0.085013\n",
"All 0.085170\n",
"dtype: float64\n",
"no_of_trainings\n",
"1 0.088107\n",
"2 0.075748\n",
"3 0.068694\n",
"4 0.055556\n",
"5 0.023438\n",
"6 0.045455\n",
"7 0.000000\n",
"8 0.000000\n",
"9 0.000000\n",
"10 0.000000\n",
"All 0.085170\n",
"dtype: float64\n",
"age\n",
"20 0.035398\n",
"21 0.051020\n",
"22 0.077922\n",
"23 0.079439\n",
"24 0.082840\n",
"25 0.053118\n",
"26 0.079126\n",
"27 0.092324\n",
"28 0.097871\n",
"29 0.086344\n",
"30 0.088404\n",
"31 0.087719\n",
"32 0.084041\n",
"33 0.099377\n",
"34 0.092978\n",
"35 0.104390\n",
"36 0.082638\n",
"37 0.084988\n",
"38 0.095684\n",
"39 0.094985\n",
"40 0.076368\n",
"41 0.080683\n",
"42 0.079199\n",
"43 0.077621\n",
"44 0.082645\n",
"45 0.063158\n",
"46 0.061693\n",
"47 0.048474\n",
"48 0.078995\n",
"49 0.074830\n",
"50 0.059501\n",
"51 0.087404\n",
"52 0.079772\n",
"53 0.038462\n",
"54 0.076677\n",
"55 0.061224\n",
"56 0.079545\n",
"57 0.021008\n",
"58 0.075117\n",
"59 0.081340\n",
"60 0.064516\n",
"All 0.085170\n",
"dtype: float64\n",
"previous_year_rating\n",
"1.0 0.014141\n",
"2.0 0.042840\n",
"3.0 0.072779\n",
"4.0 0.079376\n",
"5.0 0.163615\n",
"All 0.085412\n",
"dtype: float64\n",
"length_of_service\n",
"1 0.082912\n",
"2 0.089019\n",
"3 0.086592\n",
"4 0.087478\n",
"5 0.081447\n",
"6 0.084706\n",
"7 0.083589\n",
"8 0.093306\n",
"9 0.087105\n",
"10 0.093023\n",
"11 0.104803\n",
"12 0.079345\n",
"13 0.078603\n",
"14 0.052823\n",
"15 0.072513\n",
"16 0.074818\n",
"17 0.060185\n",
"18 0.063776\n",
"19 0.097264\n",
"20 0.078125\n",
"21 0.051282\n",
"22 0.098361\n",
"23 0.107692\n",
"24 0.000000\n",
"25 0.039216\n",
"26 0.000000\n",
"27 0.027778\n",
"28 0.066667\n",
"29 0.100000\n",
"30 0.000000\n",
"31 0.000000\n",
"32 0.200000\n",
"33 0.000000\n",
"34 0.250000\n",
"37 0.000000\n",
"All 0.085170\n",
"dtype: float64\n",
"KPIs_met >80%\n",
"0 0.039587\n",
"1 0.169094\n",
"All 0.085170\n",
"dtype: float64\n",
"awards_won?\n",
"0 0.076749\n",
"1 0.440157\n",
"All 0.085170\n",
"dtype: float64\n",
"avg_training_score\n",
"39 0.000000\n",
"40 0.000000\n",
"41 0.038462\n",
"42 0.032258\n",
"43 0.039773\n",
"44 0.029851\n",
"45 0.045521\n",
"46 0.031690\n",
"47 0.045246\n",
"48 0.039393\n",
"49 0.037300\n",
"50 0.041237\n",
"51 0.043886\n",
"52 0.038793\n",
"53 0.033233\n",
"54 0.049147\n",
"55 0.053899\n",
"56 0.064486\n",
"57 0.057063\n",
"58 0.068493\n",
"59 0.070252\n",
"60 0.064037\n",
"61 0.077701\n",
"62 0.080690\n",
"63 0.092067\n",
"64 0.114407\n",
"65 0.108514\n",
"66 0.091379\n",
"67 0.101648\n",
"68 0.082353\n",
" ... \n",
"71 0.120267\n",
"72 0.118621\n",
"73 0.112811\n",
"74 0.170901\n",
"75 0.121588\n",
"76 0.091085\n",
"77 0.093257\n",
"78 0.094319\n",
"79 0.072414\n",
"80 0.099502\n",
"81 0.109801\n",
"82 0.104354\n",
"83 0.106010\n",
"84 0.102740\n",
"85 0.097948\n",
"86 0.131579\n",
"87 0.128244\n",
"88 0.184685\n",
"89 0.279070\n",
"90 0.427027\n",
"91 0.649573\n",
"92 0.818182\n",
"93 0.892857\n",
"94 0.969231\n",
"95 1.000000\n",
"96 0.979167\n",
"97 1.000000\n",
"98 1.000000\n",
"99 1.000000\n",
"All 0.085170\n",
"Length: 62, dtype: float64\n"
]
},
{
"ename": "ValueError",
"evalue": "Duplicated level name: \"is_promoted\", assigned to level 1, is already used for level 0.",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mvar\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvarn\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mctab1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcrosstab\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mvar\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"is_promoted\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mmargins\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mctab1\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m/\u001b[0m\u001b[0mctab1\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"All\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\reshape\\pivot.py\u001b[0m in \u001b[0;36mcrosstab\u001b[1;34m(index, columns, values, rownames, colnames, aggfunc, margins, margins_name, dropna, normalize)\u001b[0m\n\u001b[0;32m 474\u001b[0m table = df.pivot_table('__dummy__', index=rownames, columns=colnames,\n\u001b[0;32m 475\u001b[0m \u001b[0mmargins\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmargins\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmargins_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmargins_name\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 476\u001b[1;33m dropna=dropna, **kwargs)\n\u001b[0m\u001b[0;32m 477\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 478\u001b[0m \u001b[1;31m# Post-process\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36mpivot_table\u001b[1;34m(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)\u001b[0m\n\u001b[0;32m 5298\u001b[0m \u001b[0maggfunc\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maggfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfill_value\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5299\u001b[0m \u001b[0mmargins\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmargins\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdropna\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdropna\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 5300\u001b[1;33m margins_name=margins_name)\n\u001b[0m\u001b[0;32m 5301\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5302\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mstack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdropna\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\reshape\\pivot.py\u001b[0m in \u001b[0;36mpivot_table\u001b[1;34m(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)\u001b[0m\n\u001b[0;32m 81\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 82\u001b[0m \u001b[0mgrouped\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkeys\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobserved\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdropna\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 83\u001b[1;33m \u001b[0magged\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgrouped\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0magg\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maggfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 84\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 85\u001b[0m \u001b[0mtable\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0magged\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36maggregate\u001b[1;34m(self, arg, *args, **kwargs)\u001b[0m\n\u001b[0;32m 4656\u001b[0m axis=''))\n\u001b[0;32m 4657\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0maggregate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4658\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mDataFrameGroupBy\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maggregate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4659\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4660\u001b[0m \u001b[0magg\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0maggregate\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36maggregate\u001b[1;34m(self, arg, *args, **kwargs)\u001b[0m\n\u001b[0;32m 4095\u001b[0m \u001b[1;31m# grouper specific aggregations\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4096\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgrouper\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnkeys\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4097\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_python_agg_general\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4098\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4099\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36m_python_agg_general\u001b[1;34m(self, func, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1086\u001b[0m \u001b[0moutput\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_try_cast\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1087\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1088\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_wrap_aggregated_output\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1089\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1090\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_wrap_applied_output\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36m_wrap_aggregated_output\u001b[1;34m(self, output, names)\u001b[0m\n\u001b[0;32m 4728\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_consolidate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4729\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4730\u001b[1;33m \u001b[0mindex\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgrouper\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresult_index\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4731\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moutput_keys\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4732\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mpandas\\_libs\\properties.pyx\u001b[0m in \u001b[0;36mpandas._libs.properties.CachedProperty.__get__\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36mresult_index\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 2379\u001b[0m \u001b[0mlabels\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlabels\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2380\u001b[0m \u001b[0mverify_integrity\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2381\u001b[1;33m names=self.names)\n\u001b[0m\u001b[0;32m 2382\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2383\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py\u001b[0m in \u001b[0;36m__new__\u001b[1;34m(cls, levels, labels, sortorder, names, dtype, copy, name, verify_integrity, _set_identity)\u001b[0m\n\u001b[0;32m 230\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mnames\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 231\u001b[0m \u001b[1;31m# handles name validation\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 232\u001b[1;33m \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_set_names\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnames\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 233\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 234\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0msortorder\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py\u001b[0m in \u001b[0;36m_set_names\u001b[1;34m(self, names, level, validate)\u001b[0m\n\u001b[0;32m 693\u001b[0m \u001b[1;34m'Duplicated level name: \"{}\", assigned to '\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 694\u001b[0m \u001b[1;34m'level {}, is already used for level '\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 695\u001b[1;33m '{}.'.format(name, l, used[name]))\n\u001b[0m\u001b[0;32m 696\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 697\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlevels\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0ml\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: Duplicated level name: \"is_promoted\", assigned to level 1, is already used for level 0."
]
}
],
"source": [
"for var in varn:\n",
" ctab1 = pd.crosstab(df[var],df[\"is_promoted\"],margins=True)\n",
" print(ctab1[1]/ctab1[\"All\"])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"is_promoted 0 1 All\n",
"department \n",
"Analytics 4840 512 5352\n",
"Finance 2330 206 2536\n",
"HR 2282 136 2418\n",
"Legal 986 53 1039\n",
"Operations 10325 1023 11348\n",
"Procurement 6450 688 7138\n",
"R&D 930 69 999\n",
"Sales & Marketing 15627 1213 16840\n",
"Technology 6370 768 7138\n",
"All 50140 4668 54808\n",
"is_promoted 0 1 All\n",
"region \n",
"region_1 552 58 610\n",
"region_10 597 51 648\n",
"region_11 1241 74 1315\n",
"region_12 467 33 500\n",
"region_13 2418 230 2648\n",
"region_14 765 62 827\n",
"region_15 2586 222 2808\n",
"region_16 1363 102 1465\n",
"region_17 687 109 796\n",
"region_18 30 1 31\n",
"region_19 821 53 874\n",
"region_2 11354 989 12343\n",
"region_20 801 49 850\n",
"region_21 393 18 411\n",
"region_22 5694 734 6428\n",
"region_23 1038 137 1175\n",
"region_24 490 18 508\n",
"region_25 716 103 819\n",
"region_26 2117 143 2260\n",
"region_27 1528 131 1659\n",
"region_28 1164 154 1318\n",
"region_29 951 43 994\n",
"region_3 309 37 346\n",
"region_30 598 59 657\n",
"region_31 1825 110 1935\n",
"region_32 905 40 945\n",
"region_33 259 10 269\n",
"region_34 284 8 292\n",
"region_4 1457 246 1703\n",
"region_5 731 35 766\n",
"region_6 658 32 690\n",
"region_7 4327 516 4843\n",
"region_8 602 53 655\n",
"region_9 412 8 420\n",
"All 50140 4668 54808\n",
"is_promoted 0 1 All\n",
"education \n",
"Bachelor's 33661 3008 36669\n",
"Below Secondary 738 67 805\n",
"Master's & above 13454 1471 14925\n",
"All 47853 4546 52399\n",
"is_promoted 0 1 All\n",
"gender \n",
"f 14845 1467 16312\n",
"m 35295 3201 38496\n",
"All 50140 4668 54808\n",
"is_promoted 0 1 All\n",
"recruitment_channel \n",
"other 27890 2556 30446\n",
"referred 1004 138 1142\n",
"sourcing 21246 1974 23220\n",
"All 50140 4668 54808\n",
"is_promoted 0 1 All\n",
"no_of_trainings \n",
"1 40468 3910 44378\n",
"2 7382 605 7987\n",
"3 1654 122 1776\n",
"4 442 26 468\n",
"5 125 3 128\n",
"6 42 2 44\n",
"7 12 0 12\n",
"8 5 0 5\n",
"9 5 0 5\n",
"10 5 0 5\n",
"All 50140 4668 54808\n",
"is_promoted 0 1 All\n",
"age \n",
"20 109 4 113\n",
"21 93 5 98\n",
"22 213 18 231\n",
"23 394 34 428\n",
"24 775 70 845\n",
"25 1230 69 1299\n",
"26 1897 163 2060\n",
"27 2566 261 2827\n",
"28 2839 308 3147\n",
"29 3111 294 3405\n",
"30 3341 324 3665\n",
"31 3224 310 3534\n",
"32 3237 297 3534\n",
"33 2891 319 3210\n",
"34 2790 286 3076\n",
"35 2428 283 2711\n",
"36 2309 208 2517\n",
"37 1981 184 2165\n",
"38 1739 184 1923\n",
"39 1534 161 1695\n",
"40 1536 127 1663\n",
"41 1185 104 1289\n",
"42 1058 91 1149\n",
"43 915 77 992\n",
"44 777 70 847\n",
"45 712 48 760\n",
"46 654 43 697\n",
"47 530 27 557\n",
"48 513 44 557\n",
"49 408 33 441\n",
"50 490 31 521\n",
"51 355 34 389\n",
"52 323 28 351\n",
"53 350 14 364\n",
"54 289 24 313\n",
"55 276 18 294\n",
"56 243 21 264\n",
"57 233 5 238\n",
"58 197 16 213\n",
"59 192 17 209\n",
"60 203 14 217\n",
"All 50140 4668 54808\n",
"is_promoted 0 1 All\n",
"previous_year_rating \n",
"1.0 6135 88 6223\n",
"2.0 4044 181 4225\n",
"3.0 17263 1355 18618\n",
"4.0 9093 784 9877\n",
"5.0 9820 1921 11741\n",
"All 46355 4329 50684\n",
"is_promoted 0 1 All\n",
"length_of_service \n",
"1 4170 377 4547\n",
"2 6089 595 6684\n",
"3 6424 609 7033\n",
"4 6238 598 6836\n",
"5 5357 475 5832\n",
"6 4333 401 4734\n",
"7 5087 464 5551\n",
"8 2614 269 2883\n",
"9 2400 229 2629\n",
"10 1989 204 2193\n",
"11 820 96 916\n",
"12 731 63 794\n",
"13 633 54 687\n",
"14 520 29 549\n",
"15 550 43 593\n",
"16 507 41 548\n",
"17 406 26 432\n",
"18 367 25 392\n",
"19 297 32 329\n",
"20 118 10 128\n",
"21 74 4 78\n",
"22 55 6 61\n",
"23 58 7 65\n",
"24 70 0 70\n",
"25 49 2 51\n",
"26 41 0 41\n",
"27 35 1 36\n",
"28 28 2 30\n",
"29 27 3 30\n",
"30 12 0 12\n",
"31 20 0 20\n",
"32 8 2 10\n",
"33 9 0 9\n",
"34 3 1 4\n",
"37 1 0 1\n",
"All 50140 4668 54808\n",
"is_promoted 0 1 All\n",
"KPIs_met >80% \n",
"0 34111 1406 35517\n",
"1 16029 3262 19291\n",
"All 50140 4668 54808\n",
"is_promoted 0 1 All\n",
"awards_won? \n",
"0 49429 4109 53538\n",
"1 711 559 1270\n",
"All 50140 4668 54808\n",
"is_promoted 0 1 All\n",
"avg_training_score \n",
"39 2 0 2\n",
"40 5 0 5\n",
"41 25 1 26\n",
"42 60 2 62\n",
"43 169 7 176\n",
"44 325 10 335\n",
"45 650 31 681\n",
"46 1100 36 1136\n",
"47 1667 79 1746\n",
"48 2341 96 2437\n",
"49 2581 100 2681\n",
"50 2604 112 2716\n",
"51 2244 103 2347\n",
"52 1784 72 1856\n",
"53 1280 44 1324\n",
"54 948 49 997\n",
"55 825 47 872\n",
"56 1001 69 1070\n",
"57 1355 82 1437\n",
"58 1768 130 1898\n",
"59 1919 145 2064\n",
"60 2017 138 2155\n",
"61 1733 146 1879\n",
"62 1333 117 1450\n",
"63 927 94 1021\n",
"64 627 81 708\n",
"65 534 65 599\n",
"66 527 53 580\n",
"67 654 74 728\n",
"68 858 77 935\n",
"... ... ... ...\n",
"71 790 108 898\n",
"72 639 86 725\n",
"73 464 59 523\n",
"74 359 74 433\n",
"75 354 49 403\n",
"76 469 47 516\n",
"77 632 65 697\n",
"78 845 88 933\n",
"79 1076 84 1160\n",
"80 1086 120 1206\n",
"81 1208 149 1357\n",
"82 1296 151 1447\n",
"83 1071 127 1198\n",
"84 1048 120 1168\n",
"85 967 105 1072\n",
"86 792 120 912\n",
"87 571 84 655\n",
"88 362 82 444\n",
"89 217 84 301\n",
"90 106 79 185\n",
"91 41 76 117\n",
"92 18 81 99\n",
"93 9 75 84\n",
"94 2 63 65\n",
"95 0 45 45\n",
"96 1 47 48\n",
"97 0 49 49\n",
"98 0 37 37\n",
"99 0 35 35\n",
"All 50140 4668 54808\n",
"\n",
"[62 rows x 3 columns]\n"
]
},
{
"ename": "ValueError",
"evalue": "Duplicated level name: \"is_promoted\", assigned to level 1, is already used for level 0.",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mvar\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvarn\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mctab1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcrosstab\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mvar\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"is_promoted\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mmargins\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mctab1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\reshape\\pivot.py\u001b[0m in \u001b[0;36mcrosstab\u001b[1;34m(index, columns, values, rownames, colnames, aggfunc, margins, margins_name, dropna, normalize)\u001b[0m\n\u001b[0;32m 474\u001b[0m table = df.pivot_table('__dummy__', index=rownames, columns=colnames,\n\u001b[0;32m 475\u001b[0m \u001b[0mmargins\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmargins\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmargins_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmargins_name\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 476\u001b[1;33m dropna=dropna, **kwargs)\n\u001b[0m\u001b[0;32m 477\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 478\u001b[0m \u001b[1;31m# Post-process\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36mpivot_table\u001b[1;34m(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)\u001b[0m\n\u001b[0;32m 5298\u001b[0m \u001b[0maggfunc\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maggfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfill_value\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5299\u001b[0m \u001b[0mmargins\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmargins\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdropna\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdropna\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 5300\u001b[1;33m margins_name=margins_name)\n\u001b[0m\u001b[0;32m 5301\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5302\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mstack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdropna\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\reshape\\pivot.py\u001b[0m in \u001b[0;36mpivot_table\u001b[1;34m(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)\u001b[0m\n\u001b[0;32m 81\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 82\u001b[0m \u001b[0mgrouped\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkeys\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobserved\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdropna\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 83\u001b[1;33m \u001b[0magged\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgrouped\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0magg\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maggfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 84\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 85\u001b[0m \u001b[0mtable\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0magged\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36maggregate\u001b[1;34m(self, arg, *args, **kwargs)\u001b[0m\n\u001b[0;32m 4656\u001b[0m axis=''))\n\u001b[0;32m 4657\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0maggregate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4658\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mDataFrameGroupBy\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maggregate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4659\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4660\u001b[0m \u001b[0magg\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0maggregate\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36maggregate\u001b[1;34m(self, arg, *args, **kwargs)\u001b[0m\n\u001b[0;32m 4095\u001b[0m \u001b[1;31m# grouper specific aggregations\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4096\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgrouper\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnkeys\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4097\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_python_agg_general\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4098\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4099\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36m_python_agg_general\u001b[1;34m(self, func, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1086\u001b[0m \u001b[0moutput\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_try_cast\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1087\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1088\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_wrap_aggregated_output\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1089\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1090\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_wrap_applied_output\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36m_wrap_aggregated_output\u001b[1;34m(self, output, names)\u001b[0m\n\u001b[0;32m 4728\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_consolidate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4729\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4730\u001b[1;33m \u001b[0mindex\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgrouper\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresult_index\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4731\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moutput_keys\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4732\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mpandas\\_libs\\properties.pyx\u001b[0m in \u001b[0;36mpandas._libs.properties.CachedProperty.__get__\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\groupby\\groupby.py\u001b[0m in \u001b[0;36mresult_index\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 2379\u001b[0m \u001b[0mlabels\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlabels\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2380\u001b[0m \u001b[0mverify_integrity\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2381\u001b[1;33m names=self.names)\n\u001b[0m\u001b[0;32m 2382\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2383\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py\u001b[0m in \u001b[0;36m__new__\u001b[1;34m(cls, levels, labels, sortorder, names, dtype, copy, name, verify_integrity, _set_identity)\u001b[0m\n\u001b[0;32m 230\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mnames\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 231\u001b[0m \u001b[1;31m# handles name validation\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 232\u001b[1;33m \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_set_names\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnames\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 233\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 234\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0msortorder\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\multi.py\u001b[0m in \u001b[0;36m_set_names\u001b[1;34m(self, names, level, validate)\u001b[0m\n\u001b[0;32m 693\u001b[0m \u001b[1;34m'Duplicated level name: \"{}\", assigned to '\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 694\u001b[0m \u001b[1;34m'level {}, is already used for level '\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 695\u001b[1;33m '{}.'.format(name, l, used[name]))\n\u001b[0m\u001b[0;32m 696\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 697\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlevels\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0ml\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: Duplicated level name: \"is_promoted\", assigned to level 1, is already used for level 0."
]
}
],
"source": [
"for var in varn:\n",
" ctab1 = pd.crosstab(df[var],df[\"is_promoted\"],margins=True)\n",
" print(ctab1)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
" X_train, X_test, Y_train, Y_test = train_test_split(df_new, labels, test_size=0.2,random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"department 0\n",
"region 0\n",
"education 2409\n",
"gender 0\n",
"recruitment_channel 0\n",
"no_of_trainings 0\n",
"age 0\n",
"previous_year_rating 4124\n",
"length_of_service 0\n",
"KPIs_met >80% 0\n",
"awards_won? 0\n",
"avg_training_score 0\n",
"is_promoted 0\n",
"dtype: int64"
]
},
"execution_count": 157,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.apply(lambda x:sum(x.isnull()))"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(54808, 13)"
]
},
"execution_count": 158,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"is_promoted 0 1 All\n",
"education \n",
"Bachelor's 33661 3008 36669\n",
"Below Secondary 738 67 805\n",
"Master's & above 13454 1471 14925\n",
"All 47853 4546 52399\n"
]
}
],
"source": [
"print(pd.crosstab(df[\"education\"],df[\"is_promoted\"],margins=True))"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"from scipy.stats import mode"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder\n"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [],
"source": [
"le=LabelEncoder()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['education', 'gender', 'recruitment_channel'], dtype='object')"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"var_test=df.dtypes.loc[df.dtypes==\"object\"].index\n",
"var_test"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [],
"source": [
"df[\"department\"] =le.fit_transform(df[\"department\"])"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"dftest[\"department\"] =le.fit_transform(dftest[\"department\"])"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [],
"source": [
"df[\"region\"]=le.fit_transform(df[\"region\"])"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [],
"source": [
"dftest[\"region\"]=le.fit_transform(dftest[\"region\"])"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [],
"source": [
"df[\"gender\"]=le.fit_transform(df[\"gender\"])"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [],
"source": [
"dftest[\"gender\"]=le.fit_transform(dftest[\"gender\"])"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [],
"source": [
"df[\"recruitment_channel\"]=le.fit_transform(df[\"recruitment_channel\"])"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [],
"source": [
"dftest[\"recruitment_channel\"]=le.fit_transform(dftest[\"recruitment_channel\"])"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"department int64\n",
"region int64\n",
"education object\n",
"gender int64\n",
"recruitment_channel int64\n",
"no_of_trainings int64\n",
"age int64\n",
"previous_year_rating float64\n",
"length_of_service int64\n",
"KPIs_met >80% int64\n",
"awards_won? int64\n",
"avg_training_score int64\n",
"is_promoted int64\n",
"dtype: object"
]
},
"execution_count": 172,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"employee_id int64\n",
"department int64\n",
"region int64\n",
"education object\n",
"gender int64\n",
"recruitment_channel int64\n",
"no_of_trainings int64\n",
"age int64\n",
"previous_year_rating float64\n",
"length_of_service int64\n",
"KPIs_met >80% int64\n",
"awards_won? int64\n",
"avg_training_score int64\n",
"dtype: object"
]
},
"execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dftest.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Bachelor's 36669\n",
"Master's & above 14925\n",
"Below Secondary 805\n",
"Name: education, dtype: int64\n"
]
}
],
"source": [
" print(df[\"education\"].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {},
"outputs": [],
"source": [
"replace_map = {'education': {\"Bachelor's\": 2, \"Master's & above\": 3, \"Below Secondary\": 1}}"
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" department | \n",
" region | \n",
" education | \n",
" gender | \n",
" recruitment_channel | \n",
" no_of_trainings | \n",
" age | \n",
" previous_year_rating | \n",
" length_of_service | \n",
" KPIs_met >80% | \n",
" awards_won? | \n",
" avg_training_score | \n",
" is_promoted | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 7 | \n",
" 31 | \n",
" 3.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 35 | \n",
" 5.0 | \n",
" 8 | \n",
" 1 | \n",
" 0 | \n",
" 49 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 14 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 30 | \n",
" 5.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 7 | \n",
" 10 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 34 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 7 | \n",
" 15 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 2 | \n",
" 39 | \n",
" 1.0 | \n",
" 10 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 8 | \n",
" 18 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 45 | \n",
" 3.0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 73 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" department region education gender recruitment_channel \\\n",
"0 7 31 3.0 0 2 \n",
"1 4 14 2.0 1 0 \n",
"2 7 10 2.0 1 2 \n",
"3 7 15 2.0 1 0 \n",
"4 8 18 2.0 1 0 \n",
"\n",
" no_of_trainings age previous_year_rating length_of_service \\\n",
"0 1 35 5.0 8 \n",
"1 1 30 5.0 4 \n",
"2 1 34 3.0 7 \n",
"3 2 39 1.0 10 \n",
"4 1 45 3.0 2 \n",
"\n",
" KPIs_met >80% awards_won? avg_training_score is_promoted \n",
"0 1 0 49 0 \n",
"1 0 0 60 0 \n",
"2 0 0 50 0 \n",
"3 0 0 50 0 \n",
"4 0 0 73 0 "
]
},
"execution_count": 176,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.replace(replace_map, inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 213,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" department | \n",
" region | \n",
" education | \n",
" gender | \n",
" recruitment_channel | \n",
" no_of_trainings | \n",
" age | \n",
" previous_year_rating | \n",
" length_of_service | \n",
" KPIs_met >80% | \n",
" awards_won? | \n",
" avg_training_score | \n",
" is_promoted | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 7 | \n",
" 31 | \n",
" 3.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 35 | \n",
" 5.0 | \n",
" 8 | \n",
" 1 | \n",
" 0 | \n",
" 49 | \n",
" NO | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 14 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 30 | \n",
" 5.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
" NO | \n",
"
\n",
" \n",
" 2 | \n",
" 7 | \n",
" 10 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 34 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" NO | \n",
"
\n",
" \n",
" 3 | \n",
" 7 | \n",
" 15 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 2 | \n",
" 39 | \n",
" 1.0 | \n",
" 10 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" NO | \n",
"
\n",
" \n",
" 4 | \n",
" 8 | \n",
" 18 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 45 | \n",
" 3.0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 73 | \n",
" NO | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" department region education gender recruitment_channel \\\n",
"0 7 31 3.0 0 2 \n",
"1 4 14 2.0 1 0 \n",
"2 7 10 2.0 1 2 \n",
"3 7 15 2.0 1 0 \n",
"4 8 18 2.0 1 0 \n",
"\n",
" no_of_trainings age previous_year_rating length_of_service \\\n",
"0 1 35 5.0 8 \n",
"1 1 30 5.0 4 \n",
"2 1 34 3.0 7 \n",
"3 2 39 1.0 10 \n",
"4 1 45 3.0 2 \n",
"\n",
" KPIs_met >80% awards_won? avg_training_score is_promoted \n",
"0 1 0 49 NO \n",
"1 0 0 60 NO \n",
"2 0 0 50 NO \n",
"3 0 0 50 NO \n",
"4 0 0 73 NO "
]
},
"execution_count": 213,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" employee_id | \n",
" department | \n",
" region | \n",
" education | \n",
" gender | \n",
" recruitment_channel | \n",
" no_of_trainings | \n",
" age | \n",
" previous_year_rating | \n",
" length_of_service | \n",
" KPIs_met >80% | \n",
" awards_won? | \n",
" avg_training_score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 8724 | \n",
" 8 | \n",
" 18 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 24 | \n",
" NaN | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 77 | \n",
"
\n",
" \n",
" 1 | \n",
" 74430 | \n",
" 2 | \n",
" 28 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 31 | \n",
" 3.0 | \n",
" 5 | \n",
" 0 | \n",
" 0 | \n",
" 51 | \n",
"
\n",
" \n",
" 2 | \n",
" 72255 | \n",
" 7 | \n",
" 4 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 31 | \n",
" 1.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 47 | \n",
"
\n",
" \n",
" 3 | \n",
" 38562 | \n",
" 5 | \n",
" 11 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 31 | \n",
" 2.0 | \n",
" 9 | \n",
" 0 | \n",
" 0 | \n",
" 65 | \n",
"
\n",
" \n",
" 4 | \n",
" 64486 | \n",
" 1 | \n",
" 21 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 30 | \n",
" 4.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 61 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" employee_id department region education gender recruitment_channel \\\n",
"0 8724 8 18 2.0 1 2 \n",
"1 74430 2 28 2.0 0 0 \n",
"2 72255 7 4 2.0 1 0 \n",
"3 38562 5 11 2.0 0 0 \n",
"4 64486 1 21 2.0 1 2 \n",
"\n",
" no_of_trainings age previous_year_rating length_of_service \\\n",
"0 1 24 NaN 1 \n",
"1 1 31 3.0 5 \n",
"2 1 31 1.0 4 \n",
"3 3 31 2.0 9 \n",
"4 1 30 4.0 7 \n",
"\n",
" KPIs_met >80% awards_won? avg_training_score \n",
"0 1 0 77 \n",
"1 0 0 51 \n",
"2 0 0 47 \n",
"3 0 0 65 \n",
"4 0 0 61 "
]
},
"execution_count": 177,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dftest.replace(replace_map, inplace=True)\n",
"dftest.head()"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" department | \n",
" region | \n",
" education | \n",
" gender | \n",
" recruitment_channel | \n",
" no_of_trainings | \n",
" age | \n",
" previous_year_rating | \n",
" length_of_service | \n",
" KPIs_met >80% | \n",
" awards_won? | \n",
" avg_training_score | \n",
" is_promoted | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 7 | \n",
" 31 | \n",
" 3.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 35 | \n",
" 5.0 | \n",
" 8 | \n",
" 1 | \n",
" 0 | \n",
" 49 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 14 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 30 | \n",
" 5.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 7 | \n",
" 10 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 34 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 7 | \n",
" 15 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 2 | \n",
" 39 | \n",
" 1.0 | \n",
" 10 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 8 | \n",
" 18 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 45 | \n",
" 3.0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 73 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" department region education gender recruitment_channel \\\n",
"0 7 31 3.0 0 2 \n",
"1 4 14 2.0 1 0 \n",
"2 7 10 2.0 1 2 \n",
"3 7 15 2.0 1 0 \n",
"4 8 18 2.0 1 0 \n",
"\n",
" no_of_trainings age previous_year_rating length_of_service \\\n",
"0 1 35 5.0 8 \n",
"1 1 30 5.0 4 \n",
"2 1 34 3.0 7 \n",
"3 2 39 1.0 10 \n",
"4 1 45 3.0 2 \n",
"\n",
" KPIs_met >80% awards_won? avg_training_score is_promoted \n",
"0 1 0 49 0 \n",
"1 0 0 60 0 \n",
"2 0 0 50 0 \n",
"3 0 0 50 0 \n",
"4 0 0 73 0 "
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {},
"outputs": [],
"source": [
"df_new=df.fillna(method=\"ffill\")"
]
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {},
"outputs": [],
"source": [
"dftest=dftest.fillna(method=\"ffill\")"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"department 0\n",
"region 0\n",
"education 0\n",
"gender 0\n",
"recruitment_channel 0\n",
"no_of_trainings 0\n",
"age 0\n",
"previous_year_rating 0\n",
"length_of_service 0\n",
"KPIs_met >80% 0\n",
"awards_won? 0\n",
"avg_training_score 0\n",
"is_promoted 0\n",
"dtype: int64"
]
},
"execution_count": 180,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.apply(lambda x:sum(x.isnull()))"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"employee_id 0\n",
"department 0\n",
"region 0\n",
"education 0\n",
"gender 0\n",
"recruitment_channel 0\n",
"no_of_trainings 0\n",
"age 0\n",
"previous_year_rating 1\n",
"length_of_service 0\n",
"KPIs_met >80% 0\n",
"awards_won? 0\n",
"avg_training_score 0\n",
"dtype: int64"
]
},
"execution_count": 181,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dftest.apply(lambda x:sum(x.isnull()))"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"import xgboost as xgb"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"regr = xgb.XGBRegressor(colsample_bytree=0.2,\n",
" gamma=0.0,\n",
" learning_rate=0.05,\n",
" max_depth=6,\n",
" min_child_weight=1.5,\n",
" n_estimators=7200,\n",
" reg_alpha=0.9,\n",
" reg_lambda=0.6,\n",
" subsample=0.2,\n",
" seed=42,\n",
" silent=1)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting xgboost\n",
" Downloading https://files.pythonhosted.org/packages/03/b8/0fcc6d3f28f45c5d1ef33fecca4b3bbcf8c4f53bebdc9b9146dad8fda64a/xgboost-0.80-py2.py3-none-win_amd64.whl (7.1MB)\n",
"Requirement already satisfied: numpy in c:\\users\\dell\\anaconda3\\lib\\site-packages (from xgboost) (1.14.3)\n",
"Requirement already satisfied: scipy in c:\\users\\dell\\anaconda3\\lib\\site-packages (from xgboost) (1.1.0)\n",
"Installing collected packages: xgboost\n",
"Successfully installed xgboost-0.80\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"distributed 1.21.8 requires msgpack, which is not installed.\n",
"You are using pip version 10.0.1, however version 18.0 is available.\n",
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
]
}
],
"source": [
"import sys\n",
"!{sys.executable} -m pip install xgboost"
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
" colsample_bytree=0.2, gamma=0.0, learning_rate=0.05,\n",
" max_delta_step=0, max_depth=6, min_child_weight=1.5, missing=None,\n",
" n_estimators=7200, n_jobs=1, nthread=None, objective='reg:linear',\n",
" random_state=0, reg_alpha=0.9, reg_lambda=0.6, scale_pos_weight=1,\n",
" seed=42, silent=1, subsample=0.2)"
]
},
"execution_count": 192,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"regr.fit(df_new, labels)"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {},
"outputs": [],
"source": [
"labels = np.array(df_new['is_promoted'])"
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dtype('O')"
]
},
"execution_count": 218,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"labels.dtype"
]
},
{
"cell_type": "code",
"execution_count": 191,
"metadata": {},
"outputs": [],
"source": [
"df_new= df_new.drop('is_promoted', axis = 1)\n",
"# Saving feature names for later use\n"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [],
"source": [
"dftest= dftest.drop('employee_id', axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [],
"source": [
"empid=dftest[\"employee_id\"]"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"dfsample=dfsample.drop('e',axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['department',\n",
" 'region',\n",
" 'education',\n",
" 'gender',\n",
" 'recruitment_channel',\n",
" 'no_of_trainings',\n",
" 'age',\n",
" 'previous_year_rating',\n",
" 'length_of_service',\n",
" 'KPIs_met >80%',\n",
" 'awards_won?',\n",
" 'avg_training_score']"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_list = list(df_new.columns)\n",
"df_list"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {},
"outputs": [],
"source": [
"# Convert to numpy ar\n",
"df_new= np.array(df_new)"
]
},
{
"cell_type": "code",
"execution_count": 221,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(54808, 12)"
]
},
"execution_count": 221,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.shape"
]
},
{
"cell_type": "code",
"execution_count": 222,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(54808,)"
]
},
"execution_count": 222,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"labels.shape"
]
},
{
"cell_type": "code",
"execution_count": 223,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"department 0\n",
"region 0\n",
"education 0\n",
"gender 0\n",
"recruitment_channel 0\n",
"no_of_trainings 0\n",
"age 0\n",
"previous_year_rating 0\n",
"length_of_service 0\n",
"KPIs_met >80% 0\n",
"awards_won? 0\n",
"avg_training_score 0\n",
"dtype: int64"
]
},
"execution_count": 223,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dftest.apply(lambda x:sum(x.isnull()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" department | \n",
" region | \n",
" education | \n",
" gender | \n",
" recruitment_channel | \n",
" no_of_trainings | \n",
" age | \n",
" previous_year_rating | \n",
" length_of_service | \n",
" KPIs_met >80% | \n",
" awards_won? | \n",
" avg_training_score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 8 | \n",
" 18 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 24 | \n",
" NaN | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 77 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" department region education gender recruitment_channel \\\n",
"0 8 18 2.0 1 2 \n",
"\n",
" no_of_trainings age previous_year_rating length_of_service \\\n",
"0 1 24 NaN 1 \n",
"\n",
" KPIs_met >80% awards_won? avg_training_score \n",
"0 1 0 77 "
]
},
"execution_count": 186,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dftest.loc[dftest[\"previous_year_rating\"].isnull()==True ,:]"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"department 8.0\n",
"region 18.0\n",
"education 2.0\n",
"gender 1.0\n",
"recruitment_channel 2.0\n",
"no_of_trainings 1.0\n",
"age 24.0\n",
"previous_year_rating NaN\n",
"length_of_service 1.0\n",
"KPIs_met >80% 1.0\n",
"awards_won? 0.0\n",
"avg_training_score 77.0\n",
"Name: 0, dtype: float64"
]
},
"execution_count": 186,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dftest.iloc[0,]"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'col' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdftest\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdftest\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0minplace\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: name 'col' is not defined"
]
}
],
"source": [
" dftest[col].fillna(dftest[col].mode()[0],inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3.0"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dftest[\"previous_year_rating\"].mode()[0]"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [],
"source": [
"dftest.iloc[0,7]=3"
]
},
{
"cell_type": "code",
"execution_count": 189,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"department 8.0\n",
"region 18.0\n",
"education 2.0\n",
"gender 1.0\n",
"recruitment_channel 2.0\n",
"no_of_trainings 1.0\n",
"age 24.0\n",
"previous_year_rating 3.0\n",
"length_of_service 1.0\n",
"KPIs_met >80% 1.0\n",
"awards_won? 0.0\n",
"avg_training_score 77.0\n",
"Name: 0, dtype: float64"
]
},
"execution_count": 189,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dftest.iloc[0,:]"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestRegressor"
]
},
{
"cell_type": "code",
"execution_count": 204,
"metadata": {},
"outputs": [],
"source": [
"rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)\n",
"# Train the model on training data\n"
]
},
{
"cell_type": "code",
"execution_count": 224,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "could not convert string to float: 'NO'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mrf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_new\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 274\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 275\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"dtype\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mDOUBLE\u001b[0m \u001b[1;32mor\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontiguous\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 276\u001b[1;33m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mascontiguousarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mDOUBLE\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 277\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 278\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mexpanded_class_weight\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\numpy\\core\\numeric.py\u001b[0m in \u001b[0;36mascontiguousarray\u001b[1;34m(a, dtype)\u001b[0m\n\u001b[0;32m 579\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 580\u001b[0m \"\"\"\n\u001b[1;32m--> 581\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'C'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mndmin\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 582\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 583\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: could not convert string to float: 'NO'"
]
}
],
"source": [
"rf.fit(df_new, labels)"
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {},
"outputs": [],
"source": [
"predictions = regr.predict(dftest)"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.16467479, 0.04793093, -0.02251524, ..., 0.02110964,\n",
" 0.00211576, 1.0844402 ], dtype=float32)"
]
},
"execution_count": 194,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {},
"outputs": [],
"source": [
"for n in nrows:\n",
" dfsample.iloc[n,1] = [0 if dfsample.iloc[n,0] <= 0.3 else 1] "
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
" dfsample.iloc[0,1] = [0 if dfsample.iloc[0,0] <= 0. else 1] "
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {},
"outputs": [],
"source": [
"dfsample['is_promoted'] =1\n"
]
},
{
"cell_type": "code",
"execution_count": 247,
"metadata": {},
"outputs": [],
"source": [
"nrows=dfsample.index"
]
},
{
"cell_type": "code",
"execution_count": 248,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RangeIndex(start=0, stop=23490, step=1)"
]
},
"execution_count": 248,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nrows"
]
},
{
"cell_type": "code",
"execution_count": 236,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.115, 0. , 0. , ..., 0. , 0. , 0.897])"
]
},
"execution_count": 236,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions"
]
},
{
"cell_type": "code",
"execution_count": 240,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 240,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions_1"
]
},
{
"cell_type": "code",
"execution_count": 195,
"metadata": {},
"outputs": [],
"source": [
"dfsample=pd.DataFrame(predictions)\n"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" is_promoted | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.136297 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.047931 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.022515 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" -0.022045 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.023418 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 is_promoted\n",
"0 0.136297 0\n",
"1 0.047931 0\n",
"2 -0.022515 0\n",
"3 -0.022045 0\n",
"4 0.023418 0"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfsample.head()"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"ename": "PermissionError",
"evalue": "[Errno 13] Permission denied: 'C:\\\\Users\\\\DELL\\\\Downloads\\\\file2.csv'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mPermissionError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdfsample\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"C:\\\\Users\\\\DELL\\\\Downloads\\\\file2.csv\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\",\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36mto_csv\u001b[1;34m(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, tupleize_cols, date_format, doublequote, escapechar, decimal)\u001b[0m\n\u001b[0;32m 1743\u001b[0m \u001b[0mdoublequote\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdoublequote\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1744\u001b[0m escapechar=escapechar, decimal=decimal)\n\u001b[1;32m-> 1745\u001b[1;33m \u001b[0mformatter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1746\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1747\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mpath_or_buf\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\io\\formats\\csvs.py\u001b[0m in \u001b[0;36msave\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 134\u001b[0m f, handles = _get_handle(self.path_or_buf, self.mode,\n\u001b[0;32m 135\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 136\u001b[1;33m compression=None)\n\u001b[0m\u001b[0;32m 137\u001b[0m \u001b[0mclose\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcompression\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;32melse\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 138\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\io\\common.py\u001b[0m in \u001b[0;36m_get_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text)\u001b[0m\n\u001b[0;32m 398\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 399\u001b[0m \u001b[1;31m# Python 3 and encoding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 400\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath_or_buf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 401\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mis_text\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 402\u001b[0m \u001b[1;31m# Python 3 and no explicit encoding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mPermissionError\u001b[0m: [Errno 13] Permission denied: 'C:\\\\Users\\\\DELL\\\\Downloads\\\\file2.csv'"
]
}
],
"source": [
"dfsample.to_csv(\"C:\\\\Users\\\\DELL\\\\Downloads\\\\file2.csv\", sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" department | \n",
" region | \n",
" education | \n",
" gender | \n",
" recruitment_channel | \n",
" no_of_trainings | \n",
" age | \n",
" previous_year_rating | \n",
" length_of_service | \n",
" KPIs_met >80% | \n",
" awards_won? | \n",
" avg_training_score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 7 | \n",
" 31 | \n",
" 3.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 35 | \n",
" 5.0 | \n",
" 8 | \n",
" 1 | \n",
" 0 | \n",
" 49 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 14 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 30 | \n",
" 5.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
"
\n",
" \n",
" 2 | \n",
" 7 | \n",
" 10 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 34 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
"
\n",
" \n",
" 3 | \n",
" 7 | \n",
" 15 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 2 | \n",
" 39 | \n",
" 1.0 | \n",
" 10 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
"
\n",
" \n",
" 4 | \n",
" 8 | \n",
" 18 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 45 | \n",
" 3.0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 73 | \n",
"
\n",
" \n",
" 5 | \n",
" 0 | \n",
" 11 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 31 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 85 | \n",
"
\n",
" \n",
" 6 | \n",
" 4 | \n",
" 12 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 31 | \n",
" 3.0 | \n",
" 5 | \n",
" 0 | \n",
" 0 | \n",
" 59 | \n",
"
\n",
" \n",
" 7 | \n",
" 4 | \n",
" 27 | \n",
" 3.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 33 | \n",
" 3.0 | \n",
" 6 | \n",
" 0 | \n",
" 0 | \n",
" 63 | \n",
"
\n",
" \n",
" 8 | \n",
" 0 | \n",
" 12 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 28 | \n",
" 4.0 | \n",
" 5 | \n",
" 0 | \n",
" 0 | \n",
" 83 | \n",
"
\n",
" \n",
" 9 | \n",
" 7 | \n",
" 0 | \n",
" 3.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 32 | \n",
" 5.0 | \n",
" 5 | \n",
" 1 | \n",
" 0 | \n",
" 54 | \n",
"
\n",
" \n",
" 10 | \n",
" 8 | \n",
" 15 | \n",
" 3.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 30 | \n",
" 5.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 77 | \n",
"
\n",
" \n",
" 11 | \n",
" 7 | \n",
" 31 | \n",
" 2.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 35 | \n",
" 5.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 50 | \n",
"
\n",
" \n",
" 12 | \n",
" 7 | \n",
" 28 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 49 | \n",
" 5.0 | \n",
" 5 | \n",
" 1 | \n",
" 0 | \n",
" 49 | \n",
"
\n",
" \n",
" 13 | \n",
" 8 | \n",
" 21 | \n",
" 3.0 | \n",
" 1 | \n",
" 0 | \n",
" 2 | \n",
" 39 | \n",
" 3.0 | \n",
" 16 | \n",
" 0 | \n",
" 0 | \n",
" 80 | \n",
"
\n",
" \n",
" 14 | \n",
" 6 | \n",
" 11 | \n",
" 3.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 37 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 84 | \n",
"
\n",
" \n",
" 15 | \n",
" 4 | \n",
" 31 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 37 | \n",
" 1.0 | \n",
" 10 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
"
\n",
" \n",
" 16 | \n",
" 8 | \n",
" 11 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 38 | \n",
" 3.0 | \n",
" 5 | \n",
" 1 | \n",
" 0 | \n",
" 77 | \n",
"
\n",
" \n",
" 17 | \n",
" 7 | \n",
" 24 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 34 | \n",
" 1.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 51 | \n",
"
\n",
" \n",
" 18 | \n",
" 7 | \n",
" 24 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 34 | \n",
" 5.0 | \n",
" 8 | \n",
" 1 | \n",
" 0 | \n",
" 46 | \n",
"
\n",
" \n",
" 19 | \n",
" 4 | \n",
" 6 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 37 | \n",
" 3.0 | \n",
" 9 | \n",
" 0 | \n",
" 0 | \n",
" 59 | \n",
"
\n",
" \n",
" 20 | \n",
" 5 | \n",
" 5 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 35 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 75 | \n",
"
\n",
" \n",
" 21 | \n",
" 4 | \n",
" 6 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 41 | \n",
" 4.0 | \n",
" 11 | \n",
" 0 | \n",
" 0 | \n",
" 57 | \n",
"
\n",
" \n",
" 22 | \n",
" 1 | \n",
" 2 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 28 | \n",
" 3.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 63 | \n",
"
\n",
" \n",
" 23 | \n",
" 5 | \n",
" 29 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 27 | \n",
" 3.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 70 | \n",
"
\n",
" \n",
" 24 | \n",
" 1 | \n",
" 14 | \n",
" 3.0 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 39 | \n",
" 5.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 59 | \n",
"
\n",
" \n",
" 25 | \n",
" 8 | \n",
" 14 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 27 | \n",
" 5.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 83 | \n",
"
\n",
" \n",
" 26 | \n",
" 5 | \n",
" 18 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 31 | \n",
" 5.0 | \n",
" 4 | \n",
" 1 | \n",
" 0 | \n",
" 68 | \n",
"
\n",
" \n",
" 27 | \n",
" 8 | \n",
" 18 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 33 | \n",
" 4.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 79 | \n",
"
\n",
" \n",
" 28 | \n",
" 0 | \n",
" 29 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 29 | \n",
" 5.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 80 | \n",
"
\n",
" \n",
" 29 | \n",
" 7 | \n",
" 28 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 26 | \n",
" 5.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 44 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 54778 | \n",
" 5 | \n",
" 11 | \n",
" 2.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 30 | \n",
" 3.0 | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 70 | \n",
"
\n",
" \n",
" 54779 | \n",
" 8 | \n",
" 23 | \n",
" 2.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 34 | \n",
" 3.0 | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 86 | \n",
"
\n",
" \n",
" 54780 | \n",
" 4 | \n",
" 10 | \n",
" 2.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 39 | \n",
" 5.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
"
\n",
" \n",
" 54781 | \n",
" 2 | \n",
" 19 | \n",
" 2.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 38 | \n",
" 5.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 50 | \n",
"
\n",
" \n",
" 54782 | \n",
" 4 | \n",
" 11 | \n",
" 1.0 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 26 | \n",
" 3.0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
"
\n",
" \n",
" 54783 | \n",
" 5 | \n",
" 11 | \n",
" 1.0 | \n",
" 1 | \n",
" 0 | \n",
" 3 | \n",
" 23 | \n",
" 3.0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 71 | \n",
"
\n",
" \n",
" 54784 | \n",
" 7 | \n",
" 23 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 39 | \n",
" 5.0 | \n",
" 9 | \n",
" 0 | \n",
" 0 | \n",
" 48 | \n",
"
\n",
" \n",
" 54785 | \n",
" 8 | \n",
" 18 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 33 | \n",
" 4.0 | \n",
" 7 | \n",
" 1 | \n",
" 0 | \n",
" 77 | \n",
"
\n",
" \n",
" 54786 | \n",
" 7 | \n",
" 24 | \n",
" 2.0 | \n",
" 0 | \n",
" 2 | \n",
" 2 | \n",
" 31 | \n",
" 1.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 51 | \n",
"
\n",
" \n",
" 54787 | \n",
" 8 | \n",
" 19 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 4 | \n",
" 26 | \n",
" 3.0 | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 84 | \n",
"
\n",
" \n",
" 54788 | \n",
" 7 | \n",
" 4 | \n",
" 2.0 | \n",
" 0 | \n",
" 2 | \n",
" 2 | \n",
" 28 | \n",
" 3.0 | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 48 | \n",
"
\n",
" \n",
" 54789 | \n",
" 7 | \n",
" 31 | \n",
" 3.0 | \n",
" 1 | \n",
" 0 | \n",
" 3 | \n",
" 31 | \n",
" 3.0 | \n",
" 9 | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
"
\n",
" \n",
" 54790 | \n",
" 2 | \n",
" 11 | \n",
" 2.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 48 | \n",
" 5.0 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 49 | \n",
"
\n",
" \n",
" 54791 | \n",
" 8 | \n",
" 2 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 32 | \n",
" 1.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 82 | \n",
"
\n",
" \n",
" 54792 | \n",
" 7 | \n",
" 5 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 59 | \n",
" 3.0 | \n",
" 11 | \n",
" 0 | \n",
" 0 | \n",
" 65 | \n",
"
\n",
" \n",
" 54793 | \n",
" 8 | \n",
" 11 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 33 | \n",
" 1.0 | \n",
" 9 | \n",
" 0 | \n",
" 0 | \n",
" 79 | \n",
"
\n",
" \n",
" 54794 | \n",
" 4 | \n",
" 7 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 37 | \n",
" 4.0 | \n",
" 7 | \n",
" 1 | \n",
" 0 | \n",
" 54 | \n",
"
\n",
" \n",
" 54795 | \n",
" 5 | \n",
" 6 | \n",
" 3.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 35 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 70 | \n",
"
\n",
" \n",
" 54796 | \n",
" 7 | \n",
" 17 | \n",
" 3.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 34 | \n",
" 3.0 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
"
\n",
" \n",
" 54797 | \n",
" 7 | \n",
" 2 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 28 | \n",
" 5.0 | \n",
" 3 | \n",
" 1 | \n",
" 1 | \n",
" 47 | \n",
"
\n",
" \n",
" 54798 | \n",
" 7 | \n",
" 11 | \n",
" 3.0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 40 | \n",
" 5.0 | \n",
" 4 | \n",
" 1 | \n",
" 0 | \n",
" 51 | \n",
"
\n",
" \n",
" 54799 | \n",
" 5 | \n",
" 11 | \n",
" 3.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 50 | \n",
" 5.0 | \n",
" 6 | \n",
" 1 | \n",
" 1 | \n",
" 67 | \n",
"
\n",
" \n",
" 54800 | \n",
" 2 | \n",
" 2 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 2 | \n",
" 34 | \n",
" 5.0 | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 52 | \n",
"
\n",
" \n",
" 54801 | \n",
" 8 | \n",
" 18 | \n",
" 2.0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 31 | \n",
" 5.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 78 | \n",
"
\n",
" \n",
" 54802 | \n",
" 7 | \n",
" 5 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 2 | \n",
" 31 | \n",
" 1.0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 49 | \n",
"
\n",
" \n",
" 54803 | \n",
" 8 | \n",
" 5 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 48 | \n",
" 3.0 | \n",
" 17 | \n",
" 0 | \n",
" 0 | \n",
" 78 | \n",
"
\n",
" \n",
" 54804 | \n",
" 4 | \n",
" 19 | \n",
" 3.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 37 | \n",
" 2.0 | \n",
" 6 | \n",
" 0 | \n",
" 0 | \n",
" 56 | \n",
"
\n",
" \n",
" 54805 | \n",
" 0 | \n",
" 0 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 27 | \n",
" 5.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 79 | \n",
"
\n",
" \n",
" 54806 | \n",
" 7 | \n",
" 33 | \n",
" 2.0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 29 | \n",
" 1.0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 45 | \n",
"
\n",
" \n",
" 54807 | \n",
" 2 | \n",
" 14 | \n",
" 2.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 27 | \n",
" 1.0 | \n",
" 5 | \n",
" 0 | \n",
" 0 | \n",
" 49 | \n",
"
\n",
" \n",
"
\n",
"
54808 rows × 12 columns
\n",
"
"
],
"text/plain": [
" department region education gender recruitment_channel \\\n",
"0 7 31 3.0 0 2 \n",
"1 4 14 2.0 1 0 \n",
"2 7 10 2.0 1 2 \n",
"3 7 15 2.0 1 0 \n",
"4 8 18 2.0 1 0 \n",
"5 0 11 2.0 1 2 \n",
"6 4 12 2.0 0 0 \n",
"7 4 27 3.0 1 2 \n",
"8 0 12 2.0 1 0 \n",
"9 7 0 3.0 1 2 \n",
"10 8 15 3.0 1 2 \n",
"11 7 31 2.0 0 2 \n",
"12 7 28 2.0 1 2 \n",
"13 8 21 3.0 1 0 \n",
"14 6 11 3.0 1 2 \n",
"15 4 31 2.0 1 0 \n",
"16 8 11 2.0 1 0 \n",
"17 7 24 2.0 1 0 \n",
"18 7 24 2.0 1 0 \n",
"19 4 6 2.0 1 0 \n",
"20 5 5 2.0 1 0 \n",
"21 4 6 2.0 1 2 \n",
"22 1 2 2.0 0 0 \n",
"23 5 29 2.0 1 0 \n",
"24 1 14 3.0 1 2 \n",
"25 8 14 2.0 1 0 \n",
"26 5 18 2.0 1 0 \n",
"27 8 18 2.0 1 0 \n",
"28 0 29 2.0 1 0 \n",
"29 7 28 2.0 1 2 \n",
"... ... ... ... ... ... \n",
"54778 5 11 2.0 0 2 \n",
"54779 8 23 2.0 0 2 \n",
"54780 4 10 2.0 0 2 \n",
"54781 2 19 2.0 0 2 \n",
"54782 4 11 1.0 1 2 \n",
"54783 5 11 1.0 1 0 \n",
"54784 7 23 2.0 1 0 \n",
"54785 8 18 2.0 1 0 \n",
"54786 7 24 2.0 0 2 \n",
"54787 8 19 2.0 1 2 \n",
"54788 7 4 2.0 0 2 \n",
"54789 7 31 3.0 1 0 \n",
"54790 2 11 2.0 0 2 \n",
"54791 8 2 2.0 1 2 \n",
"54792 7 5 2.0 1 0 \n",
"54793 8 11 2.0 1 0 \n",
"54794 4 7 2.0 1 0 \n",
"54795 5 6 3.0 1 0 \n",
"54796 7 17 3.0 1 0 \n",
"54797 7 2 2.0 1 2 \n",
"54798 7 11 3.0 0 0 \n",
"54799 5 11 3.0 0 0 \n",
"54800 2 2 2.0 1 0 \n",
"54801 8 18 2.0 0 2 \n",
"54802 7 5 2.0 1 0 \n",
"54803 8 5 2.0 1 2 \n",
"54804 4 19 3.0 0 0 \n",
"54805 0 0 2.0 1 0 \n",
"54806 7 33 2.0 1 2 \n",
"54807 2 14 2.0 1 0 \n",
"\n",
" no_of_trainings age previous_year_rating length_of_service \\\n",
"0 1 35 5.0 8 \n",
"1 1 30 5.0 4 \n",
"2 1 34 3.0 7 \n",
"3 2 39 1.0 10 \n",
"4 1 45 3.0 2 \n",
"5 2 31 3.0 7 \n",
"6 1 31 3.0 5 \n",
"7 1 33 3.0 6 \n",
"8 1 28 4.0 5 \n",
"9 1 32 5.0 5 \n",
"10 1 30 5.0 1 \n",
"11 1 35 5.0 3 \n",
"12 1 49 5.0 5 \n",
"13 2 39 3.0 16 \n",
"14 1 37 3.0 7 \n",
"15 1 37 1.0 10 \n",
"16 1 38 3.0 5 \n",
"17 1 34 1.0 4 \n",
"18 1 34 5.0 8 \n",
"19 1 37 3.0 9 \n",
"20 1 35 3.0 7 \n",
"21 1 41 4.0 11 \n",
"22 1 28 3.0 4 \n",
"23 1 27 3.0 1 \n",
"24 2 39 5.0 7 \n",
"25 1 27 5.0 3 \n",
"26 1 31 5.0 4 \n",
"27 1 33 4.0 4 \n",
"28 1 29 5.0 3 \n",
"29 1 26 5.0 1 \n",
"... ... ... ... ... \n",
"54778 1 30 3.0 3 \n",
"54779 1 34 3.0 3 \n",
"54780 1 39 5.0 7 \n",
"54781 1 38 5.0 3 \n",
"54782 2 26 3.0 2 \n",
"54783 3 23 3.0 2 \n",
"54784 1 39 5.0 9 \n",
"54785 1 33 4.0 7 \n",
"54786 2 31 1.0 4 \n",
"54787 4 26 3.0 3 \n",
"54788 2 28 3.0 3 \n",
"54789 3 31 3.0 9 \n",
"54790 1 48 5.0 4 \n",
"54791 1 32 1.0 3 \n",
"54792 1 59 3.0 11 \n",
"54793 1 33 1.0 9 \n",
"54794 1 37 4.0 7 \n",
"54795 1 35 3.0 7 \n",
"54796 1 34 3.0 7 \n",
"54797 1 28 5.0 3 \n",
"54798 2 40 5.0 4 \n",
"54799 1 50 5.0 6 \n",
"54800 2 34 5.0 3 \n",
"54801 1 31 5.0 1 \n",
"54802 2 31 1.0 2 \n",
"54803 1 48 3.0 17 \n",
"54804 1 37 2.0 6 \n",
"54805 1 27 5.0 3 \n",
"54806 1 29 1.0 2 \n",
"54807 1 27 1.0 5 \n",
"\n",
" KPIs_met >80% awards_won? avg_training_score \n",
"0 1 0 49 \n",
"1 0 0 60 \n",
"2 0 0 50 \n",
"3 0 0 50 \n",
"4 0 0 73 \n",
"5 0 0 85 \n",
"6 0 0 59 \n",
"7 0 0 63 \n",
"8 0 0 83 \n",
"9 1 0 54 \n",
"10 0 0 77 \n",
"11 1 0 50 \n",
"12 1 0 49 \n",
"13 0 0 80 \n",
"14 0 0 84 \n",
"15 0 0 60 \n",
"16 1 0 77 \n",
"17 0 0 51 \n",
"18 1 0 46 \n",
"19 0 0 59 \n",
"20 0 0 75 \n",
"21 0 0 57 \n",
"22 0 0 63 \n",
"23 0 0 70 \n",
"24 0 0 59 \n",
"25 1 0 83 \n",
"26 1 0 68 \n",
"27 0 0 79 \n",
"28 1 0 80 \n",
"29 0 0 44 \n",
"... ... ... ... \n",
"54778 0 0 70 \n",
"54779 0 0 86 \n",
"54780 0 0 60 \n",
"54781 1 0 50 \n",
"54782 0 0 60 \n",
"54783 0 0 71 \n",
"54784 0 0 48 \n",
"54785 1 0 77 \n",
"54786 0 0 51 \n",
"54787 0 0 84 \n",
"54788 0 0 48 \n",
"54789 0 0 50 \n",
"54790 0 0 49 \n",
"54791 1 0 82 \n",
"54792 0 0 65 \n",
"54793 0 0 79 \n",
"54794 1 0 54 \n",
"54795 0 0 70 \n",
"54796 0 0 60 \n",
"54797 1 1 47 \n",
"54798 1 0 51 \n",
"54799 1 1 67 \n",
"54800 0 0 52 \n",
"54801 0 0 78 \n",
"54802 0 0 49 \n",
"54803 0 0 78 \n",
"54804 0 0 56 \n",
"54805 1 0 79 \n",
"54806 0 0 45 \n",
"54807 0 0 49 \n",
"\n",
"[54808 rows x 12 columns]"
]
},
"execution_count": 146,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [],
"source": [
"df_new=df_new.drop('region', axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 261,
"metadata": {},
"outputs": [],
"source": [
"dftest=dftest.drop('region',axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"df_file=pd.read_csv(\"C:\\\\Users\\\\DELL\\\\Downloads\\\\file2.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"640 62916\n",
"1130 72095\n",
"1157 17135\n",
"1330 18640\n",
"1486 14136\n",
"3062 38555\n",
"3979 42881\n",
"4121 69522\n",
"5917 34892\n",
"6005 39801\n",
"6195 62166\n",
"7178 3502\n",
"9089 2176\n",
"9239 25359\n",
"9854 50642\n",
"10039 5255\n",
"10149 42492\n",
"10997 2465\n",
"11089 31241\n",
"11266 39673\n",
"12761 12401\n",
"14128 31436\n",
"14162 71746\n",
"14757 39706\n",
"15224 45996\n",
"16410 72553\n",
"16416 63080\n",
"16842 61444\n",
"16876 31194\n",
"17933 20262\n",
"18416 61704\n",
"21491 31396\n",
"Name: employee_id, dtype: int64"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"employe1=dftest[((dftest[\"awards_won?\"] ==1) & (dftest.avg_training_score>=91)&(dftest.avg_training_score<95))].employee_id\n",
"employe1"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"262 70734\n",
"293 62212\n",
"350 39224\n",
"925 66787\n",
"1574 14583\n",
"1575 969\n",
"1874 6694\n",
"1881 72083\n",
"1904 27916\n",
"2083 41835\n",
"2120 5338\n",
"2181 37900\n",
"2324 56535\n",
"3005 49203\n",
"3194 61090\n",
"3210 53816\n",
"3384 48968\n",
"3408 61292\n",
"3506 44692\n",
"3616 51040\n",
"3674 23944\n",
"3847 23629\n",
"3954 17063\n",
"3973 29021\n",
"4050 30927\n",
"4301 77489\n",
"4450 57419\n",
"4480 63689\n",
"4583 41351\n",
"4727 52013\n",
" ... \n",
"15766 74737\n",
"15862 60234\n",
"15931 29007\n",
"16231 48092\n",
"16335 71122\n",
"16361 5101\n",
"16688 23977\n",
"17081 36346\n",
"17340 30703\n",
"17787 9297\n",
"17983 16784\n",
"18327 12333\n",
"18857 49832\n",
"19404 44255\n",
"19422 48585\n",
"19549 40230\n",
"19899 51810\n",
"20608 39679\n",
"20930 38754\n",
"21401 72802\n",
"21421 285\n",
"21463 24251\n",
"22109 68246\n",
"22148 40960\n",
"22344 40720\n",
"22470 1214\n",
"22522 65032\n",
"22677 64586\n",
"22974 28659\n",
"23188 35139\n",
"Name: employee_id, Length: 99, dtype: int64"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"employe2=dftest[(dftest.avg_training_score>=95)].employee_id\n",
"employe2"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 70734\n",
"1 62212\n",
"2 39224\n",
"3 62916\n",
"4 72095\n",
"5 17135\n",
"6 18640\n",
"7 14136\n",
"8 969\n",
"9 6694\n",
"10 41835\n",
"11 56535\n",
"12 49203\n",
"13 38555\n",
"14 61090\n",
"15 61292\n",
"16 44692\n",
"17 23944\n",
"18 17063\n",
"19 29021\n",
"20 42881\n",
"21 69522\n",
"22 63689\n",
"23 75993\n",
"24 34892\n",
"25 39801\n",
"26 62166\n",
"27 3502\n",
"28 2176\n",
"29 25359\n",
" ... \n",
"35 31241\n",
"36 39673\n",
"37 56902\n",
"38 17769\n",
"39 9775\n",
"40 12401\n",
"41 19558\n",
"42 31436\n",
"43 71746\n",
"44 39910\n",
"45 76959\n",
"46 52727\n",
"47 39706\n",
"48 4132\n",
"49 45996\n",
"50 74737\n",
"51 60234\n",
"52 71122\n",
"53 72553\n",
"54 63080\n",
"55 61444\n",
"56 31194\n",
"57 30703\n",
"58 20262\n",
"59 61704\n",
"60 49832\n",
"61 51810\n",
"62 285\n",
"63 31396\n",
"64 35139\n",
"Name: employee_id, Length: 65, dtype: int32"
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"emp=df_file[(df_file['employee_id'].isin(employe1))|(df_file['employee_id'].isin(employe2))].employee_id.astype(int)\n",
"emp"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [],
"source": [
"df_file.loc[df_file[\"employee_id\"].isin(emp),\"is_promoted\"]=1"
]
},
{
"cell_type": "code",
"execution_count": 234,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"262 1\n",
"293 1\n",
"350 1\n",
"640 1\n",
"1130 1\n",
"1157 1\n",
"1330 1\n",
"1486 1\n",
"1575 1\n",
"1874 1\n",
"2083 1\n",
"2324 1\n",
"3005 1\n",
"3062 1\n",
"3194 1\n",
"3408 1\n",
"3506 1\n",
"3674 1\n",
"3954 1\n",
"3973 1\n",
"3979 1\n",
"4121 1\n",
"4480 1\n",
"5091 1\n",
"5917 1\n",
"6005 1\n",
"6195 1\n",
"7178 1\n",
"9089 1\n",
"9239 1\n",
" ..\n",
"11089 1\n",
"11266 1\n",
"11335 1\n",
"11990 1\n",
"12526 1\n",
"12761 1\n",
"13989 1\n",
"14128 1\n",
"14162 1\n",
"14201 1\n",
"14558 1\n",
"14749 1\n",
"14757 1\n",
"14965 1\n",
"15224 1\n",
"15766 1\n",
"15862 1\n",
"16335 1\n",
"16410 1\n",
"16416 1\n",
"16842 1\n",
"16876 1\n",
"17340 1\n",
"17933 1\n",
"18416 1\n",
"18857 1\n",
"19899 1\n",
"21421 1\n",
"21491 1\n",
"23188 1\n",
"Name: is_promoted, Length: 65, dtype: int64"
]
},
"execution_count": 234,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfsample.loc[dfsample[\"employee_id\"].isin(emp),\"is_promoted\"]"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [],
"source": [
"df_file.to_csv(\"C:\\\\Users\\\\DELL\\\\Downloads\\\\file3.csv\", sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 200,
"metadata": {},
"outputs": [],
"source": [
"dfsample[\"employee_id\"]=empid"
]
},
{
"cell_type": "code",
"execution_count": 204,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" is_promoted | \n",
" employee_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 8724 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 74430 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 72255 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 38562 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 64486 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" is_promoted employee_id\n",
"0 0 8724\n",
"1 0 74430\n",
"2 0 72255\n",
"3 0 38562\n",
"4 0 64486"
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfsample.head()"
]
},
{
"cell_type": "code",
"execution_count": 203,
"metadata": {},
"outputs": [],
"source": [
"dfsample=dfsample.drop(0,axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 207,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"262 70734\n",
"293 62212\n",
"350 39224\n",
"925 66787\n",
"1574 14583\n",
"1575 969\n",
"1874 6694\n",
"1881 72083\n",
"1904 27916\n",
"2083 41835\n",
"2120 5338\n",
"2181 37900\n",
"2324 56535\n",
"3005 49203\n",
"3194 61090\n",
"3210 53816\n",
"3384 48968\n",
"3408 61292\n",
"3506 44692\n",
"3616 51040\n",
"3674 23944\n",
"3847 23629\n",
"3954 17063\n",
"3973 29021\n",
"4050 30927\n",
"4301 77489\n",
"4450 57419\n",
"4480 63689\n",
"4583 41351\n",
"4727 52013\n",
" ... \n",
"15766 74737\n",
"15862 60234\n",
"15931 29007\n",
"16231 48092\n",
"16335 71122\n",
"16361 5101\n",
"16688 23977\n",
"17081 36346\n",
"17340 30703\n",
"17787 9297\n",
"17983 16784\n",
"18327 12333\n",
"18857 49832\n",
"19404 44255\n",
"19422 48585\n",
"19549 40230\n",
"19899 51810\n",
"20608 39679\n",
"20930 38754\n",
"21401 72802\n",
"21421 285\n",
"21463 24251\n",
"22109 68246\n",
"22148 40960\n",
"22344 40720\n",
"22470 1214\n",
"22522 65032\n",
"22677 64586\n",
"22974 28659\n",
"23188 35139\n",
"Name: employee_id, Length: 99, dtype: int64"
]
},
"execution_count": 207,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"employe2"
]
},
{
"cell_type": "code",
"execution_count": 208,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 70734\n",
"1 62212\n",
"2 39224\n",
"3 62916\n",
"4 72095\n",
"5 17135\n",
"6 18640\n",
"7 14136\n",
"8 969\n",
"9 6694\n",
"10 41835\n",
"11 56535\n",
"12 49203\n",
"13 38555\n",
"14 61090\n",
"15 61292\n",
"16 44692\n",
"17 23944\n",
"18 17063\n",
"19 29021\n",
"20 42881\n",
"21 69522\n",
"22 63689\n",
"23 75993\n",
"24 34892\n",
"25 39801\n",
"26 62166\n",
"27 3502\n",
"28 2176\n",
"29 25359\n",
" ... \n",
"35 31241\n",
"36 39673\n",
"37 56902\n",
"38 17769\n",
"39 9775\n",
"40 12401\n",
"41 19558\n",
"42 31436\n",
"43 71746\n",
"44 39910\n",
"45 76959\n",
"46 52727\n",
"47 39706\n",
"48 4132\n",
"49 45996\n",
"50 74737\n",
"51 60234\n",
"52 71122\n",
"53 72553\n",
"54 63080\n",
"55 61444\n",
"56 31194\n",
"57 30703\n",
"58 20262\n",
"59 61704\n",
"60 49832\n",
"61 51810\n",
"62 285\n",
"63 31396\n",
"64 35139\n",
"Name: employee_id, Length: 65, dtype: int32"
]
},
"execution_count": 208,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"emp"
]
},
{
"cell_type": "code",
"execution_count": 220,
"metadata": {},
"outputs": [],
"source": [
"for n in nrows:\n",
" if dfsample.iloc[n,1] in emp:\n",
" dfsample.iloc[n,0]=1"
]
},
{
"cell_type": "code",
"execution_count": 215,
"metadata": {},
"outputs": [],
"source": [
"nrows=dfsample.index"
]
},
{
"cell_type": "code",
"execution_count": 217,
"metadata": {},
"outputs": [],
"source": [
"n=0"
]
},
{
"cell_type": "code",
"execution_count": 233,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" is_promoted | \n",
" employee_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 8724 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 74430 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 72255 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 38562 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 64486 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" is_promoted employee_id\n",
"0 0 8724\n",
"1 0 74430\n",
"2 0 72255\n",
"3 0 38562\n",
"4 0 64486"
]
},
"execution_count": 233,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfsample.head()"
]
},
{
"cell_type": "code",
"execution_count": 230,
"metadata": {},
"outputs": [],
"source": [
"for n in nrows:\n",
" if dfsample.iloc[n,1] in emp:\n",
" dfsample.iloc[n,0]=1\n",
" else:\n",
" dfsample.iloc[n,0]=dfsample.iloc[n,0]\n",
" \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 235,
"metadata": {},
"outputs": [],
"source": [
"dfsample.iloc[0,0]=0"
]
},
{
"cell_type": "code",
"execution_count": 236,
"metadata": {},
"outputs": [],
"source": [
"dfsample.to_csv(\"C:\\\\Users\\\\DELL\\\\Downloads\\\\file4.csv\", sep=\",\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}