{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df= pd.read_csv('train.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" Doc_ID | \n",
" Sent_ID | \n",
" Word | \n",
" tag | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Obesity | \n",
" O | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" in | \n",
" O | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
" 1 | \n",
" Low- | \n",
" O | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" and | \n",
" O | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 1 | \n",
" 1 | \n",
" Middle-Income | \n",
" O | \n",
"
\n",
" \n",
" 5 | \n",
" 6 | \n",
" 1 | \n",
" 1 | \n",
" Countries | \n",
" O | \n",
"
\n",
" \n",
" 6 | \n",
" 7 | \n",
" 1 | \n",
" 1 | \n",
" : | \n",
" O | \n",
"
\n",
" \n",
" 7 | \n",
" 8 | \n",
" 1 | \n",
" 1 | \n",
" Burden | \n",
" O | \n",
"
\n",
" \n",
" 8 | \n",
" 9 | \n",
" 1 | \n",
" 1 | \n",
" , | \n",
" O | \n",
"
\n",
" \n",
" 9 | \n",
" 10 | \n",
" 1 | \n",
" 1 | \n",
" Drivers | \n",
" O | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id Doc_ID Sent_ID Word tag\n",
"0 1 1 1 Obesity O\n",
"1 2 1 1 in O\n",
"2 3 1 1 Low- O\n",
"3 4 1 1 and O\n",
"4 5 1 1 Middle-Income O\n",
"5 6 1 1 Countries O\n",
"6 7 1 1 : O\n",
"7 8 1 1 Burden O\n",
"8 9 1 1 , O\n",
"9 10 1 1 Drivers O"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.impute import SimpleImputer\n",
"imp = SimpleImputer(missing_values=np.nan ,strategy='constant' , fill_value='none')\n",
"df['Word'] = imp.fit_transform(df['Word'].values.reshape(-1,1))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"vec = CountVectorizer(min_df=4)\n",
"X = vec.fit_transform(df['Word'])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<4543833x39385 sparse matrix of type ''\n",
"\twith 3796863 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = X.tocsr()\n",
"X"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"x_train,x_test,y_train,y_test = train_test_split(X,df['tag'] ,test_size = 0.2)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\tejas shanbhag\\appdata\\local\\continuum\\anaconda3\\envs\\try\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
" FutureWarning)\n",
"c:\\users\\tejas shanbhag\\appdata\\local\\continuum\\anaconda3\\envs\\try\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
" \"this warning.\", FutureWarning)\n"
]
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.svm import LinearSVC\n",
"clf = LogisticRegression()\n",
"clf.fit(x_train,y_train)\n",
"pred = clf.predict(x_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(model.score(x_test,y_test))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9832355268182054\n"
]
}
],
"source": [
"from sklearn.metrics import accuracy_score\n",
"score = accuracy_score(y_test,pred)\n",
"print(score)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df_test = pd.read_csv('test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.impute import SimpleImputer\n",
"imp1 = SimpleImputer(missing_values=np.nan ,strategy='constant' , fill_value='none')\n",
"df_test['Word'] = imp1.fit_transform(df_test['Word'].values.reshape(-1,1))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"X_test = vec.transform(df_test['Word'])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<2994463x39385 sparse matrix of type ''\n",
"\twith 2462667 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"predictions = clf.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" Sent_ID | \n",
" tag | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 4543834 | \n",
" 191283 | \n",
" O | \n",
"
\n",
" \n",
" 1 | \n",
" 4543835 | \n",
" 191283 | \n",
" O | \n",
"
\n",
" \n",
" 2 | \n",
" 4543836 | \n",
" 191283 | \n",
" O | \n",
"
\n",
" \n",
" 3 | \n",
" 4543837 | \n",
" 191283 | \n",
" O | \n",
"
\n",
" \n",
" 4 | \n",
" 4543838 | \n",
" 191283 | \n",
" O | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id Sent_ID tag\n",
"0 4543834 191283 O\n",
"1 4543835 191283 O\n",
"2 4543836 191283 O\n",
"3 4543837 191283 O\n",
"4 4543838 191283 O"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub = pd.read_csv('sample_submission.csv')\n",
"sub['tag'] = predictions\n",
"sub.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"O 2965126\n",
"I-indications 16027\n",
"B-indications 13310\n",
"Name: tag, dtype: int64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub['tag'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" Sent_ID | \n",
" tag | \n",
"
\n",
" \n",
" \n",
" \n",
" 561 | \n",
" 4544395 | \n",
" 191309 | \n",
" I-indications | \n",
"
\n",
" \n",
" 566 | \n",
" 4544400 | \n",
" 191309 | \n",
" I-indications | \n",
"
\n",
" \n",
" 591 | \n",
" 4544425 | \n",
" 191310 | \n",
" I-indications | \n",
"
\n",
" \n",
" 709 | \n",
" 4544543 | \n",
" 191317 | \n",
" I-indications | \n",
"
\n",
" \n",
" 800 | \n",
" 4544634 | \n",
" 191320 | \n",
" I-indications | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id Sent_ID tag\n",
"561 4544395 191309 I-indications\n",
"566 4544400 191309 I-indications\n",
"591 4544425 191310 I-indications\n",
"709 4544543 191317 I-indications\n",
"800 4544634 191320 I-indications"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub.loc[sub['tag'] =='I-indications'].head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"sub.to_csv('submission1.csv' ,index = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}