{
"cells": [
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import argparse\n",
"import os\n",
"import numpy as np\n",
"from anago.utils import load_data_and_labels, load_glove, filter_embeddings\n",
"import json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Reading dataset and seprating out each sentence and its tags in different lists"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# data_train=pd.read_csv('train.csv').replace(np.nan, ' ', regex=True)\n",
"# data_test=pd.read_csv('test.csv').replace(np.nan, ' ', regex=True)\n",
"# list_Word=[]\n",
"# list_tag =[]\n",
"# for i in data_train.Sent_ID.unique():\n",
"# list_Word.append(list(data_train[data_train.Sent_ID==i]['Word']))\n",
"# list_tag.append(list(data_train[data_train.Sent_ID==i]['tag']))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Saving these lists as json to read data fast"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# with open('list_Word.json','w') as f:\n",
"# json.dump(list_Word,f)\n",
"# with open('list_tag.json','w') as f:\n",
"# json.dump(list_tag,f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### loading data from saved json"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"with open('list_Word.json','r') as f:\n",
" list_Word=json.load(f)\n",
"with open('list_tag.json','r') as f:\n",
" list_tag=json.load(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### There are lots of sentences with other tags only removing them."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"#removing list with only other tags\n",
"filter_list_Word=[]\n",
"filter_list_tag=[]\n",
"for i in range (0,len(list_Word)):\n",
" if len(set(list_tag[i]))!=1:\n",
" filter_list_Word.append(list_Word[i])\n",
" filter_list_tag.append(list_tag[i])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(191282, 39804)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(list_tag),len(filter_list_tag)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"## replacing any empty quotes with space\n",
"for i in range (0,len(filter_list_Word)):\n",
" for n, j in enumerate(filter_list_Word[i]):\n",
" if j == '':\n",
" list_Word[i][n] = \" \""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### spliting data into train and test"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(filter_list_Word,filter_list_tag , test_size=0.20, random_state=42)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### now training anago sequence model with default parameters\n",
"['word_embedding_dim=100', 'char_embedding_dim=25', 'word_lstm_size=100', 'char_lstm_size=25', 'fc_dim=100', 'dropout=0.5', 'embeddings=None', 'use_char=True', 'use_crf=True', 'initial_vocab=None', \"optimizer='adam'\"],"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/15\n",
"996/996 [==============================] - 448s 450ms/step - loss: 3.9476\n",
"Epoch 2/15\n",
"996/996 [==============================] - 441s 443ms/step - loss: 3.8395\n",
"Epoch 3/15\n",
"237/996 [======>.......................] - ETA: 5:32 - loss: 3.7177"
]
}
],
"source": [
"import anago\n",
"model = anago.Sequence()\n",
"model.fit(X_train, y_train, epochs=15)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7909443970825045"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# calculating score on test data\n",
"model.score(X_test,y_test)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# saving model\n",
"model.save('ner_model','weight_file','params_file')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# loading model\n",
"model_two=anago.Sequence()\n",
"model_two=model_two.load('ner_model','weight_file','params_file')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7909443970825045"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# measuring score from loaded model\n",
"model_two.score(X_test,y_test)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"# function to predict from model \n",
"def predict(model,X):\n",
" y_predict=model.model.predict(model.p.transform(X))\n",
" lenghts=map(len,X)\n",
" y_predict=model.p.inverse_transform(y_predict,lenghts)\n",
" return y_predict"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"y_predict=predict(model,X_test)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7961, 7961)"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(y_predict),len(X_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Now using test dat for prediction"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"data_test=pd.read_csv('test.csv').replace(np.nan, ' ', regex=True)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" Doc_ID | \n",
" Sent_ID | \n",
" Word | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 4543834 | \n",
" 30001 | \n",
" 191283 | \n",
" CCCVA | \n",
"
\n",
" \n",
" 1 | \n",
" 4543835 | \n",
" 30001 | \n",
" 191283 | \n",
" , | \n",
"
\n",
" \n",
" 2 | \n",
" 4543836 | \n",
" 30001 | \n",
" 191283 | \n",
" MANOVA | \n",
"
\n",
" \n",
" 3 | \n",
" 4543837 | \n",
" 30001 | \n",
" 191283 | \n",
" , | \n",
"
\n",
" \n",
" 4 | \n",
" 4543838 | \n",
" 30001 | \n",
" 191283 | \n",
" my | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id Doc_ID Sent_ID Word\n",
"0 4543834 30001 191283 CCCVA\n",
"1 4543835 30001 191283 ,\n",
"2 4543836 30001 191283 MANOVA\n",
"3 4543837 30001 191283 ,\n",
"4 4543838 30001 191283 my"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_test.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### seprating out sentences"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"list_test_Word=[]\n",
"for i in data_test.Sent_ID.unique():\n",
" list_test_Word.append(list(data_test[data_test.Sent_ID==i]['Word']))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### predicting 10000 sentences at a time to avoid memory error"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n",
"2\n",
"3\n"
]
}
],
"source": [
"import itertools\n",
"y_outputs=[]\n",
"n=20000\n",
"count=0\n",
"for i in range (0,len(list_test_Word),n):\n",
" count+=1\n",
" print(count)\n",
" y_predict_two=predict(model,list_test_Word[i:i+n])\n",
" y_outputs+=list(itertools.chain.from_iterable(y_predict_two))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(list_test_Word)//20000"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_test.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(list_test_Word[0:10000])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_test['tag']= np.array(y_outputs).reshape(-1,1)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"df=data_test.drop(['Doc_ID','Word'],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('output.csv',index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('done')"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" Sent_ID | \n",
" tag | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 4543834 | \n",
" 191283 | \n",
" B-indications | \n",
"
\n",
" \n",
" 1 | \n",
" 4543835 | \n",
" 191283 | \n",
" I-indications | \n",
"
\n",
" \n",
" 2 | \n",
" 4543836 | \n",
" 191283 | \n",
" I-indications | \n",
"
\n",
" \n",
" 3 | \n",
" 4543837 | \n",
" 191283 | \n",
" O | \n",
"
\n",
" \n",
" 4 | \n",
" 4543838 | \n",
" 191283 | \n",
" O | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id Sent_ID tag\n",
"0 4543834 191283 B-indications\n",
"1 4543835 191283 I-indications\n",
"2 4543836 191283 I-indications\n",
"3 4543837 191283 O\n",
"4 4543838 191283 O"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (spell2)",
"language": "python",
"name": "spell"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}