import urllib2
from BeautifulSoup import BeautifulSoup
import time
start = time.time()

import tkFileDialog
from html.parser import HTMLParser
import pandas as pd

train_path_string = r'C:/Users/ROHIT/Desktop/Graphs/Analytics_Vidhya/train/train.csv'
train_df = pd.read_csv(train_path_string)
feature_df = train_df
tags = {'clinicalTrials': 0,'conferences': 1, 'forum': 2, 'guidelines': 3, 'news': 4, 'others': 5, 'profile': 6, 'publication': 7, 'thesis': 8}

train_df.Tag = [tags[item] for item in train_df.Tag]
def buildFeatures(df):
    isOrg = list()
    isArticle = list()
    isPDF = list()
    isBLOG = list()
    isNEWS = list()
    hasGuide = list()
    isEDU = list()
    isClinical = list()
    long = list()
    isForum = list()
    for i,row in df.iterrows():
        domain = row['Domain'].lower()
        url = row['Url']
        long.append(len(url))
        if ".org" in domain:
            isOrg.append(1)
        else:
            isOrg.append(0)

        if "clinical" in domain:
            isClinical.append(1)
        else:
            isClinical.append(0)

        if "article" in url:
            isArticle.append(1)
        else:
            isArticle.append(0)

        if "pdf" in url:
            isPDF.append(1)
        else:
            isPDF.append(0)

        if "blog" in domain:
            isBLOG.append(1)
        else:
            isBLOG.append(0)

        if ".edu" in domain:
            isEDU.append(1)
        else:
            isEDU.append(0)

        if "news" in url:
            isNEWS.append(1)
        else:
            isNEWS.append(0)

        if "guide" in url:
            hasGuide.append(1)
        else:
            hasGuide.append(0)

        if "forum" in url:
            isForum.append(1)
        else:
            isForum.append(0)

    df['isOrg'] = isOrg
    df['isArticle'] = isArticle
    df['isPDF'] = isPDF
    df['isBLOG'] = isBLOG
    df['isNEWS'] = isNEWS
    df['hasGuide'] = hasGuide
    df['isEDU'] = isEDU
    df['isClinical'] = isClinical
    df['how_long'] = long
    df['isForum'] = isForum
    return df

feature_df = buildFeatures(train_df)
feature_df.drop(['Webpage_id', 'Domain', 'Url'], axis = 1, inplace = 1)

print feature_df.columns

X_train=feature_df[['isOrg', 'isArticle', 'isPDF', 'isBLOG', 'isNEWS', 'hasGuide', 'isEDU', 'isClinical', 'how_long', 'isForum']]  # Features
y_train=feature_df['Tag']  # Labels


from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=200)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

test_path_string = r'C:/Users/ROHIT/Desktop/Graphs/Analytics_Vidhya/test_nvPHrOx.csv'
test_df = pd.read_csv(test_path_string)
feature_df = buildFeatures(test_df)
print test_df.columns
webPageIDList = test_df['Webpage_id'].tolist()
DomainList = test_df['Domain'].tolist()
UrlList = test_df['Url'].tolist()
feature_df.drop(['Webpage_id', 'Domain', 'Url'], axis = 1, inplace = 1)
print test_df.columns
X_test=feature_df[['isOrg', 'isArticle', 'isPDF', 'isBLOG', 'isNEWS', 'hasGuide', 'isEDU', 'isClinical', 'how_long', 'isForum']]  # Features
#y_test=feature_df['Tag']  # Labels
y_pred=clf.predict(X_test)
tags = {0 : 'clinicalTrials',1 : 'conferences', 2 : 'forum', 3 : 'guidelines',4: 'news', 5 :'others', 6 :'profile', 7: 'publication',8: 'thesis'}
tagList = list()
for item in y_pred:
    item = int(item)
    tagList.append(tags[item])
test_df['Webpage_id'] = webPageIDList
test_df['Tag'] = tagList
test_df.drop(['isOrg', 'isArticle', 'isPDF', 'isBLOG', 'isNEWS', 'hasGuide', 'isEDU', 'isClinical', 'how_long', 'isForum'], axis = 1, inplace = 1)
test_df.set_index(['Webpage_id'], inplace=True)
test_df.to_csv('Test.csv')


print 'It took', time.time()-start, 'seconds.'