import numpy as np
import pandas as pd
import operator
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

from sklearn import neighbors, tree, naive_bayes 
from sklearn.lda import LDA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn import cross_validation
from pandas.tools.plotting import scatter_matrix

1.a Define KNN classifier¶

def KNN(inX, dataSet, labels, k = 3, measure = 'Minkowski', measure_D = 2):
    ''' K-Nearest Neighbour classification using Minkowski distance or Cosine simillarity'''
    # distance calculation
    if measure == 'Minkowski':
        dataSetSize = dataSet.shape[0]
        diffMat = np.tile(inX, (dataSetSize,1)) - dataSet
        sqDiffMat = diffMat ** measure_D
        sqDistances =sqDiffMat.sum(axis=1)
        distances = sqDistances ** (1.0/measure_D)
    elif measure == 'Cosine':
        # vector of norms of each row
        D_norm = np.array([np.linalg.norm(dataSet[i]) for i in range(len(dataSet))])
        # vector of ||X|| norm
        data_norm = np.linalg.norm(inX)
        # similarity
        sim = np.dot(dataSet,inX)/(D_norm * data_norm)
        distances = 1- sim
    sortedDistIndices = distances.argsort()
    # vowing with lowest distances
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndices[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
    sortedClassCount = sorted(classCount.iteritems(),key= operator.itemgetter(1), reverse =True)
    
    return sortedClassCount[0][0]

minmax_norm = lambda x: ((x-x.min())/(x.max()-x.min()) * (1 - 0) + 0) if (x.dtypes==np.float64 or x.dtypes==np.int64) else x
Label_set = lambda x: 'hockey' if x == 1 else 'microsoft'

Data preprocesing¶

# trainig data
trainData = np.genfromtxt("trainMatrixModified.txt", delimiter='\t' , dtype=float)
trainLabel= np.genfromtxt('trainClasses.txt', delimiter='\t' , dtype=float)
# testing data
testData = np.genfromtxt("testMatrixModified.txt", delimiter='\t' , dtype=float)
testLabel = np.genfromtxt('testClasses.txt', delimiter='\t' , dtype=float )


# converting to Doc by term matrix
trainDataPD = pd.DataFrame(trainData).transpose()
testDataPD = pd.DataFrame(testData).transpose()

# data transformation
trainDataNorm = np.asarray(trainDataPD.apply(minmax_norm))
testDataNorm = np.asarray(testDataPD.apply(minmax_norm))

# label creation
# traing
trainLabels = []
for label in trainLabel:
    trainLabels.append(Label_set(label[1]))
trainLabels = np.asarray(trainLabels)
# testing
testLabels = []
for label in testLabel:
    testLabels.append(Label_set(label[1]))
testLabels = np.asarray(testLabels)

# Normalizing values and extracting classes
trainDataNorm = np.asarray(trainDataPD.apply(minmax_norm))
trainDataNorm = np.nan_to_num(trainDataNorm)
trainLabel = trainLabel[:,1]
testDataNorm = np.asarray(testDataPD.apply(minmax_norm))
testDataNorm = np.nan_to_num(testDataNorm)
testLabel = testLabel[:,1]

1.b Define a function to compute the classification accuracy¶

def KNN_Test(trainDataNorm, trainLabels, testDataNorm, testLabels,k = 3, measure = 'Minkowski', measure_D = 2):
    numTestVecs = len(testLabels)
    ClassError = 0.0
    for i in range(numTestVecs):
        PredCalss = KNN(testDataNorm[i,:], trainDataNorm, trainLabels, k, measure, measure_D) 
        if PredCalss != testLabels[i]: ClassError += 1.0
    print "%s Accuracy is: %f with %s errors" % (measure, 1-ClassError/float(numTestVecs),  ClassError)
    return 1 - ClassError/float(numTestVecs)

1.c. Testing classifier on a range of values for K¶

''' Testing for a range of valuesof k'''

ki = np.zeros(20)
CosData = np.zeros(20)
MinData = np.zeros(20)
for k in range(1,21):
    ki[k-1] = k
    #CosData[k-1] = KNN_Test(trainDataNorm, trainLabels, testDataNorm, testLabels, k , 'Cosine')
    CosData[k-1] = KNN_Test(trainData.T, trainLabels, testData.T, testLabels, k , 'Cosine')
    MinData[k-1] = KNN_Test(trainDataNorm, trainLabels, testDataNorm, testLabels, k ,'Minkowski', 2)

Cosine Accuracy is: 0.985000 with 3.0 errors
Minkowski Accuracy is: 0.730000 with 54.0 errors
Cosine Accuracy is: 0.985000 with 3.0 errors
Minkowski Accuracy is: 0.730000 with 54.0 errors
Cosine Accuracy is: 0.970000 with 6.0 errors
Minkowski Accuracy is: 0.705000 with 59.0 errors
Cosine Accuracy is: 0.985000 with 3.0 errors
Minkowski Accuracy is: 0.705000 with 59.0 errors
Cosine Accuracy is: 0.970000 with 6.0 errors
Minkowski Accuracy is: 0.610000 with 78.0 errors
Cosine Accuracy is: 0.985000 with 3.0 errors
Minkowski Accuracy is: 0.620000 with 76.0 errors
Cosine Accuracy is: 0.980000 with 4.0 errors
Minkowski Accuracy is: 0.545000 with 91.0 errors
Cosine Accuracy is: 0.980000 with 4.0 errors
Minkowski Accuracy is: 0.555000 with 89.0 errors
Cosine Accuracy is: 0.975000 with 5.0 errors
Minkowski Accuracy is: 0.525000 with 95.0 errors
Cosine Accuracy is: 0.985000 with 3.0 errors
Minkowski Accuracy is: 0.525000 with 95.0 errors
Cosine Accuracy is: 0.980000 with 4.0 errors
Minkowski Accuracy is: 0.510000 with 98.0 errors
Cosine Accuracy is: 0.975000 with 5.0 errors
Minkowski Accuracy is: 0.540000 with 92.0 errors
Cosine Accuracy is: 0.980000 with 4.0 errors
Minkowski Accuracy is: 0.525000 with 95.0 errors
Cosine Accuracy is: 0.980000 with 4.0 errors
Minkowski Accuracy is: 0.525000 with 95.0 errors
Cosine Accuracy is: 0.985000 with 3.0 errors
Minkowski Accuracy is: 0.505000 with 99.0 errors
Cosine Accuracy is: 0.980000 with 4.0 errors
Minkowski Accuracy is: 0.505000 with 99.0 errors
Cosine Accuracy is: 0.975000 with 5.0 errors
Minkowski Accuracy is: 0.500000 with 100.0 errors
Cosine Accuracy is: 0.975000 with 5.0 errors
Minkowski Accuracy is: 0.500000 with 100.0 errors
Cosine Accuracy is: 0.975000 with 5.0 errors
Minkowski Accuracy is: 0.495000 with 101.0 errors
Cosine Accuracy is: 0.975000 with 5.0 errors
Minkowski Accuracy is: 0.495000 with 101.0 errors

# 
limit = max(CosData.max(),MinData.max())+.05
inflimit = min(CosData.min(),MinData.min())-.05
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(1,1,1)
ax.set_title("Cosine vs Minkowski")
# roll to start x axis at 1 not zero
ax.plot(np.append(np.roll(CosData,1),CosData[19]),label='Cosine')
ax.plot(np.append(np.roll(MinData,1),MinData[19]),label='Minkowski')
ax.set_xticks(np.arange(1,21,1)) # x axis values
ax.legend(('Cosine','Minkowski'),loc='upper center', ncol=2, bbox_to_anchor=(0.5, -0.15))
ax.set_ylabel('Accuracy')
ax.set_xlabel('k')
ax.axis([1, 20, inflimit, limit]) 
ax.grid()
fig.savefig('knn.png')

Image(filename='knn.png')

1.d. TF/IDF¶

''' TFxIDF '''
# Number of Documents
N = len(trainDataPD[0])
# Term frequency
tf = trainData.sum(axis=1)
# doc Frecuency
DF = np.array([(trainData!=0).sum(1)]).T
# idf
IDFmat = np.ones(np.shape(trainData),dtype=float) * N
np.set_printoptions(precision=2,suppress=True, linewidth=420)
IDF = np.log2(np.divide(IDFmat,DF))
#tfxidf
train_tfidf = (trainData * IDF).T
# idf for testing
IDF = np.ones(np.shape(testData), dtype=float) * IDF[:,0:200] 
# tfxidf testing
test_tfidf = (testData * IDF).T

''' testing with tfxidf'''
CosTFData = np.zeros(20)
MinTFData = np.zeros(20)
for k in range(1,21):
    CosTFData[k-1] = KNN_Test(train_tfidf, trainLabels, test_tfidf, testLabels, k , 'Cosine')
    MinTFData[k-1] = KNN_Test(train_tfidf, trainLabels, test_tfidf, testLabels, k ,'Minkowski', 2)

Cosine Accuracy is: 0.945000 with 11.0 errors
Minkowski Accuracy is: 0.710000 with 58.0 errors
Cosine Accuracy is: 0.945000 with 11.0 errors
Minkowski Accuracy is: 0.710000 with 58.0 errors
Cosine Accuracy is: 0.980000 with 4.0 errors
Minkowski Accuracy is: 0.750000 with 50.0 errors
Cosine Accuracy is: 0.970000 with 6.0 errors
Minkowski Accuracy is: 0.715000 with 57.0 errors
Cosine Accuracy is: 0.980000 with 4.0 errors
Minkowski Accuracy is: 0.800000 with 40.0 errors
Cosine Accuracy is: 0.980000 with 4.0 errors
Minkowski Accuracy is: 0.820000 with 36.0 errors
Cosine Accuracy is: 0.985000 with 3.0 errors
Minkowski Accuracy is: 0.730000 with 54.0 errors
Cosine Accuracy is: 0.985000 with 3.0 errors
Minkowski Accuracy is: 0.740000 with 52.0 errors
Cosine Accuracy is: 0.990000 with 2.0 errors
Minkowski Accuracy is: 0.680000 with 64.0 errors
Cosine Accuracy is: 0.990000 with 2.0 errors
Minkowski Accuracy is: 0.760000 with 48.0 errors
Cosine Accuracy is: 0.990000 with 2.0 errors
Minkowski Accuracy is: 0.680000 with 64.0 errors
Cosine Accuracy is: 0.990000 with 2.0 errors
Minkowski Accuracy is: 0.720000 with 56.0 errors
Cosine Accuracy is: 0.990000 with 2.0 errors
Minkowski Accuracy is: 0.650000 with 70.0 errors
Cosine Accuracy is: 0.990000 with 2.0 errors
Minkowski Accuracy is: 0.675000 with 65.0 errors
Cosine Accuracy is: 0.990000 with 2.0 errors
Minkowski Accuracy is: 0.620000 with 76.0 errors
Cosine Accuracy is: 0.995000 with 1.0 errors
Minkowski Accuracy is: 0.660000 with 68.0 errors
Cosine Accuracy is: 0.990000 with 2.0 errors
Minkowski Accuracy is: 0.615000 with 77.0 errors
Cosine Accuracy is: 0.995000 with 1.0 errors
Minkowski Accuracy is: 0.680000 with 64.0 errors
Cosine Accuracy is: 0.995000 with 1.0 errors
Minkowski Accuracy is: 0.630000 with 74.0 errors
Cosine Accuracy is: 1.000000 with 0.0 errors
Minkowski Accuracy is: 0.660000 with 68.0 errors

# plot using tfidf data    
limit = max(CosTFData.max(),MinTFData.max(),CosTFData.max(),MinTFData.max())+.05
minlimit = min(CosTFData.min(),MinTFData.min(),CosTFData.min(),MinTFData.min())-.05
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(2,2,1)
ax.set_title("KNN comparison Min-max vs tf/idf data")
# roll to start x axis at 1 not zero
ax.plot(np.append(np.roll(CosData,1),CosData[19]),label='Cosine (min-max)')
ax.plot(np.append(np.roll(MinData,1),MinData[19]),label='Minkowski (min-max)')
ax.plot(np.append(np.roll(CosTFData,1),CosTFData[19]),label='Cosine (tf/idf)')
ax.plot(np.append(np.roll(MinTFData,1),MinTFData[19]),label='Minkowski (tf/idf)')
ax.set_xticks(np.arange(1,21,1)) # x axis values
ax.legend(('Cosine (min-max)','Minkowski (min-max)','Cosine (tf/idf)','Minkowski (tf/idf)'),
          loc='upper center', ncol=2, bbox_to_anchor=(0.5, -0.15))
ax.set_ylabel('Accuracy')
ax.set_xlabel('k')
ax.axis([1, 20, .45, limit]) 
ax.grid()
fig.savefig('tfxidf.png')

Image(filename='tfxidf.png')

'''
Cosine similarity proves to be the best regadless the normalization of the data. Within the cosine, using tfxidf is better than min-max normalization'''

1.e Rocchio¶

''' Rocchio '''
cat = ['microsoft','hockey']
''' generar dos vectores uno para microsfot y otro para keckey. cada vector es la suma de los vectoes de esa clase'''
pi = np.zeros(200)

mic = np.zeros(5500)
hoc = np.zeros(5500)

for i in range(len(trainLabel)):    
    if trainLabel[i] == 0:
        mic = mic + train_tfidf[i,:]
    else:
        hoc = hoc + train_tfidf[i,:]
docCat = []
error=0
for i in range(len(test_tfidf)):
    doc = test_tfidf[i]
    clasActual = testLabels[i]
    # vector of norms of each row
    micD_norm = np.array([np.linalg.norm(mic) for i in range(len(mic))])
    hocD_norm = np.array([np.linalg.norm(hoc) for i in range(len(hoc))])
    # vector of ||X|| norm
    data_norm = np.linalg.norm(doc)
    # similarity
    micSim = np.dot(doc,mic)/(np.unique(micD_norm)[0] * data_norm)
    hocSim = np.dot(doc,hoc)/(np.unique(hocD_norm)[0] * data_norm)
    if micSim < hocSim:
        clas=cat[1]
    else:
        clas=cat[0]
    
    print 'Predicted class: %s . Similarity to Microsoft: %0.2f , to hockey: %0.2f ' % (clas,micSim,hocSim)
    
    if clasActual != clas: 
        error+=1
print 'Accuracy %0.2f ' % (100*(1-error/len(testLabels)))

Class: hockey . Similarity to Microsoft: 0.07 , to hockey: 0.24 
Class: microsoft . Similarity to Microsoft: 0.20 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.08 , to hockey: 0.08 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.22 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.17 
Class: microsoft . Similarity to Microsoft: 0.20 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.13 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.15 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.08 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.15 
Class: microsoft . Similarity to Microsoft: 0.27 , to hockey: 0.08 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.18 
Class: microsoft . Similarity to Microsoft: 0.08 , to hockey: 0.07 
Class: microsoft . Similarity to Microsoft: 0.17 , to hockey: 0.03 
Class: hockey . Similarity to Microsoft: 0.06 , to hockey: 0.17 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.15 
Class: microsoft . Similarity to Microsoft: 0.15 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.08 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.02 
Class: microsoft . Similarity to Microsoft: 0.12 , to hockey: 0.03 
Class: microsoft . Similarity to Microsoft: 0.37 , to hockey: 0.06 
Class: microsoft . Similarity to Microsoft: 0.09 , to hockey: 0.03 
Class: microsoft . Similarity to Microsoft: 0.12 , to hockey: 0.06 
Class: microsoft . Similarity to Microsoft: 0.27 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.32 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.12 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.23 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.03 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.14 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.07 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.11 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.05 
Class: microsoft . Similarity to Microsoft: 0.19 , to hockey: 0.06 
Class: microsoft . Similarity to Microsoft: 0.24 , to hockey: 0.06 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.12 
Class: microsoft . Similarity to Microsoft: 0.13 , to hockey: 0.03 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.02 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.12 
Class: hockey . Similarity to Microsoft: 0.06 , to hockey: 0.08 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.17 
Class: microsoft . Similarity to Microsoft: 0.03 , to hockey: 0.03 
Class: microsoft . Similarity to Microsoft: 0.11 , to hockey: 0.08 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.11 
Class: microsoft . Similarity to Microsoft: 0.13 , to hockey: 0.06 
Class: microsoft . Similarity to Microsoft: 0.12 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.07 , to hockey: 0.29 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.21 
Class: microsoft . Similarity to Microsoft: 0.18 , to hockey: 0.05 
Class: microsoft . Similarity to Microsoft: 0.17 , to hockey: 0.03 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.02 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.14 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.17 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.07 , to hockey: 0.14 
Class: microsoft . Similarity to Microsoft: 0.16 , to hockey: 0.05 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.03 
Class: hockey . Similarity to Microsoft: 0.06 , to hockey: 0.20 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.14 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.14 
Class: hockey . Similarity to Microsoft: 0.09 , to hockey: 0.36 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.06 
Class: microsoft . Similarity to Microsoft: 0.06 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.22 
Class: microsoft . Similarity to Microsoft: 0.22 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.11 
Class: microsoft . Similarity to Microsoft: 0.15 , to hockey: 0.02 
Class: microsoft . Similarity to Microsoft: 0.23 , to hockey: 0.08 
Class: hockey . Similarity to Microsoft: 0.06 , to hockey: 0.29 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.12 
Class: microsoft . Similarity to Microsoft: 0.22 , to hockey: 0.07 
Class: hockey . Similarity to Microsoft: 0.07 , to hockey: 0.19 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.19 
Class: microsoft . Similarity to Microsoft: 0.15 , to hockey: 0.05 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.19 , to hockey: 0.06 
Class: microsoft . Similarity to Microsoft: 0.51 , to hockey: 0.12 
Class: microsoft . Similarity to Microsoft: 0.19 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.09 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.14 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.01 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.07 
Class: microsoft . Similarity to Microsoft: 0.19 , to hockey: 0.03 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.06 
Class: microsoft . Similarity to Microsoft: 0.08 , to hockey: 0.00 
Class: microsoft . Similarity to Microsoft: 0.17 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.18 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.05 
Class: microsoft . Similarity to Microsoft: 0.15 , to hockey: 0.05 
Class: microsoft . Similarity to Microsoft: 0.13 , to hockey: 0.02 
Class: hockey . Similarity to Microsoft: 0.06 , to hockey: 0.14 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.24 
Class: microsoft . Similarity to Microsoft: 0.15 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.06 , to hockey: 0.07 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.02 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.02 
Class: microsoft . Similarity to Microsoft: 0.16 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.08 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.10 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.16 
Class: microsoft . Similarity to Microsoft: 0.13 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.07 , to hockey: 0.10 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.13 
Class: microsoft . Similarity to Microsoft: 0.07 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.01 , to hockey: 0.03 
Class: hockey . Similarity to Microsoft: 0.06 , to hockey: 0.29 
Class: microsoft . Similarity to Microsoft: 0.20 , to hockey: 0.08 
Class: microsoft . Similarity to Microsoft: 0.16 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.08 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.15 
Class: hockey . Similarity to Microsoft: 0.10 , to hockey: 0.29 
Class: microsoft . Similarity to Microsoft: 0.09 , to hockey: 0.06 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.10 
Class: hockey . Similarity to Microsoft: 0.06 , to hockey: 0.30 
Class: microsoft . Similarity to Microsoft: 0.18 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.17 
Class: microsoft . Similarity to Microsoft: 0.20 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.18 
Class: microsoft . Similarity to Microsoft: 0.07 , to hockey: 0.03 
Class: microsoft . Similarity to Microsoft: 0.08 , to hockey: 0.07 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.15 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.11 
Class: microsoft . Similarity to Microsoft: 0.09 , to hockey: 0.03 
Class: hockey . Similarity to Microsoft: 0.06 , to hockey: 0.25 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.17 
Class: microsoft . Similarity to Microsoft: 0.17 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.14 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.11 
Class: microsoft . Similarity to Microsoft: 0.17 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.07 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.21 
Class: hockey . Similarity to Microsoft: 0.08 , to hockey: 0.14 
Class: microsoft . Similarity to Microsoft: 0.17 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.09 
Class: microsoft . Similarity to Microsoft: 0.09 , to hockey: 0.03 
Class: hockey . Similarity to Microsoft: 0.07 , to hockey: 0.13 
Class: microsoft . Similarity to Microsoft: 0.17 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.46 , to hockey: 0.13 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.11 
Class: microsoft . Similarity to Microsoft: 0.11 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.07 , to hockey: 0.08 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.18 
Class: microsoft . Similarity to Microsoft: 0.15 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.14 
Class: microsoft . Similarity to Microsoft: 0.18 , to hockey: 0.05 
Class: microsoft . Similarity to Microsoft: 0.19 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.08 , to hockey: 0.15 
Class: hockey . Similarity to Microsoft: 0.07 , to hockey: 0.26 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.09 
Class: microsoft . Similarity to Microsoft: 0.08 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.11 , to hockey: 0.03 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.13 
Class: microsoft . Similarity to Microsoft: 0.21 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.13 , to hockey: 0.03 
Class: microsoft . Similarity to Microsoft: 0.08 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.03 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.17 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.20 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.17 
Class: microsoft . Similarity to Microsoft: 0.18 , to hockey: 0.05 
Class: microsoft . Similarity to Microsoft: 0.13 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.05 , to hockey: 0.02 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.15 
Class: microsoft . Similarity to Microsoft: 0.21 , to hockey: 0.07 
Class: microsoft . Similarity to Microsoft: 0.13 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.09 
Class: microsoft . Similarity to Microsoft: 0.09 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.12 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.12 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.14 
Class: microsoft . Similarity to Microsoft: 0.13 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.09 , to hockey: 0.20 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.14 
Class: microsoft . Similarity to Microsoft: 0.16 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.07 , to hockey: 0.22 
Class: microsoft . Similarity to Microsoft: 0.29 , to hockey: 0.06 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.14 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.12 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.18 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.14 
Class: microsoft . Similarity to Microsoft: 0.14 , to hockey: 0.05 
Class: hockey . Similarity to Microsoft: 0.03 , to hockey: 0.14 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.03 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.14 
Class: microsoft . Similarity to Microsoft: 0.08 , to hockey: 0.02 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.04 
Class: hockey . Similarity to Microsoft: 0.01 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.09 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.24 , to hockey: 0.07 
Class: microsoft . Similarity to Microsoft: 0.10 , to hockey: 0.04 
Class: microsoft . Similarity to Microsoft: 0.18 , to hockey: 0.05 
Class: microsoft . Similarity to Microsoft: 0.09 , to hockey: 0.06 
Class: hockey . Similarity to Microsoft: 0.05 , to hockey: 0.08 
Class: hockey . Similarity to Microsoft: 0.02 , to hockey: 0.26 
Class: microsoft . Similarity to Microsoft: 0.22 , to hockey: 0.02 
Class: hockey . Similarity to Microsoft: 0.08 , to hockey: 0.18 
Class: hockey . Similarity to Microsoft: 0.04 , to hockey: 0.12 
Accuracy 100.00

''' The highest accuracy with KNN, which is 98.5%, is achive for any value of k equal to 1, 2, 4, 6 or 16. Would be more efficient to use k 4. The Rocchio algorithm has 100 percent, better than the good result from KNN. However the time requiered to perform KNN is much more that the time needed to train and test using the Rocchio method. For this particualr task is better to use the Rocchio algorithm '''

Using sklearn library ¶

2.a Load and preprocessing¶

# loading data
bankData = np.genfromtxt('bank_data.csv', delimiter=',',names=True, dtype=("|S7", int, float, int, '|S6','|S10',
                                                                           '|S3','|S3','|S3','|S3','|S3','|S3'))
# attributes names and remove id and label
names = ['age','income','children', 'gender', 'region', 'married', 'car', 'savings_acct', 'current_acct', 'mortgage']
names = list(bankData.dtype.names[1:-1])

#instances & labels
instances = pd.DataFrame(bankData[names])
labels = pd.DataFrame(bankData[['pep']])
# transforming categorial attributes
bank_numeric = pd.get_dummies(instances)
bank_numeric.describe(include='all')


# Creating training and testing sets
trainBank, testBank, trainBankLabel, testBankLabel = train_test_split(bank_numeric,labels, test_size=0.2, random_state=33) 

# normalizing values
minmax_sk = preprocessing.MinMaxScaler().fit(trainBank)
trainBankNorm = minmax_sk.transform(trainBank)
testBankNorm = minmax_sk.transform(testBank)
trainBankNorm[0:5]
testBankNorm[0:5]

array([[ 0.28571429,  0.12269342,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ],
       [ 0.83673469,  0.41475008,  0.33333333,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
         0.        ,  1.        ,  1.        ,  0.        ],
       [ 0.87755102,  0.62135545,  1.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ],
       [ 0.71428571,  0.75088804,  1.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         1.        ,  0.        ,  1.        ,  0.        ],
       [ 0.95918367,  0.55927144,  0.33333333,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  1.        ]])

2.b kNN from sklearn¶

trainingDataUni = np.zeros(20)
testingDataUni = np.zeros(20)
trainingDataWei = np.zeros(20)
testingDataWei = np.zeros(20)

for k in range(1,21):
    # fitting the classifier
    knnUni = neighbors.KNeighborsClassifier(k, weights='uniform')
    knnWei = neighbors.KNeighborsClassifier(k, weights='distance')
    knnUni.fit(trainBankNorm, trainBankLabel)
    knnWei.fit(trainBankNorm, trainBankLabel)
    # testing classifier
    KnnTestUni = knnUni.predict(testBankNorm)
    KnnTestWei = knnWei.predict(testBankNorm)
    #print(classification_report(testBankLabel, KnnTestUni))
    #print(classification_report(testBankLabel, KnnTestWei))
    
    knncmUni = (confusion_matrix(testBankLabel, KnnTestUni))
    knncmWei = (confusion_matrix(testBankLabel, KnnTestWei))
    trainingDataUni[k-1] = (knnUni.score(trainBankNorm, trainBankLabel))
    testingDataUni[k-1] = (knnUni.score(testBankNorm,testBankLabel))
    trainingDataWei[k-1] = (knnWei.score(trainBankNorm, trainBankLabel))
    testingDataWei[k-1] = (knnWei.score(testBankNorm,testBankLabel))
print 'Uniform weight Confusion matrix'
print confusion_matrix(testBankLabel, KnnTestUni)
print 'Eucledian weight Confusion matrix'
print confusion_matrix(testBankLabel, KnnTestWei)
print 'Uniform weight Accuracy: %0.2f' % (knnUni.score(testBankNorm,testBankLabel))
print 'Eucledian dist Accuracy: %0.2f' % (knnWei.score(testBankNorm,testBankLabel))

''' Comparison using different k and weights'''
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(1,1,1)
ax.set_title("KNN Classification with Sklearn")
# roll to start x axis at 1 not zero
ax.plot(np.append(np.roll(trainingDataUni,1),trainingDataUni[19]),label='Training Uniform dist')
ax.plot(np.append(np.roll(testingDataUni,1),testingDataUni[19]),label='Testing Uniform dist')
ax.plot(np.append(np.roll(trainingDataWei,1),trainingDataWei[19]),label='Training Weighted dist.')
ax.plot(np.append(np.roll(testingDataWei,1),testingDataWei[19]),label='Testing Weighted dist.')
ax.set_xticks(np.arange(1,21,1)) # x axis values
ax.legend(('training Uniform','Testing Uniform', 'training weighted','testing weighted'),loc='upper center', ncol=2, 
          bbox_to_anchor=(0.5, -0.15))
ax.set_ylabel('error rate')
ax.set_xlabel('k') 
ax.axis([1, 20, .6, 1]) 
ax.grid()
fig.savefig("Knn_class_sklearn.png")

Uniform weight Confusion matrix
[[57  9]
 [29 25]]
Eucledian weight Confusion matrix
[[48 18]
 [27 27]]
Uniform weight Accuracy: 0.68
Eucledian dist Accuracy: 0.62

Image(filename='Knn_class_sklearn.png')

2.c decision trees¶

treeclf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=round(len(trainBankLabel)**.5))
treeclf = treeclf.fit(trainBank, trainBankLabel)
treeTest = treeclf.predict(testBank)
print classification_report(testBankLabel, treeTest)
print 'Confusion Matrix'
print confusion_matrix(testBankLabel, treeTest)
print 'Accuracy ',treeclf.score(testBank, testBankLabel)*100

             precision    recall  f1-score   support

         NO       0.81      0.94      0.87        66
        YES       0.91      0.72      0.80        54

avg / total       0.85      0.84      0.84       120

Confusion Matrix
[[62  4]
 [15 39]]
Accuracy  84.1666666667

2.c naive bayes¶

nb = naive_bayes.GaussianNB()
nb = nb.fit(trainBank, trainBankLabel)
nbTest = nb.predict(testBank)
print classification_report(testBankLabel, nbTest)
print 'Confusion Matrix'
print confusion_matrix(testBankLabel, nbTest)

print 'Accuracy ',nb.score(testBank, testBankLabel)*100

             precision    recall  f1-score   support

         NO       0.60      0.74      0.67        66
        YES       0.56      0.41      0.47        54

avg / total       0.59      0.59      0.58       120

Confusion Matrix
[[49 17]
 [32 22]]
Accuracy  59.1666666667

2.d Observations¶

For this experiment, I have test for different values of k, 1-20, and thow similarity measuremnts: uniform and eucledean distance. Using a weighted measurement the classifier shows overfitting for any K as expected from part 1. The uniform approach has the same problem for any avalue of k but 17 and 18. Nevertheless the accuracy is not as good as observed with other algorithms such as Rocchio using TFxIDF weights

Data Analysis and Predictive Modeling on Census data¶

3.a Preprocessing and data analysis¶

censusData = pd.read_csv('adult-modified.csv', na_values=['?'])¶

censusData.describe(include='all') ''' Age has 198 missing values that would be fill with 38 , work class has over 500 missing values'''

fill age mean¶

agemean = censusData.age.mean() censusData.age.fillna(agemean, axis=0, inplace=True)

removing instances with missing values in workclass¶

censusData.dropna(axis=0,inplace=True)

censusData.describe(include='all')

scatter_matrix(censusData[['age','hours-per-week','education']], alpha=0.2, figsize=(9, 9), diagonal='kde')
Image(filename='P3-scatter.png')

censusData['income'].value_counts().plot(kind='bar', color = 'lightblue', title='income')
Image(filename='P3-income.png')

censusData['workclass'].value_counts().plot(kind='bar', color = 'lightblue', title='Work Class')
Image(filename='P3-wk.png')

censusData['marital-status'].value_counts().plot(kind='bar', color = 'lightblue', title='Marital Status')
Image(filename='P3-ms.png')

censusData['race'].value_counts().plot(kind='bar', color = 'lightblue', title='Race')
Image(filename='P3-race.png')

censusData['sex'].value_counts().plot(kind='bar', color = 'lightblue', title='Sex')
Image(filename='P3-sex.png')

# cross-tabulation of region and Pep
edu_race = pd.crosstab(censusData['education'], censusData['race'])
#plt.show(edu_race.plot(kind="bar", title= 'Education by Race'))
Image(filename='P3-education-by-race.png')

# workclass income
work_income = pd.crosstab(censusData['workclass'], censusData['income'])
#plt.show(work_income.plot(kind="bar", title= 'Income by Work Class'))
Image(filename='P3-income-by-wk.png')

# workclass race
work_race = pd.crosstab(censusData['workclass'], censusData['race'])
#plt.show(work_race.plot(kind="bar", title= 'Race by Work Class'))
Image(filename='P3-race-by-wk.png')

# race income
race_income = pd.crosstab(censusData['race'], censusData['income'])
#plt.show(race_income.plot(kind="bar", title = 'Income by Race'))
Image(filename='P3-Income-by-race.png')

''' Income classes'''
# data
low = censusData[censusData['income']=='<=50K']
hig = censusData[censusData['income']=='>50K']
# as epercentag
race_income = pd.crosstab(censusData['race'], censusData['income'])
print 'Distribution of income <=50K:', race_income['<=50K'] / race_income['<=50K'].sum()
print 'Distribution of income >50K:', race_income['>50K'] / race_income['>50K'].sum()
#histogram
low['race'].value_counts().plot(kind='bar', color = 'lightblue' , title = 'Income<=50k by Race')
hig['race'].value_counts().plot(kind='bar', color = 'blue', title= 'Income >50k by Race')
Image(filename='P3-Income-less-50k.png')

Distribution of income <=50K: race
Amer-Indian    0.011702
Asian          0.031580
Black          0.108981
Hispanic       0.009728
White          0.838009
Name: <=50K, dtype: float64
Distribution of income >50K: race
Amer-Indian    0.003881
Asian          0.028892
Black          0.051315
Hispanic       0.002587
White          0.913325
Name: >50K, dtype: float64

Image(filename='P3-Income-greater-50k-race.png')

# as percentage
work_income = pd.crosstab(censusData['workclass'], censusData['income'])
print 'Distribution of income <=50K:', work_income['<=50K'] / work_income['<=50K'].sum()
print 'Distribution of income >50K:', work_income['>50K'] / work_income['>50K'].sum()
#histogram
low['workclass'].value_counts().plot(kind='bar', color = 'lightblue')
hig['workclass'].value_counts().plot(kind='bar', color = 'blue', title= 'Income >50k by Work Class')

Image(filename='P3-Income-less-50k-wk.png')

Distribution of income <=50K: workclass
Private     0.767376
Public      0.130410
Self-emp    0.102213
Name: <=50K, dtype: float64
Distribution of income >50K: workclass
Private     0.648555
Public      0.169038
Self-emp    0.182406
Name: >50K, dtype: float64

Image(filename='P3-Income-greater-50k-wk.png')

# as percentage
marital_income = pd.crosstab(censusData['marital-status'], censusData['income'])
print 'Distribution of income <=50K:', marital_income['<=50K'] / marital_income['<=50K'].sum()
print 'Distribution of income >50K:', marital_income['>50K'] / marital_income['>50K'].sum()
# histogram
low['marital-status'].value_counts().plot(kind='bar', color = 'lightblue')
hig['marital-status'].value_counts().plot(kind='bar', color = 'blue', title= 'Income >50k by Marital Status')

Image(filename='P3-Income-less-50k-marital.png')

Distribution of income <=50K: marital-status
Married    0.384887
Single     0.615113
Name: <=50K, dtype: float64
Distribution of income >50K: marital-status
Married    0.865459
Single     0.134541
Name: >50K, dtype: float64

Image(filename='P3-Income-greater-50k-ms.png')

# as percentage
sex_income = pd.crosstab(censusData['sex'], censusData['income'])
print 'Distribution of income <=50K:', sex_income['<=50K'] / sex_income['<=50K'].sum()
print 'Distribution of income >50K:', sex_income['>50K'] / sex_income['>50K'].sum()
# histogram
low['sex'].value_counts().plot(kind='bar', color = 'lightblue')
hig['sex'].value_counts().plot(kind='bar', color = 'blue', title= 'Income >50k by Gender')
Image(filename='P3-Income-less-50k-sex.png')

Distribution of income <=50K: sex
Female    0.376286
Male      0.623714
Name: <=50K, dtype: float64
Distribution of income >50K: sex
Female    0.155239
Male      0.844761
Name: >50K, dtype: float64

Image(filename='P3-Income-greater-50k-sex.png')

''' Exploratory data analysis:
The data was reduces due to some missing values in the attribute workclass. The analysi was perfom under the reduced dataset. 75% of the sample has an income equal or bellow 50k. About the same portion are people working in the provate sector, the resl are equally distributed between seflployment and public sector. In terms of marital status 50 micent of the sample are married, the remainder are single. As a demographic about 84% of the peole in the sample are white, 10% black, the remaing are native americans latinos. About 66% of the sample are meles and 44% are females.  In terms of education race show a significan difference. Whites have higher degrees in comparison to the distribution of all other races. 
Having a higher income is more likely as self-employed or un public sectors than it is in private sector. But race and gender are important to determin incomes higer than 50k. Being white male has a higher probability of having such income. 91% of the self-employed are white. 86% of the people with incomes greater thatn 50k are married, in contrast to only 38% of married people when income is equal or lowern than 50k. 
'''

3.b Predictive modeling¶

''' Predictive Modeling and Model Evaluation '''
# converting variables
CensusNumeric = pd.get_dummies(censusData)

# Extracting class 
incClass = CensusNumeric['income_>50K']
CensusNumeric.pop('income_<=50K')
CensusNumeric.pop('income_>50K')


# decision tree
treeclf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=round(len(incClass)**0.5,0))
# linear discriminatn analysis
lda = LDA()
lda = lda.fit(CensusNumeric, incClass)
#naive bayes
nbclf = naive_bayes.GaussianNB()
# cross validation
tree_scores = cross_validation.cross_val_score(treeclf, CensusNumeric, incClass, cv=10)
lda_scores = cross_validation.cross_val_score(lda, CensusNumeric, incClass, cv=10) 
nb_scores = cross_validation.cross_val_score(nbclf, CensusNumeric, incClass, cv=10) 

# average Accurary
print("Decision trees overall Accuracy: %0.2f (+/- %0.2f)" % (tree_scores.mean(), tree_scores.std() * 2))
print("LDA overall Accuracy: %0.2f (+/- %0.2f)" % (lda_scores.mean(), lda_scores.std() * 2))
print("Naibe Bayes overall Accuracy: %0.2f (+/- %0.2f)" % (nb_scores.mean(), nb_scores.std() * 2))

Decision trees overall Accuracy: 0.81 (+/- 0.02)
LDA overall Accuracy: 0.81 (+/- 0.02)
Naibe Bayes overall Accuracy: 0.72 (+/- 0.02)

Decision Tree¶

Image(filename='tree.png')