import numpy as np
import pandas as pd
import operator
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn import neighbors, tree, naive_bayes
from sklearn.lda import LDA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation
from pandas.tools.plotting import scatter_matrix
def KNN(inX, dataSet, labels, k = 3, measure = 'Minkowski', measure_D = 2):
''' K-Nearest Neighbour classification using Minkowski distance or Cosine simillarity'''
# distance calculation
if measure == 'Minkowski':
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat ** measure_D
sqDistances =sqDiffMat.sum(axis=1)
distances = sqDistances ** (1.0/measure_D)
elif measure == 'Cosine':
# vector of norms of each row
D_norm = np.array([np.linalg.norm(dataSet[i]) for i in range(len(dataSet))])
# vector of ||X|| norm
data_norm = np.linalg.norm(inX)
# similarity
sim = np.dot(dataSet,inX)/(D_norm * data_norm)
distances = 1- sim
sortedDistIndices = distances.argsort()
# vowing with lowest distances
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndices[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
sortedClassCount = sorted(classCount.iteritems(),key= operator.itemgetter(1), reverse =True)
return sortedClassCount[0][0]
minmax_norm = lambda x: ((x-x.min())/(x.max()-x.min()) * (1 - 0) + 0) if (x.dtypes==np.float64 or x.dtypes==np.int64) else x
Label_set = lambda x: 'hockey' if x == 1 else 'microsoft'
# trainig data
trainData = np.genfromtxt("trainMatrixModified.txt", delimiter='\t' , dtype=float)
trainLabel= np.genfromtxt('trainClasses.txt', delimiter='\t' , dtype=float)
# testing data
testData = np.genfromtxt("testMatrixModified.txt", delimiter='\t' , dtype=float)
testLabel = np.genfromtxt('testClasses.txt', delimiter='\t' , dtype=float )
# converting to Doc by term matrix
trainDataPD = pd.DataFrame(trainData).transpose()
testDataPD = pd.DataFrame(testData).transpose()
# data transformation
trainDataNorm = np.asarray(trainDataPD.apply(minmax_norm))
testDataNorm = np.asarray(testDataPD.apply(minmax_norm))
# label creation
# traing
trainLabels = []
for label in trainLabel:
trainLabels.append(Label_set(label[1]))
trainLabels = np.asarray(trainLabels)
# testing
testLabels = []
for label in testLabel:
testLabels.append(Label_set(label[1]))
testLabels = np.asarray(testLabels)
# Normalizing values and extracting classes
trainDataNorm = np.asarray(trainDataPD.apply(minmax_norm))
trainDataNorm = np.nan_to_num(trainDataNorm)
trainLabel = trainLabel[:,1]
testDataNorm = np.asarray(testDataPD.apply(minmax_norm))
testDataNorm = np.nan_to_num(testDataNorm)
testLabel = testLabel[:,1]
def KNN_Test(trainDataNorm, trainLabels, testDataNorm, testLabels,k = 3, measure = 'Minkowski', measure_D = 2):
numTestVecs = len(testLabels)
ClassError = 0.0
for i in range(numTestVecs):
PredCalss = KNN(testDataNorm[i,:], trainDataNorm, trainLabels, k, measure, measure_D)
if PredCalss != testLabels[i]: ClassError += 1.0
print "%s Accuracy is: %f with %s errors" % (measure, 1-ClassError/float(numTestVecs), ClassError)
return 1 - ClassError/float(numTestVecs)
''' Testing for a range of valuesof k'''
ki = np.zeros(20)
CosData = np.zeros(20)
MinData = np.zeros(20)
for k in range(1,21):
ki[k-1] = k
#CosData[k-1] = KNN_Test(trainDataNorm, trainLabels, testDataNorm, testLabels, k , 'Cosine')
CosData[k-1] = KNN_Test(trainData.T, trainLabels, testData.T, testLabels, k , 'Cosine')
MinData[k-1] = KNN_Test(trainDataNorm, trainLabels, testDataNorm, testLabels, k ,'Minkowski', 2)
#
limit = max(CosData.max(),MinData.max())+.05
inflimit = min(CosData.min(),MinData.min())-.05
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(1,1,1)
ax.set_title("Cosine vs Minkowski")
# roll to start x axis at 1 not zero
ax.plot(np.append(np.roll(CosData,1),CosData[19]),label='Cosine')
ax.plot(np.append(np.roll(MinData,1),MinData[19]),label='Minkowski')
ax.set_xticks(np.arange(1,21,1)) # x axis values
ax.legend(('Cosine','Minkowski'),loc='upper center', ncol=2, bbox_to_anchor=(0.5, -0.15))
ax.set_ylabel('Accuracy')
ax.set_xlabel('k')
ax.axis([1, 20, inflimit, limit])
ax.grid()
fig.savefig('knn.png')
Image(filename='knn.png')
''' TFxIDF '''
# Number of Documents
N = len(trainDataPD[0])
# Term frequency
tf = trainData.sum(axis=1)
# doc Frecuency
DF = np.array([(trainData!=0).sum(1)]).T
# idf
IDFmat = np.ones(np.shape(trainData),dtype=float) * N
np.set_printoptions(precision=2,suppress=True, linewidth=420)
IDF = np.log2(np.divide(IDFmat,DF))
#tfxidf
train_tfidf = (trainData * IDF).T
# idf for testing
IDF = np.ones(np.shape(testData), dtype=float) * IDF[:,0:200]
# tfxidf testing
test_tfidf = (testData * IDF).T
''' testing with tfxidf'''
CosTFData = np.zeros(20)
MinTFData = np.zeros(20)
for k in range(1,21):
CosTFData[k-1] = KNN_Test(train_tfidf, trainLabels, test_tfidf, testLabels, k , 'Cosine')
MinTFData[k-1] = KNN_Test(train_tfidf, trainLabels, test_tfidf, testLabels, k ,'Minkowski', 2)
# plot using tfidf data
limit = max(CosTFData.max(),MinTFData.max(),CosTFData.max(),MinTFData.max())+.05
minlimit = min(CosTFData.min(),MinTFData.min(),CosTFData.min(),MinTFData.min())-.05
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(2,2,1)
ax.set_title("KNN comparison Min-max vs tf/idf data")
# roll to start x axis at 1 not zero
ax.plot(np.append(np.roll(CosData,1),CosData[19]),label='Cosine (min-max)')
ax.plot(np.append(np.roll(MinData,1),MinData[19]),label='Minkowski (min-max)')
ax.plot(np.append(np.roll(CosTFData,1),CosTFData[19]),label='Cosine (tf/idf)')
ax.plot(np.append(np.roll(MinTFData,1),MinTFData[19]),label='Minkowski (tf/idf)')
ax.set_xticks(np.arange(1,21,1)) # x axis values
ax.legend(('Cosine (min-max)','Minkowski (min-max)','Cosine (tf/idf)','Minkowski (tf/idf)'),
loc='upper center', ncol=2, bbox_to_anchor=(0.5, -0.15))
ax.set_ylabel('Accuracy')
ax.set_xlabel('k')
ax.axis([1, 20, .45, limit])
ax.grid()
fig.savefig('tfxidf.png')
Image(filename='tfxidf.png')
'''
Cosine similarity proves to be the best regadless the normalization of the data. Within the cosine, using tfxidf is better than min-max normalization'''
''' Rocchio '''
cat = ['microsoft','hockey']
''' generar dos vectores uno para microsfot y otro para keckey. cada vector es la suma de los vectoes de esa clase'''
pi = np.zeros(200)
mic = np.zeros(5500)
hoc = np.zeros(5500)
for i in range(len(trainLabel)):
if trainLabel[i] == 0:
mic = mic + train_tfidf[i,:]
else:
hoc = hoc + train_tfidf[i,:]
docCat = []
error=0
for i in range(len(test_tfidf)):
doc = test_tfidf[i]
clasActual = testLabels[i]
# vector of norms of each row
micD_norm = np.array([np.linalg.norm(mic) for i in range(len(mic))])
hocD_norm = np.array([np.linalg.norm(hoc) for i in range(len(hoc))])
# vector of ||X|| norm
data_norm = np.linalg.norm(doc)
# similarity
micSim = np.dot(doc,mic)/(np.unique(micD_norm)[0] * data_norm)
hocSim = np.dot(doc,hoc)/(np.unique(hocD_norm)[0] * data_norm)
if micSim < hocSim:
clas=cat[1]
else:
clas=cat[0]
print 'Predicted class: %s . Similarity to Microsoft: %0.2f , to hockey: %0.2f ' % (clas,micSim,hocSim)
if clasActual != clas:
error+=1
print 'Accuracy %0.2f ' % (100*(1-error/len(testLabels)))
''' The highest accuracy with KNN, which is 98.5%, is achive for any value of k equal to 1, 2, 4, 6 or 16. Would be more efficient to use k 4. The Rocchio algorithm has 100 percent, better than the good result from KNN. However the time requiered to perform KNN is much more that the time needed to train and test using the Rocchio method. For this particualr task is better to use the Rocchio algorithm '''
# loading data
bankData = np.genfromtxt('bank_data.csv', delimiter=',',names=True, dtype=("|S7", int, float, int, '|S6','|S10',
'|S3','|S3','|S3','|S3','|S3','|S3'))
# attributes names and remove id and label
names = ['age','income','children', 'gender', 'region', 'married', 'car', 'savings_acct', 'current_acct', 'mortgage']
names = list(bankData.dtype.names[1:-1])
#instances & labels
instances = pd.DataFrame(bankData[names])
labels = pd.DataFrame(bankData[['pep']])
# transforming categorial attributes
bank_numeric = pd.get_dummies(instances)
bank_numeric.describe(include='all')
# Creating training and testing sets
trainBank, testBank, trainBankLabel, testBankLabel = train_test_split(bank_numeric,labels, test_size=0.2, random_state=33)
# normalizing values
minmax_sk = preprocessing.MinMaxScaler().fit(trainBank)
trainBankNorm = minmax_sk.transform(trainBank)
testBankNorm = minmax_sk.transform(testBank)
trainBankNorm[0:5]
testBankNorm[0:5]
trainingDataUni = np.zeros(20)
testingDataUni = np.zeros(20)
trainingDataWei = np.zeros(20)
testingDataWei = np.zeros(20)
for k in range(1,21):
# fitting the classifier
knnUni = neighbors.KNeighborsClassifier(k, weights='uniform')
knnWei = neighbors.KNeighborsClassifier(k, weights='distance')
knnUni.fit(trainBankNorm, trainBankLabel)
knnWei.fit(trainBankNorm, trainBankLabel)
# testing classifier
KnnTestUni = knnUni.predict(testBankNorm)
KnnTestWei = knnWei.predict(testBankNorm)
#print(classification_report(testBankLabel, KnnTestUni))
#print(classification_report(testBankLabel, KnnTestWei))
knncmUni = (confusion_matrix(testBankLabel, KnnTestUni))
knncmWei = (confusion_matrix(testBankLabel, KnnTestWei))
trainingDataUni[k-1] = (knnUni.score(trainBankNorm, trainBankLabel))
testingDataUni[k-1] = (knnUni.score(testBankNorm,testBankLabel))
trainingDataWei[k-1] = (knnWei.score(trainBankNorm, trainBankLabel))
testingDataWei[k-1] = (knnWei.score(testBankNorm,testBankLabel))
print 'Uniform weight Confusion matrix'
print confusion_matrix(testBankLabel, KnnTestUni)
print 'Eucledian weight Confusion matrix'
print confusion_matrix(testBankLabel, KnnTestWei)
print 'Uniform weight Accuracy: %0.2f' % (knnUni.score(testBankNorm,testBankLabel))
print 'Eucledian dist Accuracy: %0.2f' % (knnWei.score(testBankNorm,testBankLabel))
''' Comparison using different k and weights'''
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(1,1,1)
ax.set_title("KNN Classification with Sklearn")
# roll to start x axis at 1 not zero
ax.plot(np.append(np.roll(trainingDataUni,1),trainingDataUni[19]),label='Training Uniform dist')
ax.plot(np.append(np.roll(testingDataUni,1),testingDataUni[19]),label='Testing Uniform dist')
ax.plot(np.append(np.roll(trainingDataWei,1),trainingDataWei[19]),label='Training Weighted dist.')
ax.plot(np.append(np.roll(testingDataWei,1),testingDataWei[19]),label='Testing Weighted dist.')
ax.set_xticks(np.arange(1,21,1)) # x axis values
ax.legend(('training Uniform','Testing Uniform', 'training weighted','testing weighted'),loc='upper center', ncol=2,
bbox_to_anchor=(0.5, -0.15))
ax.set_ylabel('error rate')
ax.set_xlabel('k')
ax.axis([1, 20, .6, 1])
ax.grid()
fig.savefig("Knn_class_sklearn.png")
Image(filename='Knn_class_sklearn.png')
treeclf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=round(len(trainBankLabel)**.5))
treeclf = treeclf.fit(trainBank, trainBankLabel)
treeTest = treeclf.predict(testBank)
print classification_report(testBankLabel, treeTest)
print 'Confusion Matrix'
print confusion_matrix(testBankLabel, treeTest)
print 'Accuracy ',treeclf.score(testBank, testBankLabel)*100
nb = naive_bayes.GaussianNB()
nb = nb.fit(trainBank, trainBankLabel)
nbTest = nb.predict(testBank)
print classification_report(testBankLabel, nbTest)
print 'Confusion Matrix'
print confusion_matrix(testBankLabel, nbTest)
print 'Accuracy ',nb.score(testBank, testBankLabel)*100
For this experiment, I have test for different values of k, 1-20, and thow similarity measuremnts: uniform and eucledean distance. Using a weighted measurement the classifier shows overfitting for any K as expected from part 1. The uniform approach has the same problem for any avalue of k but 17 and 18. Nevertheless the accuracy is not as good as observed with other algorithms such as Rocchio using TFxIDF weights
censusData.describe(include='all') ''' Age has 198 missing values that would be fill with 38 , work class has over 500 missing values'''
agemean = censusData.age.mean() censusData.age.fillna(agemean, axis=0, inplace=True)
censusData.dropna(axis=0,inplace=True)
censusData.describe(include='all')
scatter_matrix(censusData[['age','hours-per-week','education']], alpha=0.2, figsize=(9, 9), diagonal='kde')
Image(filename='P3-scatter.png')
censusData['income'].value_counts().plot(kind='bar', color = 'lightblue', title='income')
Image(filename='P3-income.png')
censusData['workclass'].value_counts().plot(kind='bar', color = 'lightblue', title='Work Class')
Image(filename='P3-wk.png')
censusData['marital-status'].value_counts().plot(kind='bar', color = 'lightblue', title='Marital Status')
Image(filename='P3-ms.png')
censusData['race'].value_counts().plot(kind='bar', color = 'lightblue', title='Race')
Image(filename='P3-race.png')
censusData['sex'].value_counts().plot(kind='bar', color = 'lightblue', title='Sex')
Image(filename='P3-sex.png')
# cross-tabulation of region and Pep
edu_race = pd.crosstab(censusData['education'], censusData['race'])
#plt.show(edu_race.plot(kind="bar", title= 'Education by Race'))
Image(filename='P3-education-by-race.png')
# workclass income
work_income = pd.crosstab(censusData['workclass'], censusData['income'])
#plt.show(work_income.plot(kind="bar", title= 'Income by Work Class'))
Image(filename='P3-income-by-wk.png')
# workclass race
work_race = pd.crosstab(censusData['workclass'], censusData['race'])
#plt.show(work_race.plot(kind="bar", title= 'Race by Work Class'))
Image(filename='P3-race-by-wk.png')
# race income
race_income = pd.crosstab(censusData['race'], censusData['income'])
#plt.show(race_income.plot(kind="bar", title = 'Income by Race'))
Image(filename='P3-Income-by-race.png')
''' Income classes'''
# data
low = censusData[censusData['income']=='<=50K']
hig = censusData[censusData['income']=='>50K']
# as epercentag
race_income = pd.crosstab(censusData['race'], censusData['income'])
print 'Distribution of income <=50K:', race_income['<=50K'] / race_income['<=50K'].sum()
print 'Distribution of income >50K:', race_income['>50K'] / race_income['>50K'].sum()
#histogram
low['race'].value_counts().plot(kind='bar', color = 'lightblue' , title = 'Income<=50k by Race')
hig['race'].value_counts().plot(kind='bar', color = 'blue', title= 'Income >50k by Race')
Image(filename='P3-Income-less-50k.png')
Image(filename='P3-Income-greater-50k-race.png')
# as percentage
work_income = pd.crosstab(censusData['workclass'], censusData['income'])
print 'Distribution of income <=50K:', work_income['<=50K'] / work_income['<=50K'].sum()
print 'Distribution of income >50K:', work_income['>50K'] / work_income['>50K'].sum()
#histogram
low['workclass'].value_counts().plot(kind='bar', color = 'lightblue')
hig['workclass'].value_counts().plot(kind='bar', color = 'blue', title= 'Income >50k by Work Class')
Image(filename='P3-Income-less-50k-wk.png')
Image(filename='P3-Income-greater-50k-wk.png')
# as percentage
marital_income = pd.crosstab(censusData['marital-status'], censusData['income'])
print 'Distribution of income <=50K:', marital_income['<=50K'] / marital_income['<=50K'].sum()
print 'Distribution of income >50K:', marital_income['>50K'] / marital_income['>50K'].sum()
# histogram
low['marital-status'].value_counts().plot(kind='bar', color = 'lightblue')
hig['marital-status'].value_counts().plot(kind='bar', color = 'blue', title= 'Income >50k by Marital Status')
Image(filename='P3-Income-less-50k-marital.png')
Image(filename='P3-Income-greater-50k-ms.png')
# as percentage
sex_income = pd.crosstab(censusData['sex'], censusData['income'])
print 'Distribution of income <=50K:', sex_income['<=50K'] / sex_income['<=50K'].sum()
print 'Distribution of income >50K:', sex_income['>50K'] / sex_income['>50K'].sum()
# histogram
low['sex'].value_counts().plot(kind='bar', color = 'lightblue')
hig['sex'].value_counts().plot(kind='bar', color = 'blue', title= 'Income >50k by Gender')
Image(filename='P3-Income-less-50k-sex.png')
Image(filename='P3-Income-greater-50k-sex.png')
''' Exploratory data analysis:
The data was reduces due to some missing values in the attribute workclass. The analysi was perfom under the reduced dataset. 75% of the sample has an income equal or bellow 50k. About the same portion are people working in the provate sector, the resl are equally distributed between seflployment and public sector. In terms of marital status 50 micent of the sample are married, the remainder are single. As a demographic about 84% of the peole in the sample are white, 10% black, the remaing are native americans latinos. About 66% of the sample are meles and 44% are females. In terms of education race show a significan difference. Whites have higher degrees in comparison to the distribution of all other races.
Having a higher income is more likely as self-employed or un public sectors than it is in private sector. But race and gender are important to determin incomes higer than 50k. Being white male has a higher probability of having such income. 91% of the self-employed are white. 86% of the people with incomes greater thatn 50k are married, in contrast to only 38% of married people when income is equal or lowern than 50k.
'''
''' Predictive Modeling and Model Evaluation '''
# converting variables
CensusNumeric = pd.get_dummies(censusData)
# Extracting class
incClass = CensusNumeric['income_>50K']
CensusNumeric.pop('income_<=50K')
CensusNumeric.pop('income_>50K')
# decision tree
treeclf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=round(len(incClass)**0.5,0))
# linear discriminatn analysis
lda = LDA()
lda = lda.fit(CensusNumeric, incClass)
#naive bayes
nbclf = naive_bayes.GaussianNB()
# cross validation
tree_scores = cross_validation.cross_val_score(treeclf, CensusNumeric, incClass, cv=10)
lda_scores = cross_validation.cross_val_score(lda, CensusNumeric, incClass, cv=10)
nb_scores = cross_validation.cross_val_score(nbclf, CensusNumeric, incClass, cv=10)
# average Accurary
print("Decision trees overall Accuracy: %0.2f (+/- %0.2f)" % (tree_scores.mean(), tree_scores.std() * 2))
print("LDA overall Accuracy: %0.2f (+/- %0.2f)" % (lda_scores.mean(), lda_scores.std() * 2))
print("Naibe Bayes overall Accuracy: %0.2f (+/- %0.2f)" % (nb_scores.mean(), nb_scores.std() * 2))
Image(filename='tree.png')