Team and Loan Clusters¶

import mysql.connector
import operator
from IPython.display import Image
import matplotlib.pyplot as plt
import pandas as pan
from pandas.tools.plotting import scatter_matrix
import numpy as np
import pylab as pl
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import decomposition
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn import neighbors, tree, naive_bayes 
from sklearn.cluster import KMeans
from sklearn.metrics import completeness_score, homogeneity_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import confusion_matrix

NL¶

''' ====================================== S T O P  W O R D S  ======================================'''
import nltk
from nltk import word_tokenize,sent_tokenize,stem
from nltk.stem import WordNetLemmatizer
import string

stop_words =set(('a',
     'about','above','after','again','against','all','am','an','and','any','are','as','at','be','because','been',
     'before','being','below','between','both','but','by','can','did','do','does','doing','don','down','during',
     'each','few','for','from','further','had','has','have','having','he','her','here','hers','herself','him',
     'himself','his','how','i','if','in','into','is','it','its','itself','just','me','more','most','my','myself','no',
     'nor','not','now','of','off','on','once','only','or','other','our','ours','ourselves','out','over',
     'own','s','same','she','should','so','some','such','t','than','that','the',
     'their','theirs','them','themselves','then','there','these','they','this','those','through','to',
     'too','under','until','up','very','was','we','were','what','when','where','which',
     'while','who','whom','why','will','with','you','your','yours','yourself','yourselves','``','...',"''","'s",
     '!','"',     '#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^',
     '_','`','{','|','}','~'))

punctutation = set(string.punctuation)

stop_words.update(punctutation)
print(stop_words)

wnl = WordNetLemmatizer()
snowball = stem.snowball.EnglishStemmer()

{'with', 'had', '*', 'down', 'during', 'has', 'myself', '[', 'any', 'those', 'into', 'i', 'once', 'was', 'its', '(', 'but', 'that', 'their', '"', 'hers', "'", 'how', '|', 'my', '%', 'above', 'an', 'ours', '`', 'our', "''", 'why', 'from', 'herself', 'not', 'ourselves', 'yourself', 'yourselves', 'at', 'when', 'theirs', 'doing', '\\', 'to', 'while', 'where', 'and', 'against', 'did', 'they', 'it', 'some', 'between', 's', 'a', 'over', 'can', 'such', '~', ',', 'or', 'whom', 'what', 'himself', 'don', 'having', 'all', 'on', 'very', '+', 'as', '``', '/', '#', 'up', ')', '=', 'too', 'about', '_', 'being', 'here', 'until', 'just', 'out', 'were', 'if', 'your', 'am', 'itself', 'there', 'which', 'her', 'own', 'his', 'he', 'further', 'under', '.', 'then', 'do', 'both', 'more', 'most', 'is', 'other', 'for', 'she', '{', ':', 'through', 'we', 'will', 'after', '$', '?', 'off', 'because', 'them', 'each', 'few', '!', 'again', 'been', 't', 'so', 'before', 'be', 'no', '^', 'by', 'in', 'only', 'now', '&', 'nor', 'same', 'themselves', 'should', 'of', 'you', '-', 'me', ';', 'have', '}', '<', 'does', 'than', ']', '>', '@', 'are', 'this', 'him', 'who', '...', 'yours', 'the', 'below', "'s", 'these'}

Database connection¶

# Database connection to MySQL
cnx = mysql.connector.connect(user='root', password='root',
                              host='localhost', port='8889',
                              database='Kiva')              
cursor = cnx.cursor() # cursos

# Currency

cursor.execute("""SELECT `DISBURSAL_CURRENCY`, count(*) 
                FROM LOAN 
                GROUP BY `DISBURSAL_CURRENCY` ORDER BY count(*) DESC""")

Currency= cursor.fetchall()

CuP = pan.DataFrame(Currency, columns=['Currency','Count'])
%matplotlib inline
CuP.plot(x=CuP['Currency'],kind='bar', color = 'green', title = 'Currency Distribution',figsize=(10,6))
print (CuP.head(5))

  Currency  Count
0      USD  40169
1      PEN  19158
2      PHP  14446
3      XOF  10414
4      GHS   7732

Team data¶

''' ====================================== T E A M  D A T A ======================================'''

cursor.execute("""SELECT `teams_id`, `teams_loan_count`, `teams_loaned_amount`, `teams_member_count`, 
               `teams_membership_type`, `teams_category`,`teams_shortname`,`teams_loan_because` 
               FROM `TEAM` WHERE `teams_loan_because` IS NOT NULL""")

team_because = cursor.fetchall()
team_becausePan = pan.DataFrame(team_because)

len(team_because)
# get the number of rows in the resultset
numrows = int(cursor.rowcount)

teamBe = []
key = 0 # the field teams_id has repited values, this would work as unique key.
for x in team_because:
    #x = team_because[0]
    temp = [key]
    for i in x:
        temp.append(i)
    teamBe.append(temp)
    key+=1
len(teamBe)

print ('Done!')

Done!

teamBePan = pan.DataFrame(teamBe, columns=['team_key','teams_id', 'teams_loan_count', 'teams_loaned_amount', 
                                           'teams_member_count','teams_membership_type', 
                                           'teams_category','teams_shortname','teams_loan_because'])

teamBePan.describe()

%matplotlib inline
teamBePan['teams_category'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Team category distribution',
                                                figsize=(10,6))

<matplotlib.axes._subplots.AxesSubplot at 0x11ea66b38>

%matplotlib inline
teamBePan['teams_membership_type'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Team category distribution',
                                                figsize=(10,6))


print ('Team category distribution: \nOpen:\t %0.2f \nClosed:\t %0.2f' % (
        teamBePan['teams_membership_type'].value_counts('open')[0],
        teamBePan['teams_membership_type'].value_counts('open')[1]))

Team category distribution: 
Open:	 0.76 
Closed:	 0.24

%matplotlib inline
teamBePan['teams_loaned_amount'].value_counts().plot(kind='hist', 
                                                color = 'green', 
                                                title = 'Team category distribution',
                                                figsize=(10,6))

<matplotlib.axes._subplots.AxesSubplot at 0x11d286048>

%matplotlib inline
teamBePan['teams_loan_count'].value_counts().plot(kind='hist', 
                                                color = 'green', 
                                                title = 'Team category distribution',
                                                figsize=(10,6))

<matplotlib.axes._subplots.AxesSubplot at 0x1342381d0>

team_type_amount = pan.crosstab(teamBePan['teams_loan_count'], 
                                teamBePan['teams_membership_type'][teamBePan['teams_membership_type']=='open'])
%matplotlib inline
plt.show(team_type_amount.plot(kind="hist", title = 'Outcome type to sex',color = 'green',figsize=(10,6)))

team_type_amount = pan.crosstab(teamBePan['teams_loan_count'], 
                                teamBePan['teams_membership_type'][teamBePan['teams_membership_type']=='closed'])
%matplotlib inline
plt.show(team_type_amount.plot(kind="hist", title = 'Outcome type to sex',color = 'green',figsize=(10,6)))

''' clustering '''
cttt =[]
for i in teamBePan.iterrows():
    if i[1]['teams_category'] not in cttt:
        cttt.append(i[1]['teams_category'])

cat = {'Families':0,
 'Common Interest':1,
 'Sports Groups':2,
 'Businesses':3,
 'Businesses - Internal Groups':4,
 'Colleges/Universities':5,
 'Clubs':6,
 'Religious Congregations':7,
 'Local Area':8,
 'Youth Groups':9,
 'Alumni Groups':10,
 'Other':11,
 'Field Partner Fans':12,
 'Schools':13,
 'Friends':14,
 'Events':15,
 'Memorials':16}
 
category = np.asarray(teamBePan['teams_category'][:1000])
category_code = []
for i in category:
    category_code.append(cat[i])

teamLabel = []
for i in category:
    if i not in teamLabel:
        teamLabel.append(i)

print ('Done!')

Done!

''' ====================================== T E A M  D A T A NL ======================================'''
#i =0 
tokens =[] # creating tokens from loan because
for reason in teamBePan.iterrows():
    tokens.append(nltk.word_tokenize(reason[1][-1]))
    #i+=1
    #if i == 10:
    #    break
do_reg = len(tokens[1])
print (tokens[1], do_reg)
print ('------------------------------------------')

# removing stopwords
for i in range(2):
    for team in tokens:
        for word in team:
            if word in stop_words: 
                while word in team:
                    team.remove(word)
            do_reg_reduce = len(tokens[1])
print (tokens[1],do_reg_reduce)

['We', 'believe', 'the', 'children', 'are', 'our', 'future', '.', 'Teach', 'them', 'well', 'and', 'let', 'them', 'lead', 'the', 'way', '.', 'Show', 'them', 'all', 'the', 'beauty', 'we', 'possess', 'inside', '.', 'You', "'ve", 'got', 'to', 'give', 'them', 'a', 'sense', 'of', 'pride', '.', 'To', 'make', 'it', 'easier', '-', 'let', 'the', 'children', "'s", 'laughter', 'remind', 'us', 'how', 'we', 'used', 'to', 'be', '.'] 56
------------------------------------------
['We', 'believe', 'children', 'future', 'Teach', 'well', 'let', 'lead', 'way', 'Show', 'beauty', 'possess', 'inside', 'You', "'ve", 'got', 'give', 'sense', 'pride', 'To', 'make', 'easier', 'let', 'children', 'laughter', 'remind', 'us', 'used'] 28

''' using wordnet to stem words '''
wnl = WordNetLemmatizer()
snowball = stem.snowball.EnglishStemmer()

#save stem words for each team
team_stem_words = []
i = 0
for team in tokens:
    temp = []
    for word in team:
        # stemming words
        temp.append(wnl.lemmatize(snowball.stem(word)))
    team_stem_words.append(temp)
    i +=1

print (team_stem_words[1])

['we', 'believ', 'child', 'futur', 'teach', 'well', 'let', 'lead', 'way', 'show', 'beauti', 'posse', 'insid', 'you', 've', 'got', 'give', 'sen', 'pride', 'to', 'make', 'easier', 'let', 'child', 'laughter', 'remind', 'u', 'use']

# including vector with stem words to teams data
teamBePan['loan_bec_vector'] = team_stem_words

# sparsing words in panda dadaframe
tsw = pan.DataFrame(team_stem_words)

# concat of team data with tsw by axis (team_key)
teamData = pan.concat([teamBePan, tsw], axis=1)
len(teamData[0][0])

teamData.head()
team_loan_bec = teamData .pop('teams_loan_because')

teambecause = teamBePan['teams_loan_because'][0:1000]
''' freq vector ok '''
countvectorTeam =CountVectorizer() # decode_error='ignore')
teamdata = countvectorTeam.fit_transform(teambecause)
team_frec_vector_pan = pan.DataFrame(teamdata.toarray(), columns=countvectorTeam.get_feature_names())


''' kmeans '''
teamkmeans = KMeans(n_clusters=17, max_iter=500, verbose=1) # initialization
teamkmeans.fit(team_frec_vector_pan)
print ('Centoids: \n:', teamkmeans.cluster_centers_)
cteam = teamkmeans.predict(team_frec_vector_pan)

print (completeness_score(category_code,cteam))
print (homogeneity_score(category_code,cteam))

''' PCA '''
# setting PCA from sklearn decomposition package
teamPCA = decomposition.PCA(n_components=len(team_frec_vector_pan.columns))
# reducing dimensionality 
TeamDTtrans = teamPCA.fit(team_frec_vector_pan).transform(team_frec_vector_pan)

# variance captured by component
print ('Variance ratio: \n', teamPCA.explained_variance_ratio_ , '\n')

# we need to take the first 300 component to get 90% of the variance
print ('\nVariance Captured with 250 components %0.3f' % sum(teamPCA.explained_variance_ratio_[:250]))
print ('Variance Captured with 50 components %0.3f' % sum(teamPCA.explained_variance_ratio_[:50]))
print ('Variance Captured with 12 components %0.3f \n' % sum(teamPCA.explained_variance_ratio_[:11]))


np.set_printoptions(precision=4,suppress=True)
print ('\nData in lower dimension:\n', TeamDTtrans)

TeamkmeansPCA = KMeans(n_clusters=17, max_iter=500, verbose=1) # initialization
TeamkmeansPCA.fit(TeamDTtrans)
print ('Centoids with PCA: \n:', TeamkmeansPCA.cluster_centers_)
TeamcPCA = TeamkmeansPCA.predict(TeamDTtrans)

print (completeness_score(category_code,TeamcPCA))
print (homogeneity_score(category_code,TeamcPCA))

Initialization complete

: [[ 0.      0.      0.     ...,  0.      0.      0.    ]
 [ 0.0021 -0.     -0.     ...,  0.0021  0.0021  0.0021]
 [-0.     -0.      0.0079 ..., -0.     -0.     -0.    ]
 ..., 
 [ 0.      0.      0.     ...,  0.      0.      0.    ]
 [-0.     -0.     -0.     ..., -0.     -0.     -0.    ]
 [ 0.      0.      0.     ...,  0.      0.      0.    ]]


completeness_score 0.0466395807697
homogeneity_score  0.0313911425056


Variance Captured with 250 components 0.908
Variance Captured with 50 components 0.655
Variance Captured with 12 components 0.436 


Data in lower dimension:
 [[-1.6015 -0.1753  0.5332 ..., -0.     -0.      0.    ]
 [ 3.3826  1.3434 -0.1131 ...,  0.      0.     -0.    ]
 [-0.0834 -0.1772  1.1174 ...,  0.     -0.     -0.    ]
 ..., 
 [ 1.6843 -1.065   0.7969 ...,  0.      0.      0.    ]
 [-0.7434 -0.0154 -0.6323 ..., -0.      0.     -0.    ]
 [ 0.4519 -0.2567  0.2685 ...,  0.      0.     -0.    ]]

Centoids with PCA: 
: [[  2.0575  -1.0234   0.2513 ...,   0.       0.      -0.    ]
 [ 15.6384   4.1047  -4.9156 ...,   0.       0.      -0.    ]
 [ -1.1177  -0.0207   0.8399 ...,  -0.      -0.       0.    ]
 ..., 
 [ -1.2161  -0.446   -0.4249 ...,  -0.      -0.       0.    ]
 [  4.322   -0.4512  -1.527  ...,   0.       0.      -0.    ]
 [  5.7859   1.4534   1.9359 ...,   0.       0.      -0.    ]]


completeness_score 0.0438386383352
homogeneity_score  0.035863857614

plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 2], c= TeamcPCA)
plt.ylabel('PC1')
plt.xlabel('PC2') 
plt.title('KMeans')
plt.subplot(222)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 2], c= category_code)
plt.xlabel('PC2') 
plt.title('Actual')
plt.subplot(223)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 3], c= TeamcPCA)
plt.ylabel('PC1')
plt.xlabel('PC3') 
plt.subplot(224)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 3], c= category_code)
plt.xlabel('PC3') 
plt.show()
#plt.savefig("TeamkmeansPCA1.png")

plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 3], c= TeamcPCA)
plt.ylabel('PC1')
plt.xlabel('PC3') 
plt.title('KMeans')
plt.subplot(222)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 3], c= category_code)
plt.xlabel('PC3') 
plt.title('Actual')
plt.subplot(223)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 4], c= TeamcPCA)
plt.ylabel('PC2')
plt.xlabel('PC4') 
plt.subplot(224)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 4], c= category_code)
plt.xlabel('PC4') 
plt.show()
#plt.savefig("TeamkmeansPCA2.png")

Loan data¶

''' ====================================== L O A N  D A T A ======================================'''

cursor.execute("""SELECT `ID`, `STATUS`, `FUNDED_AMOUNT`, `PAID_AMOUNT`, `ACTIVITY`, 
               `SECTOR`, `LOANUSE`,`COUNTRY`,`TOWN`, 'GEOPAIRS', `PARTNER_ID`,
               `DISBURSAL_AMOUNT`,`DISBURSAL_CURRENCY`,`DISBURSAL_DATE`, `LOAN_AMOUNT`,
               `NONPAYMENT`,`CURRENCY_EXCHANGE`,`POSTED_DATE`,`FUNDED_DATE`,`PAID_DATE`,
               `REFUNDED_DATE`,`JOURNAL_ENTRIES`,`JOURNAL_BULK_ENTRIES`
               FROM `LOAN` WHERE `LOANUSE` IS NOT NULL""")

loan = cursor.fetchall()
loan_pan = pan.DataFrame(loan, columns= ['ID','STATUS', 'FUNDED_AMOUNT', 'PAID_AMOUNT', 'ACTIVITY', 
               'SECTOR', 'LOANUSE','COUNTRY','TOWN', 'GEOPAIRS', 'PARTNER_ID',
               'DISBURSAL_AMOUNT','DISBURSAL_CURRENCY','DISBURSAL_DATE','LOAN_AMOUNT',
               'NONPAYMENT','CURRENCY_EXCHANGE','POSTED_DATE','FUNDED_DATE','PAID_DATE',
               'REFUNDED_DATE','JOURNAL_ENTRIES','JOURNAL_BULK_ENTRIES'])

loan_pan.set_index('ID',inplace=True)
loan_pan.describe(include='all')

/Users/Julio/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)

loanuse = loan_pan['LOANUSE'] # loan use data
loan_tokens =[] # creating tokens
for use in loanuse:
    loan_tokens.append(word_tokenize(use))
do_reg = len(loan_tokens[1])
print (loan_tokens[1], do_reg)
print ('------------------------------------------')
# removing stopwords
for i in range(2):
    for loan in loan_tokens:
        for word in loan:
            if word in stop_words: 
                while word in loan:
                    loan.remove(word)
            do_reg_reduce = len(loan_tokens[1])
print (loan_tokens[1],do_reg_reduce)

['Buying', 'more', 'produce', 'each', 'time', 'for', 'greater', 'profit'] 8
------------------------------------------
['Buying', 'produce', 'time', 'greater', 'profit'] 5

loan_stem_words = []
i = 0
for loan in loan_tokens:
    temp = []
    for word in loan:
        # stemming words
        temp.append(wnl.lemmatize(snowball.stem(word)))
    loan_stem_words.append(temp)
    i +=1
print (loan_stem_words[1])

['buy', 'produc', 'time', 'greater', 'profit']

# including vector with stem words to loan data
loan_pan['loan_use_vector'] = loan_stem_words
len(loan_stem_words)

# sparsing words in panda dadaframe
lsw = pan.DataFrame(loan_stem_words)

# concat of team data with tsw by axis (team_key)
loanData = pan.concat([loan_pan, lsw], axis=1)
len(loanData[0][0])

loanData.describe()

/Users/Julio/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)

%matplotlib inline
loan_pan['COUNTRY'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Loan Distribution by Country',
                                                figsize=(20,6))

<matplotlib.axes._subplots.AxesSubplot at 0x12a328518>

%matplotlib inline
loan_pan['PARTNER_ID'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Loan Distribution by Partner',
                                                figsize=(20,6))

<matplotlib.axes._subplots.AxesSubplot at 0x13603ca20>

%matplotlib inline
loan_pan['DISBURSAL_CURRENCY'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Loan Distribution by Currency',
                                                figsize=(20,6))
loan_pan['DISBURSAL_CURRENCY'].value_counts('open')

USD    0.242789
PEN    0.115773
PHP    0.087317
XOF    0.062946
GHS    0.046735
NIO    0.046560
TZS    0.040793
NGN    0.038400
TJS    0.038091
KHR    0.035184
UGX    0.023833
MXN    0.019795
VND    0.017994
PYG    0.015782
KES    0.015280
WST    0.014972
AZN    0.014609
PKR    0.014392
BOB    0.013908
DOP    0.011811
IDR    0.009042
SDG    0.008728
SLL    0.007924
UAH    0.007858
MNT    0.007707
AFN    0.006576
MZN    0.005428
HNL    0.004848
GTQ    0.004582
NPR    0.003995
RWF    0.003518
BAM    0.003482
LRD    0.002702
CRC    0.001626
MDL    0.001094
XAF    0.000882
KGS    0.000822
LBP    0.000568
BGN    0.000387
IQD    0.000381
AMD    0.000248
CDF    0.000230
HTG    0.000212
COP    0.000121
CLP    0.000054
JOD    0.000024
Name: DISBURSAL_CURRENCY, dtype: float64

%matplotlib inline
loan_pan['NONPAYMENT'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Loan Distribution by Country',
                                                figsize=(20,6))
print ('Partner %0.2f' % (loan_pan['NONPAYMENT'].value_counts('open')[0]))
print ('lender %0.2f' % (loan_pan['NONPAYMENT'].value_counts('open')[1]))

Partner 0.82
lender 0.18

%matplotlib inline
loan_pan['LOAN_AMOUNT'].plot(kind='hist',color = 'green',title = 'Loan Distribution by Country',figsize=(20,6))

<matplotlib.axes._subplots.AxesSubplot at 0x1378c4c18>

%matplotlib inline
loan_pan['DISBURSAL_AMOUNT'].value_counts().plot(kind='hist', 
                                                color = 'green', 
                                                title = 'Loan Distribution by Country',
                                                figsize=(20,6))

<matplotlib.axes._subplots.AxesSubplot at 0x13a553278>

''' freq vector ok '''
countvector =CountVectorizer() # decode_error='ignore')
data = countvector.fit_transform(loanuse)
frec_vector_pan = pan.DataFrame(data.toarray(), columns=countvector.get_feature_names())

print (data.shape)
print (loanuse.shape)
print (frec_vector_pan.shape)
print (loan_pan.shape)

(165444, 17135)
(165444,)
(165444, 17135)
(165444, 23)

''' Clustering ''' 

sec = {'Food':0,
 'Agriculture':1,
 'Clothing':2,
 'Construction':3,
 'Health':4,
 'Services':5,
 'Retail':6,
 'Arts':7,
 'Housing':8,
 'Transportation':9,
 'Manufacturing':10,
 'Entertainment':11}
sector = np.asarray(loan_pan['SECTOR'][:1000])
sector_code = []
for i in sector:
    sector_code.append(sec[i])
label = []
for i in sector:
    if i not in label:
        label.append(i)

''' kmeans 

kmeans = KMeans(n_clusters=12, max_iter=20, verbose=1) # initialization
kmeans.fit(frec_vector_pan)
#print ('Centoids: \n:', kmeans.cluster_centers_)
c = kmeans.predict(frec_vector_pan)

print (completeness_score(sector_code,c))
print (homogeneity_score(sector_code,c)) 
'''

" kmeans \n\nkmeans = KMeans(n_clusters=12, max_iter=20, verbose=1) # initialization\nkmeans.fit(frec_vector_pan)\n#print ('Centoids: \n:', kmeans.cluster_centers_)\nc = kmeans.predict(frec_vector_pan)\n\nprint (completeness_score(sector_code,c))\nprint (homogeneity_score(sector_code,c)) \n"

''' PCA '''
# setting PCA from sklearn decomposition package
pca = decomposition.PCA(n_components=len(frec_vector_pan.columns))
# reducing dimensionality 
DTtrans = pca.fit(frec_vector_pan).transform(frec_vector_pan)

# variance captured by component
print ('Variance ratio: \n', pca.explained_variance_ratio_ , '\n')

# we need to take the first 300 component to get 90% of the variance
print ('\nVariance Captured with 300 components %0.3f' % sum(pca.explained_variance_ratio_[:300]))
print ('Variance Captured with 50 components %0.3f' % sum(pca.explained_variance_ratio_[:50]))
print ('Variance Captured with 12 components %0.3f \n' % sum(pca.explained_variance_ratio_[:11]))


np.set_printoptions(precision=4,suppress=True)
print ('\nData in lower dimension:\n', DTtrans)

kmeansPCA = KMeans(n_clusters=12, max_iter=500, verbose=1) # initialization
kmeansPCA.fit(DTtrans)
print ('Centoids with PCA: \n:', kmeansPCA.cluster_centers_)
cPCA = kmeansPCA.predict(DTtrans)

print (completeness_score(sector_code,cPCA))
print (homogeneity_score(sector_code,cPCA))

Image(filename='kmeansPCA1.png')

Image(filename='kmeansPCA2.png')

	team_key	teams_id	teams_loan_count	teams_loaned_amount	teams_member_count
count	8523.000000	8523.000000	8523.000000	8.523000e+03	8523.000000
mean	4261.000000	5925.817553	926.399742	2.965701e+04	41.284524
std	2460.522505	3426.694116	13809.199834	4.563748e+05	738.087237
min	0.000000	2.000000	0.000000	0.000000e+00	0.000000
25%	2130.500000	2938.500000	4.000000	1.000000e+02	3.000000
50%	4261.000000	5919.000000	31.000000	9.000000e+02	5.000000
75%	6391.500000	8880.500000	146.000000	4.300000e+03	11.000000
max	8522.000000	11885.000000	934363.000000	2.711678e+07	50244.000000

	STATUS	FUNDED_AMOUNT	PAID_AMOUNT	ACTIVITY	SECTOR	LOANUSE	COUNTRY	TOWN	GEOPAIRS	PARTNER_ID	...	DISBURSAL_DATE	LOAN_AMOUNT	NONPAYMENT	CURRENCY_EXCHANGE	POSTED_DATE	FUNDED_DATE	PAID_DATE	REFUNDED_DATE	JOURNAL_ENTRIES	JOURNAL_BULK_ENTRIES
count	165444	165444.000000	161563	165444	165444	165444	165444	161417	165444	165444.000000	...	165444	165444.000000	165444	165444	165444	165396	96005	1	165189.000000	165189.000000
unique	5	NaN	13775	148	16	117914	53	7537	1	NaN	...	65332	NaN	2	3	74770	151685	76185	1	NaN	NaN
top	paid	NaN	0.00	Retail	Food	To purchase additional products to sell	Peru	Managua	GEOPAIRS	NaN	...	2009-04-20 07:00:00	NaN	partner	partner	2007-10-01 07:00:04	2005-03-31 06:27:55	2008-04-24 03:04:05	2009-12-01 07:03:29	NaN	NaN
freq	95818	NaN	12089	12413	48272	963	20260	3493	165444	NaN	...	708	NaN	134886	95391	65	202	183	1	NaN	NaN
first	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	2005-04-14 05:27:55	NaN	NaN	NaN	2005-04-15 17:00:00	2005-03-31 06:27:55	NaN	NaN	NaN	NaN
last	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	2010-02-25 08:00:00	NaN	NaN	NaN	2010-01-29 01:00:12	2010-01-29 00:53:37	NaN	NaN	NaN	NaN
mean	NaN	703.726397	NaN	NaN	NaN	NaN	NaN	NaN	NaN	71.610333	...	NaN	703.787445	NaN	NaN	NaN	NaN	NaN	NaN	1.339550	0.987481
std	NaN	625.568673	NaN	NaN	NaN	NaN	NaN	NaN	NaN	37.800782	...	NaN	625.607570	NaN	NaN	NaN	NaN	NaN	NaN	0.666729	0.162614
min	NaN	0.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.000000	...	NaN	25.000000	NaN	NaN	NaN	NaN	NaN	NaN	1.000000	0.000000
25%	NaN	325.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	43.000000	...	NaN	325.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
50%	NaN	550.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	72.000000	...	NaN	550.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
75%	NaN	925.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	102.000000	...	NaN	925.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
max	NaN	10000.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	159.000000	...	NaN	10000.000000	NaN	NaN	NaN	NaN	NaN	NaN	19.000000	19.000000

	FUNDED_AMOUNT	PARTNER_ID	LOAN_AMOUNT	JOURNAL_ENTRIES	JOURNAL_BULK_ENTRIES
count	165444.000000	165444.000000	165444.000000	165189.000000	165189.000000
mean	703.726397	71.610333	703.787445	1.339550	0.987481
std	625.568673	37.800782	625.607570	0.666729	0.162614
min	0.000000	1.000000	25.000000	1.000000	0.000000
25%	NaN	NaN	NaN	NaN	NaN
50%	NaN	NaN	NaN	NaN	NaN
75%	NaN	NaN	NaN	NaN	NaN
max	10000.000000	159.000000	10000.000000	19.000000	19.000000