Team and Loan Clusters

In [11]:
import mysql.connector
import operator
from IPython.display import Image
import matplotlib.pyplot as plt
import pandas as pan
from pandas.tools.plotting import scatter_matrix
import numpy as np
import pylab as pl
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import decomposition
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn import neighbors, tree, naive_bayes 
from sklearn.cluster import KMeans
from sklearn.metrics import completeness_score, homogeneity_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import confusion_matrix

NL

In [12]:
''' ====================================== S T O P  W O R D S  ======================================'''
import nltk
from nltk import word_tokenize,sent_tokenize,stem
from nltk.stem import WordNetLemmatizer
import string

stop_words =set(('a',
     'about','above','after','again','against','all','am','an','and','any','are','as','at','be','because','been',
     'before','being','below','between','both','but','by','can','did','do','does','doing','don','down','during',
     'each','few','for','from','further','had','has','have','having','he','her','here','hers','herself','him',
     'himself','his','how','i','if','in','into','is','it','its','itself','just','me','more','most','my','myself','no',
     'nor','not','now','of','off','on','once','only','or','other','our','ours','ourselves','out','over',
     'own','s','same','she','should','so','some','such','t','than','that','the',
     'their','theirs','them','themselves','then','there','these','they','this','those','through','to',
     'too','under','until','up','very','was','we','were','what','when','where','which',
     'while','who','whom','why','will','with','you','your','yours','yourself','yourselves','``','...',"''","'s",
     '!','"',     '#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^',
     '_','`','{','|','}','~'))

punctutation = set(string.punctuation)

stop_words.update(punctutation)
print(stop_words)

wnl = WordNetLemmatizer()
snowball = stem.snowball.EnglishStemmer()
{'with', 'had', '*', 'down', 'during', 'has', 'myself', '[', 'any', 'those', 'into', 'i', 'once', 'was', 'its', '(', 'but', 'that', 'their', '"', 'hers', "'", 'how', '|', 'my', '%', 'above', 'an', 'ours', '`', 'our', "''", 'why', 'from', 'herself', 'not', 'ourselves', 'yourself', 'yourselves', 'at', 'when', 'theirs', 'doing', '\\', 'to', 'while', 'where', 'and', 'against', 'did', 'they', 'it', 'some', 'between', 's', 'a', 'over', 'can', 'such', '~', ',', 'or', 'whom', 'what', 'himself', 'don', 'having', 'all', 'on', 'very', '+', 'as', '``', '/', '#', 'up', ')', '=', 'too', 'about', '_', 'being', 'here', 'until', 'just', 'out', 'were', 'if', 'your', 'am', 'itself', 'there', 'which', 'her', 'own', 'his', 'he', 'further', 'under', '.', 'then', 'do', 'both', 'more', 'most', 'is', 'other', 'for', 'she', '{', ':', 'through', 'we', 'will', 'after', '$', '?', 'off', 'because', 'them', 'each', 'few', '!', 'again', 'been', 't', 'so', 'before', 'be', 'no', '^', 'by', 'in', 'only', 'now', '&', 'nor', 'same', 'themselves', 'should', 'of', 'you', '-', 'me', ';', 'have', '}', '<', 'does', 'than', ']', '>', '@', 'are', 'this', 'him', 'who', '...', 'yours', 'the', 'below', "'s", 'these'}

Database connection

In [13]:
# Database connection to MySQL
cnx = mysql.connector.connect(user='root', password='root',
                              host='localhost', port='8889',
                              database='Kiva')              
cursor = cnx.cursor() # cursos
In [14]:
# Currency

cursor.execute("""SELECT `DISBURSAL_CURRENCY`, count(*) 
                FROM LOAN 
                GROUP BY `DISBURSAL_CURRENCY` ORDER BY count(*) DESC""")

Currency= cursor.fetchall()

CuP = pan.DataFrame(Currency, columns=['Currency','Count'])
%matplotlib inline
CuP.plot(x=CuP['Currency'],kind='bar', color = 'green', title = 'Currency Distribution',figsize=(10,6))
print (CuP.head(5))
  Currency  Count
0      USD  40169
1      PEN  19158
2      PHP  14446
3      XOF  10414
4      GHS   7732

Team data

In [15]:
''' ====================================== T E A M  D A T A ======================================'''

cursor.execute("""SELECT `teams_id`, `teams_loan_count`, `teams_loaned_amount`, `teams_member_count`, 
               `teams_membership_type`, `teams_category`,`teams_shortname`,`teams_loan_because` 
               FROM `TEAM` WHERE `teams_loan_because` IS NOT NULL""")

team_because = cursor.fetchall()
team_becausePan = pan.DataFrame(team_because)

len(team_because)
# get the number of rows in the resultset
numrows = int(cursor.rowcount)

teamBe = []
key = 0 # the field teams_id has repited values, this would work as unique key.
for x in team_because:
    #x = team_because[0]
    temp = [key]
    for i in x:
        temp.append(i)
    teamBe.append(temp)
    key+=1
len(teamBe)

print ('Done!')
Done!
In [16]:
teamBePan = pan.DataFrame(teamBe, columns=['team_key','teams_id', 'teams_loan_count', 'teams_loaned_amount', 
                                           'teams_member_count','teams_membership_type', 
                                           'teams_category','teams_shortname','teams_loan_because'])
In [17]:
teamBePan.describe()
Out[17]:
team_key teams_id teams_loan_count teams_loaned_amount teams_member_count
count 8523.000000 8523.000000 8523.000000 8.523000e+03 8523.000000
mean 4261.000000 5925.817553 926.399742 2.965701e+04 41.284524
std 2460.522505 3426.694116 13809.199834 4.563748e+05 738.087237
min 0.000000 2.000000 0.000000 0.000000e+00 0.000000
25% 2130.500000 2938.500000 4.000000 1.000000e+02 3.000000
50% 4261.000000 5919.000000 31.000000 9.000000e+02 5.000000
75% 6391.500000 8880.500000 146.000000 4.300000e+03 11.000000
max 8522.000000 11885.000000 934363.000000 2.711678e+07 50244.000000
In [62]:
%matplotlib inline
teamBePan['teams_category'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Team category distribution',
                                                figsize=(10,6))
Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ea66b38>
In [7]:
%matplotlib inline
teamBePan['teams_membership_type'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Team category distribution',
                                                figsize=(10,6))


print ('Team category distribution: \nOpen:\t %0.2f \nClosed:\t %0.2f' % (
        teamBePan['teams_membership_type'].value_counts('open')[0],
        teamBePan['teams_membership_type'].value_counts('open')[1]))
Team category distribution: 
Open:	 0.76 
Closed:	 0.24
In [8]:
%matplotlib inline
teamBePan['teams_loaned_amount'].value_counts().plot(kind='hist', 
                                                color = 'green', 
                                                title = 'Team category distribution',
                                                figsize=(10,6))
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d286048>
In [65]:
%matplotlib inline
teamBePan['teams_loan_count'].value_counts().plot(kind='hist', 
                                                color = 'green', 
                                                title = 'Team category distribution',
                                                figsize=(10,6))
Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0x1342381d0>
In [77]:
team_type_amount = pan.crosstab(teamBePan['teams_loan_count'], 
                                teamBePan['teams_membership_type'][teamBePan['teams_membership_type']=='open'])
%matplotlib inline
plt.show(team_type_amount.plot(kind="hist", title = 'Outcome type to sex',color = 'green',figsize=(10,6)))
In [78]:
team_type_amount = pan.crosstab(teamBePan['teams_loan_count'], 
                                teamBePan['teams_membership_type'][teamBePan['teams_membership_type']=='closed'])
%matplotlib inline
plt.show(team_type_amount.plot(kind="hist", title = 'Outcome type to sex',color = 'green',figsize=(10,6)))
In [21]:
''' clustering '''
cttt =[]
for i in teamBePan.iterrows():
    if i[1]['teams_category'] not in cttt:
        cttt.append(i[1]['teams_category'])

cat = {'Families':0,
 'Common Interest':1,
 'Sports Groups':2,
 'Businesses':3,
 'Businesses - Internal Groups':4,
 'Colleges/Universities':5,
 'Clubs':6,
 'Religious Congregations':7,
 'Local Area':8,
 'Youth Groups':9,
 'Alumni Groups':10,
 'Other':11,
 'Field Partner Fans':12,
 'Schools':13,
 'Friends':14,
 'Events':15,
 'Memorials':16}
 
category = np.asarray(teamBePan['teams_category'][:1000])
category_code = []
for i in category:
    category_code.append(cat[i])

teamLabel = []
for i in category:
    if i not in teamLabel:
        teamLabel.append(i)

print ('Done!')
Done!
In [12]:
''' ====================================== T E A M  D A T A NL ======================================'''
#i =0 
tokens =[] # creating tokens from loan because
for reason in teamBePan.iterrows():
    tokens.append(nltk.word_tokenize(reason[1][-1]))
    #i+=1
    #if i == 10:
    #    break
do_reg = len(tokens[1])
print (tokens[1], do_reg)
print ('------------------------------------------')

# removing stopwords
for i in range(2):
    for team in tokens:
        for word in team:
            if word in stop_words: 
                while word in team:
                    team.remove(word)
            do_reg_reduce = len(tokens[1])
print (tokens[1],do_reg_reduce)
['We', 'believe', 'the', 'children', 'are', 'our', 'future', '.', 'Teach', 'them', 'well', 'and', 'let', 'them', 'lead', 'the', 'way', '.', 'Show', 'them', 'all', 'the', 'beauty', 'we', 'possess', 'inside', '.', 'You', "'ve", 'got', 'to', 'give', 'them', 'a', 'sense', 'of', 'pride', '.', 'To', 'make', 'it', 'easier', '-', 'let', 'the', 'children', "'s", 'laughter', 'remind', 'us', 'how', 'we', 'used', 'to', 'be', '.'] 56
------------------------------------------
['We', 'believe', 'children', 'future', 'Teach', 'well', 'let', 'lead', 'way', 'Show', 'beauty', 'possess', 'inside', 'You', "'ve", 'got', 'give', 'sense', 'pride', 'To', 'make', 'easier', 'let', 'children', 'laughter', 'remind', 'us', 'used'] 28
In [13]:
''' using wordnet to stem words '''
wnl = WordNetLemmatizer()
snowball = stem.snowball.EnglishStemmer()

#save stem words for each team
team_stem_words = []
i = 0
for team in tokens:
    temp = []
    for word in team:
        # stemming words
        temp.append(wnl.lemmatize(snowball.stem(word)))
    team_stem_words.append(temp)
    i +=1

print (team_stem_words[1])
['we', 'believ', 'child', 'futur', 'teach', 'well', 'let', 'lead', 'way', 'show', 'beauti', 'posse', 'insid', 'you', 've', 'got', 'give', 'sen', 'pride', 'to', 'make', 'easier', 'let', 'child', 'laughter', 'remind', 'u', 'use']
In [14]:
# including vector with stem words to teams data
teamBePan['loan_bec_vector'] = team_stem_words

# sparsing words in panda dadaframe
tsw = pan.DataFrame(team_stem_words)

# concat of team data with tsw by axis (team_key)
teamData = pan.concat([teamBePan, tsw], axis=1)
len(teamData[0][0])

teamData.head()
team_loan_bec = teamData .pop('teams_loan_because')
In [25]:
teambecause = teamBePan['teams_loan_because'][0:1000]
''' freq vector ok '''
countvectorTeam =CountVectorizer() # decode_error='ignore')
teamdata = countvectorTeam.fit_transform(teambecause)
team_frec_vector_pan = pan.DataFrame(teamdata.toarray(), columns=countvectorTeam.get_feature_names())


''' kmeans '''
teamkmeans = KMeans(n_clusters=17, max_iter=500, verbose=1) # initialization
teamkmeans.fit(team_frec_vector_pan)
print ('Centoids: \n:', teamkmeans.cluster_centers_)
cteam = teamkmeans.predict(team_frec_vector_pan)

print (completeness_score(category_code,cteam))
print (homogeneity_score(category_code,cteam))

''' PCA '''
# setting PCA from sklearn decomposition package
teamPCA = decomposition.PCA(n_components=len(team_frec_vector_pan.columns))
# reducing dimensionality 
TeamDTtrans = teamPCA.fit(team_frec_vector_pan).transform(team_frec_vector_pan)

# variance captured by component
print ('Variance ratio: \n', teamPCA.explained_variance_ratio_ , '\n')

# we need to take the first 300 component to get 90% of the variance
print ('\nVariance Captured with 250 components %0.3f' % sum(teamPCA.explained_variance_ratio_[:250]))
print ('Variance Captured with 50 components %0.3f' % sum(teamPCA.explained_variance_ratio_[:50]))
print ('Variance Captured with 12 components %0.3f \n' % sum(teamPCA.explained_variance_ratio_[:11]))


np.set_printoptions(precision=4,suppress=True)
print ('\nData in lower dimension:\n', TeamDTtrans)

TeamkmeansPCA = KMeans(n_clusters=17, max_iter=500, verbose=1) # initialization
TeamkmeansPCA.fit(TeamDTtrans)
print ('Centoids with PCA: \n:', TeamkmeansPCA.cluster_centers_)
TeamcPCA = TeamkmeansPCA.predict(TeamDTtrans)

print (completeness_score(category_code,TeamcPCA))
print (homogeneity_score(category_code,TeamcPCA))
Initialization complete

: [[ 0.      0.      0.     ...,  0.      0.      0.    ]
 [ 0.0021 -0.     -0.     ...,  0.0021  0.0021  0.0021]
 [-0.     -0.      0.0079 ..., -0.     -0.     -0.    ]
 ..., 
 [ 0.      0.      0.     ...,  0.      0.      0.    ]
 [-0.     -0.     -0.     ..., -0.     -0.     -0.    ]
 [ 0.      0.      0.     ...,  0.      0.      0.    ]]


completeness_score 0.0466395807697
homogeneity_score  0.0313911425056


Variance Captured with 250 components 0.908
Variance Captured with 50 components 0.655
Variance Captured with 12 components 0.436 


Data in lower dimension:
 [[-1.6015 -0.1753  0.5332 ..., -0.     -0.      0.    ]
 [ 3.3826  1.3434 -0.1131 ...,  0.      0.     -0.    ]
 [-0.0834 -0.1772  1.1174 ...,  0.     -0.     -0.    ]
 ..., 
 [ 1.6843 -1.065   0.7969 ...,  0.      0.      0.    ]
 [-0.7434 -0.0154 -0.6323 ..., -0.      0.     -0.    ]
 [ 0.4519 -0.2567  0.2685 ...,  0.      0.     -0.    ]]

Centoids with PCA: 
: [[  2.0575  -1.0234   0.2513 ...,   0.       0.      -0.    ]
 [ 15.6384   4.1047  -4.9156 ...,   0.       0.      -0.    ]
 [ -1.1177  -0.0207   0.8399 ...,  -0.      -0.       0.    ]
 ..., 
 [ -1.2161  -0.446   -0.4249 ...,  -0.      -0.       0.    ]
 [  4.322   -0.4512  -1.527  ...,   0.       0.      -0.    ]
 [  5.7859   1.4534   1.9359 ...,   0.       0.      -0.    ]]


completeness_score 0.0438386383352
homogeneity_score  0.035863857614


In [26]:
plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 2], c= TeamcPCA)
plt.ylabel('PC1')
plt.xlabel('PC2') 
plt.title('KMeans')
plt.subplot(222)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 2], c= category_code)
plt.xlabel('PC2') 
plt.title('Actual')
plt.subplot(223)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 3], c= TeamcPCA)
plt.ylabel('PC1')
plt.xlabel('PC3') 
plt.subplot(224)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 3], c= category_code)
plt.xlabel('PC3') 
plt.show()
#plt.savefig("TeamkmeansPCA1.png")

plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 3], c= TeamcPCA)
plt.ylabel('PC1')
plt.xlabel('PC3') 
plt.title('KMeans')
plt.subplot(222)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 3], c= category_code)
plt.xlabel('PC3') 
plt.title('Actual')
plt.subplot(223)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 4], c= TeamcPCA)
plt.ylabel('PC2')
plt.xlabel('PC4') 
plt.subplot(224)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 4], c= category_code)
plt.xlabel('PC4') 
plt.show()
#plt.savefig("TeamkmeansPCA2.png")

Loan data

In [6]:
''' ====================================== L O A N  D A T A ======================================'''

cursor.execute("""SELECT `ID`, `STATUS`, `FUNDED_AMOUNT`, `PAID_AMOUNT`, `ACTIVITY`, 
               `SECTOR`, `LOANUSE`,`COUNTRY`,`TOWN`, 'GEOPAIRS', `PARTNER_ID`,
               `DISBURSAL_AMOUNT`,`DISBURSAL_CURRENCY`,`DISBURSAL_DATE`, `LOAN_AMOUNT`,
               `NONPAYMENT`,`CURRENCY_EXCHANGE`,`POSTED_DATE`,`FUNDED_DATE`,`PAID_DATE`,
               `REFUNDED_DATE`,`JOURNAL_ENTRIES`,`JOURNAL_BULK_ENTRIES`
               FROM `LOAN` WHERE `LOANUSE` IS NOT NULL""")

loan = cursor.fetchall()
loan_pan = pan.DataFrame(loan, columns= ['ID','STATUS', 'FUNDED_AMOUNT', 'PAID_AMOUNT', 'ACTIVITY', 
               'SECTOR', 'LOANUSE','COUNTRY','TOWN', 'GEOPAIRS', 'PARTNER_ID',
               'DISBURSAL_AMOUNT','DISBURSAL_CURRENCY','DISBURSAL_DATE','LOAN_AMOUNT',
               'NONPAYMENT','CURRENCY_EXCHANGE','POSTED_DATE','FUNDED_DATE','PAID_DATE',
               'REFUNDED_DATE','JOURNAL_ENTRIES','JOURNAL_BULK_ENTRIES'])

loan_pan.set_index('ID',inplace=True)
loan_pan.describe(include='all')
/Users/Julio/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[6]:
STATUS FUNDED_AMOUNT PAID_AMOUNT ACTIVITY SECTOR LOANUSE COUNTRY TOWN GEOPAIRS PARTNER_ID ... DISBURSAL_DATE LOAN_AMOUNT NONPAYMENT CURRENCY_EXCHANGE POSTED_DATE FUNDED_DATE PAID_DATE REFUNDED_DATE JOURNAL_ENTRIES JOURNAL_BULK_ENTRIES
count 165444 165444.000000 161563 165444 165444 165444 165444 161417 165444 165444.000000 ... 165444 165444.000000 165444 165444 165444 165396 96005 1 165189.000000 165189.000000
unique 5 NaN 13775 148 16 117914 53 7537 1 NaN ... 65332 NaN 2 3 74770 151685 76185 1 NaN NaN
top paid NaN 0.00 Retail Food To purchase additional products to sell Peru Managua GEOPAIRS NaN ... 2009-04-20 07:00:00 NaN partner partner 2007-10-01 07:00:04 2005-03-31 06:27:55 2008-04-24 03:04:05 2009-12-01 07:03:29 NaN NaN
freq 95818 NaN 12089 12413 48272 963 20260 3493 165444 NaN ... 708 NaN 134886 95391 65 202 183 1 NaN NaN
first NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 2005-04-14 05:27:55 NaN NaN NaN 2005-04-15 17:00:00 2005-03-31 06:27:55 NaN NaN NaN NaN
last NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 2010-02-25 08:00:00 NaN NaN NaN 2010-01-29 01:00:12 2010-01-29 00:53:37 NaN NaN NaN NaN
mean NaN 703.726397 NaN NaN NaN NaN NaN NaN NaN 71.610333 ... NaN 703.787445 NaN NaN NaN NaN NaN NaN 1.339550 0.987481
std NaN 625.568673 NaN NaN NaN NaN NaN NaN NaN 37.800782 ... NaN 625.607570 NaN NaN NaN NaN NaN NaN 0.666729 0.162614
min NaN 0.000000 NaN NaN NaN NaN NaN NaN NaN 1.000000 ... NaN 25.000000 NaN NaN NaN NaN NaN NaN 1.000000 0.000000
25% NaN 325.000000 NaN NaN NaN NaN NaN NaN NaN 43.000000 ... NaN 325.000000 NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN 550.000000 NaN NaN NaN NaN NaN NaN NaN 72.000000 ... NaN 550.000000 NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN 925.000000 NaN NaN NaN NaN NaN NaN NaN 102.000000 ... NaN 925.000000 NaN NaN NaN NaN NaN NaN NaN NaN
max NaN 10000.000000 NaN NaN NaN NaN NaN NaN NaN 159.000000 ... NaN 10000.000000 NaN NaN NaN NaN NaN NaN 19.000000 19.000000

13 rows × 22 columns

In [7]:
loanuse = loan_pan['LOANUSE'] # loan use data
loan_tokens =[] # creating tokens
for use in loanuse:
    loan_tokens.append(word_tokenize(use))
do_reg = len(loan_tokens[1])
print (loan_tokens[1], do_reg)
print ('------------------------------------------')
# removing stopwords
for i in range(2):
    for loan in loan_tokens:
        for word in loan:
            if word in stop_words: 
                while word in loan:
                    loan.remove(word)
            do_reg_reduce = len(loan_tokens[1])
print (loan_tokens[1],do_reg_reduce)
['Buying', 'more', 'produce', 'each', 'time', 'for', 'greater', 'profit'] 8
------------------------------------------
['Buying', 'produce', 'time', 'greater', 'profit'] 5
In [8]:
loan_stem_words = []
i = 0
for loan in loan_tokens:
    temp = []
    for word in loan:
        # stemming words
        temp.append(wnl.lemmatize(snowball.stem(word)))
    loan_stem_words.append(temp)
    i +=1
print (loan_stem_words[1])
['buy', 'produc', 'time', 'greater', 'profit']
In [9]:
# including vector with stem words to loan data
loan_pan['loan_use_vector'] = loan_stem_words
len(loan_stem_words)

# sparsing words in panda dadaframe
lsw = pan.DataFrame(loan_stem_words)

# concat of team data with tsw by axis (team_key)
loanData = pan.concat([loan_pan, lsw], axis=1)
len(loanData[0][0])

loanData.describe()
/Users/Julio/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[9]:
FUNDED_AMOUNT PARTNER_ID LOAN_AMOUNT JOURNAL_ENTRIES JOURNAL_BULK_ENTRIES
count 165444.000000 165444.000000 165444.000000 165189.000000 165189.000000
mean 703.726397 71.610333 703.787445 1.339550 0.987481
std 625.568673 37.800782 625.607570 0.666729 0.162614
min 0.000000 1.000000 25.000000 1.000000 0.000000
25% NaN NaN NaN NaN NaN
50% NaN NaN NaN NaN NaN
75% NaN NaN NaN NaN NaN
max 10000.000000 159.000000 10000.000000 19.000000 19.000000
In [20]:
%matplotlib inline
loan_pan['COUNTRY'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Loan Distribution by Country',
                                                figsize=(20,6))
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x12a328518>
In [31]:
%matplotlib inline
loan_pan['PARTNER_ID'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Loan Distribution by Partner',
                                                figsize=(20,6))
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x13603ca20>
In [81]:
%matplotlib inline
loan_pan['DISBURSAL_CURRENCY'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Loan Distribution by Currency',
                                                figsize=(20,6))
loan_pan['DISBURSAL_CURRENCY'].value_counts('open')
Out[81]:
USD    0.242789
PEN    0.115773
PHP    0.087317
XOF    0.062946
GHS    0.046735
NIO    0.046560
TZS    0.040793
NGN    0.038400
TJS    0.038091
KHR    0.035184
UGX    0.023833
MXN    0.019795
VND    0.017994
PYG    0.015782
KES    0.015280
WST    0.014972
AZN    0.014609
PKR    0.014392
BOB    0.013908
DOP    0.011811
IDR    0.009042
SDG    0.008728
SLL    0.007924
UAH    0.007858
MNT    0.007707
AFN    0.006576
MZN    0.005428
HNL    0.004848
GTQ    0.004582
NPR    0.003995
RWF    0.003518
BAM    0.003482
LRD    0.002702
CRC    0.001626
MDL    0.001094
XAF    0.000882
KGS    0.000822
LBP    0.000568
BGN    0.000387
IQD    0.000381
AMD    0.000248
CDF    0.000230
HTG    0.000212
COP    0.000121
CLP    0.000054
JOD    0.000024
Name: DISBURSAL_CURRENCY, dtype: float64
In [36]:
%matplotlib inline
loan_pan['NONPAYMENT'].value_counts().plot(kind='bar', 
                                                color = 'green', 
                                                title = 'Loan Distribution by Country',
                                                figsize=(20,6))
print ('Partner %0.2f' % (loan_pan['NONPAYMENT'].value_counts('open')[0]))
print ('lender %0.2f' % (loan_pan['NONPAYMENT'].value_counts('open')[1]))
Partner 0.82
lender 0.18
In [33]:
%matplotlib inline
loan_pan['LOAN_AMOUNT'].plot(kind='hist',color = 'green',title = 'Loan Distribution by Country',figsize=(20,6))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1378c4c18>
In [39]:
%matplotlib inline
loan_pan['DISBURSAL_AMOUNT'].value_counts().plot(kind='hist', 
                                                color = 'green', 
                                                title = 'Loan Distribution by Country',
                                                figsize=(20,6))
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x13a553278>
In [10]:
''' freq vector ok '''
countvector =CountVectorizer() # decode_error='ignore')
data = countvector.fit_transform(loanuse)
frec_vector_pan = pan.DataFrame(data.toarray(), columns=countvector.get_feature_names())

print (data.shape)
print (loanuse.shape)
print (frec_vector_pan.shape)
print (loan_pan.shape)
(165444, 17135)
(165444,)
(165444, 17135)
(165444, 23)
In [11]:
''' Clustering ''' 

sec = {'Food':0,
 'Agriculture':1,
 'Clothing':2,
 'Construction':3,
 'Health':4,
 'Services':5,
 'Retail':6,
 'Arts':7,
 'Housing':8,
 'Transportation':9,
 'Manufacturing':10,
 'Entertainment':11}
sector = np.asarray(loan_pan['SECTOR'][:1000])
sector_code = []
for i in sector:
    sector_code.append(sec[i])
label = []
for i in sector:
    if i not in label:
        label.append(i)
In [8]:
''' kmeans 

kmeans = KMeans(n_clusters=12, max_iter=20, verbose=1) # initialization
kmeans.fit(frec_vector_pan)
#print ('Centoids: \n:', kmeans.cluster_centers_)
c = kmeans.predict(frec_vector_pan)

print (completeness_score(sector_code,c))
print (homogeneity_score(sector_code,c)) 
'''
Out[8]:
" kmeans \n\nkmeans = KMeans(n_clusters=12, max_iter=20, verbose=1) # initialization\nkmeans.fit(frec_vector_pan)\n#print ('Centoids: \n:', kmeans.cluster_centers_)\nc = kmeans.predict(frec_vector_pan)\n\nprint (completeness_score(sector_code,c))\nprint (homogeneity_score(sector_code,c)) \n"
In [ ]:
''' PCA '''
# setting PCA from sklearn decomposition package
pca = decomposition.PCA(n_components=len(frec_vector_pan.columns))
# reducing dimensionality 
DTtrans = pca.fit(frec_vector_pan).transform(frec_vector_pan)

# variance captured by component
print ('Variance ratio: \n', pca.explained_variance_ratio_ , '\n')

# we need to take the first 300 component to get 90% of the variance
print ('\nVariance Captured with 300 components %0.3f' % sum(pca.explained_variance_ratio_[:300]))
print ('Variance Captured with 50 components %0.3f' % sum(pca.explained_variance_ratio_[:50]))
print ('Variance Captured with 12 components %0.3f \n' % sum(pca.explained_variance_ratio_[:11]))


np.set_printoptions(precision=4,suppress=True)
print ('\nData in lower dimension:\n', DTtrans)

kmeansPCA = KMeans(n_clusters=12, max_iter=500, verbose=1) # initialization
kmeansPCA.fit(DTtrans)
print ('Centoids with PCA: \n:', kmeansPCA.cluster_centers_)
cPCA = kmeansPCA.predict(DTtrans)

print (completeness_score(sector_code,cPCA))
print (homogeneity_score(sector_code,cPCA))
In [9]:
Image(filename='kmeansPCA1.png')
Out[9]:
In [10]:
Image(filename='kmeansPCA2.png')
Out[10]: