import mysql.connector
import operator
from IPython.display import Image
import matplotlib.pyplot as plt
import pandas as pan
from pandas.tools.plotting import scatter_matrix
import numpy as np
import pylab as pl
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import decomposition
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn import neighbors, tree, naive_bayes
from sklearn.cluster import KMeans
from sklearn.metrics import completeness_score, homogeneity_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import confusion_matrix
''' ====================================== S T O P W O R D S ======================================'''
import nltk
from nltk import word_tokenize,sent_tokenize,stem
from nltk.stem import WordNetLemmatizer
import string
stop_words =set(('a',
'about','above','after','again','against','all','am','an','and','any','are','as','at','be','because','been',
'before','being','below','between','both','but','by','can','did','do','does','doing','don','down','during',
'each','few','for','from','further','had','has','have','having','he','her','here','hers','herself','him',
'himself','his','how','i','if','in','into','is','it','its','itself','just','me','more','most','my','myself','no',
'nor','not','now','of','off','on','once','only','or','other','our','ours','ourselves','out','over',
'own','s','same','she','should','so','some','such','t','than','that','the',
'their','theirs','them','themselves','then','there','these','they','this','those','through','to',
'too','under','until','up','very','was','we','were','what','when','where','which',
'while','who','whom','why','will','with','you','your','yours','yourself','yourselves','``','...',"''","'s",
'!','"', '#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^',
'_','`','{','|','}','~'))
punctutation = set(string.punctuation)
stop_words.update(punctutation)
print(stop_words)
wnl = WordNetLemmatizer()
snowball = stem.snowball.EnglishStemmer()
# Database connection to MySQL
cnx = mysql.connector.connect(user='root', password='root',
host='localhost', port='8889',
database='Kiva')
cursor = cnx.cursor() # cursos
# Currency
cursor.execute("""SELECT `DISBURSAL_CURRENCY`, count(*)
FROM LOAN
GROUP BY `DISBURSAL_CURRENCY` ORDER BY count(*) DESC""")
Currency= cursor.fetchall()
CuP = pan.DataFrame(Currency, columns=['Currency','Count'])
%matplotlib inline
CuP.plot(x=CuP['Currency'],kind='bar', color = 'green', title = 'Currency Distribution',figsize=(10,6))
print (CuP.head(5))
''' ====================================== T E A M D A T A ======================================'''
cursor.execute("""SELECT `teams_id`, `teams_loan_count`, `teams_loaned_amount`, `teams_member_count`,
`teams_membership_type`, `teams_category`,`teams_shortname`,`teams_loan_because`
FROM `TEAM` WHERE `teams_loan_because` IS NOT NULL""")
team_because = cursor.fetchall()
team_becausePan = pan.DataFrame(team_because)
len(team_because)
# get the number of rows in the resultset
numrows = int(cursor.rowcount)
teamBe = []
key = 0 # the field teams_id has repited values, this would work as unique key.
for x in team_because:
#x = team_because[0]
temp = [key]
for i in x:
temp.append(i)
teamBe.append(temp)
key+=1
len(teamBe)
print ('Done!')
teamBePan = pan.DataFrame(teamBe, columns=['team_key','teams_id', 'teams_loan_count', 'teams_loaned_amount',
'teams_member_count','teams_membership_type',
'teams_category','teams_shortname','teams_loan_because'])
teamBePan.describe()
%matplotlib inline
teamBePan['teams_category'].value_counts().plot(kind='bar',
color = 'green',
title = 'Team category distribution',
figsize=(10,6))
%matplotlib inline
teamBePan['teams_membership_type'].value_counts().plot(kind='bar',
color = 'green',
title = 'Team category distribution',
figsize=(10,6))
print ('Team category distribution: \nOpen:\t %0.2f \nClosed:\t %0.2f' % (
teamBePan['teams_membership_type'].value_counts('open')[0],
teamBePan['teams_membership_type'].value_counts('open')[1]))
%matplotlib inline
teamBePan['teams_loaned_amount'].value_counts().plot(kind='hist',
color = 'green',
title = 'Team category distribution',
figsize=(10,6))
%matplotlib inline
teamBePan['teams_loan_count'].value_counts().plot(kind='hist',
color = 'green',
title = 'Team category distribution',
figsize=(10,6))
team_type_amount = pan.crosstab(teamBePan['teams_loan_count'],
teamBePan['teams_membership_type'][teamBePan['teams_membership_type']=='open'])
%matplotlib inline
plt.show(team_type_amount.plot(kind="hist", title = 'Outcome type to sex',color = 'green',figsize=(10,6)))
team_type_amount = pan.crosstab(teamBePan['teams_loan_count'],
teamBePan['teams_membership_type'][teamBePan['teams_membership_type']=='closed'])
%matplotlib inline
plt.show(team_type_amount.plot(kind="hist", title = 'Outcome type to sex',color = 'green',figsize=(10,6)))
''' clustering '''
cttt =[]
for i in teamBePan.iterrows():
if i[1]['teams_category'] not in cttt:
cttt.append(i[1]['teams_category'])
cat = {'Families':0,
'Common Interest':1,
'Sports Groups':2,
'Businesses':3,
'Businesses - Internal Groups':4,
'Colleges/Universities':5,
'Clubs':6,
'Religious Congregations':7,
'Local Area':8,
'Youth Groups':9,
'Alumni Groups':10,
'Other':11,
'Field Partner Fans':12,
'Schools':13,
'Friends':14,
'Events':15,
'Memorials':16}
category = np.asarray(teamBePan['teams_category'][:1000])
category_code = []
for i in category:
category_code.append(cat[i])
teamLabel = []
for i in category:
if i not in teamLabel:
teamLabel.append(i)
print ('Done!')
''' ====================================== T E A M D A T A NL ======================================'''
#i =0
tokens =[] # creating tokens from loan because
for reason in teamBePan.iterrows():
tokens.append(nltk.word_tokenize(reason[1][-1]))
#i+=1
#if i == 10:
# break
do_reg = len(tokens[1])
print (tokens[1], do_reg)
print ('------------------------------------------')
# removing stopwords
for i in range(2):
for team in tokens:
for word in team:
if word in stop_words:
while word in team:
team.remove(word)
do_reg_reduce = len(tokens[1])
print (tokens[1],do_reg_reduce)
''' using wordnet to stem words '''
wnl = WordNetLemmatizer()
snowball = stem.snowball.EnglishStemmer()
#save stem words for each team
team_stem_words = []
i = 0
for team in tokens:
temp = []
for word in team:
# stemming words
temp.append(wnl.lemmatize(snowball.stem(word)))
team_stem_words.append(temp)
i +=1
print (team_stem_words[1])
# including vector with stem words to teams data
teamBePan['loan_bec_vector'] = team_stem_words
# sparsing words in panda dadaframe
tsw = pan.DataFrame(team_stem_words)
# concat of team data with tsw by axis (team_key)
teamData = pan.concat([teamBePan, tsw], axis=1)
len(teamData[0][0])
teamData.head()
team_loan_bec = teamData .pop('teams_loan_because')
teambecause = teamBePan['teams_loan_because'][0:1000]
''' freq vector ok '''
countvectorTeam =CountVectorizer() # decode_error='ignore')
teamdata = countvectorTeam.fit_transform(teambecause)
team_frec_vector_pan = pan.DataFrame(teamdata.toarray(), columns=countvectorTeam.get_feature_names())
''' kmeans '''
teamkmeans = KMeans(n_clusters=17, max_iter=500, verbose=1) # initialization
teamkmeans.fit(team_frec_vector_pan)
print ('Centoids: \n:', teamkmeans.cluster_centers_)
cteam = teamkmeans.predict(team_frec_vector_pan)
print (completeness_score(category_code,cteam))
print (homogeneity_score(category_code,cteam))
''' PCA '''
# setting PCA from sklearn decomposition package
teamPCA = decomposition.PCA(n_components=len(team_frec_vector_pan.columns))
# reducing dimensionality
TeamDTtrans = teamPCA.fit(team_frec_vector_pan).transform(team_frec_vector_pan)
# variance captured by component
print ('Variance ratio: \n', teamPCA.explained_variance_ratio_ , '\n')
# we need to take the first 300 component to get 90% of the variance
print ('\nVariance Captured with 250 components %0.3f' % sum(teamPCA.explained_variance_ratio_[:250]))
print ('Variance Captured with 50 components %0.3f' % sum(teamPCA.explained_variance_ratio_[:50]))
print ('Variance Captured with 12 components %0.3f \n' % sum(teamPCA.explained_variance_ratio_[:11]))
np.set_printoptions(precision=4,suppress=True)
print ('\nData in lower dimension:\n', TeamDTtrans)
TeamkmeansPCA = KMeans(n_clusters=17, max_iter=500, verbose=1) # initialization
TeamkmeansPCA.fit(TeamDTtrans)
print ('Centoids with PCA: \n:', TeamkmeansPCA.cluster_centers_)
TeamcPCA = TeamkmeansPCA.predict(TeamDTtrans)
print (completeness_score(category_code,TeamcPCA))
print (homogeneity_score(category_code,TeamcPCA))
plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 2], c= TeamcPCA)
plt.ylabel('PC1')
plt.xlabel('PC2')
plt.title('KMeans')
plt.subplot(222)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 2], c= category_code)
plt.xlabel('PC2')
plt.title('Actual')
plt.subplot(223)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 3], c= TeamcPCA)
plt.ylabel('PC1')
plt.xlabel('PC3')
plt.subplot(224)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 3], c= category_code)
plt.xlabel('PC3')
plt.show()
#plt.savefig("TeamkmeansPCA1.png")
plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 3], c= TeamcPCA)
plt.ylabel('PC1')
plt.xlabel('PC3')
plt.title('KMeans')
plt.subplot(222)
plt.scatter(TeamDTtrans[:, 1], TeamDTtrans[:, 3], c= category_code)
plt.xlabel('PC3')
plt.title('Actual')
plt.subplot(223)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 4], c= TeamcPCA)
plt.ylabel('PC2')
plt.xlabel('PC4')
plt.subplot(224)
plt.scatter(TeamDTtrans[:, 2], TeamDTtrans[:, 4], c= category_code)
plt.xlabel('PC4')
plt.show()
#plt.savefig("TeamkmeansPCA2.png")
''' ====================================== L O A N D A T A ======================================'''
cursor.execute("""SELECT `ID`, `STATUS`, `FUNDED_AMOUNT`, `PAID_AMOUNT`, `ACTIVITY`,
`SECTOR`, `LOANUSE`,`COUNTRY`,`TOWN`, 'GEOPAIRS', `PARTNER_ID`,
`DISBURSAL_AMOUNT`,`DISBURSAL_CURRENCY`,`DISBURSAL_DATE`, `LOAN_AMOUNT`,
`NONPAYMENT`,`CURRENCY_EXCHANGE`,`POSTED_DATE`,`FUNDED_DATE`,`PAID_DATE`,
`REFUNDED_DATE`,`JOURNAL_ENTRIES`,`JOURNAL_BULK_ENTRIES`
FROM `LOAN` WHERE `LOANUSE` IS NOT NULL""")
loan = cursor.fetchall()
loan_pan = pan.DataFrame(loan, columns= ['ID','STATUS', 'FUNDED_AMOUNT', 'PAID_AMOUNT', 'ACTIVITY',
'SECTOR', 'LOANUSE','COUNTRY','TOWN', 'GEOPAIRS', 'PARTNER_ID',
'DISBURSAL_AMOUNT','DISBURSAL_CURRENCY','DISBURSAL_DATE','LOAN_AMOUNT',
'NONPAYMENT','CURRENCY_EXCHANGE','POSTED_DATE','FUNDED_DATE','PAID_DATE',
'REFUNDED_DATE','JOURNAL_ENTRIES','JOURNAL_BULK_ENTRIES'])
loan_pan.set_index('ID',inplace=True)
loan_pan.describe(include='all')
loanuse = loan_pan['LOANUSE'] # loan use data
loan_tokens =[] # creating tokens
for use in loanuse:
loan_tokens.append(word_tokenize(use))
do_reg = len(loan_tokens[1])
print (loan_tokens[1], do_reg)
print ('------------------------------------------')
# removing stopwords
for i in range(2):
for loan in loan_tokens:
for word in loan:
if word in stop_words:
while word in loan:
loan.remove(word)
do_reg_reduce = len(loan_tokens[1])
print (loan_tokens[1],do_reg_reduce)
loan_stem_words = []
i = 0
for loan in loan_tokens:
temp = []
for word in loan:
# stemming words
temp.append(wnl.lemmatize(snowball.stem(word)))
loan_stem_words.append(temp)
i +=1
print (loan_stem_words[1])
# including vector with stem words to loan data
loan_pan['loan_use_vector'] = loan_stem_words
len(loan_stem_words)
# sparsing words in panda dadaframe
lsw = pan.DataFrame(loan_stem_words)
# concat of team data with tsw by axis (team_key)
loanData = pan.concat([loan_pan, lsw], axis=1)
len(loanData[0][0])
loanData.describe()
%matplotlib inline
loan_pan['COUNTRY'].value_counts().plot(kind='bar',
color = 'green',
title = 'Loan Distribution by Country',
figsize=(20,6))
%matplotlib inline
loan_pan['PARTNER_ID'].value_counts().plot(kind='bar',
color = 'green',
title = 'Loan Distribution by Partner',
figsize=(20,6))
%matplotlib inline
loan_pan['DISBURSAL_CURRENCY'].value_counts().plot(kind='bar',
color = 'green',
title = 'Loan Distribution by Currency',
figsize=(20,6))
loan_pan['DISBURSAL_CURRENCY'].value_counts('open')
%matplotlib inline
loan_pan['NONPAYMENT'].value_counts().plot(kind='bar',
color = 'green',
title = 'Loan Distribution by Country',
figsize=(20,6))
print ('Partner %0.2f' % (loan_pan['NONPAYMENT'].value_counts('open')[0]))
print ('lender %0.2f' % (loan_pan['NONPAYMENT'].value_counts('open')[1]))
%matplotlib inline
loan_pan['LOAN_AMOUNT'].plot(kind='hist',color = 'green',title = 'Loan Distribution by Country',figsize=(20,6))
%matplotlib inline
loan_pan['DISBURSAL_AMOUNT'].value_counts().plot(kind='hist',
color = 'green',
title = 'Loan Distribution by Country',
figsize=(20,6))
''' freq vector ok '''
countvector =CountVectorizer() # decode_error='ignore')
data = countvector.fit_transform(loanuse)
frec_vector_pan = pan.DataFrame(data.toarray(), columns=countvector.get_feature_names())
print (data.shape)
print (loanuse.shape)
print (frec_vector_pan.shape)
print (loan_pan.shape)
''' Clustering '''
sec = {'Food':0,
'Agriculture':1,
'Clothing':2,
'Construction':3,
'Health':4,
'Services':5,
'Retail':6,
'Arts':7,
'Housing':8,
'Transportation':9,
'Manufacturing':10,
'Entertainment':11}
sector = np.asarray(loan_pan['SECTOR'][:1000])
sector_code = []
for i in sector:
sector_code.append(sec[i])
label = []
for i in sector:
if i not in label:
label.append(i)
''' kmeans
kmeans = KMeans(n_clusters=12, max_iter=20, verbose=1) # initialization
kmeans.fit(frec_vector_pan)
#print ('Centoids: \n:', kmeans.cluster_centers_)
c = kmeans.predict(frec_vector_pan)
print (completeness_score(sector_code,c))
print (homogeneity_score(sector_code,c))
'''
''' PCA '''
# setting PCA from sklearn decomposition package
pca = decomposition.PCA(n_components=len(frec_vector_pan.columns))
# reducing dimensionality
DTtrans = pca.fit(frec_vector_pan).transform(frec_vector_pan)
# variance captured by component
print ('Variance ratio: \n', pca.explained_variance_ratio_ , '\n')
# we need to take the first 300 component to get 90% of the variance
print ('\nVariance Captured with 300 components %0.3f' % sum(pca.explained_variance_ratio_[:300]))
print ('Variance Captured with 50 components %0.3f' % sum(pca.explained_variance_ratio_[:50]))
print ('Variance Captured with 12 components %0.3f \n' % sum(pca.explained_variance_ratio_[:11]))
np.set_printoptions(precision=4,suppress=True)
print ('\nData in lower dimension:\n', DTtrans)
kmeansPCA = KMeans(n_clusters=12, max_iter=500, verbose=1) # initialization
kmeansPCA.fit(DTtrans)
print ('Centoids with PCA: \n:', kmeansPCA.cluster_centers_)
cPCA = kmeansPCA.predict(DTtrans)
print (completeness_score(sector_code,cPCA))
print (homogeneity_score(sector_code,cPCA))
Image(filename='kmeansPCA1.png')
Image(filename='kmeansPCA2.png')