%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('datasets/bank-full.csv', sep=";")


df.head()


df.shape

(45211, 17)


df['conversion'] = df['y'].apply(lambda x: 0 if x == 'no' else 1)


df.head()


conversion_rate_df = pd.DataFrame(
    df.groupby('conversion').count()['y'] / df.shape[0] * 100.0
)


conversion_rate_df


conversion_rate_df.T


conversion_rate_by_job = df.groupby(
    by='job'
)['conversion'].sum() / df.groupby(
    by='job'
)['conversion'].count() * 100.0


conversion_rate_by_job

job
admin.           12.202669
blue-collar       7.274969
entrepreneur      8.271688
housemaid         8.790323
management       13.755551
retired          22.791519
self-employed    11.842939
services          8.883004
student          28.678038
technician       11.056996
unemployed       15.502686
unknown          11.805556
Name: conversion, dtype: float64


ax = conversion_rate_by_job.plot(
    kind='barh',
    color='skyblue',
    grid=True,
    figsize=(10, 7),
    title='Conversion Rates by Job'
)

ax.set_xlabel('conversion rate (%)')
ax.set_ylabel('Job')

plt.show()


default_by_conversion_df = pd.pivot_table(
    df, 
    values='y', 
    index='default', 
    columns='conversion', 
    aggfunc=len
)


default_by_conversion_df


default_by_conversion_df.plot(
    kind='pie',
    figsize=(15, 7),
    startangle=90,
    subplots=True,
    autopct=lambda x: '%0.1f%%' % x
)

plt.show()


ax = df[['conversion', 'balance']].boxplot(
    by='conversion',
    showfliers=True,
    figsize=(10, 7)
)

ax.set_xlabel('Conversion')
ax.set_ylabel('Average Bank Balance')
ax.set_title('Average Bank Balance Distributions by Conversion')

plt.suptitle("")
plt.show()


ax = df[['conversion', 'balance']].boxplot(
    by='conversion',
    showfliers=False,
    figsize=(10, 7)
)

ax.set_xlabel('Conversion')
ax.set_ylabel('Average Bank Balance')
ax.set_title('Average Bank Balance Distributions by Conversion')

plt.suptitle("")
plt.show()


conversions_by_num_contacts = df.groupby(
    by='campaign'
)['conversion'].sum() / df.groupby(
    by='campaign'
)['conversion'].count() * 100.0


conversions_by_num_contacts

campaign
1     14.597583
2     11.203519
3     11.193624
4      9.000568
5      7.879819
6      7.126259
7      6.394558
8      5.925926
9      6.422018
10     5.263158
11     7.960199
12     2.580645
13     4.511278
14     4.301075
15     4.761905
16     2.531646
17     8.695652
18     0.000000
19     0.000000
20     2.325581
21     2.857143
22     0.000000
23     0.000000
24     5.000000
25     0.000000
26     0.000000
27     0.000000
28     0.000000
29     6.250000
30     0.000000
31     0.000000
32    11.111111
33     0.000000
34     0.000000
35     0.000000
36     0.000000
37     0.000000
38     0.000000
39     0.000000
41     0.000000
43     0.000000
44     0.000000
46     0.000000
50     0.000000
51     0.000000
55     0.000000
58     0.000000
63     0.000000
Name: conversion, dtype: float64


ax = conversions_by_num_contacts.plot(
    kind='bar',
    figsize=(10, 7),
    title='Conversion Rates by Number of Contacts',
    grid=True,
    color='skyblue'
)

ax.set_xlabel('Number of Contacts')
ax.set_ylabel('Conversion Rate (%)')

plt.show()


df['month'].unique()

array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb',
       'mar', 'apr', 'sep'], dtype=object)


months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

df['month'] = df['month'].apply(
    lambda x: months.index(x)+1
)


df.head()


df['month'].unique()

array([ 5,  6,  7,  8, 10, 11, 12,  1,  2,  3,  4,  9])


df.groupby('month').count()['conversion']

month
1      1403
2      2649
3       477
4      2932
5     13766
6      5341
7      6895
8      6247
9       579
10      738
11     3970
12      214
Name: conversion, dtype: int64


df['job'].unique()

array(['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin.', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student'], dtype=object)


jobs_encoded_df = pd.get_dummies(df['job'])


jobs_encoded_df


jobs_encoded_df.columns = ['job_%s' % x for x in jobs_encoded_df.columns]


jobs_encoded_df


df = pd.concat([df, jobs_encoded_df], axis=1)
df.head()


marital_encoded_df = pd.get_dummies(df['marital'])


marital_encoded_df


marital_encoded_df.columns = ['marital_%s' % x for x in marital_encoded_df.columns]


marital_encoded_df


df = pd.concat([df, marital_encoded_df], axis=1)


df.head()


df['housing'] = df['housing'].apply(lambda x: 1 if x == 'yes' else 0)


df.head()


df['loan'] = df['loan'].apply(lambda x: 1 if x == 'yes' else 0)


df.head()


features = [
    'age',
    'balance',
    'campaign',
    'previous',
    'housing',
    'job_admin.',
    'job_blue-collar',
    'job_entrepreneur',
    'job_housemaid',
    'job_management',
    'job_retired',
    'job_self-employed',
    'job_services',
    'job_student',
    'job_technician',
    'job_unemployed',
    'job_unknown',
    'marital_divorced',
    'marital_married',
    'marital_single'
]


from sklearn import tree


dt_model = tree.DecisionTreeClassifier(
    max_depth=4
)


dt_model.fit(df[features], df['conversion'])

DecisionTreeClassifier(max_depth=4)


!pip3 install python-graphviz

ERROR: Could not find a version that satisfies the requirement python-graphviz (from versions: none)
ERROR: No matching distribution found for python-graphviz
WARNING: You are using pip version 20.2.3; however, version 20.2.4 is available.
You should consider upgrading via the '/home/daniel/Desktop/narrativetext_project/notebooks/bin/python -m pip install --upgrade pip' command.


import graphviz


dot_data = tree.export_graphviz(
    dt_model, 
    feature_names=features, 
    class_names=['0', '1'], 
    filled=True, 
    rounded=True, 
    special_characters=True
) 

graph = graphviz.Source(dot_data)


graph

	admin.	blue-collar	entrepreneur	housemaid	management	retired	self-employed	services	student	technician	unemployed	unknown
0	0	0	0	0	1	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	1	0	0
2	0	0	1	0	0	0	0	0	0	0	0	0
3	0	1	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...
45206	0	0	0	0	0	0	0	0	0	1	0	0
45207	0	0	0	0	0	1	0	0	0	0	0	0
45208	0	0	0	0	0	1	0	0	0	0	0	0
45209	0	1	0	0	0	0	0	0	0	0	0	0
45210	0	0	1	0	0	0	0	0	0	0	0	0

	job_admin.	job_blue-collar	job_entrepreneur	job_housemaid	job_management	job_retired	job_self-employed	job_services	job_student	job_technician	job_unemployed	job_unknown
0	0	0	0	0	1	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	1	0	0
2	0	0	1	0	0	0	0	0	0	0	0	0
3	0	1	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...
45206	0	0	0	0	0	0	0	0	0	1	0	0
45207	0	0	0	0	0	1	0	0	0	0	0	0
45208	0	0	0	0	0	1	0	0	0	0	0	0
45209	0	1	0	0	0	0	0	0	0	0	0	0
45210	0	0	1	0	0	0	0	0	0	0	0	0

	divorced	married	single
0	0	1	0
1	0	0	1
2	0	1	0
3	0	1	0
4	0	0	1
...	...	...	...
45206	0	1	0
45207	1	0	0
45208	0	1	0
45209	0	1	0
45210	0	1	0

	marital_divorced	marital_married	marital_single
0	0	1	0
1	0	0	1
2	0	1	0
3	0	1	0
4	0	0	1
...	...	...	...
45206	0	1	0
45207	1	0	0
45208	0	1	0
45209	0	1	0
45210	0	1	0

Analyzing Customer Conversions with Machine Learning and Decision Trees¶

Logistic Regression Vs Decision Trees¶

Growing Decision Trees¶

Decision Trees Interpretation¶

Data analysis and visualization¶

Conversion rate¶

Encoding of categorical variables¶

Job encoding¶

`Marital` encoding¶

Coding the Housing and Loan Variables¶

Building Decision Trees¶

Interpretation of decision trees¶

	age	job	marital	education	default	balance	housing	loan	contact	day	month	duration	campaign	pdays	poutcome	y
0	58	management	married	tertiary	no	2143	yes	no	unknown	5	may	261	1	-1	unknown	no
1	44	technician	single	secondary	no	29	yes	no	unknown	5	may	151	1	-1	unknown	no
2	33	entrepreneur	married	secondary	no	2	yes	yes	unknown	5	may	76	1	-1	unknown	no
3	47	blue-collar	married	unknown	no	1506	yes	no	unknown	5	may	92	1	-1	unknown	no
4	33	unknown	single	unknown	no	1	no	no	unknown	5	may	198	1	-1	unknown	no

	y
conversion
0	88.30152
1	11.69848

conversion	0	1
default
no	39159	5237
yes	763	52

Analyzing Customer Conversions with Machine Learning and Decision Trees¶

Logistic Regression Vs Decision Trees¶

Growing Decision Trees¶

Decision Trees Interpretation¶

Data analysis and visualization¶

Conversion rate¶

Encoding of categorical variables¶

Job encoding¶

Marital encoding¶

Coding the Housing and Loan Variables¶

Building Decision Trees¶

Interpretation of decision trees¶

`Marital` encoding¶