import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# for linear regression
from sklearn.linear_model import LinearRegression


# the dataset for the demo
from sklearn.datasets import load_boston


boston_dataset = load_boston()


boston = pd.DataFrame(boston_dataset.data,
                      columns=boston_dataset.feature_names)


boston.head()


# add the target
boston['MEDV'] = boston_dataset.target

boston.head()


print(boston_dataset.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
.. topic:: References

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.


np.random.seed(29)


n = 200


x = np.random.randn(n)

x

array([-0.41748213,  0.7060321 ,  1.9159847 , -2.1417555 ,  0.71905689,
        0.46707262,  0.76672253,  0.88200945,  0.80875066, -0.94716485,
       -0.12903843, -0.63086044, -1.43842284,  0.65220495,  0.98494561,
       -1.1444064 ,  0.38683645,  0.4265849 ,  2.10646128, -0.19786553,
       -0.2484003 ,  1.71381789, -0.27356624,  1.34747596,  0.34321934,
       -0.89567509,  2.11825131, -0.28609779,  0.24322882, -0.61897527,
        1.63784998, -0.05811961, -1.83694028, -0.7785032 ,  0.83320023,
       -0.04428776,  0.55439513, -0.4084749 , -0.46409435,  0.41800894,
        0.57453712, -1.74668412,  1.38228917, -1.26331079, -0.25736174,
       -0.96723108, -1.20666907,  2.66861402, -2.47113964,  0.07261643,
       -0.40761335, -1.14510085, -2.32342938,  0.08442342,  0.71087449,
        0.82788128, -1.59302662, -0.5079899 ,  2.55114425, -1.68596461,
       -0.47842946,  1.17949062, -0.00697292,  0.98764392,  0.22076528,
       -0.06161137, -1.03826422, -0.29860915,  0.37449239,  0.84802006,
       -1.09539576, -0.41272231, -1.22820766,  0.92439358,  0.23261627,
       -0.64493688,  0.3318164 ,  1.57982212,  1.50554664,  0.46108256,
       -3.50540144,  0.17808984, -0.03782814,  0.83119576, -0.52194566,
       -1.24103811,  0.23385459, -1.08274886,  0.24367168,  0.66858507,
        1.29696349,  0.99062627,  1.89088641, -0.31756113, -0.1692602 ,
       -2.31076975, -0.55949954,  0.19711825,  1.71137105,  0.25394272,
        1.88132975, -0.9104608 , -1.60892029,  0.52954982, -0.43969747,
       -0.11137613,  2.1984452 , -0.5756465 ,  0.28603782,  1.10089254,
       -0.8824175 , -1.18493165,  0.2397606 , -0.71091761,  1.74410287,
        0.67688325,  1.14835421,  1.10861613,  0.18297961, -0.34242482,
        0.09260824,  0.17252557, -0.72360451,  0.23231358, -0.85709313,
        1.25644024, -0.60153313,  0.79602256, -0.12575   , -0.32786541,
        0.09826019,  0.26879829, -1.0183898 ,  0.49804058,  0.47310501,
       -0.42978209,  0.36352589,  0.60432165, -1.13384251, -0.82415533,
       -1.39407718, -0.65038898, -0.55004437, -0.22063437, -0.57429247,
       -0.78794866,  0.03147863, -0.88157061,  0.07335102, -0.03084551,
        0.67429943, -1.59026607,  0.27076498, -0.03217659,  0.1449159 ,
        1.17840499, -0.08181883, -0.75478285, -2.60519783, -0.50118753,
        0.77021706, -0.63137292, -0.82447747, -0.24807806, -0.08570506,
        0.59577133,  0.54939105,  1.44566035,  0.07159586, -0.29511184,
        0.19106855, -0.39036668,  0.45118492, -0.71866011,  0.36110607,
       -0.26069795,  0.05310938,  1.45080133, -0.1807666 ,  0.68998302,
        0.32864807,  0.01583799, -1.75590638, -0.73635199,  0.68673438,
       -1.72269262, -0.55110239,  0.03698584, -0.5694227 ,  0.58429396,
       -0.36532479, -0.17078495,  2.3882313 ,  0.21786041, -0.65240352,
        0.77285625,  0.9841542 ,  0.14873645,  0.33170631,  0.40506245])


y = x * 10 + np.random.randn(n) * 2

y

array([-1.27156123e+00,  7.99059976e+00,  1.98486874e+01, -2.19289028e+01,
        5.57906972e+00,  3.71332303e+00,  5.52974425e+00,  5.69849836e+00,
        6.06961687e+00, -1.06978171e+01, -6.19183571e+00, -3.92516137e+00,
       -1.62849879e+01,  3.84159425e+00,  9.53395418e+00, -1.09399832e+01,
        5.30903568e+00,  5.51747996e+00,  2.15019040e+01, -4.55230737e+00,
       -5.33143408e+00,  1.67750657e+01, -3.70040475e+00,  1.47716230e+01,
        5.63108835e-01, -7.08489586e+00,  2.09775494e+01, -5.90283157e+00,
        3.39900771e+00, -7.09370309e+00,  1.98256205e+01,  8.95849572e-01,
       -1.54599159e+01, -5.32469918e+00,  1.06904639e+01,  4.19909882e-02,
        3.66857372e+00, -3.19911994e+00, -2.85613119e+00,  9.45815207e-01,
        5.84119698e+00, -1.86440086e+01,  1.58219467e+01, -1.30782422e+01,
       -4.47367719e+00, -1.04413754e+01, -9.13030428e+00,  2.29949809e+01,
       -2.75758231e+01,  1.58979356e+00, -8.06613163e+00, -1.09266183e+01,
       -2.24482209e+01,  1.22223923e+00,  5.23755634e+00,  7.39134372e+00,
       -1.64079239e+01, -4.94942065e+00,  2.64070393e+01, -1.97464230e+01,
       -5.19884518e+00,  1.50023924e+01,  3.00375168e+00,  1.02852682e+01,
        4.64725576e+00,  4.77325571e-01, -1.31504071e+01, -3.27496596e+00,
        1.56723200e+00,  7.83994566e+00, -9.93158083e+00, -6.50081944e+00,
       -1.41116218e+01,  7.34914427e+00, -2.25812185e+00, -8.02821167e+00,
       -1.32578910e-02,  1.53700639e+01,  1.66885937e+01,  6.18989360e-01,
       -3.37371405e+01,  3.10168726e+00, -7.16176434e-01,  6.40341939e+00,
       -6.15626612e+00, -1.37518450e+01,  5.43185282e+00, -8.87954957e+00,
        4.91697692e+00,  3.34032661e+00,  1.27563815e+01,  1.12689565e+01,
        1.50065127e+01, -6.80260510e+00, -4.63794345e-01, -2.46732731e+01,
       -6.09955032e+00,  7.20122910e+00,  1.87015374e+01,  4.63223354e+00,
        2.02229917e+01, -6.72870806e+00, -1.73880932e+01,  7.59587814e+00,
       -4.17880485e+00, -3.65375144e+00,  2.16670475e+01, -9.35708739e+00,
        6.66076687e+00,  1.73138508e+01, -1.16045040e+01, -1.10575685e+01,
        3.57660100e+00, -6.41913545e+00,  1.87858843e+01,  6.65615075e+00,
        1.63418193e+01,  9.78651739e+00,  2.05287534e+00, -2.48470870e+00,
        1.68581083e+00, -5.86129783e-01, -8.59700841e+00,  1.58876036e+00,
       -1.09791917e+01,  1.30245390e+01, -4.90734348e+00,  5.47634084e+00,
       -2.74339493e+00, -5.51896014e+00, -2.47764117e-01,  2.91095624e+00,
       -7.41615644e+00,  9.67926462e+00,  4.62537111e+00, -4.39673455e+00,
        1.56372140e+00,  6.74056593e+00, -9.19029028e+00, -7.12604553e+00,
       -1.44147065e+01, -5.87511598e+00, -5.04671327e+00, -3.04118139e+00,
       -9.44329116e+00, -1.18695625e+01,  5.61297676e-01, -6.73329836e+00,
        3.93009377e+00,  7.77109544e-01,  7.67184232e+00, -1.84763102e+01,
        2.43563209e+00, -8.43024586e-01,  2.43662212e+00,  1.38679093e+01,
       -2.50807712e+00, -9.01645440e+00, -2.19805707e+01, -1.73846408e+00,
        8.98349244e+00, -8.86139562e+00, -6.12568161e+00, -1.64130945e+00,
       -9.07600933e-01,  7.42303375e+00,  4.50124884e+00,  1.47028612e+01,
       -2.14864190e+00, -1.68162829e+00, -6.56617355e-01, -6.03204004e+00,
        4.91682014e+00, -6.02523428e+00,  5.01586473e+00, -3.42670085e+00,
        2.19155742e+00,  1.44437655e+01,  2.30719333e-01,  6.33563219e+00,
        1.04866504e+00, -1.92162544e+00, -1.80382022e+01, -9.86707969e+00,
        9.27901335e+00, -1.49592445e+01, -8.04100673e+00, -5.74185293e-02,
       -6.13903735e+00,  5.08473500e+00, -2.52595438e+00, -2.41137009e+00,
        2.30441385e+01,  3.94946753e+00, -5.45256003e+00,  7.20563250e+00,
        1.10395248e+01,  8.85182834e-01,  3.22166791e+00,  3.47359812e+00])


data = pd.DataFrame([x, y]).T
data.columns = ['x', 'y']
data.head()


sns.lmplot(x="x", y="y", data=data, order=1)

plt.ylabel('Target')
plt.xlabel('Independent variable')

Text(0.5, 6.79999999999999, 'Independent variable')


sns.lmplot(x="LSTAT", y="MEDV", data=boston, order=1)

<seaborn.axisgrid.FacetGrid at 0x7f83aea66828>


sns.lmplot(x="CRIM", y="MEDV", data=boston, order=1)

<seaborn.axisgrid.FacetGrid at 0x7f83aea3ceb8>


linreg = LinearRegression()


# fit the model
linreg.fit(data['x'].to_frame(), data['y'])

LinearRegression()


pred = linreg.predict(data['x'].to_frame())


pred

array([ -4.31267193,   7.02628565,  19.23761417, -21.71473253,
         7.15773711,   4.61461042,   7.63879794,   8.80231985,
         8.06296271,  -9.65844307,  -1.40158213,  -6.46617171,
       -14.61641634,   6.48304039,   9.841193  , -11.64908421,
         3.80483473,   4.20599206,  21.15998039,  -2.09621288,
        -2.60623008,  17.19726539,  -2.86021486,  13.4999952 ,
         3.36463337,  -9.13878771,  21.27897014,  -2.98668838,
         2.35548874,  -6.34622178,  16.43056662,  -0.68584082,
       -18.63841468,  -7.95624181,   8.30971761,  -0.54624427,
         5.49590438,  -4.22176734,  -4.7831012 ,   4.11944012,
         5.69918543, -17.7275132 ,  13.85134415, -12.84911509,
        -2.69667259,  -9.8609596 , -12.27746413,  26.83345205,
       -25.03900989,   0.63359985,  -4.21307224, -11.65609283,
       -23.54825844,   0.75276076,   7.07515699,   8.25603666,
       -16.17673989,  -5.22611279,  25.64789991, -17.11470755,
        -4.92777698,  11.80461953,  -0.16964788,   9.86842551,
         2.12877769,  -0.72108109, -10.57785455,  -3.11295798,
         3.68025353,   8.45928531, -11.15444903,  -4.26463391,
       -12.49484026,   9.23007762,   2.24838262,  -6.60823678,
         3.24955028,  15.84492616,  15.09530813,   4.55415635,
       -35.47719607,   1.69807998,  -0.48105112,   8.28948775,
        -5.36695997, -12.62433025,   2.26088023, -11.02681144,
         2.35995832,   6.64835523,  12.99020306,   9.89852453,
        18.98431206,  -3.304229  ,  -1.80751642, -23.42049238,
        -5.74596877,   1.89012235,  17.17257084,   2.46361777,
        18.8878624 ,  -9.28801109, -16.33714528,   5.24515552,
        -4.53687812,  -1.22332702,  22.08831915,  -5.90893041,
         2.78753442,  11.0113761 ,  -9.00498676, -12.05808129,
         2.32048606,  -7.27414086,  17.50291359,   6.73210373,
        11.49037834,  11.0893257 ,   1.74742955,  -3.55516336,
         0.83536529,   1.64192312,  -7.40218216,   2.24532774,
        -8.74940305,  12.58122605,  -6.17018869,   7.93450566,
        -1.36839389,  -3.40822397,   0.89240703,   2.61354611,
       -10.37727391,   4.92715162,   4.67549178,  -4.43680808,
         3.56957519,   5.99978294, -11.54246917,  -8.41698152,
       -14.16886212,  -6.66326162,  -5.65054345,  -2.32600521,
        -5.89526505,  -8.05156929,   0.21842067,  -8.99643961,
         0.64101366,  -0.41057962,   6.70602676, -16.14887938,
         2.63339474,  -0.42401339,   1.36327524,  11.79366297,
        -0.92502292,  -7.71684654, -26.39197902,  -5.15746052,
         7.67406614,  -6.47134386,  -8.42023273,  -2.60297794,
        -0.96424432,   5.91348964,   5.44540123,  14.49091153,
         0.62329994,  -3.0776618 ,   1.82906631,  -4.03901195,
         4.45426538,  -7.35228133,   3.54515346,  -2.73034293,
         0.43672687,  14.54279645,  -1.92364363,   6.8643119 ,
         3.21757417,   0.06056904, -17.82058802,  -7.53083492,
         6.83152533, -17.4853814 ,  -5.66122139,   0.27400167,
        -5.84611736,   5.79765536,  -3.78627914,  -1.82290474,
        24.00371682,   2.09946059,  -6.68359318,   7.70070193,
         9.8332058 ,   1.40183374,   3.24843922,   3.98877892])


error = data['y'] - pred


error

0      3.041111
1      0.964314
2      0.611073
3     -0.214170
4     -1.578667
         ...   
195   -0.495069
196    1.206319
197   -0.516651
198   -0.026771
199   -0.515181
Name: y, Length: 200, dtype: float64


plt.scatter(x=pred, y=data['y'])
plt.xlabel('Predictions')
plt.ylabel('Real value')

Text(0, 0.5, 'Real value')


plt.scatter(y=error, x=data['x'])
plt.ylabel('Residuals')
plt.xlabel('Independent variable x')

Text(0.5, 0, 'Independent variable x')


sns.distplot(error, bins=30)
plt.xlabel('Residuals')

/home/daniel/Desktop/narrativetext_project/notebooks/lib/python3.6/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Text(0.5, 0, 'Residuals')


# call the linear model from sklearn
linreg = LinearRegression()


# fit the model
linreg.fit(boston['LSTAT'].to_frame(), boston['MEDV'])

LinearRegression()


# make the predictions
pred = linreg.predict(boston['LSTAT'].to_frame())


# calculate the residuals
error = boston['MEDV'] - pred


# plot predicted vs real
plt.scatter(x=pred, y=boston['MEDV'])
plt.xlabel('Predictions')
plt.ylabel('MEDV')

Text(0, 0.5, 'MEDV')


# Residuals plot

# if the relationship is linear, the noise should be
# random, centered around zero, and follow a normal distribution

plt.scatter(y=error, x=boston['LSTAT'])
plt.ylabel('Residuals')
plt.xlabel('LSTAT')

Text(0.5, 0, 'LSTAT')


# plot a histogram of the residuals
# they should follow a gaussian distribution
sns.distplot(error, bins=30)

/home/daniel/Desktop/narrativetext_project/notebooks/lib/python3.6/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='MEDV', ylabel='Density'>

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33	36.2

	x	y
0	-0.417482	-1.271561
1	0.706032	7.990600
2	1.915985	19.848687
3	-2.141755	-21.928903
4	0.719057	5.579070

Identifying Linear Relationships Between Variables In Machine Learning¶

Conclusion¶