5. Linear regression#

Warning

Under construction

This notebook comprises various text and code snippets for generating plots and other content for the lectures corresponding to this topic. It is not a coherent set of lecture notes. Students should refer to the actual lecture slides available on Blackboard.

5.1. (notebook preliminaries)#

%matplotlib inline
%config InlineBackend.figure_format = "retina"
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Make fonts bigger for slides
plt.rcParams['font.size'] = 16
import xarray as xr

# Load the data
filepath_in = "../data/central-park-station-data_1869-01-01_2023-09-30.nc"
ds_cp = xr.open_dataset(filepath_in)

# Clean: drop all 0 values of the temperature fields which are (mostly) spurious
for varname in ["temp_avg", "temp_min", "temp_max"]:
    ds_cp[varname] = ds_cp[varname].where(ds_cp[varname] != 0.)

5.2. Motivation#

5.2.1. Schematics#

import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

# Generate random data
np.random.seed(0)
X = np.random.normal(0, 1, 50).reshape(-1, 1)
y = 3 * X.squeeze() + 4 + np.random.normal(0, 1, 50)

# Fit linear regression model
model = LinearRegression()
model.fit(X, y)

# Generate prediction points (in-sample)
X_pred_in_sample = np.linspace(min(X), max(X), 50).reshape(-1, 1)
y_pred_in_sample = model.predict(X_pred_in_sample)

# Plotting
plt.scatter(X, y, color='blue', label='Data points')
plt.plot(X_pred_in_sample, y_pred_in_sample, 'r-', label='In-sample Prediction')
plt.xlabel('X')
plt.ylabel('y')
plt.xlim(-3, 3)  # Axis limits from the previous example
plt.ylim(min(y) - 2, max(y) + 2)  # Axis limits from the previous example
plt.legend()
plt.title('Linear Regression: In-sample Prediction')
plt.show()
../_images/e49d8789d42ce2839b0bd510ef50e7fb0f52d5c5af250e8f4e91c314faa97ee6.png
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

# Generate random data
np.random.seed(0)
X = np.random.normal(0, 1, 50).reshape(-1, 1)
y = 3 * X.squeeze() + 4 + np.random.normal(0, 1, 50)

# Fit linear regression model
model = LinearRegression()
model.fit(X, y)

# Generate prediction points (both in-sample and out-of-sample)
X_pred_in_sample = np.linspace(min(X), max(X), 50).reshape(-1, 1)
X_pred_out_of_sample = np.linspace(max(X), max(X) + 3, 20).reshape(-1, 1)
y_pred_in_sample = model.predict(X_pred_in_sample)
y_pred_out_of_sample = model.predict(X_pred_out_of_sample)

# Plotting
plt.scatter(X, y, color='blue', label='Data points')
plt.plot(X_pred_in_sample, y_pred_in_sample, 'r-', label='In-sample Prediction')
plt.plot(X_pred_out_of_sample, y_pred_out_of_sample, 'r--', label='Out-of-sample Prediction')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.title('Linear Regression: Out-of-sample')
plt.show()
../_images/5927019be8324996fb0a1e39e7c093db4487cd0a4cb7829e5bb52ff1df6938dc.png
import numpy as np
from bokeh.layouts import row
from bokeh.plotting import figure, show, curdoc
from bokeh.models import ColumnDataSource
import panel as pn

# Generate synthetic data
np.random.seed(0)
X = np.random.normal(0, 1, 50)
y = 3 * X + 4 + np.random.normal(0, 1, 50)

# Set up initial data source and plot
source = ColumnDataSource(data=dict(x=X, y=y))
p = figure(title='Interactive Linear Regression')
p.scatter('x', 'y', source=source, legend_label='Data Points')
line_source = ColumnDataSource(data=dict(x=[], y=[]))
p.line('x', 'y', source=line_source, color='red', legend_label='Regression Line')

# Extend the regression line beyond data points
x_extended = np.linspace(min(X) - 2, max(X) + 2, 50)

def update_line(event):
    slope = slope_slider.value
    intercept = intercept_slider.value
    y_pred = slope * x_extended + intercept
    line_source.data = dict(x=x_extended, y=y_pred)

# Sliders for slope and intercept
slope_slider = pn.widgets.FloatSlider(name='Slope', start=-10, end=10, value=3)
intercept_slider = pn.widgets.FloatSlider(name='Intercept', start=-10, end=10, value=4)

for w in [slope_slider, intercept_slider]:
    w.param.watch(update_line, 'value')

# Initial update
update_line(None)

pn.Column(
    pn.Row(slope_slider, intercept_slider),
    p
)
WARNING:param.Column00123: Displaying Panel objects in the notebook requires the panel extension to be loaded. Ensure you run pn.extension() before displaying objects in the notebook.
Column
    [0] Row
        [0] FloatSlider(end=10, name='Slope', start=-10, value=3)
        [1] FloatSlider(end=10, name='Intercept', start=-10, value=4)
    [1] Bokeh(figure)

5.2.2. Global-mean surface temperature vs. sea-level#

import pandas as pd

path_gmst_nasa = "../data/nasa-gmst-ann.txt"
pd.read_csv(path_gmst_nasa)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[8], line 4
      1 import pandas as pd
      3 path_gmst_nasa = "../data/nasa-gmst-ann.txt"
----> 4 pd.read_csv(path_gmst_nasa)

File ~/miniconda3/envs/stat-methods-course/lib/python3.11/site-packages/pandas/io/parsers/readers.py:912, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    899 kwds_defaults = _refine_defaults_read(
    900     dialect,
    901     delimiter,
   (...)
    908     dtype_backend=dtype_backend,
    909 )
    910 kwds.update(kwds_defaults)
--> 912 return _read(filepath_or_buffer, kwds)

File ~/miniconda3/envs/stat-methods-course/lib/python3.11/site-packages/pandas/io/parsers/readers.py:577, in _read(filepath_or_buffer, kwds)
    574 _validate_names(kwds.get("names", None))
    576 # Create the parser.
--> 577 parser = TextFileReader(filepath_or_buffer, **kwds)
    579 if chunksize or iterator:
    580     return parser

File ~/miniconda3/envs/stat-methods-course/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1407, in TextFileReader.__init__(self, f, engine, **kwds)
   1404     self.options["has_index_names"] = kwds["has_index_names"]
   1406 self.handles: IOHandles | None = None
-> 1407 self._engine = self._make_engine(f, self.engine)

File ~/miniconda3/envs/stat-methods-course/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1661, in TextFileReader._make_engine(self, f, engine)
   1659     if "b" not in mode:
   1660         mode += "b"
-> 1661 self.handles = get_handle(
   1662     f,
   1663     mode,
   1664     encoding=self.options.get("encoding", None),
   1665     compression=self.options.get("compression", None),
   1666     memory_map=self.options.get("memory_map", False),
   1667     is_text=is_text,
   1668     errors=self.options.get("encoding_errors", "strict"),
   1669     storage_options=self.options.get("storage_options", None),
   1670 )
   1671 assert self.handles is not None
   1672 f = self.handles.handle

File ~/miniconda3/envs/stat-methods-course/lib/python3.11/site-packages/pandas/io/common.py:859, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    854 elif isinstance(handle, str):
    855     # Check whether the filename is to be opened in binary mode.
    856     # Binary mode does not support 'encoding' and 'newline'.
    857     if ioargs.encoding and "b" not in ioargs.mode:
    858         # Encoding
--> 859         handle = open(
    860             handle,
    861             ioargs.mode,
    862             encoding=ioargs.encoding,
    863             errors=errors,
    864             newline="",
    865         )
    866     else:
    867         # Binary mode
    868         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: '../data/nasa-gmst-ann.txt'

5.3. Covariance#

The first step in understanding linear regression is understanding covariance.

5.3.1. General#

import numpy as np

# Number of points
N = 1000

# Mean and standard deviation for X and Y
mean_X, mean_Y = 0, 0
std_X, std_Y = 1, 1

# Specified covariance
cov_XY = 0.5

# Covariance matrix
cov_matrix = [[std_X**2, cov_XY], [cov_XY, std_Y**2]]

# Generate X and Y with the specified covariance
X, Y = np.random.multivariate_normal([mean_X, mean_Y], cov_matrix, N).T

# Verify covariance
computed_cov = np.cov(X, Y)[0, 1]
print("Computed covariance:", computed_cov)
Computed covariance: 0.4884784873002697
import numpy as np
import matplotlib.pyplot as plt

# Define two arrays of 3 points each
x = np.array([2, 4, 6])
y = np.array([1, 3, 5])

# Calculate means
mean_x = np.mean(x)
mean_y = np.mean(y)

# Create the plot
fig, ax = plt.subplots()
ax.scatter(x, y, label='Data Points', color='b')
ax.axhline(mean_y, color='r', linestyle='--', label=f'Mean of Y = {mean_y}')
ax.axvline(mean_x, color='g', linestyle='--', label=f'Mean of X = {mean_x}')

# Annotate individual points
for i, (xi, yi) in enumerate(zip(x, y)):
    ax.annotate(f'({xi}, {yi})', (xi, yi), textcoords="offset points", xytext=(0,10), ha='center')

# Add labels and legend
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.legend()

plt.show()
../_images/454e41c4785e13b5dfc1a25add7b01c0ec81c2142bdae3a164a29ee6e6539f44.png
import numpy as np
import matplotlib.pyplot as plt
import panel as pn
import param

class CovariancePlot(param.Parameterized):
    cov = param.Number(10, bounds=(-50, 50))
    mean_x = param.Number(0, bounds=(-10, 10))
    mean_y = param.Number(0, bounds=(-10, 10))
    
    def plot(self):
        np.random.seed(0)
        N = 1000

        cov_matrix = [[100, self.cov], 
                      [self.cov, 100]]

        X, Y = np.random.multivariate_normal([self.mean_x, self.mean_y], cov_matrix, N).T
        
        fig, ax = plt.subplots()
        ax.scatter(X, Y)
        ax.set_xlim(-50, 50)
        ax.set_ylim(-50, 50)
        ax.set_title(f'Covariance: {self.cov}, Mean_X: {self.mean_x}, Mean_Y: {self.mean_y}')
        plt.xlabel('X')
        plt.ylabel('Y')

        return fig

cov_plot = CovariancePlot()
pn.Row(pn.Param(cov_plot.param, widgets={'cov': pn.widgets.FloatSlider, 'mean_x': pn.widgets.FloatSlider, 'mean_y': pn.widgets.FloatSlider}),
       pn.panel(cov_plot.plot))
../_images/1f30f9fc87d0c605e1479ec1a2827fb66e7ddbe3a12e02e9e093b2416660a090.png

5.3.2. Central Park data#

fig, ax = plt.subplots()
ax.scatter(ds_cp["temp_min"], ds_cp["temp_max"], s=1)
ax.set_xlabel("min temp. [F]")
ax.set_ylabel("max temp. [F]")
Text(0, 0.5, 'max temp. [F]')
../_images/7147a95af6128378a983ebdb362dcf35f74f7bb04a695c3277be76b71c32fc17.png
float(xr.cov(
    ds_cp["temp_min"], 
    ds_cp["temp_max"]))
302.73222593210903
import puffins as pf
pf.stats.lin_regress(ds_cp["precip"].where(ds_cp["snow_fall"] > 0, drop=True), ds_cp["snow_fall"].where(
    ds_cp["snow_fall"] > 0, drop=True), "time").sel(parameter="r_value")
<xarray.DataArray ()>
array(0.53028817)
Coordinates:
    parameter  <U9 'r_value'
fig, ax = plt.subplots()
ax.scatter(ds_cp["precip"], ds_cp["snow_fall"].where(ds_cp["snow_fall"] > 0), s=1)
<matplotlib.collections.PathCollection at 0x173cfe910>
../_images/4f2d588d0b3812938724096ed1e62bc2b05f684c550cdb2c35ee2aff040f4169.png

5.4. Correlation#

Covariance provides a dimensional, quantitative measure of how much two variables tend to vary in sync with one another. But it is impossible to directly compare the covariance between pairs of variables that do not have the same pysical dimensions.

Correlation provides a dimensionless measure of how closely related two variables are, and one that can be

5.4.1. Pearson correlation coefficient (\(r\))#

In essence, the Pearson correlation coefficient is simply the covariance between two variables, but then normalized by the product of each variable’s individual variance.

# Scatter Plots for Different Pearson Coefficients
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Negative Correlation
x1 = np.linspace(0, 10, 100)
y1 = -x1 + np.random.normal(0, 1, 100)
axs[0].scatter(x1, y1)
axs[0].set_title('Negative Correlation')

# No Correlation
y2 = np.random.normal(0, 1, 100)
axs[1].scatter(x1, y2)
axs[1].set_title('No Correlation')

# Positive Correlation
y3 = x1 + np.random.normal(0, 1, 100)
axs[2].scatter(x1, y3)
axs[2].set_title('Positive Correlation')

plt.show()

# Data Distribution and Correlation using Seaborn
sns.jointplot(x=x1, y=y3, kind="scatter", marginal_kws=dict(bins=15))
plt.suptitle('Data Distribution and Positive Correlation')
plt.show()
../_images/5d34503127935f8597e591c98067746c70a25d6506ab9352d5a8453ea9fb9a59.png ../_images/520da398bb97837aec9896fa4cb5b91c12195628abd351cb067fc4e8bd4801e3.png
import panel as pn
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

pn.extension()

def plot_with_correlation(r=0):
    # Create synthetic data
    np.random.seed(42)
    n = 1000
    x = np.random.randn(n)
    noise = np.random.randn(n)
    y = r*x + np.sqrt(1-r**2)*noise

    # Compute and print actual correlation
    actual_corr, _ = pearsonr(x, y)
    
    # Plotting
    fig, ax = plt.subplots()
    ax.scatter(x, y)
    ax.set_title(f"Actual Pearson correlation: {actual_corr:.2f}")
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.xlim([-4, 4])
    plt.ylim([-4, 4])
    plt.grid(True)
    plt.close(fig)
    return fig

correlation_slider = pn.widgets.FloatSlider(name='Correlation Coefficient', start=-1, end=1, step=0.01, value=0)
interactive_plot = pn.interact(plot_with_correlation, r=correlation_slider)
pn.Column("# Interactive Correlation Coefficient Example", interactive_plot).servable()
import numpy as np
import matplotlib.pyplot as plt
import panel as pn

pn.extension()

def plot_correlated_data(correlation=0.0, x_scale=1.0, y_scale=1.0, x_offset=0.0, y_offset=0.0):
    np.random.seed(0)
    N = 100
    x = np.random.randn(N)
    x_transformed = x_scale * (x - x_offset)
    noise = np.random.randn(N)
    y = correlation * x_transformed + np.sqrt(1 - correlation ** 2) * noise
    y_transformed = y_scale * (y + y_offset)

    actual_corr, _ = pearsonr(x_transformed, y_transformed)

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.scatter(x_transformed, y_transformed)
    ax.set_title(f"Actual Pearson correlation: {actual_corr:.2f}")
    ax.set_xlim(-4, 4)
    ax.set_ylim(-10, 10)
    ax.grid(True)
    plt.close(fig)
    return fig

correlation_slider = pn.widgets.FloatSlider(start=-1.0, end=1.0, step=0.01, value=0.0, name='Correlation')
x_scale_slider = pn.widgets.FloatSlider(start=0.1, end=5.0, step=0.1, value=1.0, name='x scale')
y_scale_slider = pn.widgets.FloatSlider(start=0.1, end=5.0, step=0.1, value=1.0, name='y scale')
x_offset_slider = pn.widgets.FloatSlider(start=-5.0, end=5.0, step=0.1, value=0.0, name='x offset')
y_offset_slider = pn.widgets.FloatSlider(start=-5.0, end=5.0, step=0.1, value=0.0, name='y offset')

#interactive_plot = pn.interact(plot_correlated_data, correlation=correlation_slider)
interactive_plot = pn.interact(
    plot_correlated_data,
    correlation=correlation_slider,
    x_scale=x_scale_slider,
    y_scale=y_scale_slider,
    x_offset=x_offset_slider,
    y_offset=y_offset_slider,
)

pn.Column("# Correlation coefficient", interactive_plot).servable()

5.4.1.1. Limitations#

Correlation does not imply causation: 3rd variable could be driving both#
import matplotlib.pyplot as plt
import numpy as np

# Generate synthetic data
np.random.seed(42)
Z = np.linspace(0, 10, 100)  # Z is the hidden factor that affects both X and Y
X = 0.5 * Z + 0.5*np.random.normal(size=Z.size)  # X is somewhat influenced by Z
Y = 0.3 * Z + 0.5*np.random.normal(size=Z.size)  # Y is also somewhat influenced by Z

# Fit linear models to get residuals
model_X = LinearRegression()
model_X.fit(Z.reshape(-1, 1), X)
residuals_X = X - model_X.predict(Z.reshape(-1, 1))

model_Y = LinearRegression()
model_Y.fit(Z.reshape(-1, 1), Y)
residuals_Y = Y - model_Y.predict(Z.reshape(-1, 1))


# Plotting
fig, axs = plt.subplots(2, 2, figsize=(10, 5))

# Scatter plot for X vs Y showing the apparent correlation
ax = axs[0,0]
ax.scatter(X, Y, color='red')
ax.set_xlabel('Variable X')
ax.set_ylabel('Variable Y')
ax.set_title('X vs Y')

# Scatter plot for Z vs X
ax = axs[1,0]
ax.scatter(Z, X, color='blue')
ax.set_xlabel('Hidden Factor Z')
ax.set_ylabel('Variable X')
ax.set_title('Z vs X')

# Scatter plot for Z vs Y
ax = axs[0,1]
ax.scatter(Z, Y, color='goldenrod')
ax.set_xlabel('Hidden Factor Z')
ax.set_ylabel('Variable Y')
ax.set_title('Z vs Y')

# Scatter plot for Residuals X vs Residuals Y
ax = axs[1,1]
ax.scatter(residuals_X, residuals_Y, color='blue')
ax.set_xlabel('Residuals of X')
ax.set_ylabel('Residuals of Y')
ax.set_title('Residuals X vs Residuals Y')

plt.tight_layout()
plt.show()
../_images/3b144040bffa11bf6d6204fa5230ebd32a309fed90951be828378b116d4cdc40.png
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

# Generate synthetic data
np.random.seed(42)
Z = np.linspace(0, 10, 100)  # Z is the hidden factor that affects both X and Y
X = 0.5 * Z + 0.5*np.random.normal(size=Z.size)  # X is somewhat influenced by Z
Y = 0.3 * Z + 0.5*np.random.normal(size=Z.size)  # Y is also somewhat influenced by Z

# Fit linear models to get residuals
model_X = LinearRegression()
model_X.fit(Z.reshape(-1, 1), X)
residuals_X = X - model_X.predict(Z.reshape(-1, 1))

model_Y = LinearRegression()
model_Y.fit(Z.reshape(-1, 1), Y)
residuals_Y = Y - model_Y.predict(Z.reshape(-1, 1))

# Plotting
fig, axs = plt.subplots(1, 2, figsize=(10, 4))

# Scatter plot for X vs Y
axs[0].scatter(X, Y, color='red')
axs[0].set_xlabel('Variable X')
axs[0].set_ylabel('Variable Y')
axs[0].set_title('X vs Y')

# Scatter plot for Residuals X vs Residuals Y
axs[1].scatter(residuals_X, residuals_Y, color='blue')
axs[1].set_xlabel('Residuals of X')
axs[1].set_ylabel('Residuals of Y')
axs[1].set_title('Residuals X vs Residuals Y')

plt.tight_layout()
plt.show()
../_images/faebefc08a25c7949d2278f85e4df03247851ea76a06391fd232799e22132d38.png
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Generate synthetic data
np.random.seed(42)
Z = np.linspace(0, 10, 100)  # Z is the hidden factor that affects both X and Y
X = 0.5 * Z + np.random.normal(size=Z.size)  # X is somewhat influenced by Z
Y = 0.3 * Z + np.random.normal(size=Z.size)  # Y is also somewhat influenced by Z

# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
scatter = ax.scatter(Z, X, Y)

# Labels and title
ax.set_xlabel('Hidden Factor Z')
ax.set_ylabel('Variable X')
ax.set_zlabel('Variable Y')
ax.set_title('Illustration: Correlation Does Not Imply Causation')

# Draw lines projecting the points onto the Z=0 plane
for zi, xi, yi in zip(Z, X, Y):
    ax.plot([zi, zi], [xi, xi], [0, yi], 'gray', lw=0.5)

plt.show()
../_images/a0796d350c1368fe4d4bb3ae03af92efcc89cbe313877d0cb45fb4551a3332c5.png
Non-monotonic relationships#
import matplotlib.pyplot as plt
import numpy as np

# Generate x values symmetrically around zero
x = np.linspace(-10, 10, 200)

# Compute y values based on x^2
y = x**2

# Compute Pearson's r
r = np.corrcoef(x, y)[0, 1]

# Plot the data
plt.figure(figsize=(4, 2))
plt.scatter(x, y, label=f"Pearson's r: {r:.2f}", s=2)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('')
plt.legend()
#plt.grid(True)
plt.show()
../_images/f5f4365a6b0ad924434ca9d85c767b4531092afbd2fcee999d3263a429b49454.png
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr

# Generate data
np.random.seed(0)
x = np.linspace(-10, 10, 100)
y = np.abs(x)

# Compute correlations
r_all = pearsonr(x, y)[0]
r_pos = pearsonr(x[x > 0], y[x > 0])[0]
r_neg = pearsonr(x[x < 0], y[x < 0])[0]

# Create plot
fig, ax = plt.subplots(figsize=(9.6, 5.4))
ax.scatter(x, y, label='Data Points')
ax.axhline(0, color='gray', lw=0.5)
ax.axvline(0, color='gray', lw=0.5)

# Annotate with correlation values
ax.text(6, 8, f"Pearson r (x > 0): {r_pos:.2f}", fontsize=12)
ax.text(-10, 8, f"Pearson r (x < 0): {r_neg:.2f}", fontsize=12)
ax.text(-2, 4, f"Pearson r (all x): {r_all:.2f}", fontsize=12)

ax.set_xlabel('x')
ax.set_ylabel('|x|')
ax.set_title('Demonstrating Limitations of Pearson Correlation')
plt.tight_layout()
plt.show()
../_images/7a7b67194aaddf379a1283186c937ae59cd3d63b089ce0f0df0b90c4356c1970.png
Random chance#
import numpy as np
import matplotlib.pyplot as plt

#np.random.seed(0)

N = 10  # Number of points in each sample
M = 100  # Number of repeated experiments

max_corr = 0
max_x, max_y = None, None

for _ in range(M):
    x_random = np.random.randn(N)
    y_random = np.random.randn(N)
    
    corr = np.abs(np.corrcoef(x_random, y_random)[0, 1])
    
    if corr > max_corr:
        max_corr = corr
        max_x = x_random
        max_y = y_random

# Plot the data points that resulted in the highest magnitude correlation
fig, ax = plt.subplots()
ax.scatter(max_x, max_y, label=f'r = {max_corr:.3f}')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.legend()
#plt.title('Spurious Correlation from Random Samples')

plt.show()
../_images/2f36ba5c70e3dec18be1200dc9e876e1fac8de00ab18e34c353148cc4d387a5f.png
Sensitivitiy to outliers#
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats

# Generate independent arrays
x = np.random.rand(20)
y = np.random.rand(20)

# Compute the initial Pearson correlation
r_initial, _ = scipy.stats.pearsonr(x, y)

# Add an outlier
x_outlier = np.append(x, [10])
y_outlier = np.append(y, [10])

# Compute the new Pearson correlation with the outlier
r_outlier, _ = scipy.stats.pearsonr(x_outlier, y_outlier)

# Create the plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# Plot without outlier
ax1.scatter(x, y, color='b')
ax1.set_title(f'Without Outlier, r = {r_initial:.2f}')
ax1.set_xlabel('X')
ax1.set_ylabel('Y')

# Plot with outlier
ax2.scatter(x_outlier, y_outlier, color='b')
ax2.scatter(10, 10, color='r', label='Outlier')
ax2.set_title(f'With Outlier, r = {r_outlier:.2f}')
ax2.set_xlabel('X')
ax2.set_ylabel('Y')
ax2.legend()

plt.show()
../_images/50fd80318bd219ee541e4bdec824153345831f0914de81fde4152ba8affdf08e.png

5.4.1.2. Hypothesis test for Pearson’s \(r\)#

from scipy import stats
import numpy as np

# Sample data
np.random.seed(0)
x = np.random.randn(30)
y = 2 * x + np.random.randn(30)

# Compute Pearson correlation
r, _ = stats.pearsonr(x, y)

# Compute t-statistic
N = len(x)
t_statistic = (r * np.sqrt(N - 2)) / np.sqrt(1 - r**2)

# Degrees of freedom
df = N - 2

# Compute p-value
p_value = 2 * (1 - stats.t.cdf(np.abs(t_statistic), df))

print(f"r: {r}, t-statistic: {t_statistic}, p-value: {p_value}")
print(stats.pearsonr(x, y))
r: 0.913518817273365, t-statistic: 11.88281496322004, p-value: 1.8771650900362147e-12
PearsonRResult(statistic=0.913518817273365, pvalue=1.8771534139207052e-12)

While Pearson’s correlation coefficient \(r\) itself makes no assumptions about the distributions of the two variables, the hypothesis test for \(r\) assumes that each variable’s population is Gaussian.

As always, the concern about normality becomes more relaxed with larger samples due to the central limit theorem. However, with small samples, deviations from normality can distort the test statistics and p-values, increasing the likelihood of false positives (i.e. “Type I errors”).

import numpy as np
from scipy.stats import pearsonr, norm, uniform

# Set random seed for reproducibility
seed = np.random.randint(0, 2**32 - 1)
seed2 = np.random.randint(0, 2**32 - 1)

np.random.seed(seed)

# Number of samples
N = 1000

# Specified correlation
rho = 0.1

# Generate normal-distributed variables with specified correlation
np.random.seed(seed)
normal_x = norm.rvs(size=N)
np.random.seed(seed2)
norm2 = norm.rvs(size=N)
normal_y = rho * normal_x + np.sqrt(1 - rho**2) * norm2
r_normal, p_normal = pearsonr(normal_x, normal_y)


# Generate uniform-distributed variables with specified correlation
np.random.seed(seed)
uniform_x = uniform.rvs(size=N)
np.random.seed(seed2)
unif2 = uniform.rvs(size=N)
uniform_y = rho * uniform_x + np.sqrt(1 - rho**2) * unif2
r_uniform, p_uniform = pearsonr(uniform_x, uniform_y)

print(f"Normal distribution: r = {r_normal:.4f}, p-value = {p_normal:.2e}")
print(f"Uniform distribution: r = {r_uniform:.4f}, p-value = {p_uniform:.2e}")
Normal distribution: r = 0.1326, p-value = 2.61e-05
Uniform distribution: r = 0.1341, p-value = 2.09e-05

5.4.1.3. Central Park data#

fig, ax = plt.subplots()
ax.scatter(ds_cp["temp_min"], ds_cp["temp_max"], s=1)
ax.set_xlabel("min temp. [F]")
ax.set_ylabel("max temp. [F]")
Text(0, 0.5, 'max temp. [F]')
../_images/7147a95af6128378a983ebdb362dcf35f74f7bb04a695c3277be76b71c32fc17.png
float(xr.corr(
    ds_cp["temp_min"], 
    ds_cp["temp_max"]))
0.9537972358424263
pf.stats.corr_detrended(
    ds_cp["temp_min"], 
    ds_cp["temp_max"])
0.9535715950209205

5.4.2. Spearman’s rank correlation coefficient#

5.5. Anscombe’s quartet#

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr

# Data for Anscombe's quartet
x1 = np.array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5])
y1 = np.array([8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68])
x2 = x1
y2 = np.array([9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74])
x3 = x1
y3 = np.array([7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73])
x4 = np.array([8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8])
y4 = np.array([6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89])



# Define datasets
datasets = {
    "Dataset 1": (x1, y1),
    "Dataset 2": (x2, y2),
    "Dataset 3": (x3, y3),
    "Dataset 4": (x4, y4)
}

# Create 2x2 subplots
fig, axs = plt.subplots(2, 2, figsize=(14, 7.875))
axs = axs.flatten()

# Loop through datasets to plot each
for i, (title, (x, y)) in enumerate(datasets.items()):
    ax = axs[i]
    ax.scatter(x, y)
    
    # Compute summary statistics
    mean_x, mean_y = np.mean(x), np.mean(y)
    var_x, var_y = np.var(x, ddof=1), np.var(y, ddof=1)
    r, _ = pearsonr(x, y)
    
    # Annotate the plot with these statistics
    stats_text = f"Mean(x)={mean_x:.2f}\nVar(x)={var_x:.2f}\nMean(y)={mean_y:.2f}\nVar(y)={var_y:.2f}\nPearson r={r:.2f}"
    ax.text(0.05, 0.95, stats_text, transform=ax.transAxes, verticalalignment='top', fontsize=16)
    ax.set_title(title)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')

# Show plot
plt.tight_layout()
plt.show()
../_images/369e9538628bfa7557e1e36fa9b987fc15b7b43f0e569c493d58366ec6a65c08.png

5.6. Linear regression#

Linear regression makes the following assumptions:

  1. Linearity: the independent and dependent variables are linearly related. Applying linear regression for variables with nonlinear relationships can lead to errant predictions.

  2. Independence: Each value of the independent variable is assumed independent of all its other values, and each value of the dependent variable is assumed independent of all its other values. For time series or spatial data, this assumption can be violated, leading to biased or inefficient parameter estimates.

  3. Homoscedasticity: The variance of the residuals is constant across all levels of the independent variable. When this doesn’t occur, it’s called heteroscedastic, whic can lead to inefficient parameter estimates and incorrect conclusions from hypothesis tests.

  4. Normality of Errors: Not for linear regression itself, but for hypothesis testing.

import matplotlib.pyplot as plt
import numpy as np

np.random.seed(0)
X = np.random.rand(50)
Y = 2 + 3 * X + np.random.randn(50)

plt.scatter(X, Y, label='Data Points')
plt.plot(X, 2 + 3 * X, label='Regression Line', color='red')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.title('What is Linear Regression?')
plt.show()
../_images/a37f8506f96f667c0bd49d5498e6791063ba6ee5ccdd84ed1e2e33e484d61222.png
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

# Generate some synthetic data
np.random.seed(0)
X = np.random.rand(50)
Y = 2 * X + 1 + np.random.randn(50)

# Fit a simple linear model
model = LinearRegression()
model.fit(X.reshape(-1, 1), Y)
line_X = np.linspace(min(X), max(X), 100).reshape(-1, 1)
line_y = model.predict(line_X)

# Calculate rise and run for annotation
rise = float(line_y[-1] - line_y[0])
run = float(line_X[-1] - line_X[0])
slope = rise / run

# Create the plot
plt.scatter(X, Y, label='Data Points')
plt.plot(line_X, line_y, label='Regression Line', color='red')

# Annotate slope on the line
mid_point_index = len(line_X) // 2
mid_point_X = line_X[mid_point_index]
mid_point_y = model.predict(mid_point_X.reshape(1, -1))
plt.annotate(
    f'Slope = Rise/Run\n       = {rise:.2f}/{run:.2f}\n       = {slope:.2f}',
    xy=(mid_point_X[0], mid_point_y[0]),
    xytext=(mid_point_X[0] + 0.1, mid_point_y[0] + 0.5),
    arrowprops=dict(facecolor='black', shrink=0.05),
    horizontalalignment='center'
)

plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.title('What is Linear Regression?')
plt.show()
/var/folders/3g/s0brcg452zn0z3962qt_0pn00000gn/T/ipykernel_10782/161289806.py:18: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
  run = float(line_X[-1] - line_X[0])
../_images/64b3f6237b3a9294b6fe291d5b4f327bb7bc2757eb04795bff7c14a32b7f15e3.png

5.6.1. Residuals#

import matplotlib.pyplot as plt
import numpy as np

# Generate synthetic data
np.random.seed(42)
X = np.linspace(0, 10, 100)
y = 2 * X + 1 + np.random.normal(0, 1, 100)

# Fit a simple linear model
coeff = np.polyfit(X, y, 1)
y_pred = np.polyval(coeff, X)

# Calculate residuals
residuals = y - y_pred

# Create residual plot
plt.figure(figsize=(8, 5))
plt.scatter(X, residuals)
plt.axhline(0, color='r', linestyle='--')
plt.xlabel("X")
plt.ylabel("Residuals")
plt.title("Residual Plot to Check Linearity")
plt.show()
../_images/e20fa31941a0e38d9fb3db24f537df9f73513741e8bd724ec95c28006a55d5c4.png
import panel as pn
import pandas as pd
from bokeh.models import Span
from bokeh.plotting import figure

# Create DataFrame
df = pd.DataFrame({'X': X, 'residuals': residuals})

# Define the plotting function
def plot_with_smoothing(smoothing):
    p = figure(width=800, height=400, title='Residual Plot with Smoothing Curve')
    p.scatter(df['X'], df['residuals'], color='blue')
    if smoothing:
        p.line(df['X'], df['residuals'].rolling(window=5).mean(), color='green', legend_label='Smoothing Curve')
    hline = Span(location=0, dimension='width', line_color='red', line_dash='dashed')
    p.renderers.extend([hline])
    return p

# Create Panel interactive widget
smoothing_toggle = pn.widgets.Toggle(name='Add Smoothing Curve', value=False)
interact = pn.interact(plot_with_smoothing, smoothing=smoothing_toggle)
pn.Row(interact[1][0], interact[0]).servable()

5.6.2. Homeskedasticity#

import matplotlib.pyplot as plt
import numpy as np

# Generate synthetic data
np.random.seed(42)
X = np.linspace(0, 10, 100)
y = 2 * X + 1 + np.random.normal(0, np.sqrt(X), 100)

# Fit a simple linear model
coeff = np.polyfit(X, y, 1)
y_pred = np.polyval(coeff, X)

# Calculate residuals
residuals = y - y_pred

# Create residual plot
plt.figure(figsize=(8, 5))
plt.scatter(X, residuals)
plt.axhline(0, color='r', linestyle='--')
plt.xlabel("X")
plt.ylabel("Residuals")
plt.title("Residual Plot to Check Homoscedasticity")
plt.show()
../_images/109ccba700fbde1b6ebde39b1c572ffd7c90a4649a1dcd120dcf3e0a90992f43.png
import panel as pn
import pandas as pd
from bokeh.models import Span
from bokeh.plotting import figure

# Define the plotting function
def plot_homoscedasticity(constant_variance=True):
    np.random.seed(42)
    if constant_variance:
        y = 2 * X + 1 + np.random.normal(0, 1, 100)
    else:
        y = 2 * X + 1 + np.random.normal(0, np.sqrt(X), 100)

    # Calculate residuals
    coeff = np.polyfit(X, y, 1)
    y_pred = np.polyval(coeff, X)
    residuals = y - y_pred

    # Create Bokeh plot
    p = figure(width=800, height=400, title='Residual Plot to Check Homoscedasticity')
    p.scatter(X, residuals, color='blue')
    hline = Span(location=0, dimension='width', line_color='red', line_dash='dashed')
    p.renderers.extend([hline])
    return p

# Create Panel interactive widget
toggle = pn.widgets.Toggle(name='Constant Variance', value=True)
interact = pn.interact(plot_homoscedasticity, constant_variance=toggle)
pn.Row(interact[1][0], interact[0]).servable()

5.6.3. Least squares#

# Given the above X, Y
from sklearn.metrics import mean_squared_error

np.random.seed(0)
X = np.random.rand(50)
Y = 2 + 3 * X + np.random.randn(50)

beta_0 = np.linspace(0, 4, 50)
beta_1 = np.linspace(0, 5, 50)
mse_values = np.zeros((50, 50))

for i, b0 in enumerate(beta_0):
    for j, b1 in enumerate(beta_1):
        mse_values[i, j] = mean_squared_error(Y, b0 + b1 * X)

plt.contourf(beta_0, beta_1, mse_values, levels=20)
plt.colorbar(label='MSE')
plt.xlabel('Beta 0')
plt.ylabel('Beta 1')
plt.title('Objective of Linear Regression')
plt.show()
../_images/86806b56d814629171de3ea1da8f898cb0cc85fb1e6d2934773495f7d0da5feb.png
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

# Generate some synthetic data
np.random.seed(0)
X = np.random.normal(5, 1, 10).reshape(-1, 1)  # Independent variable
y = 2 * X.squeeze() + 1 + np.random.normal(0, 2, 10)  # Dependent variable

# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)
line_X = np.linspace(min(X), max(X), 100).reshape(-1, 1)
line_y = model.predict(line_X)


# Plot the data points
plt.scatter(X, y, color='blue', label='Data Points')

# Plot the regression line
plt.plot(line_X, line_y, color='red', label='Regression Line')

# Calculate the y-values on the regression line
predicted_y = model.predict(X)

# Plot the errors as vertical lines
plt.vlines(X, predicted_y, y, colors='green', linestyles='dashed', label='Errors')

# Add labels and title
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression with Regression Errors')
plt.legend()
plt.show()
../_images/8c6fa327b61cb10adfa146e3330e4dcfdc56eb788318294cc3b4d9fbd84e0880.png
import numpy as np
import holoviews as hv
import panel as pn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

hv.extension('bokeh')

# Generate some synthetic data
np.random.seed(42)
X = np.random.rand(100) * 10
y = 2.5 * X + 10 + np.random.randn(100) * 5

# Fit a least squares linear regression model
X_reshaped = X.reshape(-1, 1)
model = LinearRegression()
model.fit(X_reshaped, y)

# Predict using the model
y_pred = model.predict(X_reshaped)
mse_least_squares = mean_squared_error(y, y_pred)

# Function to calculate the user-defined regression line and MSE
def regression_line(slope, intercept):
    user_y_pred = slope * X + intercept
    mse_user = mean_squared_error(y, user_y_pred)
    
    # Plot the data, least squares line, and user-defined line
    scatter = hv.Scatter((X, y), 'X', 'y')
    ls_line = hv.Curve((X, y_pred), 'X', 'y').opts(color='blue', line_width=2)
    user_line = hv.Curve((X, user_y_pred), 'X', 'y').opts(color='red', line_width=2)
    
    return (scatter * ls_line * user_line).opts(
        title=f'MSE (Least Squares): {mse_least_squares:.2f},\nMSE (User): {mse_user:.2f}'
    ), mse_user

# Create sliders for slope and intercept
slope_slider = pn.widgets.FloatSlider(name='Slope', start=0.0, end=5.0, step=0.1, value=2.5)
intercept_slider = pn.widgets.FloatSlider(name='Intercept', start=0.0, end=20.0, step=1.0, value=10.0)

# Interactive function to update plot based on slider values
@pn.depends(slope=slope_slider, intercept=intercept_slider)
def interactive_regression(slope, intercept):
    return regression_line(slope, intercept)[0]

# Display the plot and sliders
pn.Row(
    pn.Column(slope_slider, intercept_slider),
    interactive_regression
).servable()

5.6.4. Linear regression implementations in python#

Most of the implementation don’t work if there are any NaN values. So it’s helpful to write a helper function using xarray to create versions of your X and Y arrays with all NaN values dropped and with identical coordinate values:

ds_cp["temp_max"]
<xarray.DataArray 'temp_max' (time: 56520)>
array([29., 27., 35., ..., 65., 63., 66.])
Coordinates:
  * time     (time) datetime64[ns] 1869-01-01 1869-01-02 ... 2023-09-30
ds_cp["temp_max"].dropna("time")
<xarray.DataArray 'temp_max' (time: 56449)>
array([29., 27., 35., ..., 65., 63., 66.])
Coordinates:
  * time     (time) datetime64[ns] 1869-01-01 1869-01-02 ... 2023-09-30
xr.align(ds_cp["temp_min"], ds_cp["temp_max"])
(<xarray.DataArray 'temp_min' (time: 56520)>
 array([19., 21., 27., ..., 56., 59., 57.])
 Coordinates:
   * time     (time) datetime64[ns] 1869-01-01 1869-01-02 ... 2023-09-30,
 <xarray.DataArray 'temp_max' (time: 56520)>
 array([29., 27., 35., ..., 65., 63., 66.])
 Coordinates:
   * time     (time) datetime64[ns] 1869-01-01 1869-01-02 ... 2023-09-30)
def arrs_for_regr(arr1, arr2):
    """Return views of arr1 and arr2 with NaNs dropped and aligned."""
    return xr.align(arr1.dropna(arr1.dims[0]), arr2.dropna(arr2.dims[0]))

5.6.4.1. numpy.polyfit#

tmin_for_regr, tmax_for_regr = arrs_for_regr(ds_cp["temp_min"], ds_cp["temp_max"])
np.polyfit(tmin_for_regr, tmax_for_regr, 1)
array([ 1.05064541, 12.35569893])

5.6.4.2. scipy.stats.linregress#

scipy.stats.linregress(tmin_for_regr, tmax_for_regr)
LinregressResult(slope=1.050645409563477, intercept=12.355698929858043, rvalue=0.9537972358424263, pvalue=0.0, stderr=0.0013936741129371801, intercept_stderr=0.06941543775966094)

5.6.5. scikit-learn#

from sklearn.linear_model import LinearRegression

skl_lsm = LinearRegression().fit(np.array([tmin_for_regr.values]).transpose(), tmax_for_regr)
print(skl_lsm.coef_, skl_lsm.intercept_)
[1.05064541] 12.35569892985805
tmax_ls_predic = 12.35 + 1.05 * ds_cp["temp_min"]
plt.scatter(ds_cp["temp_min"], ds_cp["temp_max"], s=1, color="coral", label="actual")
plt.plot(ds_cp["temp_min"], tmax_ls_predic, color="black", linewidth=2, label="lin. regr.")
plt.xlabel("min temp [F]")
plt.ylabel("max temp [F]")
plt.legend(labelcolor="linecolor")
<matplotlib.legend.Legend at 0x1710f2a90>
../_images/ab5a6fb969e67a72bcc0059d1c2a975751b4ca346b21e8a09a783cdebc455676.png