Lab 05: linear regressions and pandas#

CCNY EAS 42000/A4200, Fall 2025, 2025/10/29, Prof. Spencer Hill

SUMMARY: We perform linear regressions using different python packages. We also introduce the pandas package for tabular data analysis.

Preliminaries#

Import needed python packages#

from matplotlib import pyplot as plt  # for plotting
import numpy as np # for working with arrays of numerical values
import pandas as pd # for reading CSV or excel files and subsequent analyses
import scipy  # for various scientific calculations
import xarray as xr  # for loading data and subsequent analyses

Load the Central Park data into this python session#

Hide code cell content

!pip install pooch

# The above command installs the needed `pooch` 3rd-party package if it's not already installed.


import hashlib  # for verifying that the Central Park file is not corrupted
import pathlib  # for constructing paths to the dataset's location on disk
import sys  # for checking if this is a Google Colab session or not
import pooch  # for downloading the dataset from the web, if needed


# Replace "../data" as needed to point to the correct directory for you.
# This can be an *absolute path* or a *relative path*.  One dot, `.`, means
# "this directory", while two dots, `..`, means "go up one directory."
LOCAL_DATA_DIR = "../data"  # If you're in Colab: just ignore this.

# The URL where the dataset can be downloaded from.
DATA_URL = (
    "https://spencerahill.github.io/25f-stat-methods-course/_downloads/"
    "91803b82950d49961a65355c075439b3/central-park-station-data_1869-01-01_2023-09-30.nc"
)

# This HASH_HEX stores a "hash" which we use to verify that the data you end up
# with has not been altered or corrupted compared to the one at the above URL.
HASH_HEX = "85237a4bae1202030a36f330764fd5bd0c2c4fa484b3ae34a05db49fe7721eee"


def create_data_path(
    colab_dir="/content/data", 
    local_dir=LOCAL_DATA_DIR,
    filename="central-park-station-data_1869-01-01_2023-09-30.nc",
):
    """Set the path for the data, whether on colab or a local Jupyter session."""
    is_this_a_colab = "google.colab" in sys.modules
    if is_this_a_colab:
        data_dir = colab_dir 
    else: 
        data_dir = local_dir

    DATA_DIR = pathlib.Path(data_dir)
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    return DATA_DIR / filename


def sha256sum(path: pathlib.Path) -> str:
    """Get the hash of the file at the specified path."""
    return hashlib.sha256(path.read_bytes()).hexdigest()


DATA_PATH = create_data_path()
# Determine if we'll need to download the data, which we'll do if either (a) 
# the data can't be found, or (b) it appears corrupted/modified from the
# "master" file at the above URL.
need_fetch = (not DATA_PATH.exists()) or (sha256sum(DATA_PATH) != HASH_HEX)

# Download the data if needed.
if need_fetch:
    fetched_data = pooch.retrieve(
        url=DATA_URL, 
        known_hash=f"sha256:{HASH_HEX}",
        path=DATA_PATH.parents[0], 
        fname=DATA_PATH.name,
    )
    print(f"\nDownloaded and verified: {fetched_data}")
else:
    print(f"\nVerified existing file at {DATA_PATH}")
Requirement already satisfied: pooch in /home/runner/micromamba/envs/25f-stats-ci/lib/python3.13/site-packages (1.8.2)
Requirement already satisfied: platformdirs>=2.5.0 in /home/runner/micromamba/envs/25f-stats-ci/lib/python3.13/site-packages (from pooch) (4.5.0)
Requirement already satisfied: packaging>=20.0 in /home/runner/micromamba/envs/25f-stats-ci/lib/python3.13/site-packages (from pooch) (25.0)
Requirement already satisfied: requests>=2.19.0 in /home/runner/micromamba/envs/25f-stats-ci/lib/python3.13/site-packages (from pooch) (2.32.5)
Requirement already satisfied: charset_normalizer<4,>=2 in /home/runner/micromamba/envs/25f-stats-ci/lib/python3.13/site-packages (from requests>=2.19.0->pooch) (3.4.4)
Requirement already satisfied: idna<4,>=2.5 in /home/runner/micromamba/envs/25f-stats-ci/lib/python3.13/site-packages (from requests>=2.19.0->pooch) (3.11)
Requirement already satisfied: urllib3<3,>=1.21.1 in /home/runner/micromamba/envs/25f-stats-ci/lib/python3.13/site-packages (from requests>=2.19.0->pooch) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in /home/runner/micromamba/envs/25f-stats-ci/lib/python3.13/site-packages (from requests>=2.19.0->pooch) (2025.10.5)
Verified existing file at ../data/central-park-station-data_1869-01-01_2023-09-30.nc
import xarray as xr

# `DATA_PATH` variable was created by the hidden cell just above. 
# Un-hide that cell if you want to see the details.
ds_cp = xr.open_dataset(DATA_PATH)
ds_cp
<xarray.Dataset> Size: 5MB
Dimensions:        (time: 56520)
Coordinates:
  * time           (time) datetime64[ns] 452kB 1869-01-01 ... 2023-09-30
Data variables:
    temp_max       (time) int64 452kB ...
    temp_min       (time) int64 452kB ...
    temp_avg       (time) float64 452kB ...
    temp_anom      (time) float64 452kB ...
    heat_deg_days  (time) int64 452kB ...
    cool_deg_days  (time) int64 452kB ...
    precip         (time) float64 452kB ...
    snow_fall      (time) float64 452kB ...
    snow_depth     (time) int64 452kB ...

Load the data again, but from an .xlsx spreadsheet file using pandas#

Alternative: convert the xr.Dataset to a pd.Dataframe using the to_pandas method#

help(ds_cp.to_pandas)
Help on method to_pandas in module xarray.core.dataset:

to_pandas() -> 'pd.Series | pd.DataFrame' method of xarray.core.dataset.Dataset instance
    Convert this dataset into a pandas object without changing the number of dimensions.

    The type of the returned object depends on the number of Dataset
    dimensions:

    * 0D -> `pandas.Series`
    * 1D -> `pandas.DataFrame`

    Only works for Datasets with 1 or fewer dimensions.

Regarding the last line of the help message, in our case the dataset indeed has just one dimension, time, and so we can go ahead and use this method to get a pd.DataFrame.

# df is the conventional variable name used for a pandas.DataFrame
# ds is the conventional variable name used for an xarray.Dataset
df_cp = ds_cp.to_pandas()   
df_cp
temp_max temp_min temp_avg temp_anom heat_deg_days cool_deg_days precip snow_fall snow_depth
time
1869-01-01 29 19 24.0 -11.2 41 0 0.75 9.0 0
1869-01-02 27 21 24.0 -11.0 41 0 0.03 0.0 0
1869-01-03 35 27 31.0 -3.8 34 0 0.00 0.0 0
1869-01-04 37 34 35.5 0.8 29 0 0.18 0.0 0
1869-01-05 43 37 40.0 5.5 25 0 0.05 0.0 0
... ... ... ... ... ... ... ... ... ...
2023-09-26 60 54 57.0 -8.5 8 0 0.20 0.0 0
2023-09-27 64 50 57.0 -8.1 8 0 0.00 0.0 0
2023-09-28 65 56 60.5 -4.2 4 0 0.38 0.0 0
2023-09-29 63 59 61.0 -3.3 4 0 5.48 0.0 0
2023-09-30 66 57 61.5 -2.4 3 0 0.04 0.0 0

56520 rows × 9 columns

Some basics on pandas#

Basic usage and terminology#

The standard way of importing it is to abbreviate it to pd: import pandas as pd.

Typically, pd.Series objects are used to represent a single, 1-D variable.

Typically, pd.DataFrame objects are used to represent tabular data: either one 2-D variable or multiple, 1-D variables all sharing the same coordinate values along that dimension.

In our case, the dataframe is a collection of nine 1D timeseries.

Each column in a pd.DataFrame object is itself a pd.Series—analogous to how each variable in an xr.Dataset is itself an xr.DataArray.

Accessing variables#

For more, see the pandas doc page on indexing and selecting data.

Use square brackets and the name of the column to grab that variable:

print(type(df_cp["temp_max"]))
df_cp["temp_max"]
<class 'pandas.core.series.Series'>
time
1869-01-01    29
1869-01-02    27
1869-01-03    35
1869-01-04    37
1869-01-05    43
              ..
2023-09-26    60
2023-09-27    64
2023-09-28    65
2023-09-29    63
2023-09-30    66
Name: temp_max, Length: 56520, dtype: int64

To grab an individual row, there are two ways, loc and iloc.  **Important**: unlike normal methods and functions, locandiloc` don’t use parentheses. Instead, they use square brackets. (It’s because they’re technically something called an “Accessor” rather than being methods.)

loc is for label-based indexing. Give it the actual value of the index you want to grab. In this case, because the Index is a datetime object, you can specify the date as a string:

df_cp.loc["1989-04-22"]
temp_max         55.0
temp_min         36.0
temp_avg         45.5
temp_anom       -10.7
heat_deg_days    19.0
cool_deg_days     0.0
precip            0.0
snow_fall         0.0
snow_depth        0.0
Name: 1989-04-22 00:00:00, dtype: float64

You can also use slicing to grab multiple values:

df_cp.loc["1989-04-22":"1989-04-24"]
temp_max temp_min temp_avg temp_anom heat_deg_days cool_deg_days precip snow_fall snow_depth
time
1989-04-22 55 36 45.5 -10.7 19 0 0.0 0.0 0
1989-04-23 58 34 46.0 -10.5 19 0 0.0 0.0 0
1989-04-24 62 37 49.5 -7.4 15 0 0.0 0.0 0

Notice that it includes both endpoints.

You can also use iloc, which grabs things by integer index. That means, 0 is the 1st row of data, 1 the second row, etc.:

df_cp.iloc[0]
temp_max         29.00
temp_min         19.00
temp_avg         24.00
temp_anom       -11.20
heat_deg_days    41.00
cool_deg_days     0.00
precip            0.75
snow_fall         9.00
snow_depth        0.00
Name: 1869-01-01 00:00:00, dtype: float64

Just like for .loc you can use slicing:

df_cp.iloc[1000:1002]
temp_max temp_min temp_avg temp_anom heat_deg_days cool_deg_days precip snow_fall snow_depth
time
1871-09-28 60 47 53.5 -11.2 11 0 0.0 0.0 0
1871-09-29 56 44 50.0 -14.3 15 0 0.0 0.0 0

Notice that, unlike for .loc, it does not include the right endpoint. (Which is the same behavior as for indexing lists, numpy arrays, and other things too.)

Compute the linear regression#

Using scipy.stats.linregress#

help(scipy.stats.linregress)
Help on function linregress in module scipy.stats._stats_py:

linregress(
    x,
    y,
    alternative='two-sided',
    *,
    axis=0,
    nan_policy='propagate',
    keepdims=False
)
    Calculate a linear least-squares regression for two sets of measurements.

    Parameters
    ----------
    x, y : array_like
        Two sets of measurements.  Both arrays should have the same length N.
    alternative : {'two-sided', 'less', 'greater'}, optional
        Defines the alternative hypothesis. Default is 'two-sided'.
        The following options are available:

        * 'two-sided': the slope of the regression line is nonzero
        * 'less': the slope of the regression line is less than zero
        * 'greater':  the slope of the regression line is greater than zero

        .. versionadded:: 1.7.0
    axis : int or None, default: 0
        If an int, the axis of the input along which to compute the statistic.
        The statistic of each axis-slice (e.g. row) of the input will appear in a
        corresponding element of the output.
        If ``None``, the input will be raveled before computing the statistic.
    nan_policy : {'propagate', 'omit', 'raise'}
        Defines how to handle input NaNs.

        - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
          which the  statistic is computed, the corresponding entry of the output
          will be NaN.
        - ``omit``: NaNs will be omitted when performing the calculation.
          If insufficient data remains in the axis slice along which the
          statistic is computed, the corresponding entry of the output will be
          NaN.
        - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
    keepdims : bool, default: False
        If this is set to True, the axes which are reduced are left
        in the result as dimensions with size one. With this option,
        the result will broadcast correctly against the input array.

    Returns
    -------
    result : ``LinregressResult`` instance
        The return value is an object with the following attributes:

        slope : float
            Slope of the regression line.
        intercept : float
            Intercept of the regression line.
        rvalue : float
            The Pearson correlation coefficient. The square of ``rvalue``
            is equal to the coefficient of determination.
        pvalue : float
            The p-value for a hypothesis test whose null hypothesis is
            that the slope is zero, using Wald Test with t-distribution of
            the test statistic. See `alternative` above for alternative
            hypotheses.
        stderr : float
            Standard error of the estimated slope (gradient), under the
            assumption of residual normality.
        intercept_stderr : float
            Standard error of the estimated intercept, under the assumption
            of residual normality.

    See Also
    --------

    :func:`scipy.optimize.curve_fit`
        Use non-linear least squares to fit a function to data.
    :func:`scipy.optimize.leastsq`
        Minimize the sum of squares of a set of equations.


    Notes
    -----
    For compatibility with older versions of SciPy, the return value acts
    like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,
    ``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::

        slope, intercept, r, p, se = linregress(x, y)

    With that style, however, the standard error of the intercept is not
    available.  To have access to all the computed values, including the
    standard error of the intercept, use the return value as an object
    with attributes, e.g.::

        result = linregress(x, y)
        print(result.intercept, result.intercept_stderr)

    Beginning in SciPy 1.9, ``np.matrix`` inputs (not recommended for new
    code) are converted to ``np.ndarray`` before the calculation is performed. In
    this case, the output will be a scalar or ``np.ndarray`` of appropriate shape
    rather than a 2D ``np.matrix``. Similarly, while masked elements of masked
    arrays are ignored, the output will be a scalar or ``np.ndarray`` rather than a
    masked array with ``mask=False``.

    Examples
    --------
    >>> import numpy as np
    >>> import matplotlib.pyplot as plt
    >>> from scipy import stats
    >>> rng = np.random.default_rng()

    Generate some data:

    >>> x = rng.random(10)
    >>> y = 1.6*x + rng.random(10)

    Perform the linear regression:

    >>> res = stats.linregress(x, y)

    Coefficient of determination (R-squared):

    >>> print(f"R-squared: {res.rvalue**2:.6f}")
    R-squared: 0.717533

    Plot the data along with the fitted line:

    >>> plt.plot(x, y, 'o', label='original data')
    >>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
    >>> plt.legend()
    >>> plt.show()

    Calculate 95% confidence interval on slope and intercept:

    >>> # Two-sided inverse Students t-distribution
    >>> # p - probability, df - degrees of freedom
    >>> from scipy.stats import t
    >>> tinv = lambda p, df: abs(t.ppf(p/2, df))

    >>> ts = tinv(0.05, len(x)-2)
    >>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")
    slope (95%): 1.453392 +/- 0.743465
    >>> print(f"intercept (95%): {res.intercept:.6f}"
    ...       f" +/- {ts*res.intercept_stderr:.6f}")
    intercept (95%): 0.616950 +/- 0.544475

The inputs to scipy.stats.linregress can be pd.Series, xr.DataArray, numpy arrays, or even plain old lists:

scipy.stats.linregress(df_cp["snow_depth"], df_cp["temp_max"])
LinregressResult(slope=np.float64(-3.598971141084667), intercept=np.float64(62.157608768059596), rvalue=np.float64(-0.233427342412877), pvalue=np.float64(0.0), stderr=np.float64(0.06306184810935893), intercept_stderr=np.float64(0.07799165142915197))
scipy.stats.linregress(ds_cp["snow_depth"], ds_cp["temp_max"])
LinregressResult(slope=np.float64(-3.598971141084667), intercept=np.float64(62.157608768059596), rvalue=np.float64(-0.233427342412877), pvalue=np.float64(0.0), stderr=np.float64(0.06306184810935893), intercept_stderr=np.float64(0.07799165142915197))
scipy.stats.linregress(ds_cp["snow_depth"].values, ds_cp["temp_max"].values)
LinregressResult(slope=np.float64(-3.598971141084667), intercept=np.float64(62.157608768059596), rvalue=np.float64(-0.233427342412877), pvalue=np.float64(0.0), stderr=np.float64(0.06306184810935893), intercept_stderr=np.float64(0.07799165142915197))
scipy.stats.linregress(ds_cp["snow_depth"].values, ds_cp["temp_max"].values)
LinregressResult(slope=np.float64(-3.598971141084667), intercept=np.float64(62.157608768059596), rvalue=np.float64(-0.233427342412877), pvalue=np.float64(0.0), stderr=np.float64(0.06306184810935893), intercept_stderr=np.float64(0.07799165142915197))
scipy.stats.linregress(list(ds_cp["snow_depth"].values), list(ds_cp["temp_max"].values))
LinregressResult(slope=np.float64(-3.598971141084667), intercept=np.float64(62.157608768059596), rvalue=np.float64(-0.233427342412877), pvalue=np.float64(0.0), stderr=np.float64(0.06306184810935893), intercept_stderr=np.float64(0.07799165142915197))

Performing linear regressions#