Linear Models

statsmodels

import statsmodels.api as sm
from patsy import dmatrices


y, X = dmatrices('life_expectancy ~ log_gdp_pcap', 
                 data = df_cleaned.query("year == @i"),
                 return_type = 'dataframe')
res = sm.OLS(y,X).fit()

# access the coefficients and pvalues
res.params.log_gdp_pcap,
res.pvalues.log_gdp_pcap

Stepwise Regression

features_chosen = []
remaining_features = list(X.columns.values)
best_AIC = np.inf

for i in range(1, len(X.columns)+1):
    add_feature = False

    for combo in itertools.combinations(remaining_features, 1):
        X_with_ones = sm.add_constant(
            X[list(combo) + features_chosen])
        model = sm.OLS(endog=forratedat['emp'], 
                       exog=X_with_ones).fit()
        this_AIC = model.aic

        if this_AIC < best_AIC:
            add_feature = True
            best_AIC = this_AIC
            best_feature = combo[0]

    if add_feature:
        features_chosen.append(best_feature)
        remaining_features.remove(best_feature)

Lasso

Pipeline

Lasso regression requires a standardization before fitting the models.

lasso_pip = make_pipeline(StandardScaler(), LassoCV(cv=5)).fit(X_in, y_in)

To Access the Lasso parameters, need to separate the StandardScaler() part of the pipeline. Use names_steps[] methods.

# lasso.named_steps['lassocv'].alpha_
# lasso.named_steps['lassocv'].coef_
# lasso.named_steps['lassocv'].mse_path_
# lasso.named_steps['lassocv'].alphas_

Plots on each folds

lasso = lasso_pip.named_steps['lassocv']

ymin, ymax = 2300, 3800
plt.semilogx(lasso.alphas_, lasso.mse_path_, linestyle=":")
plt.plot(
    lasso.alphas_,
    lasso.mse_path_.mean(axis=-1),
    color="black",
    label="Average across the folds",
    linewidth=2,
)
plt.axvline(lasso.alpha_, linestyle="--", color="black", label="alpha: CV estimate")

plt.ylim(ymin, ymax)
plt.xlabel(r"$\alpha$")
plt.ylabel("Mean square error")
plt.legend()
_ = plt.title(
    f"Mean square error on each fold: coordinate descent"
)
Previous
Next