statsmodels
import statsmodels.api as sm
from patsy import dmatrices
y, X = dmatrices('life_expectancy ~ log_gdp_pcap',
data = df_cleaned.query("year == @i"),
return_type = 'dataframe')
res = sm.OLS(y,X).fit()
# access the coefficients and pvalues
res.params.log_gdp_pcap,
res.pvalues.log_gdp_pcap
Stepwise Regression
features_chosen = []
remaining_features = list(X.columns.values)
best_AIC = np.inf
for i in range(1, len(X.columns)+1):
add_feature = False
for combo in itertools.combinations(remaining_features, 1):
X_with_ones = sm.add_constant(
X[list(combo) + features_chosen])
model = sm.OLS(endog=forratedat['emp'],
exog=X_with_ones).fit()
this_AIC = model.aic
if this_AIC < best_AIC:
add_feature = True
best_AIC = this_AIC
best_feature = combo[0]
if add_feature:
features_chosen.append(best_feature)
remaining_features.remove(best_feature)
Lasso
Pipeline
Lasso regression requires a standardization before fitting the models.
lasso_pip = make_pipeline(StandardScaler(), LassoCV(cv=5)).fit(X_in, y_in)
To Access the Lasso parameters, need to separate the StandardScaler()
part of the pipeline. Use names_steps[]
methods.
# lasso.named_steps['lassocv'].alpha_
# lasso.named_steps['lassocv'].coef_
# lasso.named_steps['lassocv'].mse_path_
# lasso.named_steps['lassocv'].alphas_
Plots on each folds
lasso = lasso_pip.named_steps['lassocv']
ymin, ymax = 2300, 3800
plt.semilogx(lasso.alphas_, lasso.mse_path_, linestyle=":")
plt.plot(
lasso.alphas_,
lasso.mse_path_.mean(axis=-1),
color="black",
label="Average across the folds",
linewidth=2,
)
plt.axvline(lasso.alpha_, linestyle="--", color="black", label="alpha: CV estimate")
plt.ylim(ymin, ymax)
plt.xlabel(r"$\alpha$")
plt.ylabel("Mean square error")
plt.legend()
_ = plt.title(
f"Mean square error on each fold: coordinate descent"
)