Model Evaluation | Yiming Zhang

Classification

ROC-AUC Plot

def roc_plot(y_test, y_pred_score, model_label="", ax=None):
    """
    Plot the roc curve given true values and predicted scores.
    If plot on top of the main plot, need to send the ax to this function.
    """
    fpr, tpr, threshold = roc_curve(y_test, y_pred_score)
    auc = metrics.auc(fpr, tpr)
    if ax == None:
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.plot(fpr, tpr, label=model_label + " - AUC: {:.2f}".format(auc))
        ax.plot([0, 1], [0, 1], "k--")
        ax.set(
            xlabel="False Positive Rate (FPR)",
            ylabel="True Positive Rate (TPR)",
            title="ROC curve",
        )
        ax.legend(loc="lower right")
        return ax
    else:
        ax.plot(fpr, tpr, label=model_label + " - AUC: {:.2f}".format(auc))
        ax.legend(loc="lower right")

ax = roc_plot(y_train, lgb_base.predict_proba(X_train)[:, 1], "Train")
roc_plot(y_val, lgb_base.predict_proba(X_val)[:, 1], "Validation", ax)
roc_plot(y_test, lgb_base.predict_proba(X_test)[:, 1], "Test", ax)

Confusion Matrix

def confusion_matrix_plot(y_test, y_pred, plot=True, prob=False, p=0.5):
    """
    Plot the confusion matrix.
    """
    import seaborn as sns
    import matplotlib
    from matplotlib import pyplot as plt
    from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix
    # For best visualizations, need to use default style
    # matplotlib.rcParams.update(matplotlib.rcParamsDefault)

    if prob:
        y_pred = np.where(y_pred > 0.5, 1, 0)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    if plot:
        cm = confusion_matrix(y_test, y_pred)
        # ConfusionMatrixDisplay.from_predictions(y_test, y_pred, colorbar=False)
        # plt.show()
        print(cm)

Regression Model

def resid_plot(y, y_pred, title=""):
    fig, ax = plt.subplots()
    ax.scatter(y_pred, y - y_pred)
    ax.set_xlabel("y hat")
    ax.set_ylabel("Residuals")
    ax.set_title("{0} - Residual Plots".format(title))

def eval_reg_model(model, x_list, y_list, 
                   dataLabel=["Train", "Val", "Test"],
                   residPlot=False):
    """
    Evaluate using the MSE and residual plots to multiple pairs of X and y data.
    """
    n = len(dataLabel)
    records = list()
    for i in range(n):
        x = x_list[i]
        y = y_list[i]
        label = dataLabel[i]
        y_pred = model.predict(x)
        mse = mean_squared_error(y, y_pred)
        records.append((label, mse))
        if residPlot:
            resid_plot(y, y_pred, label)
    return pd.DataFrame.from_records(records, columns=["Data Set", "MSE"])

Feature Importance

def feature_importance_plot(importantces, names):
    # rf2.feature_importances_, index=rf2.feature_names_in_
    _importances = pd.Series(importantces, index=names)
    _indices = np.argsort(importantces)

    fig, ax = plt.subplots()
    _importances[_indices].plot.barh(ax=ax)
    ax.set_title("Feature importances")
    fig.tight_layout()

feature_importance_plot(lgb_bo_insample.feature_importances_, lgb_bo_insample.feature_name_)