Notes Lists
Basic Operations
Sort a dictionary by its values sorted(d.items(), key=lambda x: x[1], reverse=True) The above line will returns a list of tuples (since we are sorting the d.items()), and will not change the orders within the dictionary.
import
# basics import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt # linear models import statsmodels.api as sm from patsy import dmatrices # machine learning from sklearn import metrics from sklearn import datasets import lightgbm as lgb from sklearn.
Preprocessing
Split the data X_in, X_test, y_in, y_test = train_test_split(X, y, test_size=0.2) X_train, X_val, y_train, y_val = train_test_split(X_in, y_in, test_size=0.2) Check the NA values def df_NA_summary(df): """ Return the number and percentage of missing values in each column of the dataframe.
Matplotlib
Subplots with super lables fig, axs = plt.subplots(2,3, sharey=True, sharex=True, figsize=(12, 6)) # ravel() so that we have one-dimensioanl array to loop axs = axs.ravel() for i, j in enumerate(years): x = np.
LightGBM
Training param = { 'learning_rate' : 0.1, 'max_depth' : 5, 'min_child_samples' : 10, 'n_estimators' : 20, 'num_leaves' : 5, 'reg_alpha' : 0.1, } lgb_base = lgb.LGBMRegressor(**param).fit(X_train, y_train) Bayes Optimization def target( learning_rate, max_depth, min_child_samples, n_estimators, num_leaves, reg_alpha, ): param = { 'learning_rate' : learning_rate, 'max_depth' : int(max_depth), 'min_child_samples' : int(min_child_samples), 'n_estimators' : int(n_estimators), 'num_leaves' : int(num_leaves), 'reg_alpha' : reg_alpha, } model = lgb.
Parameters Search
Grid Search lgb_grid_search = lgb.LGBMRegressor() param_grid1 = { 'learning_rate' : [0.01, 0.05, 0.1, 0.2], 'max_depth' : [3, 6, 9, 12], 'min_child_samples' : [5, 10, 20, 50], 'n_estimators' : [10, 20, 40, 50], 'num_leaves' : [5, 10, 20, 40], 'reg_alpha' : [0.
Cross Validation
Cross Validation def cross_val_models(X, y, model_dict, cv=5, scoring='neg_mean_squared_error'): """ For the scoring metrics, use the following reference https://scikit-learn.org/stable/modules/model_evaluation.html """ res = list() for name in model_dict.keys(): scores = cross_val_score(model_dict[name], X, y, cv=cv, scoring=scoring) res.
Linear Models
statsmodels import statsmodels.api as sm from patsy import dmatrices y, X = dmatrices('life_expectancy ~ log_gdp_pcap', data = df_cleaned.query("year == @i"), return_type = 'dataframe') res = sm.OLS(y,X).fit() # access the coefficients and pvalues res.
Model Evaluation
Classification ROC-AUC Plot def roc_plot(y_test, y_pred_score, model_label="", ax=None): """ Plot the roc curve given true values and predicted scores. If plot on top of the main plot, need to send the ax to this function.