Preprocessing | Yiming Zhang

Split the data

X_in, X_test, y_in, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_in, y_in, test_size=0.2)

Check the NA values

def df_NA_summary(df):
    """
    Return the number and percentage of missing values in each column of the dataframe.
    """
    n = len(df)
    res = list()
    for col in df.columns:
        missing_counts = df[col].isna().sum()
        missing_percentage = 100 * missing_counts / n
        res.append((col, missing_counts, missing_percentage))
    df = pd.DataFrame.from_records(res, columns=['Column Name', 'Missing', 'Missing%'])
    return df

Outliers

x14_bar = df["x14"].quantile(0.96)
x15_bar = df["x15"].quantile(0.96)