Split the data
X_in, X_test, y_in, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_in, y_in, test_size=0.2)
Check the NA values
def df_NA_summary(df):
"""
Return the number and percentage of missing values in each column of the dataframe.
"""
n = len(df)
res = list()
for col in df.columns:
missing_counts = df[col].isna().sum()
missing_percentage = 100 * missing_counts / n
res.append((col, missing_counts, missing_percentage))
df = pd.DataFrame.from_records(res, columns=['Column Name', 'Missing', 'Missing%'])
return df
Outliers
x14_bar = df["x14"].quantile(0.96)
x15_bar = df["x15"].quantile(0.96)