params = {'boosting_type':'gbdt',
'objective':'binary',
'metric':'auc',
'learning_rate':0.1,
'num_leaves':16,
'n_estimators':100000,
'random_state':123,
'importance_type':'gain'
}
metrics =[]
imp = pd.DataFrame()
n_splits = 5
cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(x_train, y_train))
for nfold in range(n_splits):
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
x_tr, y_tr = x_train.loc[idx_tr,:], y_train.loc[idx_tr, :]
x_va, y_va = x_train.loc[idx_va,:], y_train.loc[idx_va, :]
model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
y_tr,
eval_set=[(x_tr, y_tr),(x_va, y_va)],
early_stopping_rounds=100,
verbose=100,
)
y_tr_pred = model.predict(x_tr)
y_va_pred = model.predict(x_va)
metric_tr = accuracy_score(y_tr, y_tr_pred)
metric_va = accuracy_score(y_va, y_va_pred)
metrics.append([nfold, metric_tr, metric_va])
_imp = pd.DataFrame({"col":x_train.columns, "imp":model.feature_importances_, "nfold": nfold})
imp = pd.concat([imp, _imp], axis=0, ignore_index=True)
metrics = np.array(metrics)
print(metrics)
imp = imp.groupby("col")['imp'].agg(['mean','std'])
imp.columns = ['imp', 'imp_std']
imp = imp.reset_index(drop=False)
imp.sort_values('imp', ascending = False, ignore_index=True)