xgboost # First XGBoost model for Pima Indians dataset from numpy import loadtxt from xgboost import XGBClassifier train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model no training data model = XGBClassifier Accuracy: %.2f%%" % (accuracy * 100.0)) 或者每次插入一颗树,看看效果 from numpy import loadtxt from xgboost import XGBClassifier train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model no training data model = XGBClassifier 'eta': 0.007, 如同学习率 'seed':1000, 'nthread':7, cpu 线程数 xgb1 = XGBClassifier( learning_rate =0.1, n_estimators
#imported libs import numpy as np import pandas as pd from xgboost import XGBClassifier import matplotlib.pyplot classification #Random Search xgb_pipeline = Pipeline([('scaler', StandardScaler()), ('classifier',XGBClassifier X_train,y_train) #OR #Grid Search xgb_pipeline = Pipeline([('scaler', StandardScaler()), ('classifier',XGBClassifier 让我们分析一下随机搜索的区块: #Random Search xgb_pipeline = Pipeline([('scaler', StandardScaler()), ('classifier',XGBClassifier 网格搜索优化 #Grid Search xgb_pipeline = Pipeline([('scaler', StandardScaler()), ('classifier',XGBClassifier
XGBoost模型可以使用包装类直接在scikit-learn框架中使用,XGBClassifier用于分类,XGBRegressor用于回归问题。 我们可以通过构造它并调用**model.fit()**函数来训练XGBoost 模型进行分类: model = XGBClassifier() model.fit(X_train, y_train) 然后可以通过在新数据上调用 我们可以将这些结合起来如下: # First XGBoost model for Pima Indians dataset from numpy import loadtxt from xgboost import XGBClassifier train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model on training data model = XGBClassifier
svg scikit-learn 接口格式 from xgboost import XGBClassifier from sklearn.datasets import load_svmlight_file (**params) #bst = XGBClassifier() bst =XGBClassifier(max_depth=2, learning_rate=1, n_estimators=num_round , silent=True, objective='binary:logistic') bst.fit(X_train, y_train) XGBClassifier scikit-lean 中 cv 使用 做cross_validation主要用到下面 StratifiedKFold 函数 # 设置boosting迭代计算次数 num_round = 2 bst =XGBClassifier CV Accuracy: 93.68% (9.00%) GridSearchcv 搜索最优解 from sklearn.model_selection import GridSearchCV bst =XGBClassifier
基础应用 引入xgboost等包 from numpy import loadtxt from xgboost import XGBClassifier from sklearn.model_selection y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) xgboost 有封装好的分类器和回归器,可以直接用 XGBClassifier model = XGBClassifier() model.fit(X_train, y_train) xgboost 的结果是每个样本属于第一类的概率,需要用 round 将其转换为 0 1 值 监控模型表现 xgboost可以在模型训练时,评价模型在测试集上的表现,也可以输出每一步的分数,只需要将 model = XGBClassifier() model.fit(X_train, y_train ) 变为: model = XGBClassifier() eval_set = [(X_test, y_test)] model.fit(X_train, y_train, early_stopping_rounds
data_test.iloc[:, [i for i in range(data_test.shape[1]-1)]]) test_y = np.array(data_test['Species']) •训练模型 clf=XGBClassifier n_estimators=50) clf.fit(train_x, train_y) •测试 print(clf.score(test_x, test_y)) •完整代码 from xgboost import XGBClassifier test_x, test_y def XGBoost(): train_x, train_y, test_x, test_y = load_data() #训练 clf=XGBClassifier
基础应用 引入 xgboost 等包 from numpy import loadtxt from xgboost import XGBClassifier from sklearn.model_selection y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) xgboost 有封装好的分类器和回归器,可以直接用 XGBClassifier model = XGBClassifier() model.fit(X_train, y_train) xgboost 的结果是每个样本属于第一类的概率,需要用 round 将其转换为 0 1 值 y_pred 监控模型表现 xgboost 可以在模型训练时,评价模型在测试集上的表现,也可以输出每一步的分数 只需要将 model = XGBClassifier() model.fit(X_train, y_train ) 变为: model = XGBClassifier() eval_set = [(X_test, y_test)] model.fit(X_train, y_train, early_stopping_rounds
m2cgen目前支持的模型还蛮多的,常用常见的都包括了: 使用方法 m2cgen的安装非常方便,直接pip: pip install m2cgen 使用,先用XGBClassifier训练一个模型 numpy as np import os re from random import sample from sklearn import datasets from xgboost import XGBClassifier train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model on training data model = XGBClassifier
from xgboost import XGBClassifier xgbc = XGBClassifier () xgbc.fit(x_train, y_train) XGBClassifier(base_score
pip install lime import pandas as pd from xgboost import XGBClassifier import shap import numpy as np X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 模型训练 model = XGBClassifier model.fit(X_train, y_train) score = model.score(X_test, y_test) score The use of label encoder in XGBClassifier
# plot feature importance using built-in function 2from numpy import loadtxt 3from xgboost import XGBClassifier data into X and y 9X = dataset[:,0:8] 10y = dataset[:,8] 11# fit model on training data 12model = XGBClassifier
m2cgen目前支持的模型还蛮多的,常用常见的都包括了: 使用方法 m2cgen的安装非常方便,直接pip: pip install m2cgen 使用,先用XGBClassifier训练一个模型 numpy as np import os re from random import sample from sklearn import datasets from xgboost import XGBClassifier train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model on training data model = XGBClassifier
先不管 XGBclassifier 每个参数是什么,先用它的默认值跑跑看看结果如何。 训练模型 定义函数 fit 训练 XGBClassifier()。 def fit( X, y ): model = XGBClassifier() model.fit(X, y) return model 在训练集上调用函数 fit 并打印模型 接下来做三个实验: 只并行化 k-Fold 验证 (n_jobs = -1, nthread = 1) 只并行化 XGBClassifier (n_jobs = 1, nthread = -1) 两个都并行化 (n_jobs = -1, nthread = -1) # Single Thread XGBoost, Parallel Thread CV model = XGBClassifier(nthread
为了让整个链路留在 GPU 上,我们需要对 XGBClassifier 做一点小的封装,并结合 cuML 的指标计算。 sklearn.metrics import make_scorer from cuml.metrics import roc_auc_score from xgboost import XGBClassifier = StratifiedKFold(5, shuffle=True, random_state=0) # 封装 XGB 以适配 CuPy 预测 class cuXGBClassifier(XGBClassifier
import OneHotEncoder from sklearn.externals import joblib import numpy as np from xgboost.sklearn import XGBClassifier 'colsample_bytree': 0.8, 'reg_alpha': 0, 'reg_lambda': 1, 'learning_rate': 0.1} xgb = XGBClassifier
pre = 0.3pre1 + 0.3pre2 + 0.4pre3 5.4.2 投票 简单投票 from xgboost import XGBClassifier from sklearn.linear_model VotingClassifier clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = XGBClassifier 加权投票 在VotingClassifier中加入参数 voting='soft', weights=[2, 1, 1],weights用于调节基模型的权重 from xgboost import XGBClassifier VotingClassifier clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = XGBClassifier
# exmaple of early stopping from numpy import loadtxt from xgboost import XGBClassifier from sklearn.model_selection train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model on training data model = XGBClassifier
2.2 使用sklearn风格接口,使用原生参数 对于sklearn风格的接口,主要有2个类可以使用,一个是分类用的XGBClassifier,另一个是回归用的XGBRegressor。 其实就是使用XGBClassifier/XGBRegressor的**kwargs参数,把上面原生参数的params集合放进去,代码如下: sklearn_model_raw = xgb.XGBClassifier 具体的参数意义我们后面讲,我们看看分类的算法初始化,训练与调用的简单过程: sklearn_model_new = xgb.XGBClassifier(max_depth=5,learning_rate = 0.5, verbosity=1, objective='binary:logistic',random_state=1) 可以看到,参数定义直接放在了XGBClassifier的类参数里, {'max_depth': 4, 'n_estimators': 10} 接着尝试在上面搜索的基础上调learning_rate : sklearn_model_new2 = xgb.XGBClassifier
模型训练 使用 XGBClassifier 进行模型训练: # 初始化模型 model = xgb.XGBClassifier(use_label_encoder=False) # 训练模型 model.fit 解决方法: 尝试调低 max_depth 参数,或者增加并行线程数: model = xgb.XGBClassifier(use_label_encoder=False, max_depth=3, n_jobs
import accuracy_score import pandas as pd import numpy as np import warnings from xgboost.sklearn import XGBClassifier 3、 采用交叉验证方法对数据进行训练和验证: # xgboost from xgboost import XGBClassifier xgbc_model=XGBClassifier() # 随机森林 4、 对模型进行性能评估 # 性能评估以XGboost为例 xgb = xgb.XGBClassifier() # 对训练集训练模型 xgb.fit(X_train,y_train) # 对测试集进行预测 grid_search, datasets from sklearn import grid_search gsearch = grid_search.GridSearchCV( estimator = XGBClassifier 'colsample_bytree':[i/10.0 for i in range(6,10)] } gsearch = grid_search.GridSearchCV( estimator = XGBClassifier