Scikit中的特征选择,XGboost进行回归预测,模型优
然不满意啦,一直想着怎么能提高准确率呢?后来就想到了可以利用一下这个库啊!在中包含了一个特征选择的模块,而在这个模块下面有以下几个方法:
with low (剔除低方差的特征) (单变量特征选择) (递归功能消除) using (使用进行特征选择)
我首先想到的是利用单变量特征选择的方法选出几个跟预测结果最相关的特征。根据官方文档,有以下几种得分函数来检验变量之间的依赖程度:
由于这个比赛是一个回归预测问题,所以我选择了这个得分函数(刚开始我没有注意,错误使用了分类问题中的得分函数chi2,导致程序一直报错!心很累~)
的参数:
..(X, y, =True)
X:一个多维数组,大小为(, ),即行数为训练样本的大小,列数为特征的个数
y:一个一维数组,长度为训练样本的大小
:返回值为特征的F值以及p值
不过在进行这个操作之前,我们还有一个重大的任务要完成,那就是对于空值的处理!幸运的是中也有专门的模块可以处理这个问题:
..的参数:
..(=’NaN’, =’mean’, axis=0, =0, copy=True)
其中代表对于空值的填充策略(默认为mean,即取所在列的平均数进行填充):
axis默认值为0:
其他具体参数可以参考:
根据以上,我对数据进行了一些处理:
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(data.loc[:, 'rw':'lb'])
x_new = imputer.transform(data.loc[:, 'rw':'lb'])
data_num = len(x_new)
XList = []
yList = []
for row in range(0, data_num):
tmp_list = []
tmp_list.append(x_new[row][0])
tmp_list.append(x_new[row][1])
tmp_list.append(x_new[row][2])
tmp_list.append(x_new[row][3])
tmp_list.append(x_new[row][4])
tmp_list.append(x_new[row][5])
tmp_list.append(x_new[row][6])
tmp_list.append(x_new[row][7])
tmp_list.append(x_new[row][8])
tmp_list.append(x_new[row][9])
XList.append(tmp_list)
yList.append(data.iloc[row]['y'])
F = f_regression(XList, yList)
print(len(F))
print(F)
测试结果:
2
(array([2531.07587725, 1166.63303449, 2891.97789543, 2531.07587725,
2786.75491791, 2891.62686404, 3682.42649607, 1394.46743196,
531.08672792, 1166.63303449]), array([0.00000000e+000, 1.74675421e-242, 0.00000000e+000, 0.00000000e+000,
0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 1.37584507e-286,
1.15614152e-114, 1.74675421e-242]))
根据以上得到的结果,我选取了rw,st,lw,cf,cam,cm(选取F值相对大的)几个特征加入模型之中。以下是我改进后的代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : soccer_value.py
# @Author: Huangqinjian
# @Date : 2018/3/22
# @Desc :
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np
from xgboost import plot_importance
from sklearn.preprocessing import Imputer
def loadDataset(filePath):
df = pd.read_csv(filepath_or_buffer=filePath)
return df
def featureSet(data):
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(data.loc[:, ['rw', 'st', 'lw', 'cf', 'cam', 'cm']])
x_new = imputer.transform(data.loc[:, ['rw', 'st', 'lw', 'cf', 'cam', 'cm']])
data_num = len(data)
XList = []
for row in range(0, data_num):
tmp_list = []
tmp_list.append(data.iloc[row]['club'])
tmp_list.append(data.iloc[row]['league'])
tmp_list.append(data.iloc[row]['potential'])
tmp_list.append(data.iloc[row]['international_reputation'])
tmp_list.append(data.iloc[row]['pac'])
tmp_list.append(data.iloc[row]['sho'])
tmp_list.append(data.iloc[row]['pas'])
tmp_list.append(data.iloc[row]['dri'])
tmp_list.append(data.iloc[row]['def'])
tmp_list.append(data.iloc[row]['phy'])
tmp_list.append(data.iloc[row]['skill_moves'])
tmp_list.append(x_new[row][0])
tmp_list.append(x_new[row][1])
tmp_list.append(x_new[row][2])
tmp_list.append(x_new[row][3])
tmp_list.append(x_new[row][4])
tmp_list.append(x_new[row][5])
XList.append(tmp_list)
yList = data.y.values
return XList, yList
def loadTestData(filePath):
data = pd.read_csv(filepath_or_buffer=filePath)
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(data.loc[:, ['rw', 'st', 'lw', 'cf', 'cam', 'cm']])
x_new = imputer.transform(data.loc[:, ['rw', 'st', 'lw', 'cf', 'cam', 'cm']])
data_num = len(data)
XList = []
for row in range(0, data_num):
tmp_list = []
tmp_list.append(data.iloc[row]['club'])
tmp_list.append(data.iloc[row]['league'])
tmp_list.append(data.iloc[row]['potential'])
tmp_list.append(data.iloc[row]['international_reputation'])
tmp_list.append(data.iloc[row]['pac'])
tmp_list.append(data.iloc[row]['sho'])
tmp_list.append(data.iloc[row]['pas'])
tmp_list.append(data.iloc[row]['dri'])
tmp_list.append(data.iloc[row]['def'])
tmp_list.append(data.iloc[row]['phy'])
tmp_list.append(data.iloc[row]['skill_moves'])
tmp_list.append(x_new[row][0])
tmp_list.append(x_new[row][1])
tmp_list.append(x_new[row][2])
tmp_list.append(x_new[row][3])
tmp_list.append(x_new[row][4])
tmp_list.append(x_new[row][5])
XList.append(tmp_list)
return XList
def trainandTest(X_train, y_train, X_test):
# XGBoost训练过程
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=False, objective='reg:gamma')
model.fit(X_train, y_train)
# 对测试集进行预测
ans = model.predict(X_test)
ans_len = len(ans)
id_list = np.arange(10441, 17441)
data_arr = []
for row in range(0, ans_len):
data_arr.append([int(id_list[row]), ans[row]])
np_data = np.array(data_arr)
# 写入文件
pd_data = pd.DataFrame(np_data, columns=['id', 'y'])
# print(pd_data)
pd_data.to_csv('submit.csv', index=None)
# 显示重要特征
# plot_importance(model)
# plt.show()
if __name__ == '__main__':
trainFilePath = 'dataset/soccer/train.csv'
testFilePath = 'dataset/soccer/test.csv'
data = loadDataset(trainFilePath)
X_train, y_train = featureSet(data)
X_test = loadTestData(testFilePath)
trainandTest(X_train, y_train, X_test)
再次提交,这次MAE为 42.1227,排名16/28。虽然提升了不少,不过距离第一名还是有差距,仍需努力。
接下来,我们来处理一下下面这个字段:
由于这两个字段是标签,需要进行处理以后(标签标准化)才用到模型中。我们要用到的函数是..:
le = preprocessing.LabelEncoder()
le.fit(['Low', 'Medium', 'High'])
att_label = le.transform(data.work_rate_att.values)
# print(att_label)
def_label = le.transform(data.work_rate_def.values)
# print(def_label)
当然你也可以使用直接来处理离散型特征变量,具体内容可以参考:使用进行one-hot编码。顺带提一句,中也有一个方法可以来处理,可参考:。
调整后的代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : soccer_value.py
# @Author: Huangqinjian
# @Date : 2018/3/22
# @Desc :
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing
import numpy as np
from xgboost import plot_importance
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
def featureSet(data):
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(data.loc[:, ['rw', 'st', 'lw', 'cf', 'cam', 'cm']])
x_new = imputer.transform(data.loc[:, ['rw', 'st', 'lw', 'cf', 'cam', 'cm']])
le = preprocessing.LabelEncoder()
le.fit(['Low', 'Medium', 'High'])
att_label = le.transform(data.work_rate_att.values)
# print(att_label)
def_label = le.transform(data.work_rate_def.values)
# print(def_label)
data_num = len(data)
XList = []
for row in range(0, data_num):
tmp_list = []
tmp_list.append(data.iloc[row]['club'])
tmp_list.append(data.iloc[row]['league'])
tmp_list.append(data.iloc[row]['potential'])
tmp_list.append(data.iloc[row]['international_reputation'])
tmp_list.append(data.iloc[row]['pac'])
tmp_list.append(data.iloc[row]['sho'])
tmp_list.append(data.iloc[row]['pas'])
tmp_list.append(data.iloc[row]['dri'])
tmp_list.append(data.iloc[row]['def'])
tmp_list.append(data.iloc[row]['phy'])
tmp_list.append(data.iloc[row]['skill_moves'])
tmp_list.append(x_new[row][0])
tmp_list.append(x_new[row][1])
tmp_list.append(x_new[row][2])
tmp_list.append(x_new[row][3])
tmp_list.append(x_new[row][4])
tmp_list.append(x_new[row][5])
tmp_list.append(att_label[row])
tmp_list.append(def_label[row])
XList.append(tmp_list)
yList = data.y.values
return XList, yList
def loadTestData(filePath):
data = pd.read_csv(filepath_or_buffer=filePath)
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(data.loc[:, ['rw', 'st', 'lw', 'cf', 'cam', 'cm']])
x_new = imputer.transform(data.loc[:, ['rw', 'st', 'lw', 'cf', 'cam', 'cm']])
le = preprocessing.LabelEncoder()
le.fit(['Low', 'Medium', 'High'])
att_label = le.transform(data.work_rate_att.values)
# print(att_label)
def_label = le.transform(data.work_rate_def.values)
# print(def_label)
data_num = len(data)
XList = []
for row in range(0, data_num):
tmp_list = []
tmp_list.append(data.iloc[row]['club'])
tmp_list.append(data.iloc[row]['league'])
tmp_list.append(data.iloc[row]['potential'])
tmp_list.append(data.iloc[row]['international_reputation'])
tmp_list.append(data.iloc[row]['pac'])
tmp_list.append(data.iloc[row]['sho'])
tmp_list.append(data.iloc[row]['pas'])
tmp_list.append(data.iloc[row]['dri'])
tmp_list.append(data.iloc[row]['def'])
tmp_list.append(data.iloc[row]['phy'])
tmp_list.append(data.iloc[row]['skill_moves'])
tmp_list.append(x_new[row][0])
tmp_list.append(x_new[row][1])
tmp_list.append(x_new[row][2])
tmp_list.append(x_new[row][3])
tmp_list.append(x_new[row][4])
tmp_list.append(x_new[row][5])
tmp_list.append(att_label[row])
tmp_list.append(def_label[row])
XList.append(tmp_list)
return XList
def trainandTest(X_train, y_train, X_test):
# XGBoost训练过程
model = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=500, silent=False, objective='reg:gamma')
model.fit(X_train, y_train)
# 对测试集进行预测
ans = model.predict(X_test)
ans_len = len(ans)
id_list = np.arange(10441, 17441)
data_arr = []
for row in range(0, ans_len):
data_arr.append([int(id_list[row]), ans[row]])
np_data = np.array(data_arr)
# 写入文件
pd_data = pd.DataFrame(np_data, columns=['id', 'y'])
# print(pd_data)
pd_data.to_csv('submit.csv', index=None)
# 显示重要特征
# plot_importance(model)
# plt.show()
if __name__ == '__main__':
trainFilePath = 'dataset/soccer/train.csv'
testFilePath = 'dataset/soccer/test.csv'
data = pd.read_csv(trainFilePath)
X_train, y_train = featureSet(data)
X_test = loadTestData(testFilePath)
trainandTest(X_train, y_train, X_test)
这次只提高到了40.8686。暂时想不到提高的方法了,还请大神多多赐教!