import numpy as npimport pandas as pdimport sklearnimport matplotlib as mlpimport matplotlib.pyplot as pltimport seaborn as snsimport timeimport re, pip, conda
from sklearn.ensemble import RandomForestRegressor as RFRfrom sklearn.model_selection import cross_validate, KFold, GridSearchCVdata = pd.read_csv(r"D:\Pythonwork\2021ML\PART 2 Ensembles\datasets\House Price\train_encode.csv",index_col=0)X = data.iloc[:,:-1]y = data.iloc[:,-1]X.shape#(1460, 80)X.head()y.describe()#RMSE#参数空间param_grid_simple ={"criterion":["squared_error","poisson"],'n_estimators':[*range(20,100,5)],'max_depth':[*range(10,25,2)],"max_features":["log2","sqrt",16,32,64,"auto"],"min_impurity_decrease":[*np.arange(0,5,10)]}#参数空间大小计算2*len([*range(20,100,5)])*len([*range(10,25,2)])*len(["log2","sqrt",16,32,64,"auto"])*len([*np.arange(0,5,10)])#1536#直接使用循环计算no_option =1for i in param_grid_simple:no_option *=len(param_grid_simple[i])no_option#1536#模型,交叉验证,网格搜索reg = RFR(random_state=1412,verbose=True,n_jobs=-1)cv = KFold(n_splits=5,shuffle=True,random_state=1412)search = GridSearchCV(estimator=reg ,param_grid=param_grid_simple ,scoring ="neg_mean_squared_error",verbose =True,cv = cv ,n_jobs=-1)#=====【TIME WARNING: 7mins】=====#start = time.time()search.fit(X,y)print(time.time()- start)Fitting 5 folds for each of 1536 candidates, totalling 7680 fits#381.6039867401123381.6039/60#6.3600650000000005search.best_estimator_#RandomForestRegressor(max_depth=23, max_features=16, min_impurity_decrease=0,#n_estimators=85, n_jobs=-1, random_state=1412,#verbose=True)abs(search.best_score_)**0.5#29179.698261599166#按最优参数重建模型,查看效果ad_reg = RFR(n_estimators=85, max_depth=23, max_features=16, random_state=1412)cv = KFold(n_splits=5,shuffle=True,random_state=1412)result_post_adjusted = cross_validate(ad_reg,X,y,cv=cv,scoring="neg_mean_squared_error",return_train_score=True,verbose=True,n_jobs=-1)defRMSE(cvresult,key):return(abs(cvresult[key])**0.5).mean()RMSE(result_post_adjusted,"train_score")#11000.81099038192RMSE(result_post_adjusted,"test_score")#28572.070208366855
HPO方法
默认参数
网格搜索
搜索空间/全域空间
–
1536/1536
运行时间(分钟)
–
6.36
搜索最优(RMSE)
30571.266
29179.698
重建最优(RMSE)
–
28572.070
#打包成函数供后续使用#评估指标RMSEdefRMSE(cvresult,key):return(abs(cvresult[key])**0.5).mean()#计算参数空间大小defcount_space(param):no_option =1for i in param_grid_simple:no_option *=len(param_grid_simple[i])print(no_option)#在最优参数上进行重新建模验证结果defrebuild_on_best_param(ad_reg):cv = KFold(n_splits=5,shuffle=True,random_state=1412)result_post_adjusted = cross_validate(ad_reg,X,y,cv=cv,scoring="neg_mean_squared_error",return_train_score=True,verbose=True,n_jobs=-1)print("训练RMSE:{:.3f}".format(RMSE(result_post_adjusted,"train_score")))print("测试RMSE:{:.3f}".format(RMSE(result_post_adjusted,"test_score")))
fig,[ax1, ax2]= plt.subplots(1,2,dpi=300)n_e_list =[*range(50,350,50)]m_d_list =[*range(2,7)]comb = pd.DataFrame([(n_estimators, max_depth)for n_estimators in n_e_list for max_depth in m_d_list])ax1.scatter(comb.iloc[:,0],comb.iloc[:,1],cmap="Blues")ax1.set_xticks([*range(50,350,50)])ax1.set_yticks([*range(2,7)])ax1.set_xlabel("n_estimators")ax1.set_ylabel("max_depth")ax1.set_title("GridSearch")ax2.scatter(comb.iloc[:,0],comb.iloc[:,1],cmap="Blues")ax2.scatter([50,250,200,200,300,100,150,150],[4,2,6,3,2,3,2,5],cmap="red",s=20,linewidths=5)ax2.set_xticks([*range(50,350,50)])ax2.set_yticks([*range(2,7)])ax2.set_xlabel("n_estimators")ax2.set_ylabel("max_depth")ax2.set_title("RandomSearch");