我已经创建了第一个程序来训练和保存算法。
程序1
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor # import for Decision Tree Algorithm
import pickle
from sklearn.preprocessing import StandardScaler
SourceData=pd.read_excel("ASML Stock Predict.xlsx") # Load the data into Pandas DataFrame
SourceData["Nasdaq Category"]=pd.cut(SourceData["Adj Close Nasdaq 100"],
bins=[0., 4500, 5500, 6500, 7500,8500, 9500, 10500, np.inf],
labels=[1, 2, 3, 4,5,6,7,8])
""" Split the data source into test and train subset """
split = StratifiedShuffleSplit(n_splits=1, test_size=0.01, random_state=42)
for train_index, test_index in split.split(SourceData, SourceData["Nasdaq Category"]):
strat_train_set = SourceData.loc[train_index] # stratfied train dataset with all columns in original source data
strat_test_set = SourceData.loc[test_index] #stratified test dataset with all columns in original source data
""" Drop the new Nasdaq Category Cloumn from the data source after the train and test subset is prepared"""
for set_ in (strat_train_set, strat_test_set):
set_.drop("Nasdaq Category", axis=1, inplace=True)
DataSource_train_independent= strat_train_set.drop(["Date", "Adj Close ASML"], axis=1) # Drop depedent variable from training dataset
DataSource_train_dependent=strat_train_set["Adj Close ASML"].copy() # New dataframe with only independent variable value for training dataset
imputer = SimpleImputer(strategy="median") # declated imputer to fill the blank values with Median value of the variable
imputer.fit(DataSource_train_independent) # calulate the median for different independent variables
""" Scale the independent variables training set. No need to scale the dependent variable """
sc_X = StandardScaler()
X=sc_X.fit_transform(DataSource_train_independent.values) # scale the independent variables
X_test=sc_X.transform(testdata.values) # scale the independent variables for test data
##sc_y = StandardScaler()
y=DataSource_train_dependent # scaling is not required for dependent variable
"""Decision Tree Regressor """
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X,y)
filename = 'DecisionTree_TrainedModel.sav'
pickle.dump(tree_reg, open(filename, 'wb'))程序2
from sklearn.tree import DecisionTreeRegressor # import for Decision Tree Algorithm
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor # import for Decision Tree Algorithm
import pandas as pd
testdata=pd.read_excel("ASML Test Stock Predict.xlsx") # Load the test data
sc_X = StandardScaler()
X_test=sc_X.transform(testdata.values) # scale the independent variables for test data
loaded_model = pickle.load(open('DecisionTree_TrainedModel.sav', 'rb'))
decision_predictions = loaded_model.predict(X_test) # Predict the value of dependent variable
print("The prediction by Decision Treemodel is " , decision_predictions )因为我在程序1中有"fit_transform“,并且保存了模型,因此在加载模型后的第二个程序中,我只转换了自变量。
当运行第二个程序"sklearn.exceptions.NotFittedError:这个StandardScaler实例还没有安装时,我会收到错误消息。在使用这个估计器之前,用适当的参数调用'fit‘。“
请建议一下。正如我所理解的,我只需要转换而不适合测试自变量。
发布于 2020-07-25 21:48:52
你还必须腌制受过训练的StandardScaler:
# train and pickle
sc = StandardScaler()
X = sc.fit_transform(DataSource_train_independent.values)
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X, y)
pickle.dump(sc, open('StandardScaler.pk', 'wb'))
pickle.dump(tree_reg, open('DecisionTree.pk', 'wb'))
# load and predict
sc = pickle.load(open('StandardScaler.pk', 'rb'))
model = pickle.load(open('DecisionTree.pk', 'rb'))
X_test = sc.transform(testdata.values)
predictions = model.predict(X_test)更好的方法是将所有步骤封装在单个管道中。
pipeline = Pipeline(steps=[('sc', StandardScaler()),
('tree_reg', DecisionTreeRegressor())])
pipeline.fit(X, y)
pipeline.predict(testdata.values)https://stackoverflow.com/questions/63093773
复制相似问题