
我有四种数据。每一项都经过了以下预处理:
x1,y1=Standardisation
x2,y2=Normalisation
x3,y3=Rescale
一个是完全未处理的(x,y)。
我对每一种情况都进行了逻辑回归:
#Building Logistic Regression model on the UNPROCESSED DATA
from sklearn.metrics import accuracy_score
lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)
lr_predict = lr_model.predict(x_test)
print('Logistic Regression - ',accuracy_score(lr_predict,y_test))#Building Logistic Regression model on the NORMALISED DATA
from sklearn.linear_model import LogisticRegression
lr_norm = LogisticRegression()
lr_norm.fit(x1_train, y1_train)
y_pred = lr_norm.predict(x1_test)
print("Accuracy of logistic regression on test set with Rescaled features: {:.2f}".format(lr_norm.score(x1_test, y1_test)))等等..。
我想做一个图表,不确定哪一个,通过它的准确性得分来代表性能,或者其他什么。但在我想测试的其他型号中,以下是:
svm_model = SVC(kernel='linear')
svm_model.fit(x_train,y_train)
svc_predict = svm_model.predict(x_test)
print('SVM - ',accuracy_score(svc_predict,y_test))
print('\t\t\t\tTRAIN DATA\n')
print(classification_report(y_train, svm_model.predict(x_train), target_names=encoder.inverse_transform([0,1,2])))
print('\n')
print('\t\t\t\tTEST DATA\n')
print(classification_report(y_test, svm_model.predict(x_test), target_names=encoder.inverse_transform([0,1,2])))nb_model = GaussianNB()
nb_model.fit(x_train,y_train)
nb_predict = nb_model.predict(x_test)
print('Naive bayes - ',accuracy_score(nb_predict,y_test))dt_model = DecisionTreeClassifier(max_leaf_nodes=3)
dt_model.fit(x_train,y_train)
dt_predict = dt_model.predict(x_test)
print('Decision Tree - ',accuracy_score(dt_predict,y_test))rfc_model = RandomForestClassifier(max_depth=3)
rfc_model.fit(x_train,y_train)
rfc_predict = rfc_model.predict(x_test)
print('Random Forest - ',accuracy_score(rfc_predict,y_test))knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(x_train,y_train)
knn_predict = knn_model.predict(x_test)
print('knn - ',accuracy_score(knn_predict,y_test))希望这有意义..。
#preprare data
pre_processing=[('NOT PROCESSED', None)]
pre_processing.append(('RESCALED', MinMaxScaler(feature_range=(0, 1))))
pre_processing.append(('STANDARDIZED', StandardScaler()))
pre_processing.append(('NORMALIZED', Normalizer()))
# prepare models
models = []
models.append(( 'LR' , LogisticRegression(max_iter=10000)))
models.append(( 'LDA' , LinearDiscriminantAnalysis()))
models.append(( 'KNN' , KNeighborsClassifier()))
models.append(( 'CART' , DecisionTreeClassifier()))
models.append(( 'NB' , GaussianNB()))
models.append(( 'SVM' , SVC(probability=True)))
results = []
names = []
higher_acc=0
standard=0
best_model=''
for process in pre_processing:
globals()['df_'+process[0]] = pd.DataFrame(index=None, columns=None)
for algo in models:
estimators = [process,algo]
model = Pipeline(estimators)
ss = ShuffleSplit(n_splits=10, test_size=test_size, random_state=seed)
names.append(algo[0])
for scoring in performance_metrix:
cv_results = cross_val_score(model, X_train, Y_train, cv=ss, scoring=scoring)
globals()['df_'+process[0]].loc[algo[0],scoring]= '%s\u00B1%s'%(round(cv_results.mean()*100.0,2),round(cv_results.std()*100.0,2))
if performance_metrix.index(scoring)==0:
results.append(cv_results)
if cv_results.mean()*100.0 > higher_acc:
higher_acc=cv_results.mean()*100.0
standard=cv_results.std()*100.0
best_model=process[0], algo[0]
elif cv_results.mean()*100.0 == higher_acc:
if cv_results.std()*100.0 < standard:
higher_acc=cv_results.mean()*100.0
best_model=process[0], algo[0]
print('For %s data we produced:\n\n'%(process[0]),globals()['df_'+process[0]],'\n\n')
# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithms accuracy comparison for %s data'%(process[0]))
ax = fig.add_subplot(111)
pyplot.boxplot(results[:len(models)])
ax.set_xticklabels(names)
pyplot.show()
# Create a pipeline that standardizes the data then creates a model
print("The overall best performance was the one obtained with %s data, using %s algorithm. \nIt's Accuracy resulted to be %s with a standard deviation of %s" %(best_model[0],best_model[1],round(higher_acc,2),round(standard,2)))发布于 2022-04-14 17:27:11
datasets = {
"Unprocessed": (x_train, x_test, y_train, y_test),
"Standardisation": (x1_train, x1_test, y1_train, y1_test),
"Normalisation": (x2_train, x2_test, y2_train, y2_test),
"Rescale": (x3_train, x3_test, y3_train, y3_test),
}
models = {
"Logistic Regression": LogisticRegression(),
"Decision Tree": DecisionTreeClassifier(max_leaf_nodes=3),
"Random Forest": RandomForestClassifier(max_depth=3)
}
def evaluate_model(model, dataset):
x_train, x_test, y_train, y_test = data
model.fit(x_train, y_train)
pred = model.predict(x_test)
return accuracy_score(pred, y_test)
model_scores_for_datasets = {}
for dataset_name, dataset in datasets.items():
dataset_scores = {}
for model_name, model in models.items():
model_score = evaluate_model(model, dataset)
dataset_scores[model_name] = model_score
model_scores_for_datasets[dataset_name] = dataset_scores在这里,model_scores_for_datasets将包含每个模型的每个数据集的准确性结果,并且看起来如下所示:
{
"Unprocessed" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Standardisation" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Normalisation" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
"Rescale" :
{
"Logistic Regression" : 10,
"Decision Tree": 5,
"Random Forest": 20
},
}现在,您可以获得每个数据集的结果,并可以创建所需的绘图。沿着这条线的东西:
for dataset_name, scores in model_scores_for_datasets.items():
# For example:
# dataset_name will be "Unprocessed"
# scores will be a dict like so:
# {
# "Logistic Regression" : 10,
# "Decision Tree": 5,
# "Random Forest": 20
# }
generate_plot(dataset_name scores)当然,您需要计算出generate_plot函数。希望这对你有所帮助,并给你一些想法。
https://stackoverflow.com/questions/71874938
复制相似问题