我尝试使用GridSearchCV和KMeans集群来探索要使用的集群的最佳数目,以便在分类问题上获得最佳结果。
我有以下代码:
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
faces = fetch_olivetti_faces()
X_data, y_data = faces.data, faces.target
log_reg = LogisticRegression()
split = StratifiedShuffleSplit(n_splits = 1, test_size=.2, random_state=42)
for train_index, test_index in split.split(X_train, y_train):
X_train_set , y_train_set = X_data[train_index,], y_data[train_index,]
X_test_set, y_test_set = X_data[test_index,], y_data[test_index, ]
pipeline = Pipeline([
('kmeans', KMeans(n_clusters = 30)),
('log_reg', LogisticRegression())
])
cluster_grid = dict(n_clusters=range(2,100))
grid = GridSearchCV(pipeline, cluster_grid)
grid.fit(X_train_set, y_train_set, cv=5, verbose=2)以下是所有的回溯:
-------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-80e6a3932897> in <module>
----> 1 grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
686 return results
687
--> 688 self._run_search(evaluate_candidates)
689
690 # For multi-metric evaluation, store the best_index_, best_params_ and
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1147 def _run_search(self, evaluate_candidates):
1148 """Search all candidates in param_grid"""
-> 1149 evaluate_candidates(ParameterGrid(self.param_grid))
1150
1151
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
665 for parameters, (train, test)
666 in product(candidate_params,
--> 667 cv.split(X, y, groups)))
668
669 if len(out) < 1:
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
501 train_scores = {}
502 if parameters is not None:
--> 503 estimator.set_params(**parameters)
504
505 start_time = time.time()
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
162 self
163 """
--> 164 self._set_params('steps', **kwargs)
165 return self
166
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
48 self._replace_estimator(attr, name, params.pop(name))
49 # 3. Step parameters and other initialisation arguments
---> 50 super().set_params(**params)
51 return self
52
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/base.py in set_params(self, **params)
222 'Check the list of available parameters '
223 'with `estimator.get_params().keys()`.' %
--> 224 (key, self))
225
226 if delim:
ValueError: Invalid parameter n_clusters for estimator Pipeline(memory=None,
steps=[('kmeans',
KMeans(algorithm='auto', copy_x=True, init='k-means++',
max_iter=300, n_clusters=30, n_init=10, n_jobs=None,
precompute_distances='auto', random_state=None,
tol=0.0001, verbose=0)),
('log_reg',
LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=None,
solver='warn', tol=0.0001, verbose=0,
warm_start=False))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
我不知道到底是怎么回事.我不知道如何解释这个错误信息,但我的参数网格似乎并没有失控。请帮帮忙!
发布于 2020-02-26 04:01:13
当您使用pipeline时,您需要给出如下参数:
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)kmeans是从('kmeans', KMeans())中提取的
因此,您的代码应该如下所示:
pipeline = Pipeline([
('kmeans', KMeans(),
('log_reg', LogisticRegression())
])
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)发布于 2020-02-26 04:00:46
参数n_clusters只适用于KMeans而不适用于LogisticRegression。
在cluster_grid中指定网格参数仅用于KMeans
# Parameters of pipelines can be set using ‘__’ separated parameter names:
cluster_grid = dict(kmeans__n_clusters=range(2,100))参考资料:https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
https://stackoverflow.com/questions/60405635
复制相似问题