我试图对UCI成人数据集进行k均值聚类分析.
X = np.array(df.drop(['class'], 1).astype(int))
y = np.array(df['class'])
km = KMeans(n_clusters=3)
y_km=km.fit_predict(X)
plt.scatter(X[y_km==0,0], X[y_km==0,1], s=50, c='lightgreen', marker='s', label='cluster 1')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=250, marker='*', c='red', label='centroids')图表:

我肯定我错过了一些显而易见的东西,但我在这方面花了很长时间。任何帮助都将不胜感激。
数据集:https://archive.ics.uci.edu/ml/datasets/adult
我必须用k-方法,因为这是课程。
MCVE:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
features=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain","capital-loss", "hours-per-week", "native-country", "class"]
df = pd.read_csv("/kaggle/input/data-adult/adult.data", names=features)
df['class'] = df["class"].apply(lambda x:0 if x==' <=50K' else 1)
labelEncoder = LabelEncoder()
labelEncoder.fit(df['sex'])
df['sex'] = labelEncoder.transform(df['sex'])
labelEncoder.fit(df['occupation'])
df['occupation'] = labelEncoder.transform(df['occupation'])
labelEncoder.fit(df['workclass'])
df['workclass'] = labelEncoder.transform(df['workclass'])
labelEncoder.fit(df['education'])
df['education'] = labelEncoder.transform(df['education'])
labelEncoder.fit(df['marital-status'])
df['marital-status'] = labelEncoder.transform(df['marital-status'])
labelEncoder.fit(df['relationship'])
df['relationship'] = labelEncoder.transform(df['relationship'])
labelEncoder.fit(df['race'])
df['race'] = labelEncoder.transform(df['race'])
labelEncoder.fit(df['native-country'])
df['native-country'] = labelEncoder.transform(df['native-country'])
X = np.array(df.drop(['class'], 1).astype(int))
y = np.array(df['class'])
km = KMeans(n_clusters=3)
y_km=km.fit_predict(X)
plt.scatter(X[y_km==0,0], X[y_km==0,1], s=50, c='lightgreen', label='cluster 1')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=250, marker='*', c='red', label='centroids')发布于 2022-09-05 08:00:07
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
features=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain","capital-loss", "hours-per-week", "native-country", "class"]
df = pd.read_csv("/kaggle/input/data-adult/adult.data", names=features)
df['class'] = df["class"].apply(lambda x:0 if x==' <=50K' else 1)
labelEncoder = LabelEncoder()
labelEncoder.fit(df['sex'])
df['sex'] = labelEncoder.transform(df['sex'])
labelEncoder.fit(df['occupation'])
df['occupation'] = labelEncoder.transform(df['occupation'])
labelEncoder.fit(df['workclass'])
df['workclass'] = labelEncoder.transform(df['workclass'])
labelEncoder.fit(df['education'])
df['education'] = labelEncoder.transform(df['education'])
labelEncoder.fit(df['marital-status'])
df['marital-status'] = labelEncoder.transform(df['marital-status'])
labelEncoder.fit(df['relationship'])
df['relationship'] = labelEncoder.transform(df['relationship'])
labelEncoder.fit(df['race'])
df['race'] = labelEncoder.transform(df['race'])
labelEncoder.fit(df['native-country'])
df['native-country'] = labelEncoder.transform(df['native-country'])
X = np.array(df.drop(['class'], 1).astype(int))
y = np.array(df['class'])
#Apply normalization to your feature vectors
# fit scaler on training data
norm = MinMaxScaler().fit(X)
# transform training data
X_norm = norm.transform(X)
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_norm)
#This will generate number of predicted labels for each class
kmeans_labels = kmeans.labels_
unique_labels, unique_counts = np.unique(kmeans_labels, return_counts=True)
dict(zip(unique_labels, unique_counts))
#accuracy measure of your model can be done by
accuracy = accuracy_score(y, kmeans_labels)
print("k means prediction accuracy:", accuracy)https://stackoverflow.com/questions/61612709
复制相似问题