我需要建立一个MLP来分类不同类型的网络攻击存在于UNSW 15数据集中。当我得到相关矩阵时,很明显我做错了什么。任何帮助都是有用的,因为我被这个项目完全堵住了。下面是我的代码,数据集的预处理和MLP:
import pandas as pd
import numpy as np
from tensorflow.keras.utils import get_file
pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 5)
#df1 = pd.read_csv(r'C:\Users\alexc\Downloads\UNSW-NB15_1.csv')
#df2 = pd.read_csv(r'C:\Users\alexc\Downloads\UNSW-NB15_2.csv')
#df3 = pd.read_csv(r'C:\Users\alexc\Downloads\UNSW-NB15_3.csv')
#df4 = pd.read_csv(r'C:\Users\alexc\Downloads\UNSW-NB15_4.csv')
#dfs = pd.concat([dffilled1, dffilled2, dffilled3, dffilled4],ignore_index = True, sort = False)
#dfs = dffilled1.append([dffilled2, dffilled3, dffilled4,], ignore_index=True)
dfs = []
for i in range(1,5):
path = './UNSW-NB15_{}.csv'# There are 4 input csv files
dfs.append(pd.read_csv(path.format(i),dtype={'attack_cat': str, 'ct_ftp_cmd' : int }, header = None))
all_data = pd.concat(dfs).reset_index(drop=True) # Concat all to a single df
# This csv file contains names of all the features
df_col = pd.read_csv('./NUSW-NB15_features.csv', encoding='ISO-8859-1')
# Making column names lower case, removing spaces
df_col['Name'] = df_col['Name'].apply(lambda x: x.strip().replace(' ', '').lower())
# Renaming our dataframe with proper column names
all_data.columns = df_col['Name']
# display 5 rows
pd.set_option('display.max_columns', 48)
pd.set_option('display.max_rows', 21)
all_data
all_data[all_data['service']=='-']
all_data['service'].replace('-',np.nan,inplace=True)
all_data = all_data[all_data['service'].notna()]
all_data['attack_cat'] = all_data['attack_cat'].str.strip()
all_data['attack_cat'] = all_data['attack_cat'].replace(['Backdoors'], 'Backdoor')
all_data.groupby('attack_cat')['attack_cat'].count()
all_data["attack_cat"] = all_data["attack_cat"].fillna('Normal')
all_data.groupby('attack_cat')['attack_cat'].count()
all_data.drop(all_data[all_data['is_ftp_login'] >= 2.0].index, inplace = True)
all_data.groupby('attack_cat')['attack_cat'].count()
all_data.drop(['srcip', 'sport', 'dstip', 'dsport', 'ct_ftp_cmd'],axis=1, inplace=True)
num_col = all_data.select_dtypes(include='number').columns
# selecting categorical data attributes
cat_col = all_data.columns.difference(num_col)
cat_col = cat_col[1:]
cat_col
data_cat = all_data[cat_col].copy()
data_cat.head()
data_cat = pd.get_dummies(data_cat,columns=cat_col)
data_cat.head()
data = pd.concat([all_data, data_cat],axis=1)
data.shape
data.drop(columns=cat_col,inplace=True)
data.shape
# selecting numeric attributes columns from data
num_col = list(data.select_dtypes(include='number').columns)
#num_col.remove('id')
num_col.remove('label')
print(num_col)
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
# using minmax scaler for normalizing data
minmax_scale = MinMaxScaler(feature_range=(0, 1))
def normalization(df,col):
for i in col:
arr = df[i]
arr = np.array(arr)
df[i] = minmax_scale.fit_transform(arr.reshape(len(arr),1))
return df
data = normalization(data.copy(),num_col)
# one-hot-encoding attack label
multi_data = data.copy()
multi_label = pd.DataFrame(multi_data.attack_cat)
multi_data = pd.get_dummies(multi_data,columns=['attack_cat'])
from sklearn import metrics
from sklearn import preprocessing
# label encoding (0,1,2,3,4,5,6,7,8) multi-class labels
le2 = preprocessing.LabelEncoder()
enc_label = multi_label.apply(le2.fit_transform)
multi_data['label'] = enc_label
X = multi_data.drop(columns=['label'],axis=1)
Y = multi_data['label']
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import metrics
x_train, x_test, y_train, y_test = train_test_split(
X, Y, test_size=0.25, random_state=42)
model = Sequential()
model.add(Dense(X.shape[1], input_dim= X.shape[1], activation= 'relu'))
model.add(Dense(2*X.shape[1]+1, activation= 'relu'))
model.add(Dense(2*X.shape[1]+1,activation= 'relu'))
model.add(Dense(10,activation= 'sigmoid', kernel_initializer='normal'))
model.compile(loss= 'sparse_categorical_crossentropy', optimizer= 'adam', metrics= ['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5,
verbose=1, mode='auto', restore_best_weights=True)
model.fit(x_train,y_train,validation_data=(x_test,y_test),
callbacks=[monitor],batch_size=10000,verbose=2, epochs=100)
pred = model.predict(x_test).argmax(axis=1)
y_pred = np.matrix(pred).T
#y_pred = pred.tolist()
y_compare = y_test.tolist()
score = metrics.accuracy_score(y_compare, y_pred)
print("Accuracy score: {}".format(score))
from sklearn.metrics import classification_report
print(classification_report(y_compare, y_pred,target_names=le2.classes_))

提前谢谢。
发布于 2023-04-08 17:20:22
根据所提供的代码和您面临的问题,我将建议您可以对预处理步骤和模型体系结构进行一些改进。
数据预处理:您可能需要应用特性选择技术来减少功能的数量,然后再将它们输入模型。像递归功能消除或套索回归这样的技术可以有所帮助。
模型体系结构:您可以为MLP模型试验不同的体系结构、激活函数和其他超参数,以提高其性能。一些建议是:
-Add多隐层或改变每层神经元的数目。
-Try其他激活函数,如“tanh”或“swish”,用于隐藏层。
-Add辍学层的模型,以避免过度拟合。
不同批次大小和学习率的-Experiment。
模型培训:尝试使用K-折叠交叉验证,而不是单一的火车测试分割.它将允许您在数据集的不同部分上对模型进行培训,减少过度拟合的可能性,并提高模型的总体性能。
https://datascience.stackexchange.com/questions/120804
复制相似问题