|
@@ -0,0 +1,151 @@
|
|
|
|
|
+import pandas as pd
|
|
|
|
|
+import numpy as np
|
|
|
|
|
+from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _col_names():
|
|
|
|
|
+ """
|
|
|
|
|
+ 返回列名称
|
|
|
|
|
+ :return: 列名称
|
|
|
|
|
+ """
|
|
|
|
|
+ return ["duration", "protocol_type", "service", "flag", "src_bytes",
|
|
|
|
|
+ "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
|
|
|
|
|
+ "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
|
|
|
|
|
+ "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
|
|
|
|
|
+ "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
|
|
|
|
|
+ "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
|
|
|
|
|
+ "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
|
|
|
|
|
+ "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
|
|
|
|
|
+ "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
|
|
|
|
|
+ "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _encode_text_dummy(df, name):
|
|
|
|
|
+ """
|
|
|
|
|
+ 将字符串类型的参数编码为二进制
|
|
|
|
|
+ :param df:
|
|
|
|
|
+ :param name:
|
|
|
|
|
+ :return:
|
|
|
|
|
+ """
|
|
|
|
|
+ dummies = pd.get_dummies(df.loc[:, name])
|
|
|
|
|
+ for x in dummies.columns:
|
|
|
|
|
+ dummy_name = "{}-{}".format(name, x)
|
|
|
|
|
+ df.loc[:, dummy_name] = dummies[x]
|
|
|
|
|
+ df.drop(name, axis=1, inplace=True)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _to_xy(df):
|
|
|
|
|
+ """
|
|
|
|
|
+ 编码整个df,分为x,y
|
|
|
|
|
+ :param df:
|
|
|
|
|
+ :param target:
|
|
|
|
|
+ :return:
|
|
|
|
|
+ """
|
|
|
|
|
+ labels = ['label-dos', 'label-normal', 'label-probe', 'label-r2l', 'label-u2r']
|
|
|
|
|
+ dummies = df.loc[:, labels]
|
|
|
|
|
+ for label in labels:
|
|
|
|
|
+ df = df.drop(label, 1)
|
|
|
|
|
+ return df, dummies
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _log_turns(df, col):
|
|
|
|
|
+ return df[col].apply(np.log1p)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 数据归一化
|
|
|
|
|
+def _rescale_features(data):
|
|
|
|
|
+ min_max_scaler = MinMaxScaler()
|
|
|
|
|
+ data = min_max_scaler.fit_transform(data)
|
|
|
|
|
+ return data
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _get_attack_map():
|
|
|
|
|
+ attack_map = [x.strip().split(',') for x in open('Data/AttackTypes.csv', 'r')]
|
|
|
|
|
+ attack_map = {k + '.': v for (k, v) in attack_map}
|
|
|
|
|
+ return attack_map
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _divide_xy(x, y):
|
|
|
|
|
+ x_normal = x[y['label-normal'] == 1].values.astype(np.float32)
|
|
|
|
|
+ y_normal = y[y['label-normal'] == 1].values.astype(np.float32)
|
|
|
|
|
+ x_u2r = x[y['label-u2r'] == 1].values.astype(np.float32)
|
|
|
|
|
+ y_u2r = y[y['label-u2r'] == 1].values.astype(np.float32)
|
|
|
|
|
+ x_r2l = x[y['label-r2l'] == 1].values.astype(np.float32)
|
|
|
|
|
+ y_r2l = y[y['label-r2l'] == 1].values.astype(np.float32)
|
|
|
|
|
+ x_dos = x[y['label-dos'] == 1].values.astype(np.float32)
|
|
|
|
|
+ y_dos = y[y['label-dos'] == 1].values.astype(np.float32)
|
|
|
|
|
+ x_probe = x[y['label-probe'] == 1].values.astype(np.float32)
|
|
|
|
|
+ y_probe = y[y['label-probe'] == 1].values.astype(np.float32)
|
|
|
|
|
+ x = x.values.astype(np.float32)
|
|
|
|
|
+ y = y.values.astype(np.float32)
|
|
|
|
|
+ x = _rescale_features(x)
|
|
|
|
|
+ x_normal = _rescale_features(x_normal)
|
|
|
|
|
+ x_u2r = _rescale_features(x_u2r)
|
|
|
|
|
+ x_r2l = _rescale_features(x_r2l)
|
|
|
|
|
+ x_dos = _rescale_features(x_dos)
|
|
|
|
|
+ x_probe = _rescale_features(x_probe)
|
|
|
|
|
+ return {'all': [x, y],
|
|
|
|
|
+ 'normal': [x_normal, y_normal],
|
|
|
|
|
+ 'u2r': [x_u2r, y_u2r],
|
|
|
|
|
+ 'r2l': [x_r2l, y_r2l],
|
|
|
|
|
+ 'dos': [x_dos, y_dos],
|
|
|
|
|
+ 'probe': [x_probe, y_probe]}
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# x_train, y_train, x_train_normal, y_train_normal, x_train_u2r, y_train_u2r, x_train_r2l, y_train_r2l, x_train_dos, y_train_dos, x_train_probe, y_train_probe
|
|
|
|
|
+
|
|
|
|
|
+def get_data(argument):
|
|
|
|
|
+ col_names = _col_names()
|
|
|
|
|
+ df = pd.read_csv("Data/kddcup.data_10_percent_corrected", header=None, names=col_names)
|
|
|
|
|
+ attack_map = _get_attack_map()
|
|
|
|
|
+ df['label'] = df['label'].replace(attack_map)
|
|
|
|
|
+ encode_lst = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
|
|
|
|
|
+ for name in encode_lst:
|
|
|
|
|
+ _encode_text_dummy(df, name)
|
|
|
|
|
+ log_lst = ['duration', 'src_bytes', 'dst_bytes']
|
|
|
|
|
+ for col in log_lst:
|
|
|
|
|
+ df[col] = _log_turns(df, col)
|
|
|
|
|
+ _encode_text_dummy(df, 'label')
|
|
|
|
|
+ # 编码label列
|
|
|
|
|
+ # labels = df['label'].copy()
|
|
|
|
|
+ # labels[labels != 'normal.'] = 0
|
|
|
|
|
+ # labels[labels == 'normal.'] = 1
|
|
|
|
|
+ # df['label'] = labels
|
|
|
|
|
+
|
|
|
|
|
+ # 分出train集以及test集
|
|
|
|
|
+ df_train = df.sample(frac=0.5, random_state=42)
|
|
|
|
|
+ df_test = df.loc[~df.index.isin(df_train.index)]
|
|
|
|
|
+ # 分为训练集和label
|
|
|
|
|
+ x_train, y_train = _to_xy(df_train)
|
|
|
|
|
+ # y_train = y_train.flatten().astype(int)
|
|
|
|
|
+ x_test, y_test = _to_xy(df_test)
|
|
|
|
|
+ # y_test = y_test.flatten().astype(int)
|
|
|
|
|
+ if argument == 'train':
|
|
|
|
|
+ return _divide_xy(x_train, y_train)
|
|
|
|
|
+ else:
|
|
|
|
|
+ return _divide_xy(x_test, y_test)
|
|
|
|
|
+
|
|
|
|
|
+ # dataset = {'x_train': x_train.astype(np.float32),
|
|
|
|
|
+ # 'y_train': y_train.astype(np.float32),
|
|
|
|
|
+ # 'x_train_normal': x_train_normal.astype(np.float32),
|
|
|
|
|
+ # 'y_train_normal': y_train_normal.astype(np.float32),
|
|
|
|
|
+ # 'x_train_abnormal': x_train_abnormal.astype(np.float32),
|
|
|
|
|
+ # 'y_train_abnormal': y_train_abnormal.astype(np.float32),
|
|
|
|
|
+ # 'x_test': x_test.astype(np.float32),
|
|
|
|
|
+ # 'y_test': y_test.astype(np.float32),
|
|
|
|
|
+ # 'x_test_normal': x_test_normal.astype(np.float32),
|
|
|
|
|
+ # 'y_test_normal': y_test_normal.astype(np.float32),
|
|
|
|
|
+ # 'x_test_abnormal': x_test_abnormal.astype(np.float32),
|
|
|
|
|
+ # 'y_test_abnormal': y_test_abnormal.astype(np.float32),
|
|
|
|
|
+ # }
|
|
|
|
|
+
|
|
|
|
|
+# def get_dataset(split='all'):
|
|
|
|
|
+# dataset = _get_data()
|
|
|
|
|
+# if split == 'all':
|
|
|
|
|
+# return dataset['x_train'], dataset['y_train'], dataset['x_train_normal'], dataset['y_train_normal'], dataset[
|
|
|
|
|
+# 'x_train_abnormal'], dataset['y_train_abnormal'], dataset['x_test'], dataset['y_test'], dataset[
|
|
|
|
|
+# 'x_test_normal'], dataset['y_test_normal'], dataset['x_test_abnormal'], dataset['y_test_abnormal'],
|
|
|
|
|
+# else:
|
|
|
|
|
+# key_img = 'x_' + split
|
|
|
|
|
+# key_lbl = 'y_' + split
|
|
|
|
|
+# return dataset[key_img], dataset[key_lbl]
|