|
@@ -0,0 +1,109 @@
|
|
|
|
+import pandas as pd
|
|
|
|
+import numpy as np
|
|
|
|
+from sklearn.preprocessing import MinMaxScaler
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_cols():
|
|
|
|
+ with open('Data/kddcup.names', 'r') as infile:
|
|
|
|
+ kdd_names = infile.readlines()
|
|
|
|
+ kdd_cols = [x.split(':')[0] for x in kdd_names[1:]]
|
|
|
|
+ kdd_cols += ['class', 'difficulty']
|
|
|
|
+ return kdd_cols
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_attack_map():
|
|
|
|
+ attack_map = [x.strip().split(',') for x in open('Data/AttackTypes.csv', 'r')]
|
|
|
|
+ attack_map = {k: v for (k, v) in attack_map}
|
|
|
|
+ return attack_map
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def cat_encode(df, col):
|
|
|
|
+ return pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col].values)], axis=1)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def log_trns(df, col):
|
|
|
|
+ return df[col].apply(np.log1p)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# 数据归一化
|
|
|
|
+def rescale_features(data):
|
|
|
|
+ min_max_scaler = MinMaxScaler()
|
|
|
|
+ data = min_max_scaler.fit_transform(data)
|
|
|
|
+ return data
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_normal_and_abnormal_data(kdd_t):
|
|
|
|
+ normal_kdd_t = kdd_t[kdd_t['class'].isin(['normal'])]
|
|
|
|
+ abnormal_kdd_t = kdd_t[~kdd_t['class'].isin(['normal'])]
|
|
|
|
+ test_target = kdd_t['class']
|
|
|
|
+ test_target = pd.get_dummies(test_target)
|
|
|
|
+ normal_kdd_t.pop('difficulty')
|
|
|
|
+ abnormal_kdd_t.pop('difficulty')
|
|
|
|
+ normal_test_target = normal_kdd_t.pop('class')
|
|
|
|
+ abnormal_test_target = abnormal_kdd_t.pop('class')
|
|
|
|
+ normal_test_target = pd.get_dummies(normal_test_target)
|
|
|
|
+ abnormal_test_target = pd.get_dummies(abnormal_test_target)
|
|
|
|
+ for col in test_target.columns:
|
|
|
|
+ if col not in normal_test_target.columns:
|
|
|
|
+ normal_test_target[col] = 0
|
|
|
|
+ normal_test_target = normal_test_target[test_target.columns]
|
|
|
|
+ for col in test_target.columns:
|
|
|
|
+ if col not in abnormal_test_target.columns:
|
|
|
|
+ abnormal_test_target[col] = 0
|
|
|
|
+ abnormal_test_target = abnormal_test_target[test_target.columns]
|
|
|
|
+ normal_test = normal_kdd_t.values
|
|
|
|
+ normal_test_target = normal_test_target.values
|
|
|
|
+ normal_test = rescale_features(normal_test)
|
|
|
|
+ abnormal_test = abnormal_kdd_t.values
|
|
|
|
+ abnormal_test_target = abnormal_test_target.values
|
|
|
|
+ abnormal_test = rescale_features(abnormal_test)
|
|
|
|
+ return normal_test, normal_test_target, abnormal_test, abnormal_test_target
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_data():
|
|
|
|
+ kdd_cols = get_cols()
|
|
|
|
+ kdd = pd.read_csv('Data/KDDTrain+.csv', names=kdd_cols)
|
|
|
|
+ kdd_t = pd.read_csv('Data/KDDTest+.csv', names=kdd_cols)
|
|
|
|
+ kdd_cols = [kdd.columns[0]] + sorted(list(set(kdd.protocol_type.values))) + sorted(
|
|
|
|
+ list(set(kdd.service.values))) + sorted(list(set(kdd.flag.values))) + kdd.columns[4:].tolist()
|
|
|
|
+ attack_map = get_attack_map()
|
|
|
|
+ kdd['class'] = kdd['class'].replace(attack_map)
|
|
|
|
+ kdd_t['class'] = kdd_t['class'].replace(attack_map)
|
|
|
|
+ cat_lst = ['protocol_type', 'service', 'flag']
|
|
|
|
+ for col in cat_lst:
|
|
|
|
+ kdd = cat_encode(kdd, col)
|
|
|
|
+ kdd_t = cat_encode(kdd_t, col)
|
|
|
|
+ log_lst = ['duration', 'src_bytes', 'dst_bytes']
|
|
|
|
+ for col in log_lst:
|
|
|
|
+ kdd[col] = log_trns(kdd, col)
|
|
|
|
+ kdd_t[col] = log_trns(kdd_t, col)
|
|
|
|
+ kdd = kdd[kdd_cols]
|
|
|
|
+ # 多类别转0、1编码时,kdd_t的类别比kdd少
|
|
|
|
+ for col in kdd_cols:
|
|
|
|
+ if col not in kdd_t.columns:
|
|
|
|
+ kdd_t[col] = 0
|
|
|
|
+ kdd_t = kdd_t[kdd_cols]
|
|
|
|
+
|
|
|
|
+ normal_test, normal_test_target, abnormal_test, abnormal_test_target = get_normal_and_abnormal_data(kdd_t)
|
|
|
|
+
|
|
|
|
+ kdd.pop('difficulty')
|
|
|
|
+ target = kdd.pop('class')
|
|
|
|
+ kdd_t.pop('difficulty')
|
|
|
|
+ test_target = kdd_t.pop('class')
|
|
|
|
+
|
|
|
|
+ target = pd.get_dummies(target)
|
|
|
|
+ test_target = pd.get_dummies(test_target)
|
|
|
|
+
|
|
|
|
+ # for idx, col in enumerate(list(test_target.columns)):
|
|
|
|
+ # print(idx, col)
|
|
|
|
+
|
|
|
|
+ train = kdd.values
|
|
|
|
+ test = kdd_t.values
|
|
|
|
+
|
|
|
|
+ target = target.values
|
|
|
|
+ test_target = test_target.values
|
|
|
|
+
|
|
|
|
+ train = rescale_features(train)
|
|
|
|
+ test = rescale_features(test)
|
|
|
|
+
|
|
|
|
+ return train, target, test, test_target, normal_test, normal_test_target, abnormal_test, abnormal_test_target
|