import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler def get_cols(): with open('Data/kddcup.names', 'r') as infile: kdd_names = infile.readlines() kdd_cols = [x.split(':')[0] for x in kdd_names[1:]] kdd_cols += ['class', 'difficulty'] return kdd_cols def get_attack_map(): attack_map = [x.strip().split(',') for x in open('Data/AttackTypes.csv', 'r')] attack_map = {k: v for (k, v) in attack_map} return attack_map def cat_encode(df, col): return pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col].values)], axis=1) def log_trns(df, col): return df[col].apply(np.log1p) # 数据归一化 def rescale_features(data): min_max_scaler = MinMaxScaler() data = min_max_scaler.fit_transform(data) return data def get_normal_and_abnormal_data(kdd_t): normal_kdd_t = kdd_t[kdd_t['class'].isin(['normal'])] abnormal_kdd_t = kdd_t[~kdd_t['class'].isin(['normal'])] test_target = kdd_t['class'] test_target = pd.get_dummies(test_target) normal_kdd_t.pop('difficulty') abnormal_kdd_t.pop('difficulty') normal_test_target = normal_kdd_t.pop('class') abnormal_test_target = abnormal_kdd_t.pop('class') normal_test_target = pd.get_dummies(normal_test_target) abnormal_test_target = pd.get_dummies(abnormal_test_target) for col in test_target.columns: if col not in normal_test_target.columns: normal_test_target[col] = 0 normal_test_target = normal_test_target[test_target.columns] for col in test_target.columns: if col not in abnormal_test_target.columns: abnormal_test_target[col] = 0 abnormal_test_target = abnormal_test_target[test_target.columns] normal_test = normal_kdd_t.values normal_test_target = normal_test_target.values normal_test = rescale_features(normal_test) abnormal_test = abnormal_kdd_t.values abnormal_test_target = abnormal_test_target.values abnormal_test = rescale_features(abnormal_test) return normal_test, normal_test_target, abnormal_test, abnormal_test_target def get_data(): kdd_cols = get_cols() kdd = pd.read_csv('Data/KDDTrain+.csv', names=kdd_cols) kdd_t = pd.read_csv('Data/KDDTest+.csv', names=kdd_cols) kdd_cols = [kdd.columns[0]] + sorted(list(set(kdd.protocol_type.values))) + sorted( list(set(kdd.service.values))) + sorted(list(set(kdd.flag.values))) + kdd.columns[4:].tolist() attack_map = get_attack_map() kdd['class'] = kdd['class'].replace(attack_map) kdd_t['class'] = kdd_t['class'].replace(attack_map) cat_lst = ['protocol_type', 'service', 'flag'] for col in cat_lst: kdd = cat_encode(kdd, col) kdd_t = cat_encode(kdd_t, col) log_lst = ['duration', 'src_bytes', 'dst_bytes'] for col in log_lst: kdd[col] = log_trns(kdd, col) kdd_t[col] = log_trns(kdd_t, col) kdd = kdd[kdd_cols] # 多类别转0、1编码时,kdd_t的类别比kdd少 for col in kdd_cols: if col not in kdd_t.columns: kdd_t[col] = 0 kdd_t = kdd_t[kdd_cols] normal_test, normal_test_target, abnormal_test, abnormal_test_target = get_normal_and_abnormal_data(kdd_t) kdd.pop('difficulty') target = kdd.pop('class') kdd_t.pop('difficulty') test_target = kdd_t.pop('class') target = pd.get_dummies(target) test_target = pd.get_dummies(test_target) # for idx, col in enumerate(list(test_target.columns)): # print(idx, col) train = kdd.values test = kdd_t.values target = target.values test_target = test_target.values train = rescale_features(train) test = rescale_features(test) return train, target, test, test_target, normal_test, normal_test_target, abnormal_test, abnormal_test_target