123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- import pandas as pd
- import numpy as np
- from sklearn.preprocessing import MinMaxScaler
- def get_cols():
- with open('Data/kddcup.names', 'r') as infile:
- kdd_names = infile.readlines()
- kdd_cols = [x.split(':')[0] for x in kdd_names[1:]]
- kdd_cols += ['class', 'difficulty']
- return kdd_cols
- def get_attack_map():
- attack_map = [x.strip().split(',') for x in open('Data/AttackTypes.csv', 'r')]
- attack_map = {k: v for (k, v) in attack_map}
- return attack_map
- def cat_encode(df, col):
- return pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col].values)], axis=1)
- def log_trns(df, col):
- return df[col].apply(np.log1p)
- # 数据归一化
- def rescale_features(data):
- min_max_scaler = MinMaxScaler()
- data = min_max_scaler.fit_transform(data)
- return data
- def get_normal_and_abnormal_data(kdd_t):
- normal_kdd_t = kdd_t[kdd_t['class'].isin(['normal'])]
- abnormal_kdd_t = kdd_t[~kdd_t['class'].isin(['normal'])]
- test_target = kdd_t['class']
- test_target = pd.get_dummies(test_target)
- normal_kdd_t.pop('difficulty')
- abnormal_kdd_t.pop('difficulty')
- normal_test_target = normal_kdd_t.pop('class')
- abnormal_test_target = abnormal_kdd_t.pop('class')
- normal_test_target = pd.get_dummies(normal_test_target)
- abnormal_test_target = pd.get_dummies(abnormal_test_target)
- for col in test_target.columns:
- if col not in normal_test_target.columns:
- normal_test_target[col] = 0
- normal_test_target = normal_test_target[test_target.columns]
- for col in test_target.columns:
- if col not in abnormal_test_target.columns:
- abnormal_test_target[col] = 0
- abnormal_test_target = abnormal_test_target[test_target.columns]
- normal_test = normal_kdd_t.values
- normal_test_target = normal_test_target.values
- normal_test = rescale_features(normal_test)
- abnormal_test = abnormal_kdd_t.values
- abnormal_test_target = abnormal_test_target.values
- abnormal_test = rescale_features(abnormal_test)
- return normal_test, normal_test_target, abnormal_test, abnormal_test_target
- def get_data():
- kdd_cols = get_cols()
- kdd = pd.read_csv('Data/KDDTrain+.csv', names=kdd_cols)
- kdd_t = pd.read_csv('Data/KDDTest+.csv', names=kdd_cols)
- kdd_cols = [kdd.columns[0]] + sorted(list(set(kdd.protocol_type.values))) + sorted(
- list(set(kdd.service.values))) + sorted(list(set(kdd.flag.values))) + kdd.columns[4:].tolist()
- attack_map = get_attack_map()
- kdd['class'] = kdd['class'].replace(attack_map)
- kdd_t['class'] = kdd_t['class'].replace(attack_map)
- cat_lst = ['protocol_type', 'service', 'flag']
- for col in cat_lst:
- kdd = cat_encode(kdd, col)
- kdd_t = cat_encode(kdd_t, col)
- log_lst = ['duration', 'src_bytes', 'dst_bytes']
- for col in log_lst:
- kdd[col] = log_trns(kdd, col)
- kdd_t[col] = log_trns(kdd_t, col)
- kdd = kdd[kdd_cols]
- # 多类别转0、1编码时,kdd_t的类别比kdd少
- for col in kdd_cols:
- if col not in kdd_t.columns:
- kdd_t[col] = 0
- kdd_t = kdd_t[kdd_cols]
- normal_test, normal_test_target, abnormal_test, abnormal_test_target = get_normal_and_abnormal_data(kdd_t)
- kdd.pop('difficulty')
- target = kdd.pop('class')
- kdd_t.pop('difficulty')
- test_target = kdd_t.pop('class')
- target = pd.get_dummies(target)
- test_target = pd.get_dummies(test_target)
- # for idx, col in enumerate(list(test_target.columns)):
- # print(idx, col)
- train = kdd.values
- test = kdd_t.values
- target = target.values
- test_target = test_target.values
- train = rescale_features(train)
- test = rescale_features(test)
- return train, target, test, test_target, normal_test, normal_test_target, abnormal_test, abnormal_test_target
|