import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler def _col_names(): """ 返回列名称 :return: 列名称 """ return ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"] def _encode_text_dummy(df, name): """ 将字符串类型的参数编码为二进制 :param df: :param name: :return: """ dummies = pd.get_dummies(df.loc[:, name]) for x in dummies.columns: dummy_name = "{}-{}".format(name, x) df.loc[:, dummy_name] = dummies[x] df.drop(name, axis=1, inplace=True) def _to_xy(df): """ 编码整个df,分为x,y :param df: :param target: :return: """ labels = ['label-dos', 'label-normal', 'label-probe', 'label-r2l', 'label-u2r'] dummies = df.loc[:, labels] for label in labels: df = df.drop(label, 1) return df, dummies def _log_turns(df, col): return df[col].apply(np.log1p) # 数据归一化 def _rescale_features(data): min_max_scaler = MinMaxScaler() data = min_max_scaler.fit_transform(data) return data def _get_attack_map(): attack_map = [x.strip().split(',') for x in open('Data/AttackTypes.csv', 'r')] attack_map = {k + '.': v for (k, v) in attack_map} return attack_map def _divide_xy(x, y): x_normal = x[y['label-normal'] == 1].values.astype(np.float32) y_normal = y[y['label-normal'] == 1].values.astype(np.float32) x_u2r = x[y['label-u2r'] == 1].values.astype(np.float32) y_u2r = y[y['label-u2r'] == 1].values.astype(np.float32) x_r2l = x[y['label-r2l'] == 1].values.astype(np.float32) y_r2l = y[y['label-r2l'] == 1].values.astype(np.float32) x_dos = x[y['label-dos'] == 1].values.astype(np.float32) y_dos = y[y['label-dos'] == 1].values.astype(np.float32) x_probe = x[y['label-probe'] == 1].values.astype(np.float32) y_probe = y[y['label-probe'] == 1].values.astype(np.float32) x = x.values.astype(np.float32) y = y.values.astype(np.float32) x = _rescale_features(x) x_normal = _rescale_features(x_normal) x_u2r = _rescale_features(x_u2r) x_r2l = _rescale_features(x_r2l) x_dos = _rescale_features(x_dos) x_probe = _rescale_features(x_probe) return {'all': [x, y], 'normal': [x_normal, y_normal], 'u2r': [x_u2r, y_u2r], 'r2l': [x_r2l, y_r2l], 'dos': [x_dos, y_dos], 'probe': [x_probe, y_probe]} # x_train, y_train, x_train_normal, y_train_normal, x_train_u2r, y_train_u2r, x_train_r2l, y_train_r2l, x_train_dos, y_train_dos, x_train_probe, y_train_probe def get_data(argument): col_names = _col_names() df = pd.read_csv("Data/kddcup.data_10_percent_corrected", header=None, names=col_names) attack_map = _get_attack_map() df['label'] = df['label'].replace(attack_map) encode_lst = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login'] for name in encode_lst: _encode_text_dummy(df, name) log_lst = ['duration', 'src_bytes', 'dst_bytes'] for col in log_lst: df[col] = _log_turns(df, col) _encode_text_dummy(df, 'label') # 编码label列 # labels = df['label'].copy() # labels[labels != 'normal.'] = 0 # labels[labels == 'normal.'] = 1 # df['label'] = labels # 分出train集以及test集 df_train = df.sample(frac=0.5, random_state=42) df_test = df.loc[~df.index.isin(df_train.index)] # 分为训练集和label x_train, y_train = _to_xy(df_train) # y_train = y_train.flatten().astype(int) x_test, y_test = _to_xy(df_test) # y_test = y_test.flatten().astype(int) if argument == 'train': return _divide_xy(x_train, y_train) else: return _divide_xy(x_test, y_test) # dataset = {'x_train': x_train.astype(np.float32), # 'y_train': y_train.astype(np.float32), # 'x_train_normal': x_train_normal.astype(np.float32), # 'y_train_normal': y_train_normal.astype(np.float32), # 'x_train_abnormal': x_train_abnormal.astype(np.float32), # 'y_train_abnormal': y_train_abnormal.astype(np.float32), # 'x_test': x_test.astype(np.float32), # 'y_test': y_test.astype(np.float32), # 'x_test_normal': x_test_normal.astype(np.float32), # 'y_test_normal': y_test_normal.astype(np.float32), # 'x_test_abnormal': x_test_abnormal.astype(np.float32), # 'y_test_abnormal': y_test_abnormal.astype(np.float32), # } # def get_dataset(split='all'): # dataset = _get_data() # if split == 'all': # return dataset['x_train'], dataset['y_train'], dataset['x_train_normal'], dataset['y_train_normal'], dataset[ # 'x_train_abnormal'], dataset['y_train_abnormal'], dataset['x_test'], dataset['y_test'], dataset[ # 'x_test_normal'], dataset['y_test_normal'], dataset['x_test_abnormal'], dataset['y_test_abnormal'], # else: # key_img = 'x_' + split # key_lbl = 'y_' + split # return dataset[key_img], dataset[key_lbl]