| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151 |
- import pandas as pd
- import numpy as np
- from sklearn.preprocessing import MinMaxScaler
- def _col_names():
- """
- 返回列名称
- :return: 列名称
- """
- return ["duration", "protocol_type", "service", "flag", "src_bytes",
- "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
- "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
- "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
- "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
- "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
- "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
- "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
- "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
- "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]
- def _encode_text_dummy(df, name):
- """
- 将字符串类型的参数编码为二进制
- :param df:
- :param name:
- :return:
- """
- dummies = pd.get_dummies(df.loc[:, name])
- for x in dummies.columns:
- dummy_name = "{}-{}".format(name, x)
- df.loc[:, dummy_name] = dummies[x]
- df.drop(name, axis=1, inplace=True)
- def _to_xy(df):
- """
- 编码整个df,分为x,y
- :param df:
- :param target:
- :return:
- """
- labels = ['label-dos', 'label-normal', 'label-probe', 'label-r2l', 'label-u2r']
- dummies = df.loc[:, labels]
- for label in labels:
- df = df.drop(label, 1)
- return df, dummies
- def _log_turns(df, col):
- return df[col].apply(np.log1p)
- # 数据归一化
- def _rescale_features(data):
- min_max_scaler = MinMaxScaler()
- data = min_max_scaler.fit_transform(data)
- return data
- def _get_attack_map():
- attack_map = [x.strip().split(',') for x in open('Data/AttackTypes.csv', 'r')]
- attack_map = {k + '.': v for (k, v) in attack_map}
- return attack_map
- def _divide_xy(x, y):
- x_normal = x[y['label-normal'] == 1].values.astype(np.float32)
- y_normal = y[y['label-normal'] == 1].values.astype(np.float32)
- x_u2r = x[y['label-u2r'] == 1].values.astype(np.float32)
- y_u2r = y[y['label-u2r'] == 1].values.astype(np.float32)
- x_r2l = x[y['label-r2l'] == 1].values.astype(np.float32)
- y_r2l = y[y['label-r2l'] == 1].values.astype(np.float32)
- x_dos = x[y['label-dos'] == 1].values.astype(np.float32)
- y_dos = y[y['label-dos'] == 1].values.astype(np.float32)
- x_probe = x[y['label-probe'] == 1].values.astype(np.float32)
- y_probe = y[y['label-probe'] == 1].values.astype(np.float32)
- x = x.values.astype(np.float32)
- y = y.values.astype(np.float32)
- x = _rescale_features(x)
- x_normal = _rescale_features(x_normal)
- x_u2r = _rescale_features(x_u2r)
- x_r2l = _rescale_features(x_r2l)
- x_dos = _rescale_features(x_dos)
- x_probe = _rescale_features(x_probe)
- return {'all': [x, y],
- 'normal': [x_normal, y_normal],
- 'u2r': [x_u2r, y_u2r],
- 'r2l': [x_r2l, y_r2l],
- 'dos': [x_dos, y_dos],
- 'probe': [x_probe, y_probe]}
- # x_train, y_train, x_train_normal, y_train_normal, x_train_u2r, y_train_u2r, x_train_r2l, y_train_r2l, x_train_dos, y_train_dos, x_train_probe, y_train_probe
- def get_data(argument):
- col_names = _col_names()
- df = pd.read_csv("Data/kddcup.data_10_percent_corrected", header=None, names=col_names)
- attack_map = _get_attack_map()
- df['label'] = df['label'].replace(attack_map)
- encode_lst = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
- for name in encode_lst:
- _encode_text_dummy(df, name)
- log_lst = ['duration', 'src_bytes', 'dst_bytes']
- for col in log_lst:
- df[col] = _log_turns(df, col)
- _encode_text_dummy(df, 'label')
- # 编码label列
- # labels = df['label'].copy()
- # labels[labels != 'normal.'] = 0
- # labels[labels == 'normal.'] = 1
- # df['label'] = labels
- # 分出train集以及test集
- df_train = df.sample(frac=0.5, random_state=42)
- df_test = df.loc[~df.index.isin(df_train.index)]
- # 分为训练集和label
- x_train, y_train = _to_xy(df_train)
- # y_train = y_train.flatten().astype(int)
- x_test, y_test = _to_xy(df_test)
- # y_test = y_test.flatten().astype(int)
- if argument == 'train':
- return _divide_xy(x_train, y_train)
- else:
- return _divide_xy(x_test, y_test)
- # dataset = {'x_train': x_train.astype(np.float32),
- # 'y_train': y_train.astype(np.float32),
- # 'x_train_normal': x_train_normal.astype(np.float32),
- # 'y_train_normal': y_train_normal.astype(np.float32),
- # 'x_train_abnormal': x_train_abnormal.astype(np.float32),
- # 'y_train_abnormal': y_train_abnormal.astype(np.float32),
- # 'x_test': x_test.astype(np.float32),
- # 'y_test': y_test.astype(np.float32),
- # 'x_test_normal': x_test_normal.astype(np.float32),
- # 'y_test_normal': y_test_normal.astype(np.float32),
- # 'x_test_abnormal': x_test_abnormal.astype(np.float32),
- # 'y_test_abnormal': y_test_abnormal.astype(np.float32),
- # }
- # def get_dataset(split='all'):
- # dataset = _get_data()
- # if split == 'all':
- # return dataset['x_train'], dataset['y_train'], dataset['x_train_normal'], dataset['y_train_normal'], dataset[
- # 'x_train_abnormal'], dataset['y_train_abnormal'], dataset['x_test'], dataset['y_test'], dataset[
- # 'x_test_normal'], dataset['y_test_normal'], dataset['x_test_abnormal'], dataset['y_test_abnormal'],
- # else:
- # key_img = 'x_' + split
- # key_lbl = 'y_' + split
- # return dataset[key_img], dataset[key_lbl]
|