Preprocess.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.preprocessing import MinMaxScaler
  4. def _col_names():
  5. """
  6. 返回列名称
  7. :return: 列名称
  8. """
  9. return ["duration", "protocol_type", "service", "flag", "src_bytes",
  10. "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
  11. "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
  12. "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
  13. "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
  14. "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
  15. "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
  16. "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
  17. "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
  18. "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]
  19. def _encode_text_dummy(df, name):
  20. """
  21. 将字符串类型的参数编码为二进制
  22. :param df:
  23. :param name:
  24. :return:
  25. """
  26. dummies = pd.get_dummies(df.loc[:, name])
  27. for x in dummies.columns:
  28. dummy_name = "{}-{}".format(name, x)
  29. df.loc[:, dummy_name] = dummies[x]
  30. df.drop(name, axis=1, inplace=True)
  31. def _to_xy(df):
  32. """
  33. 编码整个df,分为x,y
  34. :param df:
  35. :param target:
  36. :return:
  37. """
  38. labels = ['label-dos', 'label-normal', 'label-probe', 'label-r2l', 'label-u2r']
  39. dummies = df.loc[:, labels]
  40. for label in labels:
  41. df = df.drop(label, 1)
  42. return df, dummies
  43. def _log_turns(df, col):
  44. return df[col].apply(np.log1p)
  45. # 数据归一化
  46. def _rescale_features(data):
  47. min_max_scaler = MinMaxScaler()
  48. data = min_max_scaler.fit_transform(data)
  49. return data
  50. def _get_attack_map():
  51. attack_map = [x.strip().split(',') for x in open('Data/AttackTypes.csv', 'r')]
  52. attack_map = {k + '.': v for (k, v) in attack_map}
  53. return attack_map
  54. def _divide_xy(x, y):
  55. x_normal = x[y['label-normal'] == 1].values.astype(np.float32)
  56. y_normal = y[y['label-normal'] == 1].values.astype(np.float32)
  57. x_u2r = x[y['label-u2r'] == 1].values.astype(np.float32)
  58. y_u2r = y[y['label-u2r'] == 1].values.astype(np.float32)
  59. x_r2l = x[y['label-r2l'] == 1].values.astype(np.float32)
  60. y_r2l = y[y['label-r2l'] == 1].values.astype(np.float32)
  61. x_dos = x[y['label-dos'] == 1].values.astype(np.float32)
  62. y_dos = y[y['label-dos'] == 1].values.astype(np.float32)
  63. x_probe = x[y['label-probe'] == 1].values.astype(np.float32)
  64. y_probe = y[y['label-probe'] == 1].values.astype(np.float32)
  65. x = x.values.astype(np.float32)
  66. y = y.values.astype(np.float32)
  67. x = _rescale_features(x)
  68. x_normal = _rescale_features(x_normal)
  69. x_u2r = _rescale_features(x_u2r)
  70. x_r2l = _rescale_features(x_r2l)
  71. x_dos = _rescale_features(x_dos)
  72. x_probe = _rescale_features(x_probe)
  73. return {'all': [x, y],
  74. 'normal': [x_normal, y_normal],
  75. 'u2r': [x_u2r, y_u2r],
  76. 'r2l': [x_r2l, y_r2l],
  77. 'dos': [x_dos, y_dos],
  78. 'probe': [x_probe, y_probe]}
  79. # x_train, y_train, x_train_normal, y_train_normal, x_train_u2r, y_train_u2r, x_train_r2l, y_train_r2l, x_train_dos, y_train_dos, x_train_probe, y_train_probe
  80. def get_data(argument):
  81. col_names = _col_names()
  82. df = pd.read_csv("Data/kddcup.data_10_percent_corrected", header=None, names=col_names)
  83. attack_map = _get_attack_map()
  84. df['label'] = df['label'].replace(attack_map)
  85. encode_lst = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
  86. for name in encode_lst:
  87. _encode_text_dummy(df, name)
  88. log_lst = ['duration', 'src_bytes', 'dst_bytes']
  89. for col in log_lst:
  90. df[col] = _log_turns(df, col)
  91. _encode_text_dummy(df, 'label')
  92. # 编码label列
  93. # labels = df['label'].copy()
  94. # labels[labels != 'normal.'] = 0
  95. # labels[labels == 'normal.'] = 1
  96. # df['label'] = labels
  97. # 分出train集以及test集
  98. df_train = df.sample(frac=0.5, random_state=42)
  99. df_test = df.loc[~df.index.isin(df_train.index)]
  100. # 分为训练集和label
  101. x_train, y_train = _to_xy(df_train)
  102. # y_train = y_train.flatten().astype(int)
  103. x_test, y_test = _to_xy(df_test)
  104. # y_test = y_test.flatten().astype(int)
  105. if argument == 'train':
  106. return _divide_xy(x_train, y_train)
  107. else:
  108. return _divide_xy(x_test, y_test)
  109. # dataset = {'x_train': x_train.astype(np.float32),
  110. # 'y_train': y_train.astype(np.float32),
  111. # 'x_train_normal': x_train_normal.astype(np.float32),
  112. # 'y_train_normal': y_train_normal.astype(np.float32),
  113. # 'x_train_abnormal': x_train_abnormal.astype(np.float32),
  114. # 'y_train_abnormal': y_train_abnormal.astype(np.float32),
  115. # 'x_test': x_test.astype(np.float32),
  116. # 'y_test': y_test.astype(np.float32),
  117. # 'x_test_normal': x_test_normal.astype(np.float32),
  118. # 'y_test_normal': y_test_normal.astype(np.float32),
  119. # 'x_test_abnormal': x_test_abnormal.astype(np.float32),
  120. # 'y_test_abnormal': y_test_abnormal.astype(np.float32),
  121. # }
  122. # def get_dataset(split='all'):
  123. # dataset = _get_data()
  124. # if split == 'all':
  125. # return dataset['x_train'], dataset['y_train'], dataset['x_train_normal'], dataset['y_train_normal'], dataset[
  126. # 'x_train_abnormal'], dataset['y_train_abnormal'], dataset['x_test'], dataset['y_test'], dataset[
  127. # 'x_test_normal'], dataset['y_test_normal'], dataset['x_test_abnormal'], dataset['y_test_abnormal'],
  128. # else:
  129. # key_img = 'x_' + split
  130. # key_lbl = 'y_' + split
  131. # return dataset[key_img], dataset[key_lbl]