In [8]:
import pandas as pd
import json
from sklearn import tree
from sklearn.model_selection import train_test_split as tsplit 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

In [9]:
def transform_value(val):
    return val.split(':')[1]

In [10]:
def extract_data(s):
    data_temp = pd.read_json(s)
    columns = ['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']
    vectorString = data_temp['vectorString']
    temp = []
    for i in range(vectorString.size):
        part =  vectorString[i].split('/')
        list_items = part[1::]
        temp.append(list_items)
    data = pd.DataFrame(temp, columns=columns)
    data = data.applymap(transform_value)
    data['severity'] = data_temp['severity']
    return data

In [11]:
data_train = extract_data('SIR_train_set.json')
data_test = extract_data('SIR_test_set.json')
data_validation = extract_data('SIR_validation_set.json')
data_train
print(data_test)

    AV AC PR UI  S  C  I  A  severity
0    N  L  N  R  C  L  L  N    MEDIUM
1    N  L  N  N  U  N  N  H      HIGH
2    N  L  N  N  U  N  N  H      HIGH
3    N  L  N  R  C  L  L  N    MEDIUM
4    N  L  N  R  C  L  L  N    MEDIUM
..  .. .. .. .. .. .. .. ..       ...
705  N  L  N  N  U  H  H  H  CRITICAL
706  L  L  L  N  U  H  N  N    MEDIUM
707  N  L  N  N  U  H  H  H  CRITICAL
708  N  L  N  N  U  N  L  L    MEDIUM
709  N  L  N  N  U  H  N  N      HIGH

[710 rows x 9 columns]


In [12]:
lw = data_train[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']]
lw

Unnamed: 0,AV,AC,PR,UI,S,C,I,A
0,N,L,N,N,U,H,N,N
1,N,L,N,N,U,H,H,H
2,N,L,N,N,U,H,N,N
3,N,H,N,N,U,H,H,H
4,N,L,N,R,U,H,H,H
...,...,...,...,...,...,...,...,...
5619,N,L,N,N,U,N,N,H
5620,N,L,N,R,C,L,L,N
5621,N,L,N,R,U,N,H,N
5622,N,L,N,R,U,N,H,N


In [13]:
def encode(data):
    # 初始化 OneHotEncoder
    encoder = OneHotEncoder(sparse=False)

    # 转换字符数据为数值
    encoded_features = encoder.fit_transform(data[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])
    encoded_data = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']))
    return encoded_data
print(encode(lw))


      AV_A  AV_L  AV_N  AV_P  AC_H  AC_L  PR_H  PR_L  PR_N  UI_N  ...  S_U  \
0      0.0   0.0   1.0   0.0   0.0   1.0   0.0   0.0   1.0   1.0  ...  1.0   
1      0.0   0.0   1.0   0.0   0.0   1.0   0.0   0.0   1.0   1.0  ...  1.0   
2      0.0   0.0   1.0   0.0   0.0   1.0   0.0   0.0   1.0   1.0  ...  1.0   
3      0.0   0.0   1.0   0.0   1.0   0.0   0.0   0.0   1.0   1.0  ...  1.0   
4      0.0   0.0   1.0   0.0   0.0   1.0   0.0   0.0   1.0   0.0  ...  1.0   
...    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...  ...   
5619   0.0   0.0   1.0   0.0   0.0   1.0   0.0   0.0   1.0   1.0  ...  1.0   
5620   0.0   0.0   1.0   0.0   0.0   1.0   0.0   0.0   1.0   0.0  ...  0.0   
5621   0.0   0.0   1.0   0.0   0.0   1.0   0.0   0.0   1.0   0.0  ...  1.0   
5622   0.0   0.0   1.0   0.0   0.0   1.0   0.0   0.0   1.0   0.0  ...  1.0   
5623   0.0   0.0   1.0   0.0   0.0   1.0   0.0   1.0   0.0   0.0  ...  0.0   

      C_H  C_L  C_N  I_H  I_L  I_N  A_H  A_L  A_N  
0     1.0  



In [16]:
x_test = encode(data_test[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])
x_test.insert(0, 'AV_A', 0)
x_test.insert(3, 'AV_P', 0)
print(x_test)

     AV_A  AV_L  AV_N  AV_P  AC_H  AC_L  PR_H  PR_L  PR_N  UI_N  ...  S_U  \
0       0   0.0   1.0     0   0.0   1.0   0.0   0.0   1.0   0.0  ...  0.0   
1       0   0.0   1.0     0   0.0   1.0   0.0   0.0   1.0   1.0  ...  1.0   
2       0   0.0   1.0     0   0.0   1.0   0.0   0.0   1.0   1.0  ...  1.0   
3       0   0.0   1.0     0   0.0   1.0   0.0   0.0   1.0   0.0  ...  0.0   
4       0   0.0   1.0     0   0.0   1.0   0.0   0.0   1.0   0.0  ...  0.0   
..    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...  ...   
705     0   0.0   1.0     0   0.0   1.0   0.0   0.0   1.0   1.0  ...  1.0   
706     0   1.0   0.0     0   0.0   1.0   0.0   1.0   0.0   1.0  ...  1.0   
707     0   0.0   1.0     0   0.0   1.0   0.0   0.0   1.0   1.0  ...  1.0   
708     0   0.0   1.0     0   0.0   1.0   0.0   0.0   1.0   1.0  ...  1.0   
709     0   0.0   1.0     0   0.0   1.0   0.0   0.0   1.0   1.0  ...  1.0   

     C_H  C_L  C_N  I_H  I_L  I_N  A_H  A_L  A_N  
0    0.0  1.0  0.0  0.0 



In [15]:
x_train = encode(data_train[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])
y_train = data_train['severity']
x_test = encode(data_test[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])
x_test.insert(0, 'AV_A', 0)
x_test.insert(3, 'AV_P', 0)
y_test = data_test['severity']
x_validation = encode(data_validation[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])
y_validation = data_validation['severity']
# 创建并训练决策树分类器
m = tree.DecisionTreeClassifier()
m.fit(x_train, y_train)

# 使用模型进行预测
y_test_pred = m.predict(x_test)

# 打印测试集分类报告
print('分类报告：\n', classification_report(y_test, y_test_pred))

# 打印准确率
test_accuracy = m.score(x_test, y_test)
print('测试集分类的准确率：%0.4f' % test_accuracy)

print()

# 使用模型进行预测
y_validation_pred = m.predict(x_validation)

# 打印测试集分类报告
print('分类报告：\n', classification_report(y_validation, y_validation_pred))

# 打印准确率
validation_accuracy = m.score(x_validation, y_validation)
print('验证集分类的准确率：%0.4f' % validation_accuracy)

分类报告：
               precision    recall  f1-score   support

    CRITICAL       0.99      0.97      0.98       155
        HIGH       0.98      1.00      0.99       241
         LOW       0.00      0.00      0.00         0
      MEDIUM       1.00      0.99      1.00       314

    accuracy                           0.99       710
   macro avg       0.74      0.74      0.74       710
weighted avg       0.99      0.99      0.99       710

测试集分类的准确率：0.9901

分类报告：
               precision    recall  f1-score   support

    CRITICAL       1.00      1.00      1.00       137
        HIGH       1.00      1.00      1.00       228
         LOW       0.67      0.80      0.73         5
      MEDIUM       0.99      0.99      0.99       333

    accuracy                           0.99       703
   macro avg       0.92      0.95      0.93       703
weighted avg       0.99      0.99      0.99       703

验证集分类的准确率：0.9943


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
