import pandas
as pd
import numpy
as np
import matplotlib
.pyplot
as plt
DATA
= pd
.read_csv
("creditcard.csv")
count_classes
= pd
.value_counts
(DATA
["Class"], sort
=True).sort_index
()
count_classes
.plot
(kind
="bar")
plt
.show
()
from sklearn
.preprocessing
import StandardScaler
DATA
["normAmount"] = StandardScaler
().fit_transform
(DATA
["Amount"].values
.reshape
(-1, 1))
DATA
= DATA
.drop
(["Time", "Amount"], axis
=1)
DATA_matrix
= DATA
.values
X
= DATA_matrix
[:, DATA
.columns
!= "Class"]
y
= DATA_matrix
[:, DATA
.columns
== "Class"]
number_records_fraud
= len(DATA
[DATA
["Class"] == 1])
fraud_indices
= np
.array
(DATA
[DATA
["Class"] == 1].index
)
norm_indices
= np
.array
(DATA
[DATA
["Class"] == 0].index
)
random_norm_indics
= np
.random
.choice
(norm_indices
, number_records_fraud
, replace
=False)
random_norm_indics
= np
.array
(random_norm_indics
)
under_sample_indices
= np
.concatenate
([fraud_indices
, random_norm_indics
])
under_sample
= DATA
.iloc
[under_sample_indices
, :]
x_under_sample
= under_sample
.values
[:, DATA
.columns
!= "Class"]
y_under_sample
= under_sample
.values
[:, DATA
.columns
== "Class"]
from sklearn
.model_selection
import train_test_split
x_train
, x_test
, y_train
, y_test
= train_test_split
(X
, y
, test_size
=0.3, random_state
=0)
x_train_undersample
, x_test_undersample
, y_train_undersample
, y_test_undersample
= train_test_split
(x_under_sample
,
y_under_sample
,
train_size
=0.3,
random_state
=0)
from sklearn
.linear_model
import LogisticRegression
from sklearn
.model_selection
import KFold
from sklearn
.metrics
import confusion_matrix
, recall_score
def printing_kfold_score(x_train_data
, y_train_data
):
fold
= KFold
(5, shuffle
=False)
c_param_range
= [0.01, 0.1, 1, 10, 100]
results_table
= pd
.DataFrame
(columns
=['C_parameter', 'Mean recall score'])
results_table
['C_parameter'] = c_param_range
j
= 0
for c_param
in c_param_range
:
recall_accs
= []
for train_indices
, test_indices
in fold
.split
(x_train_data
):
lr
= LogisticRegression
(C
=c_param
, penalty
="l1")
x_train_data
= pd
.DataFrame
(x_train_data
)
y_train_data
= pd
.DataFrame
(y_train_data
)
lr
.fit
(x_train_data
.iloc
[train_indices
- 1, :], y_train_data
.iloc
[train_indices
- 1, :].values
.ravel
())
y_pred_undersample
= lr
.predict
(x_train_data
.iloc
[test_indices
, :].values
)
recall_acc
= recall_score
(y_train_data
.iloc
[test_indices
, :].values
, y_pred_undersample
)
recall_accs
.append
(recall_acc
)
print("第%s次" % (j
+ 1))
print(recall_acc
)
results_table
.loc
[j
, "Mean recall score"] = np
.mean
(recall_accs
)
j
+= 1
print(np
.mean
(recall_accs
))
print(results_table
.head
())
best_c
= results_table
.loc
[results_table
["Mean recall score"].astype
("float").idxmax
()]["C_parameter"]
print(best_c
)
return best_c
best_c
= printing_kfold_score
(x_train_undersample
, y_train_undersample
)
import itertools
def plot_confusion_matrix(cm
, classes
, title
='Confusion matrix', cmap
=plt
.cm
.Blues
):
plt
.imshow
(cm
, interpolation
='nearest', cmap
=cmap
)
plt
.title
(title
)
plt
.colorbar
()
tick_marks
= np
.arange
(len(classes
))
plt
.xticks
(tick_marks
, classes
, rotation
=0)
plt
.yticks
(tick_marks
, classes
)
thresh
= cm
.max() / 2.
print(thresh
)
for i
, j
in itertools
.product
(range(cm
.shape
[0]), range(cm
.shape
[1轴
])):
print(i
, j
)
plt
.text
(j
, i
, cm
[i
, j
],
horizontalalignment
="center",
color
="white" if cm
[i
, j
] > thresh
else "black")
plt
.tight_layout
()
plt
.ylabel
('True label')
plt
.xlabel
('Predicted label')
lr
= LogisticRegression
(C
=best_c
, penalty
='l1')
lr
.fit
(x_train_undersample
, y_train_undersample
.ravel
())
y_pred_undersample
= lr
.predict
(x_test_undersample
)
cnf_matrix
= confusion_matrix
(y_test_undersample
, y_pred_undersample
)
np
.set_printoptions
(precision
=2)
print("Recall metric in the testing dataset: ", cnf_matrix
[1, 1] / (cnf_matrix
[1, 0] + cnf_matrix
[1, 1]))
class_names
= [0, 1]
plt
.figure
()
plot_confusion_matrix
(cnf_matrix
, classes
=class_names
, title
='Confusion matrix')
plt
.show
()
lr
= LogisticRegression
(C
=best_c
, penalty
='l1')
lr
.fit
(x_train_undersample
, y_train_undersample
.ravel
())
y_pred
= lr
.predict
(x_test
)
cnf_matrix
= confusion_matrix
(y_test
, y_pred
)
np
.set_printoptions
(precision
=2)
print("Recall metric in the testing dataset: ", cnf_matrix
[1, 1] / (cnf_matrix
[1, 0] + cnf_matrix
[1, 1]))
class_names
= [0, 1]
plt
.figure
()
plot_confusion_matrix
(cnf_matrix
, classes
=class_names
, title
='Confusion matrix')
plt
.show
()
lr
= LogisticRegression
(C
=0.01, penalty
='l1')
lr
.fit
(x_train_undersample
, y_train_undersample
.ravel
())
y_pred_undersample_proba
= lr
.predict_proba
(x_test_undersample
)
thresholds
= [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
plt
.figure
(figsize
=(10, 10))
j
= 1
for i
in thresholds
:
y_test_predictions_high_recall
= y_pred_undersample_proba
[:, 1] > i
plt
.subplot
(3, 3, j
)
j
+= 1
cnf_matrix
= confusion_matrix
(y_test_undersample
, y_test_predictions_high_recall
)
np
.set_printoptions
(precision
=2)
print("Recall metric in the testing dataset: ", cnf_matrix
[1, 1] / (cnf_matrix
[1, 0] + cnf_matrix
[1, 1]))
class_names
= [0, 1]
plot_confusion_matrix
(cnf_matrix
, classes
=class_names
, title
='Threshold >= %s' % i
)
plt
.show
()
from imblearn
.over_sampling
import SMOTE
DATA
= pd
.read_csv
("creditcard.csv")
columns
= DATA
.columns
features_columns
= columns
.delete
(len(columns
) - 1)
features
= DATA
[features_columns
]
label
= DATA
["Class"]
feature_train
, feature_test
, label_train
, label_test
= train_test_split
(features
, label
, test_size
=0.2, random_state
=0)
Oversample
= SMOTE
(random_state
=0)
os_features
, os_labels
= Oversample
.fit_resample
(feature_train
, label_train
)
print(len(os_labels
[os_labels
== 1]))
os_features
= pd
.DataFrame
(os_features
)
os_labels
= pd
.DataFrame
(os_labels
)
best_c
= printing_kfold_score
(os_features
, os_labels
)
lr
= LogisticRegression
(C
=best_c
, penalty
="l1")
lr
.fit
(os_features
, os_labels
.values
.ravel
())
y_pred
= lr
.predict
(feature_test
.values
)
conf_matrix
= confusion_matrix
(label_test
, y_pred
)
np
.set_printoptions
(precision
=2)
print("recall = %s " % (conf_matrix
[1, 1] / (conf_matrix
[1, 0] + conf_matrix
[1, 1])))
calss_names
= [0, 1]
plt
.figure
()
plot_confusion_matrix
(conf_matrix
, classes
=class_names
, title
="Confusion matrix")
plt
.show
()
解决数据不均衡问题的两个方法:下采样和过采样。使用下采样虽然recall值比较高,但是会有较高的误杀率。使用过采样recall值可能会降低但是accuracy会上升,误杀会下降。
SMOTE算法的原理是: 1.对于少数类中没一个样本x,以欧式距离为标准计算它到少数类样本集中所有样本的距离,得到其k近邻 2.选择n个样本(n为生成的倍数) 3.套用
x
n
e
v
=
x
+
rand
(
0
,
1
)
×
(
x
~
−
x
)
x_{n e v}=x+\operatorname{rand}(0,1) \times(\widetilde{x}-x)
xnev=x+rand(0,1)×(x
−x) 生成新的样本
正则惩罚项: 在机器学习特别是深度学习模型越复杂对当前训练集的拟合效果一定越高,但是泛化能力可能会很差。所以我们对损失函数中加入一个正则化惩罚项,正则项的重要程度可以通过正则化项的系数体现
交叉验证的步骤: 1.将数据洗牌 2.按照比例截取一定的数据作为测试集 3.在训练集中应用交叉验证(解决数据不平衡也在train集部分,test集是不动的)
对明显和其他指标数量级不一样的指标需要做标准化处理,因为机器学习算法会自动认为数值大的指标重要一些。