当前位置：网站首页>欺诈检测案例AND泰坦尼克号获救案例

欺诈检测案例AND泰坦尼克号获救案例

2022-07-21 16:49:00 【强仔fight】

欺诈检测案例（样本不平衡，标准化，交叉验证，模型评估）

#绘制类别比例图
count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
count_classes.plot(kind="bar")
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

#标准化操作
from sklearn.preprocessing import StandardScaler
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1))  # -1 系统自动计算行数
data =data.drop("Time", "Amount", axis=1)

#下采样策略
X=data.ix[:, data.columns != 'Class']
y=data.ix[:, data.columns == 'Class']
#获取类别为1的个数
number_records_fraud = len(data[data.Class==1])
fraud_indices = np.array(data[data.Class==1].index)
normal_indices = data[data.Class == 0].index
#从类别为0的索引组当中 随机选择 和类别为1一样数量的 拿出来
random_normal_indices = np.random.choice(normal_indices, number_records_fraud,replace=False)
random_normal_indices = np.array(random_normal_indices)
#混合两种类别的索引
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
under_sample_data = data.iloc[under_sample_indices,:]
X_undersample = under_sample_data.ix[:, under_sample_data !='Class']
y_undersample = under_sample_data.ix[:, under_sample_data =='Class']

#调用切分训练集和测试集的工具
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

from sklearn.linear_model imprt LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report

混淆矩阵

# threshold值可以自己指定，值越大越严格
lr=LogisticRegression(C=0.01, penalty='l1')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)  #获取模型给出的类别概率值
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
plt.figure(figsize=(10,10))
j=1
for i in thresholds:
    y_test_predictions_high_recall = y_pred_undersample_proba[:,1]>i     #概率值大于阈值才判定为某类别
    plt.subplot(3,3,j)
    j+=1
    cnf_matrix = confusion_matrix(y_test_undersample, y_test_predictions_high_recall)
    np.set_printoptions(precision=2)

    class_names =[0,1]
    plot_confusion_matrix(cnf_matrix, classes=class_names, title='Threshold >= %s'%i)
plt.show()

过采样策略：

#SMOTE算法 训练集生成
import imblearn.over_sampling import SMOTE
oversampler=SMOTE(random_state=0)
os_features, os_labels=oversampler.fit_sample(features_train, labels_train)

泰坦尼克号获救案例（缺失值填充，数字字符映射，提取特征，算法集成）
调用线性回归算法

from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch"]
alg = LinearRegression()
kf = KFold(titaniic.shape[0], n_folds=3, random_state=1)
predictors=[]
for train, test in kf:
    train_predictors = (titanic[predictors].iloc[train, :])
    train_target = titanic["Survived"].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)
    
import numpy as np
predictions = np.concatenate(predictions, axis=0)
predictions[predictions >.5]=1
predictions[predictions<=.5]=0
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)

调用逻辑回归算法尝试

from sklearn import cross_validation
from sklearn linear_model import LogisticRegression
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
scores.mean()

调用随机森林算法尝试

from sklearn.ensemble import RandomForestClassifier
alg=RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"],cv=kf)

调整参数

alg=RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)

抽取特征

titanic["Familysize"]=titanic[""]+titanic[""]
titanic["NameLength"]=titanic["Name"].apply(lambda x:len(x))
import re
def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""
titles = titanic["Name"].apply(get_title)
print(pandas.value_counts(titles))
title_mapping={"Mr":1, "Miss":2, "Mrs":3, }
for k,v in title_mapping.items():
    title[titles == k] = v
titanic["Title"]=titles

验证各特征的重要性

import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors=["", "", ""]
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors],titanic["Survived"])
scores = -np.log10(selector.pvalues)
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()

算法集成

import numpy as np
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25,max_depth=3),["Pclass","Sex"]],
[LogisticRegression(random_state=1), ["Pclass","Sex"]]
]
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train,test in kf:
    train_target = titanic["Survived"].iloc[train]
    full_test_predictions = []
    for alg, predictors in alograms:
        alg.fit(titanic[predictors].iloc[train, :], train_target)
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
        full_test_predictions.append(test_predictions)
    test_predictions = (full_test_predictions[0] + full_test_predictions[1])/2
    test_predictions[test_predictions <=.5]=0
    test_predictions[test_predictions > .5]=1

原网站

版权声明
本文为[强仔fight]所创，转载请带上原文链接，感谢
https://blog.csdn.net/qq_35076836/article/details/104283731

当前位置：网站首页>欺诈检测案例AND泰坦尼克号获救案例

欺诈检测案例AND泰坦尼克号获救案例

边栏推荐

猜你喜欢

随机推荐