当前位置:网站首页>欺诈检测案例AND泰坦尼克号获救案例
欺诈检测案例AND泰坦尼克号获救案例
2022-07-21 16:49:00 【强仔fight】
欺诈检测案例(样本不平衡,标准化,交叉验证,模型评估)
#绘制类别比例图
count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
count_classes.plot(kind="bar")
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
#标准化操作
from sklearn.preprocessing import StandardScaler
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1)) # -1 系统自动计算行数
data =data.drop("Time", "Amount", axis=1)
#下采样策略
X=data.ix[:, data.columns != 'Class']
y=data.ix[:, data.columns == 'Class']
#获取类别为1的个数
number_records_fraud = len(data[data.Class==1])
fraud_indices = np.array(data[data.Class==1].index)
normal_indices = data[data.Class == 0].index
#从类别为0的索引组当中 随机选择 和类别为1一样数量的 拿出来
random_normal_indices = np.random.choice(normal_indices, number_records_fraud,replace=False)
random_normal_indices = np.array(random_normal_indices)
#混合两种类别的索引
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
under_sample_data = data.iloc[under_sample_indices,:]
X_undersample = under_sample_data.ix[:, under_sample_data !='Class']
y_undersample = under_sample_data.ix[:, under_sample_data =='Class']
#调用切分训练集和测试集的工具
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
from sklearn.linear_model imprt LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report
混淆矩阵
# threshold值可以自己指定,值越大越严格
lr=LogisticRegression(C=0.01, penalty='l1')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values) #获取模型给出的类别概率值
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
plt.figure(figsize=(10,10))
j=1
for i in thresholds:
y_test_predictions_high_recall = y_pred_undersample_proba[:,1]>i #概率值大于阈值才判定为某类别
plt.subplot(3,3,j)
j+=1
cnf_matrix = confusion_matrix(y_test_undersample, y_test_predictions_high_recall)
np.set_printoptions(precision=2)
class_names =[0,1]
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Threshold >= %s'%i)
plt.show()
过采样策略:
#SMOTE算法 训练集生成
import imblearn.over_sampling import SMOTE
oversampler=SMOTE(random_state=0)
os_features, os_labels=oversampler.fit_sample(features_train, labels_train)
泰坦尼克号获救案例(缺失值填充,数字字符映射,提取特征,算法集成)
调用线性回归算法
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch"]
alg = LinearRegression()
kf = KFold(titaniic.shape[0], n_folds=3, random_state=1)
predictors=[]
for train, test in kf:
train_predictors = (titanic[predictors].iloc[train, :])
train_target = titanic["Survived"].iloc[train]
alg.fit(train_predictors, train_target)
test_predictions = alg.predict(titanic[predictors].iloc[test,:])
predictions.append(test_predictions)
import numpy as np
predictions = np.concatenate(predictions, axis=0)
predictions[predictions >.5]=1
predictions[predictions<=.5]=0
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)
调用逻辑回归算法 尝试
from sklearn import cross_validation
from sklearn linear_model import LogisticRegression
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
scores.mean()
调用随机森林算法 尝试
from sklearn.ensemble import RandomForestClassifier
alg=RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"],cv=kf)
调整参数
alg=RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
抽取特征
titanic["Familysize"]=titanic[""]+titanic[""]
titanic["NameLength"]=titanic["Name"].apply(lambda x:len(x))
import re
def get_title(name):
title_search = re.search('([A-Za-z]+)\.', name)
if title_search:
return title_search.group(1)
return ""
titles = titanic["Name"].apply(get_title)
print(pandas.value_counts(titles))
title_mapping={"Mr":1, "Miss":2, "Mrs":3, }
for k,v in title_mapping.items():
title[titles == k] = v
titanic["Title"]=titles
验证各特征的重要性
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors=["", "", ""]
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors],titanic["Survived"])
scores = -np.log10(selector.pvalues)
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()
算法集成
import numpy as np
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25,max_depth=3),["Pclass","Sex"]],
[LogisticRegression(random_state=1), ["Pclass","Sex"]]
]
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train,test in kf:
train_target = titanic["Survived"].iloc[train]
full_test_predictions = []
for alg, predictors in alograms:
alg.fit(titanic[predictors].iloc[train, :], train_target)
test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
full_test_predictions.append(test_predictions)
test_predictions = (full_test_predictions[0] + full_test_predictions[1])/2
test_predictions[test_predictions <=.5]=0
test_predictions[test_predictions > .5]=1
边栏推荐
猜你喜欢
随机推荐
R language tests the significance of correlation coefficient: use cor.test function to calculate the value and confidence interval of correlation coefficient and its statistical significance (if the v
测试面试过程中遇到的问题整理
架构师进阶,微服务设计与治理的 16 条常用原则
mysql的binlog
js中的三目运算符详解
Chiitoitsu
性能测试总体测试框架
Anaconda安装jupyter lab + jupyterlsp(代码提示,代码纠错)详细搭建过程
PMP candidates note that the examinations in these regions will be postponed
Move! What if you can't take the PMP Exam?
Cmake uses the boost static library, and the error prompt is to find the could not find boost (missing: system thread filesystem
小程序选项卡
How to make random factors in the game win the trust of players again
wallys/new product/DR7915/MT7915+MT7975/WiFi6 MiniPCIe Module 2T2R
微信小程序 wx.request的简单封装
Information sharing | hc-05 Bluetooth module information
告诉我一下股票开户找人办比较方便么?请问,手机开户股票开户安全吗?
CDH 6.1 环境搭建图文教程
Wechat payment native (I) preparation and related knowledge
智能运维场景解析:如何通过异常检测发现业务系统状态异常