#数据预处理:将表头的空格、引号以及问号去除,以及将大写字母转换为小写字母 #print(churn.columns) churn.columns=churn.columns.str.replace churn.columns=churn.columns.str.lower() #将churn列中元素末尾的'.'去除 churn.churn=churn.churn.str.strip('.') #将churn转换为01编码并创建新列churn01 churn['churn01']=np.where(churn.churn=='True',1,0) print(churn.head()) # 按制定类别变量state分组计算其他变量的均值、最大值、最小值 print(churn.groupby('churn')[['day_calls','eve_calls','night_calls']] ['total_charge']=churn['day_charge']+churn['eve_charge']+churn['night_charge']+churn['intl_charge'] #
churn.bagging <- bagging(churn~., data = trainset, coob=TRUE) churn.bagging Bagging classification trees =trainset$churn) [1] 0.06115418 # 预测 churn.predction <- predict(churn.bagging, newdata = testset, type = "class") prediction.table <- table(churn.predction, testset$churn) prediction.table churn.predction # ROC churn.roc <- roc(testset$churn, churn.predict) plot(churn.roc) # coords得到最佳临界值 coords(churn.roc <- predict(churn.rf, testset) table(churn.prediction, testset$churn) churn.prediction yes no
7.2 k折交叉验证模型性能 这个方法可以解决过度适应的问题, library(modeldata) library(e1071) data(mlc_churn) churnTrain <- mlc_churn # caret选择特征 library(modeldata) library(caret) data(mlc_churn) churnTrain <- mlc_churn ind <- sample(2 ='<em>churn</em>'][,-c(5,6,7)], trainset[,'churn'],sizes = c(1:18), rfeControl = ldaControl) ="churn"]), testset[,c("churn")]) Accuracy Kappa 0.8520710 0.2523709 扩展 7.9 评测回归模型性能 均方根误差法 ="churn"]) table(svm.pred, testset[,"churn",drop=TRUE]) svm.pred yes no yes 12 1 no
) data(mlc_churn) churn <- mlc_churn # 7:3分训练和测试集 set.seed(2) ind <- sample(2,nrow(churnTrain),replace library(rpart) churn.rp <- rpart(churn~., data=trainset) plotcp(churn.rp) summary(churn.rp) 5.4 递归分割树可视化 churn.cp <- churn.rp$cptable[5,"CP"] churn.cp [1] 0.01014199 # 修剪 prune.tree <- prune(churn.rp, cp= names(testset) %in% c("churn", "area_code", "state" )], trainset$churn, k=3) summary(churn.knn) plot (churn.knn) library(caret) confusionMatrix(table(testset$churn,churn.knn)) # ########################
: 1.数据来源于https://github.com/cbrownley/foundations-for-analytics-with-python/tree/master/statistics/churn.csv =pd.read_csv(inputCsv) #数据预处理 #将列标题的空格替换为下划线,将引号和问号去除,标题字母变为小写 churn.columns=churn.columns.str.replace str.lower() #将churn字段值中'.'删除, churn.churn=churn.churn.str.strip('.') #print(churn.head(5)) #新增一个字段,将churn字段转换为01编码字段 churn['churn01']=np.where(churn.churn=='True',1,0) #对字段 (churn[churn.columns.difference(['intl_plan','vmail_plan','churn01','churn','state','phone','account_length
it's the best so far accuracy = clf.score(test_df.drop('Churn', axis=1), test_df['Churn']) column train_df_no_churn = train_df.drop('Churn', axis=1) # calculate the mean cosine similarity # create a DataFrame containing a random sample of 10 points where Churn is 0 sample_churn_0 = train_df Class 1'].iloc[0]) 预测的代码如下: # Add a new column 'predicted_churn' cross_df['predicted_churn'] = '' (['predicted_churn', 'Churn_x']).size().reset_index(name='count') grouped_df__2['percentage'] = grouped_df
inplace=True) # 转换成类别变量 df['SeniorCitizen'] = df['SeniorCitizen'].astype("str") # 将是否流失转换为01分布 df['Churn '] = df['Churn'].map({'No':0,'Yes':1}) 生存分析探索 查看整体生存曲线 fig, ax = plt.subplots(nrows=2, ncols=1, figsize ') cph.print_summary(decimals=1) model lifelines.CoxPHFitter duration col 'tenure' event col 'Churn' = df_model.query("Churn == 0") # 预测中位数生存时间 churn0_median_survive = cph.predict_median(churn0).rename ('median_survive') # 计算剩余价值=月消费*(预测中位数生存时间)-已存续时间 values = pd.concat([churn0_median_survive, churn0[[
) data = pd.read_csv('churn_data.csv') X = data.drop('churn', axis=1) y = data['churn'] # 划分训练集和测试集 X_train ,可以通过以下语句进行预测: SELECT predict_churn(value1, value2, value3, ...) AS churn_probability; 四、批量预测性能优化(避免逐行调用) 逐行调用 UDF 函数在处理大量数据时效率较低,为了提高性能,我们可以对函数进行优化,实现批量数据输入和预测。 (ARRAY_AGG(features)) AS churn_probabilities FROM batch_features; (三)性能对比 通过测试发现,批量预测的效率远高于逐行预测。 列来存储流失标签,调用存储过程如下: CALL churn_prediction_procedure('users', 'churn_label'); (三)验证结果 执行以下语句查看预测结果: SELECT
)、总流失率(Gross Churn)、品牌流失率(Logo Churn)、续约率(Renewal Rate)、净推荐值(NPS)等等。 Within the world of customer success, there are so many metrics that are touted: gross churn, net churn Your net churn is a combination of gross churn (how stable is the current revenue stream from existing than logo churn. churn of 5 percent.
数据探索 数据读入 churn = pd.read_csv('telecom_churn.csv', skipinitialspace=True) churn.head() # 列比较多,显示不完 churn.info() # 发现数据都比较整洁 数据属性说明 数据探索 简洁版本,只是为了一元逻辑回归做的探索,毕竟实际情况中数据分析师们80%的时间可能都是用来清洗数据和结合具体业务来探索数据 churn 流失与否 是否与 posTrend 流量使用上升趋势有关 猜想:posTrend 为 1,即流量使用有上升趋势时,更不容易流失(用得越多越不容易流失) 交叉表分析 cross_table = pd.crosstab(index=churn.posTrend, columns=churn.churn, margins=True) (frac=0.7, random_state=1234).copy() test = churn[~ churn.index.isin(train.index)].copy() # ~ 表示取反,isin
列中的值 Yes和 No分别用 1和 0替换,方便后续处理 telcom['Churn'].replace(to_replace = 'Yes', value = 1,inplace = True) telcom['Churn'].replace(to_replace = 'No', value = 0,inplace = True) telcom['Churn'].head() # In[14 ]: telcom['Churn'].replace(to_replace='Yes', value=1, inplace=True) telcom['Churn'].replace(to_replace "].value_counts() labels=telcom["Churn"].value_counts().index rcParams["figure.figsize"]=6,6 plt.pie plt.title("Churn by Partner") plt.subplot(2,2,4) dependents=sns.countplot(x="Dependents",hue="Churn"
应用这种方法,我们可以将churn的数据特征转化为int64类型: df['Churn'] = df['Churn'].astype('int64') describe()方法用来描述每个数字特征(int64 下面我们查看下客户流失率Churn的分布情况: df[‘Churn’].value_counts() 0 2850 1 483 Name: Churn, dtype: int64 可以看到 ,3333位用户中有2850位有忠诚用户,其对应的Churn为0。 df[‘Churn’].mean() 0.14491449144914492 我们可以看出,客户流失率Churn达到14.5%,这对一家公司来说确实是非常糟糕的结果,因为高流失率会使公司破产。 (x='Customerservice calls', hue='Churn', data=df) 在表中我们无法清楚地看到二者之间的关系,但从图中我们可以发现,客服呼叫次数达到4次时会导致客户流失率churn
目标变量Churn分布 经过初步清洗之后的数据集大小为7032条记录,其中流失客户为1869条,占比26.6%,未流失客户占比73.4%。 df['Churn'].value_counts() No 5163 Yes 1869 Name: Churn, dtype: int64 trace0 = go.Pie(labels =df['Churn'].value_counts().index, values=df['Churn'].value_counts().values, pd.crosstab(df['PaymentMethod'], df['Churn']) plot_bar(input_col='PaymentMethod', target_col='Churn # 重新划分 X = df_model.drop(['customerID', 'Churn'], axis=1) y = df_model['Churn'] # 分层抽样 X_train, X_test
= pd.read_excel(r'C:\Users\Administrator\Desktop\Customer_Churn.xlsx') churn.head() ? font.sans-serif']=['Microsoft YaHei'] # 为确保绘制的饼图为圆形,需执行如下代码 plt.axes(aspect = 'equal') # 统计交易是否为欺诈的频数 counts = churn.churn.value_counts 将二元变量international_plan和voice_mail_plan转换为0-1哑变量 churn.international_plan = churn.international_plan.map ({'no':0,'yes':1}) churn.voice_mail_plan = churn.voice_mail_plan.map({'no':0,'yes':1}) churn.head() ? ], churn.churn, random_state=12) # 构建决策树 dt = tree.DecisionTreeClassifier(n_estimators = 300) dt.fit
') churn_df.types churn_df.describe() churn_train,churn_test,churn_valid = churn_df.split_frame(ratios =[.7, .15]) churn_train y = "Churn" x = churn_df.columns x.remove(y) x.remove("customerID") aml = H2OAutoML nvidia-smi aml.train(x = x, y = y, training_frame = churn_train, validation_frame=churn_valid) lb = aml.leaderboard lb.head() churn_pred=aml.leader.predict(churn_test) churn_pred.head() aml.leader.model_performance (churn_test) model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0]) #se = h2o.get_model
library(e1071) # install.packages("C50") # library(C50) # data('churn', package = 'C50') # install.packages ("modeldata") # https://stackoverflow.com/questions/60506936/data-set-churn-not-found-in-package-c50 library(modeldata) data(mlc_churn) churn <- mlc_churn churnTrain <- churn[,! names(testset) %in% c("churn")]) svm.table <- table(svm.pred, testset$churn) svm.table # 调用classAgreement names(testset) %in% c("churn")]) svm.tuned.table <- table(svm.tuned.pred, testset$churn) svm.tuned.table
删除无效贡献的特征 mlc_churn <- mlc_churn[, ! set.seed(1126) ind <- sample(2, nrow(mlc_churn), replace = T, prob = c(0.7, 0.3)) trainset <- mlc_churn 建立模型 churn.rf <- randomForest::randomForest(churn ~ ., data = trainset, importance = T) ? 用训练好的模型进行预测 churn.prediction <- predict(churn.rf, testset) table(churn.prediction, testset$churn) ? 绘制森林对象的均方差 plot(churn.rf) ? 评价各个属性的重要度 importance(churn.rf) varImpPlot(churn.rf) ?
', F.sum('churn').over(user_window)).limit(5).toPandas()df = df.withColumn('preChurn', F.sum('churn') 定义用户流失标签# 定义用户流失def define_churn(df): ''' Define churn @param df - spark dataframe returns 进一步数据探索① 流失率predictor = pd_melt['churn'].value_counts()print(predictor)plt.title('Churn distribution' 数值型特征相关度# 定义数值型特征numerical_churn = numerical + ['churn']# 计算相关性corr_data = pd_melt[numerical_churn].corr ']).count().reset_index()[[colname, 'churn','count']] # churn index 0, 1 doesn't relate to No, Yes
数据可以从BigML的S3 bucket,churn-80和churn-20中获取。churn-80和churn-20两套是来自同一批次,但已被分成80/20的比例。 以下是使用Scala DataFrame API的一些示例查询: train.groupBy("churn").sum("numcs").show +-----+----------+ |churn|sum dtrain.groupBy("churn").count.show 输出: +-----+-----+ |churn|count| +-----+-----+ |False|2278| | True| 在这里,我们保留Churn = True类的所有实例,但是将Churn = False类下采样为388/2278分之一。 ) strain.groupBy("churn").count.show 输出: -----+-----+ |churn|count| +-----+-----+ |False|379| | True|
= pd.read_excel(r'C:\Users\Administrator\Desktop\Customer_Churn.xlsx') churn.head() # 中文乱码的处理 plt.rcParams font.sans-serif']=['Microsoft YaHei'] # 为确保绘制的饼图为圆形,需执行如下代码 plt.axes(aspect = 'equal') # 统计交易是否为欺诈的频数 counts = churn.churn.value_counts 将二元变量international_plan和voice_mail_plan转换为0-1哑变量 churn.international_plan = churn.international_plan.map ({'no':0,'yes':1}) churn.voice_mail_plan = churn.voice_mail_plan.map({'no':0,'yes':1}) churn.head() 如上表所示 ], churn.churn, random_state=12) # 构建决策树 dt = tree.DecisionTreeClassifier(n_estimators = 300) dt.fit