【web】Fastapi自动生成接口文档(Swagger、ReDoc )
后台-插件-广告管理-内容页头部广告(手机) |
-
【python操作】将本地文件上传到远程服务器
qq_21738627: 大佬我这边文件传输失败,也没有给任何保存原因‘’
-
kaggle:泰坦尼克生存预测( R语言机器学习分类算法)
m0_64192979: # kaggle--Titanic:Machine Learning from disaster #预备部分:函数定义 #1.数据质量表 data_quality<- function(x){ mode_data<- c() diff_data<- c() na_data<- c() na_datar<- c() fna_data<- c() fna_datar<- c() for (i in 1:ncol(x)){ mode_data<-c(mode_data,mode(x[,i])) diff_data<- c(diff_data,length(unique(x[[i]]))) na_data<- c(na_data,sum(is.na(x[,i]))) nr<- paste(round(na_data[i]/nrow(x),4)*100,"%",sep = "") na_datar<- c(na_datar,nr) fna_data<- c(fna_data,sum(!is.na(x[,i]))) fnr<- paste(round(fna_data[i]/nrow(x),4)*100,"%",sep = "") fna_datar<- c(fna_datar,fnr) } result<- rbind(mode_data,diff_data,na_data,na_datar,fna_data,fna_datar) colnames(result)<- colnames(x) rownames(result)<-c("数据类型","不同值个数","空值个数","空值比例","有值个数","有值比例") result<- as.data.frame(result) # print(ls(envir = parent.frame(n=1))) return(result) } #2.类别型变量转换 data_transform<- function(x){ for (i in 1:ncol(x)) if(length(unique(x[[i]])) < 5){ x[[i]]<-as.factor(x[[i]]) } return(x) } #3.数值型/类别型-数据质量表 quality_numeric<- function(x){ m1<-c() m2<-c() m3<-c() stdev<-c() m3_r<-c() m3_l<-c() options(digits=2) for (i in 1:ncol(x)){ m1<- c(m1,min(x[[i]],na.rm = T)) m2<- c(m2,max(x[[i]],na.rm = T)) m3<- c(m3,mean(x[[i]],na.rm = T)) stdev<- c(stdev,sqrt(sd(x[[i]],na.rm = T))) m3_r<-c(m3_r,m3[i]-3*stdev[i]) m3_l<-c(m3_l,m3[i]+3*stdev[i]) } result<- cbind(m1,m2,m3,stdev,m3_r,m3_l) rownames(result)<- names(x) colnames(result)<- c("Min","Max","Mean","StDev","M-3","M+3") result<- as.data.frame(result) return(result) } quality_factor<- function(x){ Level<- c() Count<- c() for (i in 1:ncol(x)){ r<- table(x[[i]]) le<- c() co<- c() for (k in 1:length(r)){ le<- paste(le,names(r)[k],sep = ":") co<- paste(co,r[k],sep = ":")} Level<- rbind(Level,le) Count<- rbind(Count,co)} result<- cbind(Level,Count) rownames(result)<-names(x) colnames(result)<- c("Level","Count") result<- as.data.frame(result) return(result) } #4.模型评估 performance<- function(table,n=2){ if(!all(dim(table)==c(2,2))) stop("Must be a 2*2 table") tn=table[1,1] fn=table[2,1] tp=table[2,2] fp=table[1,2] sensitivity=tp/(tp+fn) specificity=tn/(tn+fp) ppp=tp/(tp+fp) npp=tn/(tn+fn) hitrate=(tp+tn)/(tp+tn+fp+fn) F1=2*sensitivity*ppp/(ppp+sensitivity) result<- rbind(sensitivity,specificity,ppp,npp,hitrate,F1) rownames(result)<- c("sensitivity","specificity","positivive predictive value","negtive predictive value","accuracy","F1") colnames(result)<- c("model") return(result) } #5.安装包 #字符处理 library(stringr) #缺失值可视化 library(Amelia) library(VIM) #画图 library(ggplot2) #画图组合 # install.packages("devtools") # library(devtools) # install_github("easyGgplot2", "kassambara") library(easyGgplot2) #--------第一部分:读取数据--------#### setwd("D:\\桃子的数据\\Titani Machine Learning from Disaster") train<- read.csv("train.csv",header = TRUE,sep = ",",stringsAsFactors = FALSE,na.strings = c("NA","")) test<- read.csv("test.csv",header = TRUE,sep = ",",stringsAsFactors = FALSE,na.strings = c("NA","")) #--------第二部分:数据理解--------#### #----2.1查看原始数据质量#### #数据质量表(总表) train_data_quality<- data_quality(train) train_data_quality #---由数据质量表可知,训练集共有891条记录,年龄字段存在19.87%的缺失值(177条),可根据姓名字段进行均值(或者回归)填补,且Survived,Pclass,Sex,Sibsp,Parch,Embarked为分类型变量,其他数据完整 #数据质量表(数值型) numeric_train<- train[,c("Age","Fare","SibSp","Parch")] quality_numeric_train<-quality_numeric(numeric_train) quality_numeric_train length(train$Fare[which(train$Fare==0)]) ##---Fare存在15笔零值,可能是异常值,船票的票价和乘客等级有关,因此可根据Pclass信息来做均值填补 # library(rcompanion) # plotNormalHistogram(numeric_train[,1]) # plotNormalHistogram(numeric_train[,2]) #数据质量表(类别型) factor_train<- train[,c("Survived","Pclass","Sex","Embarked")] quality_factor_train<- quality_factor(factor_train) quality_factor_train table(train$Embarked,useNA = "always") ##---Embarked(上船港口)存在2个空字符,导入时已经将空字符替换为NA,后面数据处理可用众数填充 #缺失数据可视化 # library("Amelia") # missmap(train,main = "Missing Map") #----2.2数据类型转换#### # (类别型变为因子型:函数设定小于5个水平都被转为因子型) train<- data_transform(train) str(train) #----2.3探索性分析#### #2.3.1总体幸存情况 options(digits = 2) ggplot(train,aes(x=Survived,fill=Survived))+geom_bar() +labs(title="总体幸存情况",x="是否幸存",y="人数") +scale_fill_manual(values=c("#999999", "#E69F00")) +theme(plot.title = element_text(hjust = 0.5),legend.position = "none") prop.table(table(train$Survived)) #38%的乘客遇难,62的乘客获救 #2.3.2总体年龄/性别分布 plot1<-ggplot(train,aes(x=Age,fill=Pclass))+geom_density(alpha=.3)+labs(title="Age distribution")+theme(plot.title = element_text(hjust = 0.5)) plot2<-ggplot(train,aes(x=Sex,fill=Sex))+geom_bar()+labs(title="乘客性别分布")+scale_fill_manual(values=c("#56B4E9", "#E69F00"))+theme(plot.title = element_text(hjust = 0.5),legend.position = "none") ggplot2.multiplot(plot1,plot2,cols=2) train_age<- train[!is.na(train$Age),] tapply(train_age$Age,train_age$Pclass,mean) prop.table(table(train$Sex)) #去掉缺失值后的分析结果表明头等舱及二等舱的年龄均值大于三等舱,各等级船舱的年龄均值如下:头等舱38岁,二等舱30岁,三等舱25岁,乘客中男性居多占比达到65%,所以男性遇难率高也有样本占比高的原因。 # 2.3.3各等级生存情况 ggplot(train,aes(x=Pclass,fill=Survived))+geom_bar()+labs(title="Survival of different Pclass")+scale_fill_manual(values=c("#999999", "#E69F00"))+theme(plot.title = element_text(hjust = 0.5)) prop.table(table(train$Survived,train$Pclass),margin = 2) #不同等级的幸存率为头等舱63%,二等舱47%,三等舱24%, #且泰坦尼克号和别的客轮一样,将存放救生艇的区域安排在了头等舱和二等舱附近,以降低富人和中产阶级乘客对航海风险的担心 # 下水逃生的安排也保持了这个相同的逻辑,即头等舱、二等舱优先,而不是后来盛传的“妇女儿童优先 # 2.3.4各年龄生存情况 ggplot(train,aes(x=Age))+geom_density()+labs(title="Age distribution")+theme(plot.title = element_text(hjust = 0.5)) # 医学界一般以0-14岁的儿童作为儿科研究对象,因此此处将年龄在14岁及以下的定为儿童,分析其生存情况 train_age_14<- train_age[which(train_age$Age <= 14),] train_age_14$pclass14<- "" train_age_14$pclass14[train_age_14$Pclass==1 | train_age_14$Pclass==2]<- "高等舱" train_age_14$pclass14[train_age_14$Pclass==3]<- "三等舱" #交叉表 table(train_age_14$Survived) table(train_age_14$pclass14,train_age_14$Survived) prop.table(table(train_age_14$Survived)) prop.table(table(train_age_14$pclass14,train_age_14$Survived),margin = 1) #作图 plot3<-ggplot(train_age_14,aes(x=Survived,fill=Survived))+geom_bar()+labs(title="儿童幸存情况(0-14岁)",x="是否幸存",y="人数")+scale_fill_manual(values=c("#999999", "#E69F00"))+theme(plot.title = element_text(hjust = 0.5),legend.position = "none") plot4<-ggplot(train_age_14,aes(x=pclass14,fill=Survived))+geom_bar()+labs(title="不同船舱儿童幸存情况(0-14岁)",x="船舱等级",y="人数")+scale_fill_manual(values=c("#999999", "#E69F00"))+theme(plot.title = element_text(hjust = 0.5)) ggplot2.multiplot(plot3,plot4,cols=2) #儿童幸存率为58%,头等舱及二等舱儿童的幸存率为96%(24名儿童获救,仅有1名儿童遇难) # 而三等舱儿童幸存率为42%(22名儿童获救,31名儿童遇难),可见乘客生存最重要的影响因素还是船舱等级 # 2.3.5性别生存情况分析 train_female<- train[which(train$Sex=="female"),] train_female$pclass_female<- "" train_female$pclass_female[train_female$Pclass==1 | train_female$Pclass==2]<- "高等舱" train_female$pclass_female[train_female$Pclass==3]<- "三等舱" #交叉表 table(train_female$Survived) table(train_female$pclass_female,train_female$Survived) prop.table(table(train$Sex)) prop.table(table(train_female$Survived,train_female$pclass_female),margin = 2) #作图 plot5<-ggplot(train,aes(x=Sex,fill=Survived))+geom_bar()+labs(title="不同性别幸存情况",x="性别",y="人数")+scale_fill_manual(values=c("#56B4E9", "#E69F00"))+theme(plot.title = element_text(hjust = 0.5),legend.position = "none") plot6<-ggplot(train_female,aes(x=pclass_female,fill=Survived))+geom_bar()+labs(title="不同船舱女性幸存情况",x="船舱等级",y="人数")+scale_fill_manual(values=c("#56B4E9", "#E69F00"))+theme(plot.title = element_text(hjust = 0.5)) ggplot2.multiplot(plot5,plot6,cols=2) #女性幸存率为65%,其中头等舱及二等舱女性的幸存率为95%(161名女性获救,9名女性遇难) # 而三等舱女性幸存率为50%(72名女性获救,72名女性遇难) #决策树属性重要性 #--------第三部分:数据准备--------#### #----3.1训练集数据清洗----#### #----3.1.1空字符串处理Embarked table(train$Embarked,useNA = "always") train$Embarked[which(is.na(train$Embarked))] <- 'S' table(train$Embarked,useNA = "always") #----3.1.2异常值处理Fare #Fare为0 的值根据仓位等级的中位数进行填补 a1<-tapply(train$Fare,train$Pclass,median) train[which(train$Fare==0&train$Pclass==1),"Fare"]<- a1[[1]] train[which(train$Fare==0&train$Pclass==2),"Fare"]<- a1[[2]] train[which(train$Fare==0&train$Pclass==3),"Fare"]<- a1[[3]] #----3.1.3.处理缺失值---Age根据称呼用中位数插补年龄 # library(stringr) table_words <- table(unlist(strsplit(train$Name,"\\s+"))) #table是为了对词进行计数 sort(table_words [grep('\\.',names(table_words))],decreasing = TRUE) #将含有.的词(这些代表称呼)提取出来排序 tb <- cbind(train$Age,str_match(train$Name,"[a-zA-Z]+\\.")) #(+代表一个或多个) table(tb[is.na(tb[,1]),2]) median.mr <- median(train$Age[grepl("Mr\\.",train$Name) & !is.na(train$Age)]) #方法一grepl返回布尔值,grep返回行号 median.mrs <- median(train$Age[grepl("Mrs\\.",train$Name)],na.rm = T) #方法二:加上na.rm= median.dr <- median(train$Age[grepl("Dr\\.",train$Name) & !is.na(train$Age)]) median.miss <- median(train$Age[grepl("Miss\\.",train$Name) & !is.na(train$Age)]) median.master <- median(train$Age[grepl("Master\\.",train$Name) & !is.na(train$Age)]) cbind(median.mr,median.mrs,median.dr,median.miss,median.master) #中位数填补 train$Age[grepl("Mr\\.",train$Name) & is.na(train$Age)] <- median.mr train$Age[grepl("Mrs\\.",train$Name) & is.na(train$Age)] <- median.mrs train$Age[grepl("Dr\\.",train$Name) & is.na(train$Age)] <- median.dr train$Age[grepl("Miss\\.",train$Name) & is.na(train$Age)] <- median.miss train$Age[grepl("Master\\.",train$Name) & is.na(train$Age)] <- median.master #处理后缺失值可视化 missmap(train,main = "Missing Map") aggr(train,numbers = TRUE) #--训练集已经不存在缺失值,存疑点:3等舱的年龄均值处理之后分布变成两个峰值,可能是由于缺失较多,且mr男性32岁填充较多。 ggplot(train,aes(x=Age,fill=Pclass))+geom_density(alpha=.3) #----3.1.4 数据清洗后训练集数据质量 #数据质量表(总表) train_data_quality<- data_quality(train) train_data_quality #数据质量表(数值型) numeric_train<- train[,c("Age","Fare","SibSp","Parch")] quality_numeric_train<-quality_numeric(numeric_train) quality_numeric_train length(train$Fare[which(train$Fare==0)]) #数据质量表(类别型) factor_train<- train[,c("Survived","Pclass","Sex","Embarked")] quality_factor_train<- quality_factor(factor_train) quality_factor_train table(train$Embarked,useNA = "always") #----3.2测试集数据清洗----#### #----3.2.1查看原始数据质量 #数据质量表(总表) test_data_quality<- data_quality(test) test_data_quality #---由数据质量表可知,训练集共有418条记录,年龄字段存在20.57%的缺失值(86条),可根据姓名字段进行均值(或者中位数,或者分布)填补,且Survived,Pclass,Sex,Sibsp,Parch,Embarked为分类型变量,其他数据完整 #数据质量表(数值型) numeric_test<- test[,c("Age","Fare","SibSp","Parch")] quality_numeric_test<-quality_numeric(numeric_test) quality_numeric_test length(test$Fare[which(test$Fare==0)]) ##---Fare存在2笔零值,可能是异常值,1笔缺失,船票的票价和乘客等级有关,因此可根据Pclass信息来做均值填补 # library(rcompanion) # plotNormalHistogram(numeric_train[,1]) # plotNormalHistogram(numeric_train[,2]) #数据质量表(因子型) factor_test<- test[,c("Pclass","Sex","Embarked")] quality_factor_test<- quality_factor(factor_test) quality_factor_test ##---类别型变量数据完整 #----3.2.2数据类型转换(字符型变为因子型) test<- data_transform(test) ggplot(test,aes(x=Age,fill=Pclass))+geom_density(alpha=.3) #----3.2.3异常值处理Fare #Fare为0 的值根据仓位等级的中位数进行填补(此处只有一等舱存在2个为0) a2<-tapply(test$Fare,test$Pclass,median) test[which(test$Fare==0&test$Pclass==1),"Fare"]<- a2[[1]] #Fare为缺失的1个值根据仓位等级的中位数进行填补(此处只有三等舱存在1个缺失,但测试集中无三等舱,因此用训练集的三等舱插补) test[is.na(test$Fare),]#查看缺失数据 test$Fare[is.na(test$Fare)]<- a1[[3]] #----3.2.4处理缺失值---Age根据称呼用中位数插补年龄 # library(stringr) table_words <- table(unlist(strsplit(test$Name,"\\s+"))) #table是为了对词进行计数 sort(table_words [grep('\\.',names(table_words))],decreasing = TRUE) #将含有.的词(这些代表称呼)提取出来排序 tb <- cbind(test$Age,str_match(test$Name,"[a-zA-Z]+\\.")) #(+代表一个或多个) table(tb[is.na(tb[,1]),2]) median.mr <- median(test$Age[grepl("Mr\\.",test$Name)],na.rm = T) #方法一grepl返回布尔值,grep返回行号 median.mrs <- median(test$Age[grepl("Mrs\\.",test$Name)],na.rm = T) #方法二:加上na.rm= median.dr <- median(test$Age[grepl("Dr\\.",test$Name)],na.rm = T) median.miss <- median(test$Age[grepl("Miss\\.",test$Name)],na.rm = T) median.master <- median(test$Age[grepl("Master\\.",test$Name)],na.rm = T) cbind(median.mr,median.mrs,median.dr,median.miss,median.master) #中位数填补 test$Age[grepl("Mr\\.",test$Name) & is.na(test$Age)] <- median.mr test$Age[grepl("Mrs\\.",test$Name) & is.na(test$Age)] <- median.mrs test$Age[grepl("Dr\\.",test$Name) & is.na(test$Age)] <- median.dr test$Age[grepl("Miss\\.",test$Name) & is.na(test$Age)] <- median.miss test$Age[grepl("Master\\.",test$Name) & is.na(test$Age)] <- median.master #处理后缺失值可视化 missmap(test,main = "Missing Map") aggr(test,numbers = TRUE) #年龄仍然存在1个缺失值,查看详情并处理,名字里显示MS,女性,猜测是Mrs,用Mrs值填补 test[is.na(test$Age),] test$Age[is.na(test$Age)]<-median.mrs ggplot(test,aes(x=Age,fill=Pclass))+geom_density(alpha=.3) #----3.2.5数据清洗后测试集数据质量 #数据质量表(总表) test_data_quality<- data_quality(test) test_data_quality #数据质量表(数值型) numeric_test<- test[,c("Age","Fare","SibSp","Parch")] quality_numeric_test<-quality_numeric(numeric_test) quality_numeric_test length(test$Fare[which(test$Fare==0)]) # library(rcompanion) # plotNormalHistogram(numeric_train[,1]) # plotNormalHistogram(numeric_train[,2]) #数据质量表(因子型) factor_test<- test[,c("Pclass","Sex","Embarked")] quality_factor_test<- quality_factor(factor_test) quality_factor_test #----3.2.6文件写出 setwd("D:\\桃子的数据\\Titani Machine Learning from Disaster\\cleand_data") write.csv(train,file = "train_clean.csv") write.csv(test,file = "test_clean.csv") #----3.3 筛选建模属性----#### #进行建模的属性筛选,因此乘客ID,姓名,票号,座位号,对模型拟合没有意义,此处进行剔除 # 最终参与建模的数据质量表如下。 names(train) train.all<- train[,c(-1,-4,-9,-11)] str(train) names(test) test.all<- test[,c(-1,-3,-8,-10)] str(test) #数据质量表 #数据质量表(总表) train_data_quality<- data_quality(train.all) train_data_quality #数据质量表(数值型) numeric_train<- train.all[,c("Age","Fare","SibSp","Parch")] quality_numeric_train<-quality_numeric(numeric_train) quality_numeric_train #数据质量表(类别型) factor_train<- train.all[,c("Survived","Pclass","Sex","Embarked")] quality_factor_train<- quality_factor(factor_train) quality_factor_train #----3.4 进行数据抽样----#### #数据抽样 set.seed(102) select<- sample(1:nrow(train.all),nrow(train.all)*0.7) train<- train.all[select,] test<- train.all[-select,-1] test.y<-train.all[-select,1] #--------第四部分:建立模型--------#### #----4.1逻辑回归----#### # 说明1:glm函数会自动将预测变量中的分类变量编码为虚拟变量 # 说明2:指定参数type="response"即可得到预测为1的概率 fit.logit<- glm(Survived~.,data = train,family = binomial()) summary(fit.logit) prob<- predict(fit.logit,test,type="response") pred.logit<- factor(prob>0.5,levels = c(FALSE,TRUE),labels = c("0","1")) pref.logit<-table(test.y,pred.logit,dnn=c("Actual","Predicted")) pref.logit # 结果:模型有参数未通过显著性检验,采用逐步回归 logit.fit.reduced<-step(fit.logit) summary(logit.fit.reduced) # 新模型为Survived ~ Pclass + Sex + Age + SibSp + Embarked fit.logit<- glm(Survived ~ Pclass + Sex + Age + SibSp + Embarked, data = train,family = binomial()) # 结果:逐步回归后的模型效果不理想,因此仍然采取原来的模型 #----4.2决策树----#### # 说明1:用全部变量建树,根据复杂度参数cp进行剪枝 # 说明2:fit.tree$cptable 是十折交叉验证的复杂度参数及误差,从中选择预测误差最小的树 # 说明3:验证时,加上type="class"输出分类结果,否则输出概率值 library(rpart) library(rpart.plot) fit.tree<- rpart(Survived~.,data = train,method = "class", parms = list(split="information"),control = rpart.control(xval = 10)) plotcp(fit.tree) fit.tree$cptable #复杂度参数 error树的误差 xerror十折交叉验证误差 xstd交叉验证标准差 prune.tree<- prune(fit.tree,cp=0.015) #剪枝 prp(prune.tree,type = 2,extra = 104,fallen.leaves = T,main="Decision Tree")#画出最终决策树 # green if survived cols <- ifelse(prune.tree$frame$yval == 1, "darkred", "green4") prp(prune.tree, main="Decision Tree", extra=106, # display prob of survival and percent of obs nn=TRUE, # display the node numbers fallen.leaves=TRUE, # put the leaves on the bottom of the page shadow.col="gray", # shadows under the leaves branch.lty=3, # draw branches using dotted lines branch=.5, # change angle of branch lines faclen=0, # faclen=0 to print full factor names trace=1, # print the automatically calculated cex split.cex=1.2, # make the split text larger than the node text split.prefix="is ", # put "is " before split text split.suffix="?", # put "?" after split text col=cols, border.col=cols, # green if survived split.box.col="lightgray", # lightgray split boxes (default is white) split.border.col="darkgray", # darkgray border on split boxes split.round=.5) # round the split box corners a tad rpart.plot(prune.tree,branch=1, extra=106, under=TRUE, faclen=0, cex=0.8, main="决策树") pred.tree<- predict(prune.tree,test,type="class") #验证 pref.tree<-table(test.y,pred.tree,dnn=c("Actual","Predicted")) pref.tree #----4.3随机森林----#### # 说明1:随机森林默认生成500棵树,在每个节点处抽取sqrt(M)个变量 #说明2:importance(fit.ranf,type=2)查看变量重要性 # 说明3:na.action = na.roughfix参数将数值变量中的缺失值以对应列中位数替代,类别变量用众数。 # 说明3:randomForest生成传统决策树,而party包中的cforest()基于条件推断树生成随机森林 library(randomForest) fit.ranf<- randomForest(Survived~.,data = train,na.action = na.roughfix,importance=T) fit.ranf importance(fit.ranf,type=2) pred.ranf<- predict(fit.ranf,test)#验证 pref.ranf<-table(test.y,pred.ranf,dnn=c("Actual","Predicted")) pref.ranf #----4.4支持向量机----#### # 说明:SVM从本质上来讲是一个黑盒子,在对大量样本建模时不如随机森林,但只要建立了一个成功的模型,在对新样本分类时就没有问题了 # 说明1:ksvm{kernlab}功能强大 / svm{e1071}相对简单 # 说明2:由于方差大的预测变量对SVM生成影响更大,所以svm默认建模前对每个变量标准化 # 说明3:na.omit(validate) 与随机森林不同,SVM在预测新样本单元时不允许缺失值 library(e1071) fit.svm<- svm(Survived~.,data = train) fit.svm pred.svm<- predict(fit.svm,na.omit(test))#验证 pref.svm<-table(na.omit(test.y),pred.svm,dnn=c("Actual","Predicted")) pref.svm #调和参数 # 说明1:svm默认通过径向基函数radial basis(RBF)将样本投射到高维空间 # 因此gamma(核函数参数,控制分割超平面形状)越大,支持向量越多,cost(犯错误成本)越大,可能导致过拟合 # 解决:用tune.svm对每个参数设置一个候选范围,搜索最优参数 # gamma(0.000001-10),cost(0.01,1010) 组合8*21 一共168个模型 # tuned<- tune.svm(Survived~.,data = train,gamma = 10^(-6:1),cost = 10^(-10:10)) # tuned # # 将mamma=0.01 cost=1代回原模型 # fit.svm<- svm(class~.,data = train,gamma=0.01,cost=1) # ---4.5 用boosting提升算法来生成组合模型-----##### library(adabag) #1.1--单一训练集建模 ada<- boosting(Survived~.,data = train) pre<-predict(ada,test) # pre$class 预测结果 # pre$confusion #混淆矩阵 pref.ada<-table(test.y,pre$class,dnn=c("Actual","Predicted")) pref.ada #--------第五部分:模型评估及选择--------#### # 利用混淆矩阵得到5种模型的精确度,如下表: per.logit<- performance(pref.logit) per.tree<- performance(pref.tree) per.ranf<- performance(pref.ranf) per.svm<-performance(pref.svm) per.ada<- performance(pref.ada) evaluating<- as.data.frame(cbind(per.logit,per.tree,per.ranf,per.svm,per.ada)) names(evaluating)<- c("Logit","tree","RandomForst","SVM","Adaboost") evaluating #SVM支持向量机的准确率为84%,采用SVM进行预测 #--------第六部分:生存预测--------#### #---结论,决策树预测准确率更高 # sensitivity=0.96 # specificity=0.95 # positivive predictive value=0.91 # negtive predictive value=0.98 # accuracy=0.952 # F1=0.94 #---结论,SVM预测准确率更高, # performance(pref.svm) # sensitivity=0.69 # specificity=0.93 # positivive predictive value=0.85 # negtive predictive value=0.83 # accuracy=0.84 # F1=0.76 head(test.all) prediction.svm<- predict(fit.svm,na.omit(test.all)) #预测 #写出结果 prediction<- as.data.frame(prediction.svm) names(prediction)<- c("Survival") write.csv(prediction,file = "prediction2.csv") # prediction_n<-read.csv("prediction.csv",header = T,sep = ",") # d<-cbind(prediction_n,prediction) # d[!d$Survived==d$Survival,]
-
聚类分析(银行客户画像)
m0_60892665: 小白一枚,等你数据学习,谢谢。
-
python爬取歌词并生成词云图
larrino: from scipy.misc import imread出现错误,原因是scipy库中的imread要被弃用也就是不能用了,解决办法改成`from imageio import imread`
-
Python爬虫歌词及词频统计--(谢春花)
预备役码农: 为什么我爬出来的都是none呢
1.本站遵循行业规范,任何转载的稿件都会明确标注作者和来源;2.本站的原创文章,请转载时务必注明文章作者和来源,不尊重原创的行为我们将追究责任;3.作者投稿可能会经我们编辑修改或补充。
在线投稿:投稿 站长QQ:1888636
后台-插件-广告管理-内容页尾部广告(手机) |