##Feature reduction with information gain (see Menzies et al. 2007). ##Paste the path of your csv file in ##Paste the path of the directory < data/c > in which parametrized data for c=? are. For example paste data/c if your samples are in data/ci library(FSelector) library(RWeka) filePathFin_1=paste("file.csv") colnames<-c() colnames<-c("MP","","","","","","RBF","","","","","","L","","","","","") write.table(t(colnames), file=filePathFin_1, row.names=FALSE, col.names=FALSE, append=TRUE, sep=",") colnames<-c() colnames<-c("MR", "fpr","tpr","MR_Validation","fpr_Validating","tpr_Validating","MR", "fpr","tpr","MR_Validating","fpr_Validating","tpr_Validating","MR", "fpr","tpr","MR_Validating","fpr_Validating","tpr_Validating") write.table(t(colnames), file=filePathFin_1, row.names=FALSE, col.names=FALSE, append=TRUE, sep=",") for(i in 1:4){ separator<-paste("c",i, sep="") write.table(separator, file=filePathFin_1, row.names=FALSE, col.names=FALSE, append=TRUE, sep=",") directoryInner<-paste("data/c",i,"/", sep="") DInner<- dir(directoryInner, full.names=TRUE) HInner<- dir(directoryInner, full.names=FALSE) s<-length(DInner) for(k in 2:5){ separator<-paste("k",k, sep="") write.table(separator, file=filePathFin_1, row.names=FALSE, col.names=FALSE, append=TRUE, sep=",") c<-c(0,0,0) d1<-c(NA,NA,NA) d2<-c(NA,NA,NA) d3<-c(NA,NA,NA) Top<-matrix(nrow=25,ncol=18) if(!length(DInner)==0){ for(j in 1:25){ OriginalSet <- read.csv(DInner[j], header=TRUE, sep=",") if(length(unique(OriginalSet$FailureNumber))==2){ ReducedSet<-try(InformationGain(OriginalSet,j)) if(is.character(ReducedSet)==FALSE){ ReducedSet<-ReducedSet ##Split ReducedSet into FitSet and ValidationSet n<-nrow(ReducedSet) ## Select randomly n\k of ReducedSet rowIDx <- sample(n, size=n/k) ValidationSet<- ReducedSet[rowIDx,] ## Define FitSet rowTotal <- sample(n, size=n) tmpIndex <- subset(rowTotal, !rowTotal %in% rowIDx) FitSet<-ReducedSet[tmpIndex,] #recall best weights BestValues<-TopologySelectionMP(FitSet) #produce a row of 18 values for MR, fpr, and tpr respectively Top[j-5,]<-comparison_1(FitSet, ValidationSet, BestValues, j-5) write.table(t(Top[j-5,]), file=filePathFin_1, row.names=FALSE, col.names=FALSE, append=TRUE, sep=",") }else{Top[j-5,] <- c(NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA) write.table(t(Top[j-5,]), file=filePathFin_1, row.names=FALSE, col.names=FALSE, append=TRUE, sep=",")} }else{Top[j-5,] <- c(NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA) write.table(t(Top[j-5,]), file=filePathFin_1, row.names=FALSE, col.names=FALSE, append=TRUE, sep=",")} } } } } ##Select features in Original samples InformationGain<-function(OriginalSet,j){ weights <- information.gain(FailureNumber~., OriginalSet) print(weights) min<-which.min(sort(weights$attr_importance, decreasing=TRUE)) if(sort(weights$attr_importance, decreasing=TRUE)[min]==0){ numberOfAttributes<-which.min(sort(weights$attr_importance, decreasing=TRUE))-1 }else{numberOfAttributes<-length(weights)} subset <- cutoff.k(weights, numberOfAttributes) f <- as.simple.formula(subset, "OriginalSet") ##print(f) ReducedSet<-subset(OriginalSet, select=subset) columns<-sort(colnames(ReducedSet), decreasing=TRUE) file<-c(HInner[j],nrow(weights),numberOfAttributes, columns) print(file) filePath<-paste("data/c",i,"/InformationGain.csv", sep="") write.table(t(file), file=filePath, row.names=FALSE, col.names=FALSE, append=TRUE, sep=",") FailureNumber<-OriginalSet$FailureNumber ReducedSetAugmented<-cbind(ReducedSet,FailureNumber) return(ReducedSetAugmented) } ###################################### ## Topology Selection ## ## file1 is the file path in which the topology parameters will be saved TopologySelectionMP<-function(Set){ filePath<-paste("file1.csv") TopFin<-matrix(nrow=1, ncol=3) if(length(unique(Set$FailureNumber))==2){ MLP<-function(p) MultilayerPerceptronTopologySelection(Set,p[1],p[2],p[3])[1] grid<-expand.grid(l=seq(0.1,1,0.1),m=seq(0.1,1,0.1),h=seq(1,10,1)) Model<-apply(grid,1,MLP) MRMF<-which.min(Model) TopFin[1,]<-as.matrix(grid[MRMF,]) }else{TopFin[1,]<-as.matrix(c(NA,NA,NA))} write.table(t(TopFin[1,]), file=filePath, row.names=FALSE, col.names=FALSE, append=TRUE, sep=",") return(TopFin[1,]) } ## D is boolean (TRUE/FALSE)H is the number of hidden layers, L learning rate, V percentage of validation set; M momentum ##MR is the percentage of Incorrectly Classified Instances ##MAS is the mean absolute value ## summary gives the confusion matrix MultilayerPerceptronTopologySelection<-function(Set,l,m,h){ NN<-make_Weka_classifier("weka/classifiers/functions/MultilayerPerceptron") ##WOW(NN) ##http://weka.sourceforge.net/doc/weka/classifiers/functions/MultilayerPerceptron ## data is normalized by default ResultNN<-NN(FailureNumber ~ .,data=Set, control=Weka_control(S=1,R=FALSE,L=l,M=m,N=500,V=30,H=h,D=TRUE)) ##summary describe the quality of fit of the NN Summary<-summary(ResultNN) MR<-Summary$details[2] MAS<-Summary$details[5] c<-c(MR,MAS) return(c) } ###################################### ##RBF RadialBasisFunctionNetwork<-function(FitSet){ WPM("load-package","RBFNetwork") RBFN<-make_Weka_classifier("weka/classifiers/functions/RBFNetwork") ##WOW(RBFN) ## http://weka.sourceforge.net/doc/weka/classifiers/functions/RBFNetwork.html ResultRBFN<-RBFN(FailureNumber ~ ., data=FitSet, control=Weka_control(B= 2)) ##summary describe the quality of fit of the NN Summary<-summary(ResultRBFN) MR<-Summary$details[2] tpr<-Summary$confusionMatrix[1,1]/(Summary$confusionMatrix[1,1]+Summary$confusionMatrix[1,2]) fpr<-Summary$confusionMatrix[2,1]/(Summary$confusionMatrix[2,1]+Summary$confusionMatrix[2,2]) return(c(MR,tpr,fpr)) } ###################################### ##L ##to classify with linear regression non numeric instances we use ClassificationViaRegression with linearRegression LinearRegression<-function(FitSet){ LR<-make_Weka_classifier("weka/classifiers/meta/ClassificationViaRegression") ##WOW(LR), http://weka.sourceforge.net/doc/weka/classifiers/functions/LinearRegression ResultLR<-LR(FailureNumber ~ .,data=FitSet, control=Weka_control(D=TRUE, W="weka.classifiers.functions.LinearRegression", S=0 )) ##summary describe the quality of fit of the LR Summary<-summary(ResultLR) MR<-Summary$details[2] tpr<-Summary$confusionMatrix[1,1]/(Summary$confusionMatrix[1,1]+Summary$confusionMatrix[1,2]) fpr<-Summary$confusionMatrix[2,1]/(Summary$confusionMatrix[2,1]+Summary$confusionMatrix[2,2]) return(c(MR,tpr,fpr)) } ## MP ## d is boolean (TRUE/FALSE)H is the number of hidden layers, L learning rate, V percentage of validation set; M momentum ## pay attention sometimes the nrow of TestSet is too small to replicate with 10 folds. MultilayerPerceptronNN<-function(FitSet,l,m,h){ NN<- make_Weka_classifier("weka/classifiers/functions/MultilayerPerceptron") ##WOW(NN)http://weka.sourceforge.net/doc/weka/classifiers/functions/MultilayerPerceptron ## data is normalized by default = I=FALSE ResultNN<-NN(FailureNumber ~ .,data=FitSet, control=Weka_control(S= 1, R= FALSE,L=l,M=m,N=500,V=30,H=h, D=TRUE )) Summary<-summary(ResultNN) ##summary describe the quality of fit of the NN MR<-Summary$details[2] tpr<-Summary$confusionMatrix[1,1]/(Summary$confusionMatrix[1,1]+Summary$confusionMatrix[1,2]) fpr<-Summary$confusionMatrix[2,1]/(Summary$confusionMatrix[2,1]+Summary$confusionMatrix[2,2]) return(c(MR,tpr,fpr)) }