## ##define Training set and Test set before using this script ##load the package RWeka before using this script ## Due to the lack of weka software on windows 7, I had to change computer and the files' path has changed library(RWeka) DefectiveSamples<-read.csv("C:\\Documents and Settings\\brusso\\My Documents\\Pedrycz\\Result\\DescriptiveAnalysisDefectiveST.csv", sep=";") ApplicationNames<-DefectiveSamples$Application a<-length(ApplicationNames) MR<-c() for(i in 1:4){ directory<-paste("C:\\Documents and Settings\\brusso\\My Documents\\Pedrycz\\data\\c",i,"\\", sep="") H<- dir(directory, full.names=FALSE) D<- dir(directory, full.names=TRUE) s<-length(D) ##vary the topology of Multilayer Perceptron min<-c() Lfin<-c() Mfin<-c() h<-c() for (j in 1:(s-4)){ L<-c() M<-c() minVector<-c() for(h in 1:10){ MRMatrix<-matrix(ncol=10,nrow=10) for(l in 1:10){ lrat<-1/(l+1) for(m in 1:10){ mrat<-1/(m+1) FitSet <- read.csv(D[j+4], header=TRUE, sep=",") ##MRMatrix is the matrix of incorrectly classified instances percentages MRMatrix[l,m]<-MultilayerPerceptronTopologySelection(FitSet,lrat,mrat,h)[1] } } ##min values of l and m and MR according to the min value of MR=incorrect classified instances percentage L[h]<-which(MRMatrix==min(MRMatrix), arr.ind=TRUE)[1,1] M[h] <-which(MRMatrix==min(MRMatrix), arr.ind=TRUE)[1,2] minVector[h]<- min(MRMatrix) } ## temp_h is the value of h corresponding the min value of incorrect classified instance percentage in minVector min[j]<-min(minVector) temp_h<-which.min(minVector) h[j]<-temp_h Lfin[j]<-L[temp_h] Mfin[j]<-M[temp_h] } lfin<-1/(Lfin+1) mfin<-1/(Mfin+1) NNTop<-cbind(min,h,lfin,mfin) filePath=paste("C:\\Documents and Settings\\brusso\\My Documents\\Pedrycz\\Result\\NNTopologies.csv") write.table(NNTop, file=filePath, row.names=FALSE, col.names=colnames(NNTop), append=TRUE, sep=",") separator<-paste("c",i, sep="") write.table(separator, file=filePath, row.names=FALSE, col.names=FALSE, append=TRUE, sep=",") } ## D is boolean (TRUE/FALSE)H is the number of hidden layers, L learning rate, V percentage of validation set; M momentum ##MR is the percentage of Incorrectly Classified Instances ##MAS is the mean absolute value ## summary gives the confusion matrix MultilayerPerceptronTopologySelection<-function(FitSet,lrat,mrat,h){ NN<-make_Weka_classifier("weka/classifiers/functions/MultilayerPerceptron") ##WOW(NN)http://weka.sourceforge.net/doc/weka/classifiers/functions/MultilayerPerceptron ## data is normalized by default ResultNN<-NN(FailureNumber ~ .,data=FitSet, control=Weka_control(S=1,L=lrat,M=mrat,N=500,V=40,H=h,D=TRUE)) ##summary describe the quality of fit of the NN Summary<-summary(ResultNN) MR<-Summary$details[2] MAS<-Summary$details[5] c<-c(MR,MAS) return(c) } ##1/k is the percentage of splitting Fit set - Test set for(i in 1:4){ for(k in 2:5){ directoryInner<-paste("C:\\Documents and Settings\\brusso\\My Documents\\Pedrycz\\data\\c",i,"\\",k,"\\", sep="") DInner<- dir(directoryInner, full.names=TRUE) HInner<- dir(directoryInner, full.names=FALSE) filePath=paste("C:\\Documents and Settings\\brusso\\My Documents\\Pedrycz\\Result\\NNTopologies.csv") a<-length(DInner) for(j in 1:25){ FitSet <- read.csv(DInner[j+25], header=TRUE, sep=",") TestSet <- read.csv(DInner[j], header=TRUE, sep=",") BestValues<-read.csv(filePath,header=TRUE, sep=",") ## to scroll down the fine for different values of c lrat<-BestValues$lrat[j+a*(i-1)+i-1] mrat<-BestValues$mrat[j+a*(i-1)+i-1] h<-BestValues$h[j+a*(i-1)+i-1] MultilayerPerceptronNN(FitSet,TestSet,lrat,mrat,h) RadialBasisFunctionNetwork(FitSet,TestSet) LinearRegression(FitSet,TestSet) } } } RadialBasisFunctionNetwork<-function(FitSet,TestSet){ RBFN<-make_Weka_classifier("weka/classifiers/functions/RBFNetwork") ##WOW(RBFN) ## http://weka.sourceforge.net/doc/weka/classifiers/functions/RBFNetwork.html ResultRBFN<-RBFN(FailureNumber ~ ., data=TrainingSet, control=Weka_control(B= 2)) ##summary describe the quality of fit of the NN Summary<-summary(ResultRBFN) MR<-Summary$details[2] ##I introduced the min between nrow(TestSet) and 10 to run the evaluation nf<-min(10,nrow(TestSet)) ##The cost matrix tells whether is more dangerous to have FP or FN. ##In particular, we decided to have balanceed miscalssification rate and so the cost matrix is antidiagonal. ##when testing we determine the prediction. ResultList will tell what will happen in percentsges to a new future dataset ## numFolds is the option of Weka for which a number of folds n is specified and hte dataset is randomly reordered and then slpit into ## n folds of equal size. In each iteration one fold is used for testing and the other n-1 is used for trianing the classifier. The test results are avaraged ## over all folds to give the accuracy. TestRBFN<-evaluate_Weka_classifier(ResultRBFN, newdata= TestSet, normalize= TRUE, cost=matrix(c(0,1,1,0), ncol=2),complexity=TRUE,class=TRUE, numFolds=nf) DetailsVector<-TestRBFN$details ##Misclassification rate as in Khoshgoftaar TestMR<-DetailsVector[2] Details<-c(MR,TestMR) names(Details)<-c("MR training", "MR testing") return(Details) } ##to classify with linear regression non numeric instances we use ClassificationViaRegression with linearRegression LinearRegression<-function(TrainingSet,TestSet,v,h){ LR<-make_Weka_classifier("weka/classifiers/meta/ClassificationViaRegression") ##WOW(LR), http://weka.sourceforge.net/doc/weka/classifiers/functions/LinearRegression ResultLR<-LR(FailureNumber ~ .,data=TrainingSet, control=Weka_control(D=TRUE, W="weka.classifiers.functions.LinearRegression", S=0 )) ##summary describe the quality of fit of the LR summary(ResultLR) ## the cost matrix tells whether is more dangerous to have FP or FN. ##In particular, it has to have 0 on the diagonal. ##Balanced misclassification rates means equal values on the anti-diagonal ##the matrix is created with a vector of values: the first two describe the first column ##I introduced the min between nrow(TestSet) and 10 to run the evaluation nf<-min(10,nrow(TestSet)) ##when testing we determine the prediction. ResultList will tell what will happen in percentsges to a new future dataset ## numFolds is the option of Weka for which a number of folds n is specified and hte dataset is randomly reordered and then slpit into ## n folds of equal size. In each iteration one fold is used for testing and the other n-1 is used for trianing the classifier. The test results are avaraged ## over all folds to give the accuracy. TestLR<-evaluate_Weka_classifier(ResultLR, newdata= TestSet, normalize= TRUE, cost=matrix(c(0,1,1,0), ncol=2),complexity=TRUE,class=TRUE, numFolds=nf) DetailsVector<-TestRBFN$details ##Misclassification rate as in Khoshgoftaar TestMR<-DetailsVector[2] Details<-c(MR,TestMR) names(Details)<-c("MR training", "MR testing") return(Details) } ## d is boolean (TRUE/FALSE)H is the number of hidden layers, L learning rate, V percentage of validation set; M momentum ## pay attention sometimes the nrow of TestSet is too small to replicate with 10 folds. MultilayerPerceptronNN<-function(TrainingSet,TestSet,l,m,h){ NN<-make_Weka_classifier("weka/classifiers/functions/MultilayerPerceptron") ##WOW(NN)http://weka.sourceforge.net/doc/weka/classifiers/functions/MultilayerPerceptron ## data is normalized by default = I=FALSE ResultNN<-NN(FailureNumber ~ .,data=TrainingSet, control=Weka_control(S= 1, E= 20,L=l,M=m,N=500,V=30,H=h, D=TRUE )) summary(ResultNN) ##summary describe the quality of fit of the NN ## the cost matrix tells whether is more dangerous to have FP or FN. ##In particular, it has to have 0 on the diagonal. ##Balanced misclassification rates means equal values on the anti-diagonal ##the matrix is created with a vector of values: the first two describe the first column ##I introduced the min between nrow(TestSet) and 10 to run the evaluation nf<-min(10,nrow(TestSet)) ##when testing we determine the prediction. ResultList will tell what will happen in percentsges to a new future dataset ## numFolds is the option of Weka for which a number of folds n is specified and hte dataset is randomly reordered and then slpit into ## n folds of equal size. In each iteration one fold is used for testing and the other n-1 is used for trianing the classifier. The test results are avaraged ## over all folds to give the accuracy. TestNN<-evaluate_Weka_classifier(ResultNN, newdata= TestSet, normalize= TRUE, cost=matrix(c(0,1,1,0), ncol=2),complexity=TRUE,class=TRUE, numFolds=nf) DetailsVector<-TestRBFN$details ##Misclassification rate as in Khoshgoftaar TestMR<-DetailsVector[2] Details<-c(MR,TestMR) names(Details)<-c("MR training", "MR testing") return(Details) }