corrections and function for obtaining distances

paobranco · Jul 13, 2017 · 2822270 · 2822270
1 parent 2ad133b
commit 2822270
Show file tree

Hide file tree

Showing 15 changed files with 397 additions and 58 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,8 +1,15 @@
-VERSION 0.0.6 (2017-01-03)
+VERSION 0.0.6 (2017-06-20)
+	- added adasyn algorithm for classification
 	- added method for interpolating utility/benefit/cost surfaces
 	- added utility-based metrics for classification and regression tasks (utility, cost and benefit)
-	- added a function for utility-based learning for classification (from Elkan 2001)
-	- added a function for utility-based learning for regression 
+	- added a function for utility-based learning optimization for classification (from Elkan 2001)
+	- added a function for utility-based learning optimization for regression
+	- added a function that returns the distances between all pairs of 
+          example computed according to a selected distance metric
+	- bugs correction in SmoteRegress function (caused by constant features)
+	- bugs correction on RandOverClassif (caused by classes with only one example)
+	- bugs correction on neighbours function (Fortran code)
+	- routines registration added
 
 VERSION 0.0.5 (2016-07-13)
 	- added two synthetic data sets, for classification and regression

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,7 +5,7 @@ Description: Provides a set of functions that can be used to obtain better predi
 Version: 0.0.6
 Depends: R(>= 3.0), methods, grDevices, graphics, stats, MBA, gstat, automap, sp, randomForest
 Suggests: MASS, rpart, testthat, DMwR, ggplot2, e1071
-Date: 2017-01-03
+Date: 2017-07-13
 Authors@R: c(person("Paula", "Branco", email = "[email protected]", role = c("aut", "cre")), person("Rita", "Ribeiro", email = "[email protected]", role = c("aut", "ctb")), person("Luis", "Torgo", email = "[email protected]", role = c("aut", "ctb")))
 URL: https://github.com/paobranco/UBL
 BugReports: https://github.com/paobranco/UBL/issues

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,7 +1,7 @@
 
 ## ------------------------------------------------------------------------------------------------
 
-useDynLib(UBL)
+useDynLib(UBL, .registration=TRUE)
 
 ## ------------------------------------------------------------------------------------------------
 
@@ -46,9 +46,9 @@ export(
 #	phi.extremes,
 #	phi.range,
 	phi,
-#	tPhi,
-#	BL,
-#	UtilTRPhiBL,
+	#tPhi,
+	#BL,
+	#UtilNewRegress,
 	##surface interpolation methods
 	UtilInterpol,
 	## utility-based evaluation metrics for classification and regression
@@ -57,7 +57,11 @@ export(
 	## utility-based optimal predictions
 	UtilOptimClassif,
 	UtilOptimRegress,
+	## utility-based learning
+	#MetacostClassif,
+	#MetacostRegress,
 	## neighbours function
-	neighbours
+	neighbours,
+	distances
 )
 
diff --git a/R/Distances.R b/R/Distances.R
@@ -0,0 +1,115 @@
+# ===================================================
+# This function prepares the data and calls fortran subroutine
+# that computes the distances between examples
+# P. Branco, June 2017
+# ---------------------------------------------------
+
+distances <- function(tgt, dat, dist, p=2)
+  #  INPUTS:
+  #   tgt is the column where the target variable is
+  #   dat is the original data set
+  #   dist is the distance measure to be used
+  #   p is a parameter used when a p-norm is computed
+  #   OUTPUTS:
+  #   a matrix with the distances between each example
+{
+  # check if p has an admissible value(>=1) and change to distance integer code
+  # p-norm : p provided
+  # Manhattan : p=1
+  # Euclidean : p=2
+  # Chebyshev : p=0
+  # Canberra : p=-1
+  # HEOM : p=-2
+  # HVDM : p=-3
+  # DVDM : p=-4
+  # IVDM : p=-5
+  # WVDM : p=-6
+  # MVDM : p=-7
+
+  # just set a dummy value for the number of neighbours to evaluate, which is not
+  # used in this function but is necessary for the fortran function.  
+
+  k <- 1
+
+  if(p<1) stop("The parameter p must be >=1!")
+  # p >=-1 only numeric attributes handled
+  # p =-2 only nominal attributes handled
+  # p<= -3 nominal and numeric attributes handled
+  p<- switch(dist,
+             "Chebyshev"=0,
+             "Manhattan"=1,
+             "Euclidean"=2,
+             "Canberra"=-1,
+             "Overlap"=-2,
+             "HEOM"=-3,
+             "HVDM"=-4,
+             # to be implemented
+             #             "DVDM"=-5,
+             #             "IVDM"=-6,
+             #             "WVDM"=-7,
+             #             "MVDM"=-8,
+             "p-norm"=p,
+             stop("Distance measure not available!"))
+
+
+  if (class(dat[,tgt]) == "numeric" & p <= -4) stop("distance measure selected only available for classification tasks")
+
+  nomatr <- c() 
+  for (col in seq.int(dim(dat)[2])) {
+    if (class(dat[,col]) %in% c('factor','character')) {
+      nomatr <- c(nomatr, col)
+    }
+  }
+
+  nomatr <- setdiff(nomatr, tgt)
+  numatr <- setdiff(seq.int(dim(dat)[2]), c(nomatr,tgt))
+
+  nomData <- t(sapply(subset(dat, select = nomatr), as.integer))
+  numData <- t(subset(dat, select = numatr))
+
+  # check if the measure can be applied to the data set features
+
+  if (length(numatr) & p == -2) {
+    stop("Can not compute Overlap metric with numeric attributes!")
+  }
+  if (length(nomatr) & p >= -1) {
+    stop("Can not compute ", dist ," distance with nominal attributes!")
+  }
+
+  tgtData <- dat[, tgt]
+  n <- length(tgtData)
+  res <- matrix(0.0, nrow = k, ncol = n)
+  if (class(tgtData) != "numeric") {tgtData <- as.integer(tgtData)}
+
+  Cl <- length(unique(tgtData))
+  nnom <- length(nomatr)
+  nnum <- length(numatr)
+
+  distm <- matrix(0.0, nrow = n, ncol = n)
+  numD <- matrix(0.0, nrow = nnum, ncol = n)
+
+
+  storage.mode(numData) <- "double"
+  storage.mode(nomData) <- "integer"
+  storage.mode(res) <- "integer"
+  storage.mode(tgtData) <- "double"
+  storage.mode(distm) <- "double"
+  storage.mode(numD) <- "double"
+
+  neig <- .Fortran("F_neighbours", 
+                   tgtData = tgtData,  # tgt data
+                   numData = numData, #numeric data
+                   nomData = nomData, #nominal data
+                   p = as.integer(p), # code for distance metric
+                   k = as.integer(k), # nr of neighbours
+                   n = as.integer(n), # nr of examples in the data
+                   nnum = as.integer(nnum), # nr of numeric attributes
+                   nnom = as.integer(nnom), # nr of nominal attributes
+                   Cl = as.integer(Cl), # number of different classes in the target variable
+                   distm = distm,
+                   numD = numD,
+                   res = res) # output
+  neig <- neig$distm
+
+  neig
+}
diff --git a/R/Neighbours.R b/R/Neighbours.R
@@ -93,7 +93,7 @@ nomatr <- c()
   storage.mode(distm) <- "double"
   storage.mode(numD) <- "double"
 
-  neig <- .Fortran("neighbours", 
+  neig <- .Fortran("F_neighbours", 
                    tgtData = tgtData,  # tgt data
                    numData = numData, #numeric data
                    nomData = nomData, #nominal data

diff --git a/R/UtilOptimRegress.R b/R/UtilOptimRegress.R
@@ -67,15 +67,17 @@ UtilOptimRegress <- function(form, train, test,
   #                 has the y value, the second column the \hat{y} value and the
   #                 third column has the corresponding utility value. The domain
   #                 boundaries of (y, \hat{y}) must be provided.
-  #   minds         the lower bound of the target variable considered for interpolation
-  #   maxds         the upper bound of the target variable considered for interpolation
-  #   eps           a value for the precision considered during the interpolation. 
+  #   minds         the lower bound of the target variable considered
+  #   maxds         the upper bound of the target variable considered
+  #   eps           a value for the precision considered during the pdf. 
   #
   #   output:
   #   the predictions for the test data optimized using the surface provided
 
   type <- match.arg(type, c("utility", "cost", "benefit"))
   strat <- match.arg(strat, c("interpol", "automatic")) # only interpol implemented for now
+  if (strat != "interpol") stop("UBL::Only interpolation is available for now as strat parameter",
+                                call. = FALSE)
   tgt <- which(names(train) == as.character(form[[2]]))
 
   if (is.null(minds)){
@@ -94,10 +96,17 @@ UtilOptimRegress <- function(form, train, test,
            No further arguments are necessary.", call. = FALSE)
     }
     method <- match.arg(strat.parms[[1]], c("bilinear", "splines", "idw", "krige"))
+    # UtilRes is a lxl matrix with the true utility values on the rows and the
+    # predictions on the columns, i.e., resUtil[a,b] provides the utility of
+    # predicting b for a true value a.
     UtilRes <- UtilInterpol(NULL, NULL, type, control.parms, 
                           minds, maxds, m.pts, 
                           method = method, visual = FALSE, eps = eps,
                           full.output = TRUE)
+  } else if(strat == "auto"){
+    # baseseq <- seq(minds-0.01, maxds+0.01, by=eps)
+    # if(baseseq[length(baseseq)]!=maxds) baseseq <- c(baseseq, maxds)
+
   }
 
   resPDF <- getPDFinRange(y.true, test, train, form)
@@ -117,7 +126,7 @@ UtilOptimRegress <- function(form, train, test,
     } else {
       optim[ex] <- y.true[which.min(areas)]
     }
-    print(ex)
+#    print(ex)
   }
 
   #obtain the utility values for the points (test, optim) determined 

diff --git a/R/randOverClassif.R b/R/randOverClassif.R
@@ -56,9 +56,13 @@ RandOverClassif <- function(form, dat, C.perc = "balance", repl = TRUE)
 
     for (i in 1:length(names.ove)) { # over-sampling each class provided
       Exs <- which(dat[, tgt] == names.ove[i])
+      if(length(Exs)==1){
+        sel <- rep(Exs, as.integer((C.perc[[names.ove[i]]] - 1) * length(Exs)))
+      } else {
       sel <- sample(Exs,
                     as.integer((C.perc[[names.ove[i]]] - 1) * length(Exs)),
                     replace = repl)
+      }
       newdata <- rbind(newdata, dat[sel, ])
     }
   } else if (C.perc == "balance") { # over-sampling percent. will be calculated
@@ -75,9 +79,13 @@ RandOverClassif <- function(form, dat, C.perc = "balance", repl = TRUE)
       Exs <- which(dat[, tgt] == names.ove[i])
       num1 <- li[[2]][as.numeric(match(majCl, names))[1]]
       num2<-  li[[2]][as.numeric(names.ove[i])]
-      sel <- sample(Exs,
-                    as.integer(num1 - num2),
-                    replace = repl)
+      if(length(Exs) == 1){
+        sel <- rep(Exs, as.integer(num1 - num2))
+      } else {
+        sel <- sample(Exs,
+                      as.integer(num1 - num2),
+                      replace = repl)
+      }
       newdata <- rbind(newdata, dat[sel, ])
     }
   } else if (C.perc == "extreme") { 
@@ -97,9 +105,13 @@ RandOverClassif <- function(form, dat, C.perc = "balance", repl = TRUE)
       mmcl <- as.numeric(match(majCl, names))[1]
       n1 <- (li[[2]][mmcl])^2/li[[2]][as.numeric(match(names.ove[i], names))]
       n2 <- li[[2]][as.numeric(match(names.ove[i], names))]
-      sel <- sample(Exs,
-                    round(n1 - n2, 0),
-                    replace = repl)
+      if(length(Exs) == 1){
+        sel <- rep(Exs, round(n1 - n2, 0))
+      } else {
+        sel <- sample(Exs,
+                      round(n1 - n2, 0),
+                      replace = repl)
+      }
       newdata <- rbind(newdata, dat[sel, ])
     }
   } else {

diff --git a/R/randUnderClassif.R b/R/randUnderClassif.R
@@ -124,7 +124,7 @@ class.freq <- function(dat, tgt){
   names <- sort(unique(dat[, tgt]))
   li <- list(names, 
              sapply(names, 
-                    function(x) length(which(dat[, tgt] == names[x]))))
+                    function(x) length(which(dat[, tgt] == x))))
   li
 }
 

diff --git a/R/smoteClassif.R b/R/smoteClassif.R
@@ -99,16 +99,37 @@ SmoteClassif <- function(form, dat, C.perc = "balance",
     }
     if (length(names.ove)) { # perform over-sampling
       for (i in 1:length(names.ove)) {
-        newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == names.ove[i]), ],
-                                   ncol(dat),
-                                   C.perc[[names.ove[i]]],
-                                   k,
-                                   dist,
-                                   p)
-        # add original rare examples and synthetic generated examples
-        newdata <- rbind(newdata,
-                         newExs,
-                         dat[which(dat[, ncol(dat)] == names.ove[i]), ])
+        if(length(which(dat[, ncol(dat)] == names.ove[i])) == 1){
+          warning(paste("SmoteClassif :: Unable to use SmoteClassif in a bump with 1 example.
+                        Introducing replicas of the example."), call.=FALSE)
+          newdata <- rbind(newdata, dat[rep(which(dat[, ncol(dat)] == names.ove[i]),C.perc[names.ove[i]]),])
+        } else if (length(which(dat[, ncol(dat)] == names.ove[i])) <= k){
+          warning(paste("SmoteClassif :: Nr of examples is less or equal to k.\n Using k =",
+                        length(which(dat[, ncol(dat)] == names.ove[i]))-1, 
+                        "in the nearest neighbours computation in this bump."), call.=FALSE)
+          Origk <- k
+          k <- length(which(dat[, ncol(dat)] == names.ove[i]))-1
+          newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == names.ove[i]), ],
+                                     ncol(dat),
+                                     li[[3]][ove[i]]/li[[2]][ove[i]] + 1,
+                                     k,
+                                     dist,
+                                     p)
+          # add original rare examples and synthetic generated examples
+          newdata <- rbind(newdata, newExs, 
+                           dat[which(dat[,ncol(dat)] == names.ove[i]),])
+          k <- Origk
+        } else {
+          newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == names.ove[i]), ],
+                                     ncol(dat),
+                                     C.perc[[names.ove[i]]],
+                                     k,
+                                     dist,
+                                     p)
+          # add original rare examples and synthetic generated examples
+          newdata <- rbind(newdata, newExs,
+                           dat[which(dat[, ncol(dat)] == names.ove[i]), ])
+        }
       }
     }
   } else {
@@ -140,15 +161,37 @@ SmoteClassif <- function(form, dat, C.perc = "balance",
 
     if (length(ove)) { #perform over-sampling
       for (i in 1:length(ove)) {
-        newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == li[[1]][ove[i]]), ],
-                                   ncol(dat),
-                                   li[[3]][ove[i]]/li[[2]][ove[i]] + 1,
-                                   k,
-                                   dist,
-                                   p)
-        # add original rare examples and synthetic generated examples
-        dc <- ncol(dat)
-        newdata <- rbind(newdata, newExs, dat[which(dat[,dc] == li[[1]][ove[i]]),])
+        if(length(which(dat[, ncol(dat)] == li[[1]][ove[i]])) == 1){
+          warning(paste("SmoteClassif :: Unable to use SmoteClassif in a bump with 1 example.
+                        Introducing replicas of the example."), call.=FALSE)
+          newdata <- rbind(newdata, dat[rep(which(dat[, ncol(dat)] == li[[1]][ove[i]]), li[[3]][ove[i]]),])
+        } else if(length(which(dat[, ncol(dat)] == li[[1]][ove[i]]))<= k){
+          warning(paste("SmoteClassif :: Nr of examples is less or equal to k.\n Using k =",
+                        length(which(dat[, ncol(dat)] == li[[1]][ove[i]]))-1, 
+                        "in the nearest neighbours computation in this bump."), call.=FALSE)
+          Origk <- k
+          k <- length(which(dat[, ncol(dat)] == li[[1]][ove[i]]))-1
+          newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == li[[1]][ove[i]]), ],
+                                     ncol(dat),
+                                     li[[3]][ove[i]]/li[[2]][ove[i]] + 1,
+                                     k,
+                                     dist,
+                                     p)
+          # add original rare examples and synthetic generated examples
+          newdata <- rbind(newdata, newExs, 
+                           dat[which(dat[,ncol(dat)] == li[[1]][ove[i]]),])
+          k <- Origk
+        } else {
+          newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == li[[1]][ove[i]]), ],
+                                     ncol(dat),
+                                     li[[3]][ove[i]]/li[[2]][ove[i]] + 1,
+                                     k,
+                                     dist,
+                                     p)
+          # add original rare examples and synthetic generated examples
+          newdata <- rbind(newdata, newExs, 
+                           dat[which(dat[,ncol(dat)] == li[[1]][ove[i]]),])
+        }
       } 
     }