Skip to content

Commit

Permalink
corrections and function for obtaining distances
Browse files Browse the repository at this point in the history
  • Loading branch information
paobranco committed Jul 13, 2017
1 parent 2ad133b commit 2822270
Show file tree
Hide file tree
Showing 15 changed files with 397 additions and 58 deletions.
13 changes: 10 additions & 3 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
VERSION 0.0.6 (2017-01-03)
VERSION 0.0.6 (2017-06-20)
- added adasyn algorithm for classification
- added method for interpolating utility/benefit/cost surfaces
- added utility-based metrics for classification and regression tasks (utility, cost and benefit)
- added a function for utility-based learning for classification (from Elkan 2001)
- added a function for utility-based learning for regression
- added a function for utility-based learning optimization for classification (from Elkan 2001)
- added a function for utility-based learning optimization for regression
- added a function that returns the distances between all pairs of
example computed according to a selected distance metric
- bugs correction in SmoteRegress function (caused by constant features)
- bugs correction on RandOverClassif (caused by classes with only one example)
- bugs correction on neighbours function (Fortran code)
- routines registration added

VERSION 0.0.5 (2016-07-13)
- added two synthetic data sets, for classification and regression
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Description: Provides a set of functions that can be used to obtain better predi
Version: 0.0.6
Depends: R(>= 3.0), methods, grDevices, graphics, stats, MBA, gstat, automap, sp, randomForest
Suggests: MASS, rpart, testthat, DMwR, ggplot2, e1071
Date: 2017-01-03
Date: 2017-07-13
Authors@R: c(person("Paula", "Branco", email = "[email protected]", role = c("aut", "cre")), person("Rita", "Ribeiro", email = "[email protected]", role = c("aut", "ctb")), person("Luis", "Torgo", email = "[email protected]", role = c("aut", "ctb")))
URL: https://github.com/paobranco/UBL
BugReports: https://github.com/paobranco/UBL/issues
Expand Down
14 changes: 9 additions & 5 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

## ------------------------------------------------------------------------------------------------

useDynLib(UBL)
useDynLib(UBL, .registration=TRUE)

## ------------------------------------------------------------------------------------------------

Expand Down Expand Up @@ -46,9 +46,9 @@ export(
# phi.extremes,
# phi.range,
phi,
# tPhi,
# BL,
# UtilTRPhiBL,
#tPhi,
#BL,
#UtilNewRegress,
##surface interpolation methods
UtilInterpol,
## utility-based evaluation metrics for classification and regression
Expand All @@ -57,7 +57,11 @@ export(
## utility-based optimal predictions
UtilOptimClassif,
UtilOptimRegress,
## utility-based learning
#MetacostClassif,
#MetacostRegress,
## neighbours function
neighbours
neighbours,
distances
)

115 changes: 115 additions & 0 deletions R/Distances.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# ===================================================
# This function prepares the data and calls fortran subroutine
# that computes the distances between examples
# P. Branco, June 2017
# ---------------------------------------------------

distances <- function(tgt, dat, dist, p=2)
# INPUTS:
# tgt is the column where the target variable is
# dat is the original data set
# dist is the distance measure to be used
# p is a parameter used when a p-norm is computed
# OUTPUTS:
# a matrix with the distances between each example
{
# check if p has an admissible value(>=1) and change to distance integer code
# p-norm : p provided
# Manhattan : p=1
# Euclidean : p=2
# Chebyshev : p=0
# Canberra : p=-1
# HEOM : p=-2
# HVDM : p=-3
# DVDM : p=-4
# IVDM : p=-5
# WVDM : p=-6
# MVDM : p=-7

# just set a dummy value for the number of neighbours to evaluate, which is not
# used in this function but is necessary for the fortran function.

k <- 1

if(p<1) stop("The parameter p must be >=1!")
# p >=-1 only numeric attributes handled
# p =-2 only nominal attributes handled
# p<= -3 nominal and numeric attributes handled
p<- switch(dist,
"Chebyshev"=0,
"Manhattan"=1,
"Euclidean"=2,
"Canberra"=-1,
"Overlap"=-2,
"HEOM"=-3,
"HVDM"=-4,
# to be implemented
# "DVDM"=-5,
# "IVDM"=-6,
# "WVDM"=-7,
# "MVDM"=-8,
"p-norm"=p,
stop("Distance measure not available!"))


if (class(dat[,tgt]) == "numeric" & p <= -4) stop("distance measure selected only available for classification tasks")

nomatr <- c()
for (col in seq.int(dim(dat)[2])) {
if (class(dat[,col]) %in% c('factor','character')) {
nomatr <- c(nomatr, col)
}
}

nomatr <- setdiff(nomatr, tgt)
numatr <- setdiff(seq.int(dim(dat)[2]), c(nomatr,tgt))

nomData <- t(sapply(subset(dat, select = nomatr), as.integer))
numData <- t(subset(dat, select = numatr))

# check if the measure can be applied to the data set features

if (length(numatr) & p == -2) {
stop("Can not compute Overlap metric with numeric attributes!")
}
if (length(nomatr) & p >= -1) {
stop("Can not compute ", dist ," distance with nominal attributes!")
}

tgtData <- dat[, tgt]
n <- length(tgtData)
res <- matrix(0.0, nrow = k, ncol = n)
if (class(tgtData) != "numeric") {tgtData <- as.integer(tgtData)}

Cl <- length(unique(tgtData))
nnom <- length(nomatr)
nnum <- length(numatr)

distm <- matrix(0.0, nrow = n, ncol = n)
numD <- matrix(0.0, nrow = nnum, ncol = n)


storage.mode(numData) <- "double"
storage.mode(nomData) <- "integer"
storage.mode(res) <- "integer"
storage.mode(tgtData) <- "double"
storage.mode(distm) <- "double"
storage.mode(numD) <- "double"

neig <- .Fortran("F_neighbours",
tgtData = tgtData, # tgt data
numData = numData, #numeric data
nomData = nomData, #nominal data
p = as.integer(p), # code for distance metric
k = as.integer(k), # nr of neighbours
n = as.integer(n), # nr of examples in the data
nnum = as.integer(nnum), # nr of numeric attributes
nnom = as.integer(nnom), # nr of nominal attributes
Cl = as.integer(Cl), # number of different classes in the target variable
distm = distm,
numD = numD,
res = res) # output
neig <- neig$distm

neig
}
2 changes: 1 addition & 1 deletion R/Neighbours.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ nomatr <- c()
storage.mode(distm) <- "double"
storage.mode(numD) <- "double"

neig <- .Fortran("neighbours",
neig <- .Fortran("F_neighbours",
tgtData = tgtData, # tgt data
numData = numData, #numeric data
nomData = nomData, #nominal data
Expand Down
17 changes: 13 additions & 4 deletions R/UtilOptimRegress.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,17 @@ UtilOptimRegress <- function(form, train, test,
# has the y value, the second column the \hat{y} value and the
# third column has the corresponding utility value. The domain
# boundaries of (y, \hat{y}) must be provided.
# minds the lower bound of the target variable considered for interpolation
# maxds the upper bound of the target variable considered for interpolation
# eps a value for the precision considered during the interpolation.
# minds the lower bound of the target variable considered
# maxds the upper bound of the target variable considered
# eps a value for the precision considered during the pdf.
#
# output:
# the predictions for the test data optimized using the surface provided

type <- match.arg(type, c("utility", "cost", "benefit"))
strat <- match.arg(strat, c("interpol", "automatic")) # only interpol implemented for now
if (strat != "interpol") stop("UBL::Only interpolation is available for now as strat parameter",
call. = FALSE)
tgt <- which(names(train) == as.character(form[[2]]))

if (is.null(minds)){
Expand All @@ -94,10 +96,17 @@ UtilOptimRegress <- function(form, train, test,
No further arguments are necessary.", call. = FALSE)
}
method <- match.arg(strat.parms[[1]], c("bilinear", "splines", "idw", "krige"))
# UtilRes is a lxl matrix with the true utility values on the rows and the
# predictions on the columns, i.e., resUtil[a,b] provides the utility of
# predicting b for a true value a.
UtilRes <- UtilInterpol(NULL, NULL, type, control.parms,
minds, maxds, m.pts,
method = method, visual = FALSE, eps = eps,
full.output = TRUE)
} else if(strat == "auto"){
# baseseq <- seq(minds-0.01, maxds+0.01, by=eps)
# if(baseseq[length(baseseq)]!=maxds) baseseq <- c(baseseq, maxds)

}

resPDF <- getPDFinRange(y.true, test, train, form)
Expand All @@ -117,7 +126,7 @@ UtilOptimRegress <- function(form, train, test,
} else {
optim[ex] <- y.true[which.min(areas)]
}
print(ex)
# print(ex)
}

#obtain the utility values for the points (test, optim) determined
Expand Down
24 changes: 18 additions & 6 deletions R/randOverClassif.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,13 @@ RandOverClassif <- function(form, dat, C.perc = "balance", repl = TRUE)

for (i in 1:length(names.ove)) { # over-sampling each class provided
Exs <- which(dat[, tgt] == names.ove[i])
if(length(Exs)==1){
sel <- rep(Exs, as.integer((C.perc[[names.ove[i]]] - 1) * length(Exs)))
} else {
sel <- sample(Exs,
as.integer((C.perc[[names.ove[i]]] - 1) * length(Exs)),
replace = repl)
}
newdata <- rbind(newdata, dat[sel, ])
}
} else if (C.perc == "balance") { # over-sampling percent. will be calculated
Expand All @@ -75,9 +79,13 @@ RandOverClassif <- function(form, dat, C.perc = "balance", repl = TRUE)
Exs <- which(dat[, tgt] == names.ove[i])
num1 <- li[[2]][as.numeric(match(majCl, names))[1]]
num2<- li[[2]][as.numeric(names.ove[i])]
sel <- sample(Exs,
as.integer(num1 - num2),
replace = repl)
if(length(Exs) == 1){
sel <- rep(Exs, as.integer(num1 - num2))
} else {
sel <- sample(Exs,
as.integer(num1 - num2),
replace = repl)
}
newdata <- rbind(newdata, dat[sel, ])
}
} else if (C.perc == "extreme") {
Expand All @@ -97,9 +105,13 @@ RandOverClassif <- function(form, dat, C.perc = "balance", repl = TRUE)
mmcl <- as.numeric(match(majCl, names))[1]
n1 <- (li[[2]][mmcl])^2/li[[2]][as.numeric(match(names.ove[i], names))]
n2 <- li[[2]][as.numeric(match(names.ove[i], names))]
sel <- sample(Exs,
round(n1 - n2, 0),
replace = repl)
if(length(Exs) == 1){
sel <- rep(Exs, round(n1 - n2, 0))
} else {
sel <- sample(Exs,
round(n1 - n2, 0),
replace = repl)
}
newdata <- rbind(newdata, dat[sel, ])
}
} else {
Expand Down
2 changes: 1 addition & 1 deletion R/randUnderClassif.R
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ class.freq <- function(dat, tgt){
names <- sort(unique(dat[, tgt]))
li <- list(names,
sapply(names,
function(x) length(which(dat[, tgt] == names[x]))))
function(x) length(which(dat[, tgt] == x))))
li
}

Expand Down
81 changes: 62 additions & 19 deletions R/smoteClassif.R
Original file line number Diff line number Diff line change
Expand Up @@ -99,16 +99,37 @@ SmoteClassif <- function(form, dat, C.perc = "balance",
}
if (length(names.ove)) { # perform over-sampling
for (i in 1:length(names.ove)) {
newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == names.ove[i]), ],
ncol(dat),
C.perc[[names.ove[i]]],
k,
dist,
p)
# add original rare examples and synthetic generated examples
newdata <- rbind(newdata,
newExs,
dat[which(dat[, ncol(dat)] == names.ove[i]), ])
if(length(which(dat[, ncol(dat)] == names.ove[i])) == 1){
warning(paste("SmoteClassif :: Unable to use SmoteClassif in a bump with 1 example.
Introducing replicas of the example."), call.=FALSE)
newdata <- rbind(newdata, dat[rep(which(dat[, ncol(dat)] == names.ove[i]),C.perc[names.ove[i]]),])
} else if (length(which(dat[, ncol(dat)] == names.ove[i])) <= k){
warning(paste("SmoteClassif :: Nr of examples is less or equal to k.\n Using k =",
length(which(dat[, ncol(dat)] == names.ove[i]))-1,
"in the nearest neighbours computation in this bump."), call.=FALSE)
Origk <- k
k <- length(which(dat[, ncol(dat)] == names.ove[i]))-1
newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == names.ove[i]), ],
ncol(dat),
li[[3]][ove[i]]/li[[2]][ove[i]] + 1,
k,
dist,
p)
# add original rare examples and synthetic generated examples
newdata <- rbind(newdata, newExs,
dat[which(dat[,ncol(dat)] == names.ove[i]),])
k <- Origk
} else {
newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == names.ove[i]), ],
ncol(dat),
C.perc[[names.ove[i]]],
k,
dist,
p)
# add original rare examples and synthetic generated examples
newdata <- rbind(newdata, newExs,
dat[which(dat[, ncol(dat)] == names.ove[i]), ])
}
}
}
} else {
Expand Down Expand Up @@ -140,15 +161,37 @@ SmoteClassif <- function(form, dat, C.perc = "balance",

if (length(ove)) { #perform over-sampling
for (i in 1:length(ove)) {
newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == li[[1]][ove[i]]), ],
ncol(dat),
li[[3]][ove[i]]/li[[2]][ove[i]] + 1,
k,
dist,
p)
# add original rare examples and synthetic generated examples
dc <- ncol(dat)
newdata <- rbind(newdata, newExs, dat[which(dat[,dc] == li[[1]][ove[i]]),])
if(length(which(dat[, ncol(dat)] == li[[1]][ove[i]])) == 1){
warning(paste("SmoteClassif :: Unable to use SmoteClassif in a bump with 1 example.
Introducing replicas of the example."), call.=FALSE)
newdata <- rbind(newdata, dat[rep(which(dat[, ncol(dat)] == li[[1]][ove[i]]), li[[3]][ove[i]]),])
} else if(length(which(dat[, ncol(dat)] == li[[1]][ove[i]]))<= k){
warning(paste("SmoteClassif :: Nr of examples is less or equal to k.\n Using k =",
length(which(dat[, ncol(dat)] == li[[1]][ove[i]]))-1,
"in the nearest neighbours computation in this bump."), call.=FALSE)
Origk <- k
k <- length(which(dat[, ncol(dat)] == li[[1]][ove[i]]))-1
newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == li[[1]][ove[i]]), ],
ncol(dat),
li[[3]][ove[i]]/li[[2]][ove[i]] + 1,
k,
dist,
p)
# add original rare examples and synthetic generated examples
newdata <- rbind(newdata, newExs,
dat[which(dat[,ncol(dat)] == li[[1]][ove[i]]),])
k <- Origk
} else {
newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == li[[1]][ove[i]]), ],
ncol(dat),
li[[3]][ove[i]]/li[[2]][ove[i]] + 1,
k,
dist,
p)
# add original rare examples and synthetic generated examples
newdata <- rbind(newdata, newExs,
dat[which(dat[,ncol(dat)] == li[[1]][ove[i]]),])
}
}
}

Expand Down
Loading

0 comments on commit 2822270

Please sign in to comment.