From 2822270d9cb9eecf39e4caf43114ded1f734d8ab Mon Sep 17 00:00:00 2001 From: paobranco Date: Thu, 13 Jul 2017 12:56:40 +0100 Subject: [PATCH] corrections and function for obtaining distances --- CHANGES | 13 +++-- DESCRIPTION | 2 +- NAMESPACE | 14 +++-- R/Distances.R | 115 ++++++++++++++++++++++++++++++++++++++++ R/Neighbours.R | 2 +- R/UtilOptimRegress.R | 17 ++++-- R/randOverClassif.R | 24 ++++++--- R/randUnderClassif.R | 2 +- R/smoteClassif.R | 81 +++++++++++++++++++++------- R/smoteRegress.R | 55 +++++++++++++++---- man/Distances.Rd | 69 ++++++++++++++++++++++++ man/UtilOptimClassif.Rd | 2 +- man/neighbours.Rd | 7 +++ src/UBL_init.c | 23 ++++++++ src/neighbours.f90 | 29 +++++++--- 15 files changed, 397 insertions(+), 58 deletions(-) create mode 100644 R/Distances.R create mode 100644 man/Distances.Rd create mode 100644 src/UBL_init.c diff --git a/CHANGES b/CHANGES index 1bc7a12..e9f9700 100644 --- a/CHANGES +++ b/CHANGES @@ -1,8 +1,15 @@ -VERSION 0.0.6 (2017-01-03) +VERSION 0.0.6 (2017-06-20) + - added adasyn algorithm for classification - added method for interpolating utility/benefit/cost surfaces - added utility-based metrics for classification and regression tasks (utility, cost and benefit) - - added a function for utility-based learning for classification (from Elkan 2001) - - added a function for utility-based learning for regression + - added a function for utility-based learning optimization for classification (from Elkan 2001) + - added a function for utility-based learning optimization for regression + - added a function that returns the distances between all pairs of + example computed according to a selected distance metric + - bugs correction in SmoteRegress function (caused by constant features) + - bugs correction on RandOverClassif (caused by classes with only one example) + - bugs correction on neighbours function (Fortran code) + - routines registration added VERSION 0.0.5 (2016-07-13) - added two synthetic data sets, for classification and regression diff --git a/DESCRIPTION b/DESCRIPTION index ca41209..c9465bf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,7 +5,7 @@ Description: Provides a set of functions that can be used to obtain better predi Version: 0.0.6 Depends: R(>= 3.0), methods, grDevices, graphics, stats, MBA, gstat, automap, sp, randomForest Suggests: MASS, rpart, testthat, DMwR, ggplot2, e1071 -Date: 2017-01-03 +Date: 2017-07-13 Authors@R: c(person("Paula", "Branco", email = "paobranco@gmail.com", role = c("aut", "cre")), person("Rita", "Ribeiro", email = "rpribeiro@dcc.fc.up.pt", role = c("aut", "ctb")), person("Luis", "Torgo", email = "ltorgo@dcc.fc.up.pt", role = c("aut", "ctb"))) URL: https://github.com/paobranco/UBL BugReports: https://github.com/paobranco/UBL/issues diff --git a/NAMESPACE b/NAMESPACE index 993ac66..6763c3f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,7 @@ ## ------------------------------------------------------------------------------------------------ -useDynLib(UBL) +useDynLib(UBL, .registration=TRUE) ## ------------------------------------------------------------------------------------------------ @@ -46,9 +46,9 @@ export( # phi.extremes, # phi.range, phi, -# tPhi, -# BL, -# UtilTRPhiBL, + #tPhi, + #BL, + #UtilNewRegress, ##surface interpolation methods UtilInterpol, ## utility-based evaluation metrics for classification and regression @@ -57,7 +57,11 @@ export( ## utility-based optimal predictions UtilOptimClassif, UtilOptimRegress, + ## utility-based learning + #MetacostClassif, + #MetacostRegress, ## neighbours function - neighbours + neighbours, + distances ) diff --git a/R/Distances.R b/R/Distances.R new file mode 100644 index 0000000..df7e112 --- /dev/null +++ b/R/Distances.R @@ -0,0 +1,115 @@ +# =================================================== +# This function prepares the data and calls fortran subroutine +# that computes the distances between examples +# P. Branco, June 2017 +# --------------------------------------------------- + +distances <- function(tgt, dat, dist, p=2) + # INPUTS: + # tgt is the column where the target variable is + # dat is the original data set + # dist is the distance measure to be used + # p is a parameter used when a p-norm is computed + # OUTPUTS: + # a matrix with the distances between each example +{ + # check if p has an admissible value(>=1) and change to distance integer code + # p-norm : p provided + # Manhattan : p=1 + # Euclidean : p=2 + # Chebyshev : p=0 + # Canberra : p=-1 + # HEOM : p=-2 + # HVDM : p=-3 + # DVDM : p=-4 + # IVDM : p=-5 + # WVDM : p=-6 + # MVDM : p=-7 + + # just set a dummy value for the number of neighbours to evaluate, which is not + # used in this function but is necessary for the fortran function. + + k <- 1 + + if(p<1) stop("The parameter p must be >=1!") + # p >=-1 only numeric attributes handled + # p =-2 only nominal attributes handled + # p<= -3 nominal and numeric attributes handled + p<- switch(dist, + "Chebyshev"=0, + "Manhattan"=1, + "Euclidean"=2, + "Canberra"=-1, + "Overlap"=-2, + "HEOM"=-3, + "HVDM"=-4, + # to be implemented + # "DVDM"=-5, + # "IVDM"=-6, + # "WVDM"=-7, + # "MVDM"=-8, + "p-norm"=p, + stop("Distance measure not available!")) + + + if (class(dat[,tgt]) == "numeric" & p <= -4) stop("distance measure selected only available for classification tasks") + + nomatr <- c() + for (col in seq.int(dim(dat)[2])) { + if (class(dat[,col]) %in% c('factor','character')) { + nomatr <- c(nomatr, col) + } + } + + nomatr <- setdiff(nomatr, tgt) + numatr <- setdiff(seq.int(dim(dat)[2]), c(nomatr,tgt)) + + nomData <- t(sapply(subset(dat, select = nomatr), as.integer)) + numData <- t(subset(dat, select = numatr)) + + # check if the measure can be applied to the data set features + + if (length(numatr) & p == -2) { + stop("Can not compute Overlap metric with numeric attributes!") + } + if (length(nomatr) & p >= -1) { + stop("Can not compute ", dist ," distance with nominal attributes!") + } + + tgtData <- dat[, tgt] + n <- length(tgtData) + res <- matrix(0.0, nrow = k, ncol = n) + if (class(tgtData) != "numeric") {tgtData <- as.integer(tgtData)} + + Cl <- length(unique(tgtData)) + nnom <- length(nomatr) + nnum <- length(numatr) + + distm <- matrix(0.0, nrow = n, ncol = n) + numD <- matrix(0.0, nrow = nnum, ncol = n) + + + storage.mode(numData) <- "double" + storage.mode(nomData) <- "integer" + storage.mode(res) <- "integer" + storage.mode(tgtData) <- "double" + storage.mode(distm) <- "double" + storage.mode(numD) <- "double" + + neig <- .Fortran("F_neighbours", + tgtData = tgtData, # tgt data + numData = numData, #numeric data + nomData = nomData, #nominal data + p = as.integer(p), # code for distance metric + k = as.integer(k), # nr of neighbours + n = as.integer(n), # nr of examples in the data + nnum = as.integer(nnum), # nr of numeric attributes + nnom = as.integer(nnom), # nr of nominal attributes + Cl = as.integer(Cl), # number of different classes in the target variable + distm = distm, + numD = numD, + res = res) # output + neig <- neig$distm + + neig +} diff --git a/R/Neighbours.R b/R/Neighbours.R index 0ec5e3d..36ca07a 100644 --- a/R/Neighbours.R +++ b/R/Neighbours.R @@ -93,7 +93,7 @@ nomatr <- c() storage.mode(distm) <- "double" storage.mode(numD) <- "double" - neig <- .Fortran("neighbours", + neig <- .Fortran("F_neighbours", tgtData = tgtData, # tgt data numData = numData, #numeric data nomData = nomData, #nominal data diff --git a/R/UtilOptimRegress.R b/R/UtilOptimRegress.R index 4e52aea..5887b11 100644 --- a/R/UtilOptimRegress.R +++ b/R/UtilOptimRegress.R @@ -67,15 +67,17 @@ UtilOptimRegress <- function(form, train, test, # has the y value, the second column the \hat{y} value and the # third column has the corresponding utility value. The domain # boundaries of (y, \hat{y}) must be provided. - # minds the lower bound of the target variable considered for interpolation - # maxds the upper bound of the target variable considered for interpolation - # eps a value for the precision considered during the interpolation. + # minds the lower bound of the target variable considered + # maxds the upper bound of the target variable considered + # eps a value for the precision considered during the pdf. # # output: # the predictions for the test data optimized using the surface provided type <- match.arg(type, c("utility", "cost", "benefit")) strat <- match.arg(strat, c("interpol", "automatic")) # only interpol implemented for now + if (strat != "interpol") stop("UBL::Only interpolation is available for now as strat parameter", + call. = FALSE) tgt <- which(names(train) == as.character(form[[2]])) if (is.null(minds)){ @@ -94,10 +96,17 @@ UtilOptimRegress <- function(form, train, test, No further arguments are necessary.", call. = FALSE) } method <- match.arg(strat.parms[[1]], c("bilinear", "splines", "idw", "krige")) + # UtilRes is a lxl matrix with the true utility values on the rows and the + # predictions on the columns, i.e., resUtil[a,b] provides the utility of + # predicting b for a true value a. UtilRes <- UtilInterpol(NULL, NULL, type, control.parms, minds, maxds, m.pts, method = method, visual = FALSE, eps = eps, full.output = TRUE) + } else if(strat == "auto"){ + # baseseq <- seq(minds-0.01, maxds+0.01, by=eps) + # if(baseseq[length(baseseq)]!=maxds) baseseq <- c(baseseq, maxds) + } resPDF <- getPDFinRange(y.true, test, train, form) @@ -117,7 +126,7 @@ UtilOptimRegress <- function(form, train, test, } else { optim[ex] <- y.true[which.min(areas)] } - print(ex) +# print(ex) } #obtain the utility values for the points (test, optim) determined diff --git a/R/randOverClassif.R b/R/randOverClassif.R index c836c1b..7d3f1d3 100644 --- a/R/randOverClassif.R +++ b/R/randOverClassif.R @@ -56,9 +56,13 @@ RandOverClassif <- function(form, dat, C.perc = "balance", repl = TRUE) for (i in 1:length(names.ove)) { # over-sampling each class provided Exs <- which(dat[, tgt] == names.ove[i]) + if(length(Exs)==1){ + sel <- rep(Exs, as.integer((C.perc[[names.ove[i]]] - 1) * length(Exs))) + } else { sel <- sample(Exs, as.integer((C.perc[[names.ove[i]]] - 1) * length(Exs)), replace = repl) + } newdata <- rbind(newdata, dat[sel, ]) } } else if (C.perc == "balance") { # over-sampling percent. will be calculated @@ -75,9 +79,13 @@ RandOverClassif <- function(form, dat, C.perc = "balance", repl = TRUE) Exs <- which(dat[, tgt] == names.ove[i]) num1 <- li[[2]][as.numeric(match(majCl, names))[1]] num2<- li[[2]][as.numeric(names.ove[i])] - sel <- sample(Exs, - as.integer(num1 - num2), - replace = repl) + if(length(Exs) == 1){ + sel <- rep(Exs, as.integer(num1 - num2)) + } else { + sel <- sample(Exs, + as.integer(num1 - num2), + replace = repl) + } newdata <- rbind(newdata, dat[sel, ]) } } else if (C.perc == "extreme") { @@ -97,9 +105,13 @@ RandOverClassif <- function(form, dat, C.perc = "balance", repl = TRUE) mmcl <- as.numeric(match(majCl, names))[1] n1 <- (li[[2]][mmcl])^2/li[[2]][as.numeric(match(names.ove[i], names))] n2 <- li[[2]][as.numeric(match(names.ove[i], names))] - sel <- sample(Exs, - round(n1 - n2, 0), - replace = repl) + if(length(Exs) == 1){ + sel <- rep(Exs, round(n1 - n2, 0)) + } else { + sel <- sample(Exs, + round(n1 - n2, 0), + replace = repl) + } newdata <- rbind(newdata, dat[sel, ]) } } else { diff --git a/R/randUnderClassif.R b/R/randUnderClassif.R index ccb5f96..41ab34b 100644 --- a/R/randUnderClassif.R +++ b/R/randUnderClassif.R @@ -124,7 +124,7 @@ class.freq <- function(dat, tgt){ names <- sort(unique(dat[, tgt])) li <- list(names, sapply(names, - function(x) length(which(dat[, tgt] == names[x])))) + function(x) length(which(dat[, tgt] == x)))) li } diff --git a/R/smoteClassif.R b/R/smoteClassif.R index e673187..103c1b1 100644 --- a/R/smoteClassif.R +++ b/R/smoteClassif.R @@ -99,16 +99,37 @@ SmoteClassif <- function(form, dat, C.perc = "balance", } if (length(names.ove)) { # perform over-sampling for (i in 1:length(names.ove)) { - newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == names.ove[i]), ], - ncol(dat), - C.perc[[names.ove[i]]], - k, - dist, - p) - # add original rare examples and synthetic generated examples - newdata <- rbind(newdata, - newExs, - dat[which(dat[, ncol(dat)] == names.ove[i]), ]) + if(length(which(dat[, ncol(dat)] == names.ove[i])) == 1){ + warning(paste("SmoteClassif :: Unable to use SmoteClassif in a bump with 1 example. + Introducing replicas of the example."), call.=FALSE) + newdata <- rbind(newdata, dat[rep(which(dat[, ncol(dat)] == names.ove[i]),C.perc[names.ove[i]]),]) + } else if (length(which(dat[, ncol(dat)] == names.ove[i])) <= k){ + warning(paste("SmoteClassif :: Nr of examples is less or equal to k.\n Using k =", + length(which(dat[, ncol(dat)] == names.ove[i]))-1, + "in the nearest neighbours computation in this bump."), call.=FALSE) + Origk <- k + k <- length(which(dat[, ncol(dat)] == names.ove[i]))-1 + newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == names.ove[i]), ], + ncol(dat), + li[[3]][ove[i]]/li[[2]][ove[i]] + 1, + k, + dist, + p) + # add original rare examples and synthetic generated examples + newdata <- rbind(newdata, newExs, + dat[which(dat[,ncol(dat)] == names.ove[i]),]) + k <- Origk + } else { + newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == names.ove[i]), ], + ncol(dat), + C.perc[[names.ove[i]]], + k, + dist, + p) + # add original rare examples and synthetic generated examples + newdata <- rbind(newdata, newExs, + dat[which(dat[, ncol(dat)] == names.ove[i]), ]) + } } } } else { @@ -140,15 +161,37 @@ SmoteClassif <- function(form, dat, C.perc = "balance", if (length(ove)) { #perform over-sampling for (i in 1:length(ove)) { - newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == li[[1]][ove[i]]), ], - ncol(dat), - li[[3]][ove[i]]/li[[2]][ove[i]] + 1, - k, - dist, - p) - # add original rare examples and synthetic generated examples - dc <- ncol(dat) - newdata <- rbind(newdata, newExs, dat[which(dat[,dc] == li[[1]][ove[i]]),]) + if(length(which(dat[, ncol(dat)] == li[[1]][ove[i]])) == 1){ + warning(paste("SmoteClassif :: Unable to use SmoteClassif in a bump with 1 example. + Introducing replicas of the example."), call.=FALSE) + newdata <- rbind(newdata, dat[rep(which(dat[, ncol(dat)] == li[[1]][ove[i]]), li[[3]][ove[i]]),]) + } else if(length(which(dat[, ncol(dat)] == li[[1]][ove[i]]))<= k){ + warning(paste("SmoteClassif :: Nr of examples is less or equal to k.\n Using k =", + length(which(dat[, ncol(dat)] == li[[1]][ove[i]]))-1, + "in the nearest neighbours computation in this bump."), call.=FALSE) + Origk <- k + k <- length(which(dat[, ncol(dat)] == li[[1]][ove[i]]))-1 + newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == li[[1]][ove[i]]), ], + ncol(dat), + li[[3]][ove[i]]/li[[2]][ove[i]] + 1, + k, + dist, + p) + # add original rare examples and synthetic generated examples + newdata <- rbind(newdata, newExs, + dat[which(dat[,ncol(dat)] == li[[1]][ove[i]]),]) + k <- Origk + } else { + newExs <- Smote.exsClassif(dat[which(dat[, ncol(dat)] == li[[1]][ove[i]]), ], + ncol(dat), + li[[3]][ove[i]]/li[[2]][ove[i]] + 1, + k, + dist, + p) + # add original rare examples and synthetic generated examples + newdata <- rbind(newdata, newExs, + dat[which(dat[,ncol(dat)] == li[[1]][ove[i]]),]) + } } } diff --git a/R/smoteRegress.R b/R/smoteRegress.R index a51d27d..d0b70d0 100644 --- a/R/smoteRegress.R +++ b/R/smoteRegress.R @@ -143,12 +143,26 @@ SmoteRegress <- function(form, dat, rel = "auto", thr.rel = 0.5, if (C.perc[[i]] == 1) { newdata <- rbind(newdata, dat[names(obs.ind[[i]]), ]) } else if (C.perc[[i]] > 1) { - newExs <- Smote.exsRegress(dat[names(obs.ind[[i]]), ], - ncol(dat), - C.perc[[i]], - k, - dist, - p) + if (length(obs.ind[[i]])<=k && length(obs.ind[[i]])>1) { + warning("Unable to use the number of neighbors specified + because the bump has fewer examples. Using ", + length(obs.ind[[i]])-1, " as the value of k.", + call.=FALSE) + newExs <- Smote.exsRegress(dat[names(obs.ind[[i]]), ], + ncol(dat), C.perc[[i]], + length(obs.ind[[i]])-1, dist, p) + + } else if (length(obs.ind[[i]]) == 1) { + warning("Unable to use the number of neighbors specified + because the bump has only one example. Introducing + replicas in this bump!", + call.=FALSE) + newExs <- dat[rep(names(obs.ind[[i]]), C.perc[[i]]*nrow(dat)),] + } else { + newExs <- Smote.exsRegress(dat[names(obs.ind[[i]]), ], + ncol(dat), C.perc[[i]], + k, dist, p) + } # add original rare examples and synthetic generated examples newdata <- rbind(newdata, newExs, dat[names(obs.ind[[i]]), ]) } else if (C.perc[[i]] < 1) { @@ -187,6 +201,17 @@ Smote.exsRegress <- function(dat, tgt, N, k, dist, p) # The result of the function is a (N-1)*nrow(dat) set of generate # examples with rare values on the target { + # check for constant features and remove them, if any + # add the constant value of those features in the returned synthetic examples + + ConstFeat <- which(apply(dat, 2, function(col){length(unique(col)) == 1})) + + if(length(ConstFeat)){ + badds <- dat + ConstRes <- dat[1,ConstFeat] + dat <- dat[,apply(dat, 2, function(col) { length(unique(col)) > 1 })] + tgt <- ncol(dat) + } nomatr <- c() T <- matrix(nrow = dim(dat)[1], ncol = dim(dat)[2]) @@ -277,9 +302,9 @@ Smote.exsRegress <- function(dat, tgt, N, k, dist, p) d2 <- abs(T[kNNs[i, neig], x] - newM[nexs * nT + count, x])/ranges[x] } if (length(nomatr)) { - d1 <- d1 + sum(T[i,nomatr] != newM[(i - 1) * nexs + n, nomatr]) + d1 <- d1 + sum(T[i,nomatr] != newM[nexs *nT + count, nomatr]) d2 <- d2 + - sum(T[kNNs[i, neig], nomatr] != newM[(i - 1) * nexs + n, nomatr]) + sum(T[kNNs[i, neig], nomatr] != newM[nexs * nT + count, nomatr]) } # (d2+d1-d1 = d2 and d2+d1-d2 = d1) the more distant the less weight if (d1 == d2) { @@ -294,14 +319,24 @@ Smote.exsRegress <- function(dat, tgt, N, k, dist, p) } newCases <- data.frame(newM) - for (a in nomatr) { newCases[, a] <- factor(newCases[, a], levels = 1:nlevels(dat[, a]), labels = levels(dat[, a])) } - colnames(newCases) <- colnames(dat) + if(length(ConstFeat)){ # add constant features that were removed in the beginning + + newCases <- cbind(newCases, + as.data.frame(lapply(ConstRes, + function(x){rep(x, nrow(newCases))}))) + colnames(newCases) <- c(colnames(dat), names(ConstFeat)) + newCases <- newCases[colnames(badds)] + + } else { + colnames(newCases) <- colnames(dat) + } + newCases } diff --git a/man/Distances.Rd b/man/Distances.Rd new file mode 100644 index 0000000..5d37d59 --- /dev/null +++ b/man/Distances.Rd @@ -0,0 +1,69 @@ +\name{distances} +\alias{distances} + +\title{ +Distance matrix between all data set examples according to a selected distance metric. +} +\description{ +This function computes the distances between all examples in a data set using a selected distance metric. The metrics available are suitable for data sets with numeric and/or nominal features and include, among others: Euclidean, Manhattan, HEOM and HVDM. +} +\usage{ +distances(tgt, dat, dist, p=2) +} + +\arguments{ + \item{tgt}{ + The column of the problem target variable. + } + \item{dat}{A data frame containing the problem data. + } + \item{dist}{A character string specifying the distance function to use in the nearest neighbours evaluation. + } + \item{p}{An optional parameter that is only required if the distance function selected in parameter \code{dist} is "p-norm". + } +} + +\value{ +The function returns a matrix with the distances computed between each pair of examples in the data set. +} + + +\details{Several distance function are implemented in UBL package. The goal of having such a diversity of distance functions is to provide the users more flexibility regarding the distance used and also to provide distance fucntions that are able to deal with nominal and numeric features. The options available for the distance functions are as follows: + \describe{ + \item{data with only numeric features:}{ "Manhattan", "Euclidean", "Canberra", "Chebyshev", "p-norm";} + \item{data with only nominal features:}{ "Overlap";} + \item{data with both nominal and numeric features:}{ "HEOM", "HVDM".} + } + + When the "p-norm" is selected for the \code{dist} parameter, it is also necessary to define the value of parameter \code{p}. The value of parameter \code{p} sets which "p-norm" will be used. For instance, if \code{p} is set to 1, the "1-norm" (or Manhattan distance) is used, and if \code{p} is set to 2, the "2-norm" (or Euclidean distance) is applied. + For more details regarding the distance functions implemented in UBL package please see the package vignettes. +} + +\references{Wilson, D.R. and Martinez, T.R. (1997). \emph{Improved heterogeneous distance functions.} Journal of artificial intelligence research, pp.1-34. +} +\seealso{ +\code{\link{neighbours}} +} + +\author{ Paula Branco \email{paobranco@gmail.com}, Rita Ribeiro + \email{rpribeiro@dcc.fc.up.pt} and Luis Torgo \email{ltorgo@dcc.fc.up.pt} +} + +\examples{ +\dontrun{ +data(ImbC) +# determine the distances between each example in ImbC data set +# using the "HVDM" distance function. +dist1 <- distances(3, ImbC, "HVDM") + +# now using the "HEOM" distance function +dist2 <- distances(3, ImbC, "HEOM") + +# check the differences +head(dist1) +head(dist2) +} +} + +\keyword{distances evaluation} + diff --git a/man/UtilOptimClassif.Rd b/man/UtilOptimClassif.Rd index baaaab7..9bd39c6 100644 --- a/man/UtilOptimClassif.Rd +++ b/man/UtilOptimClassif.Rd @@ -120,5 +120,5 @@ table(test$Class,resUtil) } -\keyword{utility optimization} +\keyword{utility optimization, utility-based classification} diff --git a/man/neighbours.Rd b/man/neighbours.Rd index 720f855..d4cafcd 100644 --- a/man/neighbours.Rd +++ b/man/neighbours.Rd @@ -41,11 +41,17 @@ The function returns a matrix with the indexes of the k nearest neighbours for e \references{Wilson, D.R. and Martinez, T.R. (1997). \emph{Improved heterogeneous distance functions.} Journal of artificial intelligence research, pp.1-34. } + +\seealso{ +\code{\link{distances}} +} + \author{ Paula Branco \email{paobranco@gmail.com}, Rita Ribeiro \email{rpribeiro@dcc.fc.up.pt} and Luis Torgo \email{ltorgo@dcc.fc.up.pt} } \examples{ +\dontrun{ data(ImbC) # determine the 2 nearest neighbours of each example in ImbC data set # using the "HVDM" distance function. @@ -58,5 +64,6 @@ neig2 <- neighbours(3, ImbC, "HEOM", k=2) head(neig1) head(neig2) } +} \keyword{neighbours evaluation} diff --git a/src/UBL_init.c b/src/UBL_init.c new file mode 100644 index 0000000..fcb5b47 --- /dev/null +++ b/src/UBL_init.c @@ -0,0 +1,23 @@ +#include +#include // for NULL +#include + +/* FIXME: + Check these declarations against the C/Fortran source code. +*/ + +/* .Fortran calls */ +extern void F77_NAME(f_neighbours)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +extern void F77_NAME(rtophi)(void *, void *, void *, void *, void *, void *, void *, void *, void *); + +static const R_FortranMethodDef FortranEntries[] = { + {"f_neighbours", (DL_FUNC) &F77_NAME(f_neighbours), 12}, + {"rtophi", (DL_FUNC) &F77_NAME(rtophi), 9}, + {NULL, NULL, 0} +}; + +void R_init_UBL(DllInfo *dll) +{ + R_registerRoutines(dll, NULL, NULL, FortranEntries, NULL); + R_useDynamicSymbols(dll, FALSE); +} \ No newline at end of file diff --git a/src/neighbours.f90 b/src/neighbours.f90 index 427375f..77152dc 100644 --- a/src/neighbours.f90 +++ b/src/neighbours.f90 @@ -4,7 +4,7 @@ ! !============================================================ -subroutine neighbours(tgtData, numData, nomData, p, k, n, nnum,& +subroutine F_neighbours(tgtData, numData, nomData, p, k, n, nnum,& nnom, Cl, distm, numD, res) !---------------------------------------------------------------- ! Subroutine neighbours is used for obtaining the nearest @@ -271,9 +271,17 @@ double precision function HEOMnum(a,b,d,ranges) implicit none integer(kind=4), intent(in) :: d real (kind=8), intent(in) :: a(d), b(d), ranges(d) - - - HEOMnum = sum((abs(a-b)/ranges)**2) + integer(kind=4) :: i + ! Epsilon value + real (kind=8), PARAMETER :: eps = 1d-30 + + HEOMnum =0.0d0 + do i=1, d + ! HEOMnum = sum((abs(a-b)/ranges)**2) + if (ranges(i) > eps) then + HEOMnum = HEOMnum +(abs(a(i)-b(i))/ranges(i))**2 + end if + end do end function HEOMnum @@ -347,8 +355,15 @@ double precision function HVDMnum(numa, numb, dimnum, sd) integer (kind=4) :: i - - HVDMnum = sum((abs(numa-numb)/(4*sd))**2) + ! Epsilon value + real (kind=8), PARAMETER :: eps = 1d-30 + + HVDMnum = 0.0d0 + do i=1, dimnum + if (sd(i) > eps) then + HVDMnum = HVDMnum + ((abs(numa(i)-numb(i))/(4*sd(i)))**2) + end if + end do end function HVDMnum @@ -396,4 +411,4 @@ double precision function HVDMnom(nomdata, dimnom, tgtData, n, i, j, Cl) end function HVDMnom -end subroutine neighbours +end subroutine F_neighbours