From 2c6cb7f5f5b7ed94a6315a96270f672ecd26ef7d Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 1 Nov 2019 14:14:04 +1300 Subject: [PATCH 01/12] add Scitype method to boost performance for arrays add docstring and update manual update manual update manual --- docs/src/index.md | 10 +++++ src/ScientificTypes.jl | 76 ++++++++++++++++++++++++++++++++++---- src/conventions/mlj/mlj.jl | 17 +++------ test/runtests.jl | 1 - 4 files changed, 84 insertions(+), 20 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 5b3c496..d8a6fdd 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -182,6 +182,16 @@ Similarly, the scitype of an `AbstractArray` is `AbstractArray{U}` where `U` is scitype([1.3, 4.5, missing]) ``` +*Performance note:* Computing type unions over large arrays is +expensive and, depending on the convention's implementation and the +array eltype, computing the scitype can be slow. (In the *mlj* +convention this is mitigated with the help of the +`ScientificTypes.Scitype` method, of which other conventions could +make use. Do `?ScientificTypes.Scitype` for details.) An eltype `Any` +will always be slow and you may want to consider replacing an array +`A` with `broadcast(idenity, A)` to collapse the eltype and speed up +the computation. + Provided the [Tables.jl](https://github.com/JuliaData/Tables.jl) package is loaded, any table implementing the Tables interface has a scitype encoding the scitypes of its columns: ```@example 5 diff --git a/src/ScientificTypes.jl b/src/ScientificTypes.jl index cf0522a..0349dcf 100644 --- a/src/ScientificTypes.jl +++ b/src/ScientificTypes.jl @@ -133,16 +133,78 @@ See also `scitype`. scitype_union(A) = reduce((a,b)->Union{a,b}, (scitype(el) for el in A)) -# ## SCITYPES OF TUPLES AND ARRAYS +# ## SCITYPES OF TUPLES scitype(t::Tuple, ::Val) = Tuple{scitype.(t)...} -# The following fallback can be quite slow. Individual conventions -# will usually be able to find more perfomant overloadings of this -# method: -scitype(A::B, ::Val) where {T,N,B<:AbstractArray{T,N}} = + +# ## SCITYPES OF ARRAYS + +""" + ScientificTypes.Scitype(::Type, C::Val) + +Method for implementers of a conventions to enable speed-up of scitype +evaluations for large arrays. + +In general, one cannot infer the scitype of an object of type +`AbstractArray{T, N}` from the machine type alone. For, example, this +never holds in the *mlj* convention for a categorical array, or in the +following examples: `X=Any[1, 2, 3]` and `X=Union{Missing,Int64}[1, 2, +3]`. + +Nevertheless, for some *restricted* machine types `U`, the statement +`type(X) == AbstractArray{T, N}` for some `T<:U` already allows one +deduce that `scitype(X) = AbstractArray{S,N}`, where `S` is determined +by `U` alone. This is the case in the *mlj* convention, for example, +if `U = Integer`, in which case `S = Count`. If one explicitly declares + + ScientificTypes.Scitype(::Type{<:U}, ::Val{:convention}) = S + +in such cases, then ScientificTypes ensures a considerable speed-up in +the computation of `scitype(X)`. There is also a partial speed-up for +the case that `T <: Union{U, Missing}`. + +For example, in *mlj* one has `Scitype(::Type{<:Integer}) = Count`. + +""" +Scitype(::Type, C::Val) = nothing +Scitype(::Type{Any}, C::Val) = nothing # b/s `Any` isa `Union{<:Any, Missing}` + +# For all such `T` we can also get almost the same speed-up in the case that +# `T` is replaced by `Union{T, Missing}`, which we detect by wrapping +# the answer: + +Scitype(MT::Type{Union{T, Missing}}, C::Val) where T = Val(Scitype(T, C)) + +# For example, in *mlj* convention, Scitype(::Integer) = Count + +const Arr{T,N} = AbstractArray{T,N} + +# the dispatcher: +scitype(A::Arr{T}, C) where T = scitype(A, C, Scitype(T, C)) + +# the slow fallback: +scitype(A::Arr{<:Any,N}, ::Val, ::Nothing) where N = AbstractArray{scitype_union(A),N} +# the speed-up: +scitype(::Arr{<:Any,N}, ::Val, S) where N = Arr{S,N} + +# partial speed-up for missing types, because broadcast is faster than +# computing scitype_union: +function scitype(A::Arr{<:Any,N}, C::Val, ::Val{S}) where {N,S} + if S == nothing + return scitype(A, C, S) + else + Atight = broadcast(identity, A) + if typeof(A) == typeof(Atight) + return Arr{Union{S,Missing},N} + else + return Arr{S,N} + end + end +end + # ## STUB FOR COERCE METHOD @@ -200,7 +262,7 @@ schema(X, ::Val{:other}) = ## ACTIVATE DEFAULT CONVENTION -# and include code not requring optional dependencies: +# and include code not requiring optional dependencies: mlj() include("conventions/mlj/mlj.jl") @@ -218,7 +280,7 @@ function __init__() @require(Tables="bd369af6-aec1-5ad0-b16a-f7cc5008161c", (include("tables.jl"); include("autotype.jl"))) - # :mlj conventions requiring external packages + # external packages for the :mlj convention: @require(CategoricalArrays="324d7699-5711-5eae-9e2f-1d82baa6b597", include("conventions/mlj/finite.jl")) @require(ColorTypes="3da002f7-5984-5a60-b8a6-cbb66c0b333f", diff --git a/src/conventions/mlj/mlj.jl b/src/conventions/mlj/mlj.jl index 8b48117..97e44ac 100644 --- a/src/conventions/mlj/mlj.jl +++ b/src/conventions/mlj/mlj.jl @@ -6,18 +6,11 @@ _coerce_missing_warn(T) = "Coerced to Union{Missing,$T} instead. " -## PERFORMANT SCITYPES FOR ARRAYS - -const A{T,N} = AbstractArray{T,N} - -scitype(::B, ::Val{:mlj}) where {N,B<:A{<:AbstractFloat,N}} = - A{Continuous,N} -scitype(::B, ::Val{:mlj}) where {N,B<:A{Union{<:AbstractFloat,Missing},N}} = - A{Union{Continuous,Missing},N} -scitype(::B, ::Val{:mlj}) where {N,B<:A{<:Integer,N}} = - A{Count,N} -scitype(::B, ::Val{:mlj}) where {N,B<:A{Union{<:Integer,Missing},N}} = - A{Union{Count,Missing},N} +# ## IMPLEMENT PERFORMANCE BOOSTING FOR ARRAYS + +Scitype(::Type{<:Integer}, ::Val{:mlj}) = Count +Scitype(::Type{<:AbstractFloat}, ::Val{:mlj}) = Continuous + ## COERCE VECTOR TO CONTINUOUS diff --git a/test/runtests.jl b/test/runtests.jl index 1f26920..be5fdd9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,7 +18,6 @@ include("basic_tests.jl") @test scitype((4, 4.5, c, u, "X")) == Tuple{Count,Continuous,Multiclass{2}, OrderedFactor{2},Unknown} - end A = Any[2 4.5; From 9f8333e8d76fb1647df9d30e6bbceb7f25a4e9ad Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 1 Nov 2019 16:09:33 +1300 Subject: [PATCH 02/12] resolve #33 and add test --- src/conventions/mlj/finite.jl | 13 +++++++++---- test/runtests.jl | 4 ++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/conventions/mlj/finite.jl b/src/conventions/mlj/finite.jl index 002452f..78413a3 100644 --- a/src/conventions/mlj/finite.jl +++ b/src/conventions/mlj/finite.jl @@ -21,15 +21,20 @@ end ## PERFORMANT SCITYPES FOR ARRAYS -function scitype(A::B, ::Val{:mlj}) where {T,N,B<:CategoricalArray{T,N}} +const CatArr{T,N,V} = CategoricalArray{T,N,<:Any,V} + +function scitype(A::CatArr{T,N,V}, ::Val{:mlj}) where {T,N,V} nlevels = length(levels(A)) if isordered(A) S = OrderedFactor{nlevels} else S = Multiclass{nlevels} end - if T isa Union && Missing <: T - S = Union{S,Missing} + if T != V # missing values + Atight = broadcast(identity, A) + if !(Atight isa CatArr{V,N,V}) # missings remain + S = Union{S,Missing} + end end - return AbstractArray{S, N} + return AbstractArray{S,N} end diff --git a/test/runtests.jl b/test/runtests.jl index be5fdd9..ce94d68 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -63,6 +63,8 @@ end AbstractVector{Multiclass{4}} @test scitype(categorical([1, missing, 3])) == AbstractVector{Union{Multiclass{2},Missing}} + @test scitype(categorical([1, missing])[1:1]) == + AbstractVector{Multiclass{1}} @test scitype(categorical(1:4, ordered=true)) == AbstractVector{OrderedFactor{4}} @@ -70,6 +72,8 @@ end AbstractVector{OrderedFactor{4}} @test scitype(categorical([1, missing, 3], ordered=true)) == AbstractVector{Union{OrderedFactor{2},Missing}} + @test scitype(categorical([1, missing], ordered=true)[1:1]) == + AbstractVector{OrderedFactor{1}} end From b894e8a85e5a4c4d622f71b6d410d0d5cf713120 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 1 Nov 2019 17:01:24 +1300 Subject: [PATCH 03/12] update readme --- README.md | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index de24fc0..9e68c10 100644 --- a/README.md +++ b/README.md @@ -4,25 +4,49 @@ | :-----------: | :------: | :-----------: | | [![Build Status](https://travis-ci.org/alan-turing-institute/ScientificTypes.jl.svg?branch=master)](https://travis-ci.org/alan-turing-institute/ScientificTypes.jl) | [![codecov.io](http://codecov.io/github/alan-turing-institute/ScientificTypes.jl/coverage.svg?branch=master)](http://codecov.io/github/alan-turing-institute/ScientificTypes.jl?branch=master) | [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://alan-turing-institute.github.io/ScientificTypes.jl/dev) | -A light-weight Julia interface for implementing conventions about the scientific interpretation of data, and for performing type coercions enforcing those conventions. +A light-weight Julia interface for implementing conventions about the +scientific interpretation of data, and for performing type coercions +enforcing those conventions. The package makes the distinction between between **machine type** and **scientific type**: * the _machine type_ is a Julia type the data is currently encoded as (for instance: `Float64`) -* the _scientific type_ is a type defined by this package which encapsulates how the data should be _interpreted_ in the rest of the code (for instance: `Continuous` or `Multiclass`) +* the _scientific type_ is a type defined by this package which + encapsulates how the data should be _interpreted_ (for instance: + `Continuous` or `Multiclass`) -As a motivating example, the data might contain a column corresponding to a _number of transactions_, the machine type in that case could be an `Int` whereas the scientific type would be a `Count`. +The distinction is useful because the same machine type is often used +to represent data with *differing* scientific interpretations - `Int` +is used for product numbers (a factor) but also for a person's weight +(a continuous variable) - while the same scientific +type is frequently represented by *different* machine types - both +`Int` and `Float64` are used to represent weights, for example. -The usefulness of this machinery becomes evident when the machine type does not directly connect with a scientific type; taking the previous example, the data could have been encoded as a `Float64` whereas the meaning should still be a `Count`. ## Very quick start -(For more information and examples please refer to [the doc](https://alan-turing-institute.github.io/ScientificTypes.jl/dev)) +For more information and examples please refer to [the +manual](https://alan-turing-institute.github.io/ScientificTypes.jl/dev). -This is a very quick start presenting two key functions exported by ScientificTypes: +ScientificTypes.jl has three components: -* `schema(X)` which gives an extended schema of the table `X` with the column scientific types implied by the current scitype convention, -* `coerce(X, ...)` which allows to overwrite scientific types for specific columns to indicate their appropriate scientific interpretation. +- An *interface*, for articulating a convention about the scientific + interpretation of data. This consists of a definition of a scientific + type hierarchy, and a single function `scitype` with scientific + types as values. Someone implementing a convention must add methods + to this function, while the general user just applies it to data, as + in `scitype(4.5)` (returning `Continuous` in the *mlj* convention). + +- A built-in convention, called *mlj*, active by default. + +- Convenience methods for working with scientific types, the most commonly used being: + + - `schema(X)`, which gives an extended schema of any table `X`, + including the column scientific types implied by the active + convention. +. + - `coerce(X, ...)`, which coerces the machine types of `X` + to reflect a desired scientific type. ```julia using ScientificTypes, DataFrames @@ -49,7 +73,8 @@ will print :e -- Union{Missing, Unknown} ``` -this uses the default "MLJ convention" to attribute a scitype (cf. [docs](https://alan-turing-institute.github.io/ScientificTypes.jl/dev/#The-MLJ-convention-1)). +this uses the default *mlj* convention to attribute a scitype +(cf. [docs](https://alan-turing-institute.github.io/ScientificTypes.jl/dev/#The-MLJ-convention-1)). Now you could want to specify that `b` is actually a `Count`, and that `d` and `e` are `Multiclass`; this is done with the `coerce` function: From 91a65735d2f63f64bdd668b9d82455b41e02902a Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 1 Nov 2019 17:08:52 +1300 Subject: [PATCH 04/12] make Tables, CategoricalArrays and ColorTypes into [deps] --- Project.toml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index 95cf97e..81119d9 100644 --- a/Project.toml +++ b/Project.toml @@ -4,20 +4,23 @@ authors = ["Anthony D. Blaom "] version = "0.2.3" [deps] +CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" +ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Requires = "ae029012-a4dd-5104-9daa-d747884805df" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] +CategoricalArrays = "<0.5.3, 0.7" +ColorTypes = "0.8" Requires = "0.5.2" +Tables = "<0.1.19, 0.2" julia = "1" [extras] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" -CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" -ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["AbstractTrees", "CategoricalArrays", "ColorTypes", "Random", "Tables", "Test"] +test = ["AbstractTrees", "Random", "Test"] From 7292c62b4558fd71f6a43e537b93258a557ed0f9 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 1 Nov 2019 17:11:41 +1300 Subject: [PATCH 05/12] update manual --- docs/src/index.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index d8a6fdd..989cd61 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -24,15 +24,17 @@ ScientificTypes.tree() - A single method `scitype` for articulating a convention about what scientific type each Julia object can represent. For example, one might declare `scitype(::AbstractFloat) = Continuous`. -- A default convention called *mlj*, based on optional dependencies `CategoricalArrays`, `ColorTypes`, and `Tables`, which includes a convenience method `coerce` for performing scientific type coercion on `AbstractVectors` and columns of tabular data (any table implementing the [Tables.jl](https://github.com/JuliaData/Tables.jl) interface). +- A default convention called *mlj*, based on dependencies + `CategoricalArrays`, `ColorTypes`, and `Tables`, which includes a + convenience method `coerce` for performing scientific type coercion + on `AbstractVectors` and columns of tabular data (any table + implementing the [Tables.jl](https://github.com/JuliaData/Tables.jl) + interface). - A `schema` method for tabular data, based on the optional Tables dependency, for inspecting the machine and scientific types of tabular data, in addition to column names and number of rows. -### Dependencies -The only dependencies are [`Requires.jl`](https://github.com/MikeInnes/Requires.jl) and `InteractiveUtils` (from stdlib). - -## Quick start +## Getting started The package is registered and can be installed via the package manager with `add ScientificTypes`. From b95962c122b4f80fe92035349d892c69cc0d99ec Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 1 Nov 2019 17:16:38 +1300 Subject: [PATCH 06/12] switch to coverall for code coverage --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 5c773cd..6dd937f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,7 +18,7 @@ matrix: - julia: nightly after_success: - - julia -e 'using Pkg; pkg"add Coverage"; using Coverage; Codecov.submit(Codecov.process_folder())' + - julia -e 'import Pkg; Pkg.add("Coverage"); using Coverage; Coveralls.submit(process_folder())' jobs: include: From f54d573b48ebd9735d9db7b89c0874fe9546f7a1 Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Sat, 2 Nov 2019 14:10:44 +0000 Subject: [PATCH 07/12] perf branch init --- Project.toml | 12 ++++-------- docs/src/index.md | 21 ++++++++++++++++----- src/ScientificTypes.jl | 34 +++++++++------------------------- src/conventions/mlj/finite.jl | 2 -- src/conventions/mlj/images.jl | 2 -- src/tables.jl | 2 -- src/tree.jl | 7 ------- 7 files changed, 29 insertions(+), 51 deletions(-) delete mode 100644 src/tree.jl diff --git a/Project.toml b/Project.toml index 81119d9..8c014ba 100644 --- a/Project.toml +++ b/Project.toml @@ -6,21 +6,17 @@ version = "0.2.3" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" -InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -Requires = "ae029012-a4dd-5104-9daa-d747884805df" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -CategoricalArrays = "<0.5.3, 0.7" -ColorTypes = "0.8" -Requires = "0.5.2" -Tables = "<0.1.19, 0.2" +CategoricalArrays = "^0.7" +ColorTypes = "^0.8" +Tables = "^0.2" julia = "1" [extras] -AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["AbstractTrees", "Random", "Test"] +test = ["Random", "Test"] diff --git a/docs/src/index.md b/docs/src/index.md index 989cd61..084de4a 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -17,9 +17,20 @@ The package `ScientificTypes` provides: - A hierarchy of new Julia types representing scientific data types for use in method dispatch (eg, for trait values). Instances of the types play no role: -```@example 0 -using ScientificTypes, AbstractTrees -ScientificTypes.tree() +``` +Found +├─ Known +│ ├─ Finite +│ │ ├─ Multiclass +│ │ └─ OrderedFactor +│ ├─ Infinite +│ │ ├─ Continuous +│ │ └─ Count +│ ├─ Image +│ │ ├─ ColorImage +│ │ └─ GrayImage +│ └─ Table +└─ Unknown ``` - A single method `scitype` for articulating a convention about what scientific type each Julia object can represent. For example, one might declare `scitype(::AbstractFloat) = Continuous`. @@ -300,7 +311,7 @@ X = (a = rand("abc", n), # 3 values, not number --> Multiclass autotype(X, only_changes=true) ``` -For example, we could first apply the `:discrete_to_continuous` rule, +For example, we could first apply the `:discrete_to_continuous` rule, followed by `:few_to_finite` rule. The first rule will apply to `b` and `e` but the subsequent application of the second rule will mean we will get the same result apart for `e` (which will be `Continuous`) @@ -310,4 +321,4 @@ autotype(X, only_changes=true, rules=(:discrete_to_continuous, :few_to_finite)) ``` One should check and possibly modify the returned dictionary -before passing to `coerce`. +before passing to `coerce`. diff --git a/src/ScientificTypes.jl b/src/ScientificTypes.jl index 0349dcf..0535609 100644 --- a/src/ScientificTypes.jl +++ b/src/ScientificTypes.jl @@ -2,11 +2,14 @@ module ScientificTypes export Scientific, Found, Unknown, Finite, Infinite export OrderedFactor, Multiclass, Count, Continuous -export Binary, Table, ColorImage, GrayImage +export Binary, Table +export ColorImage, GrayImage export scitype, scitype_union, scitypes, coerce, schema export mlj -using Requires, InteractiveUtils +using Tables, CategoricalArrays, ColorTypes +# using Requires +# using InteractiveUtils # ## FOR DEFINING SCITYPES ON OBJECTS DETECTED USING TRAITS @@ -118,7 +121,6 @@ scitype(X, C, ::Val{:other}) = Unknown scitype(::Missing) = Missing - # ## CONVENIENCE METHOD FOR UNIONS OVER ELEMENTS """ @@ -205,7 +207,6 @@ function scitype(A::Arr{<:Any,N}, C::Val, ::Val{S}) where {N,S} end end - # ## STUB FOR COERCE METHOD function coerce end @@ -259,6 +260,8 @@ schema(X, ::Val{:other}) = "an object with trait `:other`\n"* "Perhaps you meant to import Tables first?")) +include("tables.jl") +include("autotype.jl") ## ACTIVATE DEFAULT CONVENTION @@ -266,26 +269,7 @@ schema(X, ::Val{:other}) = mlj() include("conventions/mlj/mlj.jl") - - -## FOR LOADING OPTIONAL DEPENDENCIES - -function __init__() - - # for printing out the type tree: - @require(AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c", - include("tree.jl")) - - # the scitype and schema of tabular data: - @require(Tables="bd369af6-aec1-5ad0-b16a-f7cc5008161c", - (include("tables.jl"); include("autotype.jl"))) - - # external packages for the :mlj convention: - @require(CategoricalArrays="324d7699-5711-5eae-9e2f-1d82baa6b597", - include("conventions/mlj/finite.jl")) - @require(ColorTypes="3da002f7-5984-5a60-b8a6-cbb66c0b333f", - include("conventions/mlj/images.jl")) - -end +include("conventions/mlj/finite.jl") +include("conventions/mlj/images.jl") end # module diff --git a/src/conventions/mlj/finite.jl b/src/conventions/mlj/finite.jl index 78413a3..b46521b 100644 --- a/src/conventions/mlj/finite.jl +++ b/src/conventions/mlj/finite.jl @@ -1,5 +1,3 @@ -using .CategoricalArrays - nlevels(c::CategoricalValue) = length(levels(c.pool)) nlevels(c::CategoricalString) = length(levels(c.pool)) diff --git a/src/conventions/mlj/images.jl b/src/conventions/mlj/images.jl index b509d28..8279751 100644 --- a/src/conventions/mlj/images.jl +++ b/src/conventions/mlj/images.jl @@ -1,5 +1,3 @@ -using .ColorTypes - scitype(image::AbstractArray{<:Gray,2}, ::Val{:mlj}) = GrayImage{size(image)...} scitype(image::AbstractArray{<:AbstractRGB,2}, ::Val{:mlj}) = diff --git a/src/tables.jl b/src/tables.jl index 15085aa..ea9a3a7 100644 --- a/src/tables.jl +++ b/src/tables.jl @@ -1,5 +1,3 @@ -using .Tables - TRAIT_FUNCTION_GIVEN_NAME[:table] = Tables.istable function scitype(X, ::Val, ::Val{:table}) diff --git a/src/tree.jl b/src/tree.jl deleted file mode 100644 index d7e171e..0000000 --- a/src/tree.jl +++ /dev/null @@ -1,7 +0,0 @@ -using .AbstractTrees - -AbstractTrees.children(x::Type) = subtypes(x) - -function tree() - print_tree(Found) -end From c9e0dbf577438669e800550956e5d56888aa60e9 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Mon, 4 Nov 2019 10:01:02 +1300 Subject: [PATCH 08/12] Forgotten move of includes out of __init__ --- src/ScientificTypes.jl | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/ScientificTypes.jl b/src/ScientificTypes.jl index 0349dcf..f446aab 100644 --- a/src/ScientificTypes.jl +++ b/src/ScientificTypes.jl @@ -6,7 +6,7 @@ export Binary, Table, ColorImage, GrayImage export scitype, scitype_union, scitypes, coerce, schema export mlj -using Requires, InteractiveUtils +using Requires, InteractiveUtils, CategoricalArrays, ColorTypes, Tables # ## FOR DEFINING SCITYPES ON OBJECTS DETECTED USING TRAITS @@ -267,24 +267,29 @@ schema(X, ::Val{:other}) = mlj() include("conventions/mlj/mlj.jl") +include("tables.jl") +include("autotype.jl") +include("conventions/mlj/finite.jl") +include("conventions/mlj/images.jl") ## FOR LOADING OPTIONAL DEPENDENCIES + function __init__() # for printing out the type tree: @require(AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c", include("tree.jl")) - # the scitype and schema of tabular data: - @require(Tables="bd369af6-aec1-5ad0-b16a-f7cc5008161c", - (include("tables.jl"); include("autotype.jl"))) + # # the scitype and schema of tabular data: + # @require(Tables="bd369af6-aec1-5ad0-b16a-f7cc5008161c", + # (include("tables.jl"); include("autotype.jl"))) - # external packages for the :mlj convention: - @require(CategoricalArrays="324d7699-5711-5eae-9e2f-1d82baa6b597", - include("conventions/mlj/finite.jl")) - @require(ColorTypes="3da002f7-5984-5a60-b8a6-cbb66c0b333f", - include("conventions/mlj/images.jl")) + # # external packages for the :mlj convention: + # @require(CategoricalArrays="324d7699-5711-5eae-9e2f-1d82baa6b597", + # include("conventions/mlj/finite.jl")) + # @require(ColorTypes="3da002f7-5984-5a60-b8a6-cbb66c0b333f", + # include("conventions/mlj/images.jl")) end From 4911ec7cc7329023465b8596d5731265a9a4e82c Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Fri, 8 Nov 2019 20:49:31 +0100 Subject: [PATCH 09/12] patch fix for coercion of vector with Any to Multiclass --- src/conventions/mlj/finite.jl | 16 +++++++++++++++- test/runtests.jl | 21 +++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/conventions/mlj/finite.jl b/src/conventions/mlj/finite.jl index b46521b..e8e4d33 100644 --- a/src/conventions/mlj/finite.jl +++ b/src/conventions/mlj/finite.jl @@ -6,7 +6,21 @@ scitype(c::CategoricalValue, ::Val{:mlj}) = scitype(c::CategoricalString, ::Val{:mlj}) = c.pool.ordered ? OrderedFactor{nlevels(c)} : Multiclass{nlevels(c)} -function coerce(v, ::Type{T2}; verbosity=1) where T2 <: Union{Missing,Finite} +function coerce(v::AbstractVector, ::Type{T2}; + verbosity=1) where T2 <: Union{Missing,Finite} + # check if it's a Vector of Any or a CategoricalArray of Any + # in which case re-interpret as String to avoid errors with MLJBase.classes + if eltype(v) === Any || first(skipmissing(v)) isa CategoricalValue{Any,T} where T + if any(ismissing, v) + v_ = Vector{Union{Missing,String}}(undef, length(v)) + v_ .= string.(v) + v_[ismissing.(v)] .= missing + else + v_ = string.(v) + end + return categorical(v_, true, ordered=false) + end + su = scitype_union(v) if su >: Missing && !(T2 >: Missing) verbosity > 0 && _coerce_missing_warn(T2) diff --git a/test/runtests.jl b/test/runtests.jl index ce94d68..aca6a39 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -179,4 +179,25 @@ end @test all(unique(cw) .== [0.1, 0.2, 0.3]) end +@testset "Any->Multiclass (mlj)" begin + v1 = categorical(Any[1,2,1,2,1,missing,2]) + v2 = Any[collect("aksldjfalsdjkfslkjdfalksjdf")...] + v1c = coerce(v1, Multiclass) + v2c = coerce(v2, Multiclass) + @test scitype_union(v1c) == Union{Missing,Multiclass{2}} + @test scitype_union(v2c) == Multiclass{7} + @test eltype(v1c) == Union{Missing, CategoricalString{UInt8}} + @test eltype(v2c) == CategoricalString{UInt8} + + # normal behaviour is unchanged + v1 = categorical([1,2,1,2,1,2,missing]) + v2 = collect("aksldjfalsdjkfslkjdfalksjdf") + v1c = coerce(v1, Multiclass) + v2c = coerce(v2, Multiclass) + @test scitype_union(v1c) == Union{Missing,Multiclass{2}} + @test scitype_union(v2c) == Multiclass{7} + @test eltype(v1c) == Union{Missing,CategoricalValue{Int64,UInt8}} + @test eltype(v2c) == CategoricalValue{Char,UInt8} +end + include("autotype.jl") From c5b9ccf93e3a3aee72bbc0c7b2dc8b55ed82db10 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Mon, 11 Nov 2019 11:26:52 +1300 Subject: [PATCH 10/12] tweaks for coerce(::CategoricalArray{Any},...) --- src/conventions/mlj/finite.jl | 53 ++++++++++++++++++++++++----------- test/runtests.jl | 35 +++++++++++++---------- 2 files changed, 57 insertions(+), 31 deletions(-) diff --git a/src/conventions/mlj/finite.jl b/src/conventions/mlj/finite.jl index e8e4d33..86dd5cb 100644 --- a/src/conventions/mlj/finite.jl +++ b/src/conventions/mlj/finite.jl @@ -6,21 +6,12 @@ scitype(c::CategoricalValue, ::Val{:mlj}) = scitype(c::CategoricalString, ::Val{:mlj}) = c.pool.ordered ? OrderedFactor{nlevels(c)} : Multiclass{nlevels(c)} -function coerce(v::AbstractVector, ::Type{T2}; - verbosity=1) where T2 <: Union{Missing,Finite} - # check if it's a Vector of Any or a CategoricalArray of Any - # in which case re-interpret as String to avoid errors with MLJBase.classes - if eltype(v) === Any || first(skipmissing(v)) isa CategoricalValue{Any,T} where T - if any(ismissing, v) - v_ = Vector{Union{Missing,String}}(undef, length(v)) - v_ .= string.(v) - v_[ismissing.(v)] .= missing - else - v_ = string.(v) - end - return categorical(v_, true, ordered=false) - end - +# for temporary hack below: +get_(x) = get(x) +get_(::Missing) = missing + +# v is already categorical here, but may need `ordering` changed +function _finalize_finite_coerce(v, verbosity, T2) su = scitype_union(v) if su >: Missing && !(T2 >: Missing) verbosity > 0 && _coerce_missing_warn(T2) @@ -28,9 +19,39 @@ function coerce(v::AbstractVector, ::Type{T2}; if su <: T2 return v end - return categorical(v, true, ordered=T2 <: Union{Missing,OrderedFactor}) + return categorical(v, true, ordered=T2<:Union{Missing,OrderedFactor}) end +# if v is not a CategoricalArray: +function coerce(v::AbstractArray, + ::Type{T2}; verbosity=1) where T2<:Union{Missing,Finite} + vtight = broadcast(identity, v) + vcat = categorical(vtight, true, ordered=T2<:Union{Missing,OrderedFactor}) + return _finalize_finite_coerce(vcat, verbosity, T2) +end + +# if v is a CategoricalArray except CategoricalArray{Any}: +coerce(v::CategoricalArray, + ::Type{T2}; verbosity=1) where T2<:Union{Missing,Finite} = + _finalize_finite_coerce(v, verbosity, T2) + +# if v is a CategoricalArray{Any} +function coerce(v::CategoricalArray{Any}, + ::Type{T2}; verbosity=1) where T2<:Union{Missing,Finite} + + # AFTER CategoricalArrays 0.7.2 IS RELEASED: + # return _finalize_finite_coerce(broadcast(identity, v), verbosity, T2) + + # TEMPORARY HACK: + levels_ = levels(v) + isordered_ = isordered(v) + vraw = broadcast(get_, v) + v_ = categorical(vraw, true, ordered=isordered_) + levels!(v_, levels_) + return _finalize_finite_coerce(v_, verbosity, T2) +end + + ## PERFORMANT SCITYPES FOR ARRAYS const CatArr{T,N,V} = CategoricalArray{T,N,<:Any,V} diff --git a/test/runtests.jl b/test/runtests.jl index aca6a39..ea2587c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -155,25 +155,28 @@ end coerce(Any[4, 7.0, missing], Count)) @test ismissing(y_coerced == [4, 7, missing]) @test scitype_union(y_coerced) === Union{Missing,Count} -# @test scitype_union(@test_logs((:warn, r"Missing values encountered"), -# coerce([:x, :y, missing], Multiclass))) === - @test scitype_union(coerce([:x, :y, missing], Multiclass)) === - Union{Missing, Multiclass{2}} - # @test scitype_union(@test_logs((:warn, r"Missing values encountered"), - # coerce([:x, :y, missing], OrderedFactor))) === - # Union{Missing, OrderedFactor{2}} - scitype_union(coerce([:x, :y, missing], OrderedFactor)) === + @test scitype_union(@test_logs((:warn, r"Missing values encountered"), + coerce([:x, :y, missing], Multiclass))) === + Union{Missing, Multiclass{2}} + @test scitype_union(@test_logs((:warn, r"Missing values encountered"), + coerce([:x, :y, missing], OrderedFactor))) === Union{Missing, OrderedFactor{2}} # non-missing Any vectors @test coerce(Any[4, 7], Continuous) == [4.0, 7.0] @test coerce(Any[4.0, 7.0], Continuous) == [4, 7] + # Finite conversions: + @test scitype_union(coerce([:x, :y], Finite)) === Multiclass{2} + @test scitype_union(@test_logs((:warn, r"Missing values encountered"), + coerce([:x, :y, missing], Finite))) === + Union{Missing, Multiclass{2}} end @testset "coerce R->OF (mlj)" begin v = [0.1, 0.2, 0.2, 0.3, missing, 0.1] w = [0.1, 0.2, 0.2, 0.3, 0.1] - cv = coerce(v, OrderedFactor) + @test_logs((:warn, r"Missing values encountered"), + global cv = coerce(v, OrderedFactor)) cw = coerce(w, OrderedFactor) @test all(skipmissing(unique(cv)) .== [0.1, 0.2, 0.3]) @test all(unique(cw) .== [0.1, 0.2, 0.3]) @@ -182,22 +185,24 @@ end @testset "Any->Multiclass (mlj)" begin v1 = categorical(Any[1,2,1,2,1,missing,2]) v2 = Any[collect("aksldjfalsdjkfslkjdfalksjdf")...] - v1c = coerce(v1, Multiclass) + @test_logs((:warn, r"Missing values"), + global v1c = coerce(v1, Multiclass)) v2c = coerce(v2, Multiclass) @test scitype_union(v1c) == Union{Missing,Multiclass{2}} @test scitype_union(v2c) == Multiclass{7} - @test eltype(v1c) == Union{Missing, CategoricalString{UInt8}} - @test eltype(v2c) == CategoricalString{UInt8} + @test eltype(v1c) <: Union{Missing, CategoricalValue{Int64}} + @test eltype(v2c) <: CategoricalValue{Char} # normal behaviour is unchanged v1 = categorical([1,2,1,2,1,2,missing]) v2 = collect("aksldjfalsdjkfslkjdfalksjdf") - v1c = coerce(v1, Multiclass) + @test_logs((:warn, r"Missing values"), + global v1c = coerce(v1, Multiclass)) v2c = coerce(v2, Multiclass) @test scitype_union(v1c) == Union{Missing,Multiclass{2}} @test scitype_union(v2c) == Multiclass{7} - @test eltype(v1c) == Union{Missing,CategoricalValue{Int64,UInt8}} - @test eltype(v2c) == CategoricalValue{Char,UInt8} + @test eltype(v1c) <: Union{Missing,CategoricalValue{Int64}} + @test eltype(v2c) <: CategoricalValue{Char} end include("autotype.jl") From 85c82bfb41db0ed4dc547e64b8f04a75241f6b65 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Mon, 11 Nov 2019 12:27:53 +1300 Subject: [PATCH 11/12] enable coercion of arrays to resolve #35 --- src/ScientificTypes.jl | 24 ++++++++++++++++ src/conventions/mlj/mlj.jl | 59 ++++++++++++++------------------------ test/runtests.jl | 10 +++++++ 3 files changed, 55 insertions(+), 38 deletions(-) diff --git a/src/ScientificTypes.jl b/src/ScientificTypes.jl index de6c17a..acf4726 100644 --- a/src/ScientificTypes.jl +++ b/src/ScientificTypes.jl @@ -204,8 +204,32 @@ function scitype(A::Arr{<:Any,N}, C::Val, ::Val{S}) where {N,S} end end + # ## STUB FOR COERCE METHOD +""" + coerce(A::AbstractArray, T; verbosity=1) + +Coerce the julia types of elements of `A` to ensure the returned array +has `T` or `Union{Missing,T}` as the union of its element scitypes, +according to the active convention. + +A warning is issued if missing values are encountered, unless +`verbosity` is `0` or less. + + julia> mlj() + julia> v = coerce([1, missing, 5], Continuous) + 3-element Array{Union{Missing, Float64},1}: + 1.0 + missing + 5.0 + + julia> scitype(v) + AbstractArray{Union{Missing,Continuous}, 1} + +See also [`scitype`](@ref), [`scitype_union`](@ref). + +""" function coerce end diff --git a/src/conventions/mlj/mlj.jl b/src/conventions/mlj/mlj.jl index db31fce..18e83f6 100644 --- a/src/conventions/mlj/mlj.jl +++ b/src/conventions/mlj/mlj.jl @@ -12,48 +12,28 @@ Scitype(::Type{<:Integer}, ::Val{:mlj}) = Count Scitype(::Type{<:AbstractFloat}, ::Val{:mlj}) = Continuous -## COERCE VECTOR TO CONTINUOUS +## COERCE ARRAY TO CONTINUOUS -""" - coerce(v::AbstractVector, T; verbosity=1) - -Coerce the julia types of elements of `v` to ensure the returned -vector has `T` or `Union{Missing,T}` as the union of its element -scitypes. - -A warning is issued if missing values are encountered, unless -`verbosity` is `0` or less. - - julia> v = coerce([1, missing, 5], Continuous) - 3-element Array{Union{Missing, Float64},1}: - 1.0 - missing - 5.0 - - julia> scitype(v) - AbstractArray{Union{Missing,Continuous}, 1} - -See also [`scitype`](@ref), [`scitype_union`](@ref). - -""" -function coerce(y::AbstractVector{<:Union{Missing,AbstractFloat}}, T::Type{<:Union{Missing,Continuous}}; +function coerce(y::AbstractArray{<:Union{Missing,AbstractFloat}}, + T::Type{<:Union{Missing,Continuous}}; verbosity=1) eltype(y) >: Missing && verbosity > 0 && _coerce_missing_warn(T) return y end -function coerce(y::AbstractVector{<:Union{Missing,Real}}, T::Type{<:Union{Missing,Continuous}}; verbosity=1) +function coerce(y::AbstractArray{<:Union{Missing,Real}}, + T::Type{<:Union{Missing,Continuous}}; verbosity=1) eltype(y) >: Missing && verbosity > 0 && _coerce_missing_warn(T) return float(y) end -# NOTE: case where the data may have been badly encoded and resulted in an Any[] vector -# a user should proceed with caution here in particular: -# - if at one point it encounters a type for which there is no AbstractFloat such -# as a String, it will error. -# - if at one point it encounters a Char it will **not** error but return a float -# corresponding to the Char (e.g. 65.0 for 'A') whence the warning -function coerce(y::AbstractVector, T::Type{<:Union{Missing,Continuous}}; verbosity=1) +# NOTE: case where the data may have been badly encoded and resulted +# in an Any[] array a user should proceed with caution here in +# particular: - if at one point it encounters a type for which there +# is no AbstractFloat such as a String, it will error. - if at one +# point it encounters a Char it will **not** error but return a float +# corresponding to the Char (e.g. 65.0 for 'A') whence the warning +function coerce(y::AbstractArray, T::Type{<:Union{Missing,Continuous}}; verbosity=1) has_missings = findfirst(ismissing, y) !== nothing has_missings && verbosity > 0 && _coerce_missing_warn(T) has_chars = findfirst(e->isa(e,Char), y) !== nothing @@ -63,27 +43,30 @@ function coerce(y::AbstractVector, T::Type{<:Union{Missing,Continuous}}; verbosi end -## COERCE VECTOR TO COUNT +## COERCE ARRAY TO COUNT _int(::Missing) = missing _int(x::Integer) = x _int(x) = Int(x) # may throw InexactError # no-op case -function coerce(y::AbstractVector{<:Union{Missing,Integer}}, T::Type{<:Union{Missing,Count}}; verbosity=1) +function coerce(y::AbstractArray{<:Union{Missing,Integer}}, + T::Type{<:Union{Missing,Count}}; verbosity=1) eltype(y) >: Missing && verbosity > 0 && _coerce_missing_warn(T) return y end # NOTE: this will error if it encounters things like 1.5 or 1//2 (InexactError) -function coerce(y::AbstractVector{<:Union{Missing,Real}}, T::Type{<:Union{Missing,Count}}; verbosity=1) +function coerce(y::AbstractArray{<:Union{Missing,Real}}, + T::Type{<:Union{Missing,Count}}; verbosity=1) eltype(y) >: Missing && verbosity > 0 && _coerce_missing_warn(T) return _int.(y) end -# NOTE: case where the data may have been badly encoded and resulted in an Any[] vector -# a user should proceed with caution here (see comment earlier) -function coerce(y::AbstractVector, T::Type{<:Union{Missing,Count}}; verbosity=1) +# NOTE: case where the data may have been badly encoded and resulted +# in an Any[] array a user should proceed with caution here (see +# comment earlier) +function coerce(y::AbstractArray, T::Type{<:Union{Missing,Count}}; verbosity=1) has_missings = findfirst(ismissing, y) !== nothing has_missings && verbosity > 0 && _coerce_missing_warn(T) has_chars = findfirst(e->isa(e,Char), y) !== nothing diff --git a/test/runtests.jl b/test/runtests.jl index ea2587c..6967af6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -172,6 +172,16 @@ end Union{Missing, Multiclass{2}} end +@testset "coercion works for arrays too" begin + A = rand(Int, 2, 3) + z = rand(Char, 2, 3) + y = Any[1.0 2; 3 4] + @test scitype_union(coerce(A, Continuous)) == Continuous + @test scitype_union(coerce(A, OrderedFactor)) <: OrderedFactor + @test scitype_union(coerce(z, Multiclass)) <: Multiclass + @test scitype_union(coerce(y, Count)) === Count +end + @testset "coerce R->OF (mlj)" begin v = [0.1, 0.2, 0.2, 0.3, missing, 0.1] w = [0.1, 0.2, 0.2, 0.3, 0.1] From ad2beb1da94a3560d33164602e16a273b2bc33dd Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Mon, 11 Nov 2019 12:36:28 +1300 Subject: [PATCH 12/12] bump to 0.2.4 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 8c014ba..84eedcc 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ScientificTypes" uuid = "321657f4-b219-11e9-178b-2701a2544e81" authors = ["Anthony D. Blaom "] -version = "0.2.3" +version = "0.2.4" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"