diff --git a/Project.toml b/Project.toml index 50d616d..03cafa8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ScientificTypes" uuid = "321657f4-b219-11e9-178b-2701a2544e81" authors = ["Anthony D. Blaom "] -version = "0.3.1" +version = "0.3.2" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" diff --git a/README.md b/README.md index 627c05e..e72c503 100644 --- a/README.md +++ b/README.md @@ -35,18 +35,20 @@ ScientificTypes.jl has three components: type hierarchy, and a single function `scitype` with scientific types as values. Someone implementing a convention must add methods to this function, while the general user just applies it to data, as - in `scitype(4.5)` (returning `Continuous` in the *mlj* convention). + in `scitype(4.5)` (returning `Continuous` in the *MLJ* convention). -- A built-in convention, called *mlj*, active by default. +- A built-in convention, called *MLJ*, active by default. -- Convenience methods for working with scientific types, the most commonly used being: +- Convenience methods for working with scientific types, the most commonly used being - - `schema(X)`, which gives an extended schema of any table `X`, - including the column scientific types implied by the active - convention. -. - - `coerce(X, ...)`, which coerces the machine types of `X` - to reflect a desired scientific type. + - `schema(X)`, which gives an extended schema of any Tables.jl + compatible table `X`, including the column scientific types + implied by the active convention. + + - `coerce(X, ...)`, which coerces the machine types of `X` to + reflect a desired scientific type. + +For example, ```julia using ScientificTypes, DataFrames @@ -58,40 +60,51 @@ X = DataFrame( e = ['M', 'F', missing, 'M', 'F'], ) sch = schema(X) # schema is overloaded in Scientifictypes -for (name, scitype) in zip(sch.names, sch.scitypes) - println(":$name -- $scitype") -end ``` will print ``` -:a -- Continuous -:b -- Union{Missing, Continuous} -:c -- Count -:d -- Count -:e -- Union{Missing, Unknown} +_.table = +┌─────────┬─────────────────────────┬────────────────────────────┐ +│ _.names │ _.types │ _.scitypes │ +├─────────┼─────────────────────────┼────────────────────────────┤ +│ a │ Float64 │ Continuous │ +│ b │ Union{Missing, Float64} │ Union{Missing, Continuous} │ +│ c │ Int64 │ Count │ +│ d │ Int64 │ Count │ +│ e │ Union{Missing, Char} │ Union{Missing, Unknown} │ +└─────────┴─────────────────────────┴────────────────────────────┘ +_.nrows = 5 ``` -this uses the default *mlj* convention to attribute a scitype -(cf. [docs](https://alan-turing-institute.github.io/ScientificTypes.jl/dev/#The-MLJ-convention-1)). +Here the default *MLJ* convention is being applied ((cf. [docs](https://alan-turing-institute.github.io/ScientificTypes.jl/dev/#The-MLJ-convention-1)). Detail is obtained in the obvious way; for example: + +```julia +julia> sch.names +(:a, :b, :c, :d, :e) +``` Now you could want to specify that `b` is actually a `Count`, and that `d` and `e` are `Multiclass`; this is done with the `coerce` function: ```julia Xc = coerce(X, :b=>Count, :d=>Multiclass, :e=>Multiclass) -sch = schema(Xc) -for (name, scitype) in zip(sch.names, sch.scitypes) - println(":$name -- $scitype") -end +schema(Xc) ``` -will print +which prints ``` -:a -- Continuous -:b -- Union{Missing, Count} -:c -- Count -:d -- Multiclass{2} -:e -- Union{Missing, Multiclass{2}} +_.table = +┌─────────┬──────────────────────────────────────────────┬───────────────────────────────┐ +│ _.names │ _.types │ _.scitypes │ +├─────────┼──────────────────────────────────────────────┼───────────────────────────────┤ +│ a │ Float64 │ Continuous │ +│ b │ Union{Missing, Int64} │ Union{Missing, Count} │ +│ c │ Int64 │ Count │ +│ d │ CategoricalValue{Int64,UInt8} │ Multiclass{2} │ +│ e │ Union{Missing, CategoricalValue{Char,UInt8}} │ Union{Missing, Multiclass{2}} │ +└─────────┴──────────────────────────────────────────────┴───────────────────────────────┘ +_.nrows = 5 + ``` diff --git a/docs/src/index.md b/docs/src/index.md index cd41084..eb70492 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -35,7 +35,7 @@ Found - A single method `scitype` for articulating a convention about what scientific type each Julia object can represent. For example, one might declare `scitype(::AbstractFloat) = Continuous`. -- A default convention called *mlj*, based on dependencies +- A default convention called *MLJ*, based on dependencies `CategoricalArrays`, `ColorTypes`, and `Tables`, which includes a convenience method `coerce` for performing scientific type coercion on `AbstractVectors` and columns of tabular data (any table @@ -122,12 +122,24 @@ Finally there is a `coerce!` method that does in-place coercion provided the dat - Developers can define their own conventions using the code in `src/conventions/mlj/` as a template. The active convention is controlled by the value of `ScientificTypes.CONVENTION[1]`. +## Special note on binary data + +ScientificTypes does not define a separate "binary" scientific +type. Rather, when binary data has an intrinsic "true" class (for example +pass/fail in a product test), then it should be assigned an +`OrderedFactor{2}` scitype, while data with no such class (e.g., gender) +should be assigned a `Multiclass{2}` scitype. In the former case +we recommend that the "true" class come after "false" in the ordering +(corresponding to the usual assignment "false=0" and "true=1"). Of +course, `Finite{2}` covers both cases of binary data. + + ## Detailed usage examples ```@example 3 using ScientificTypes # activate a convention -mlj() # redundant as it's the default +ScientificTypes.set_convention(MLJ) # redundant as it's the default scitype((2.718, 42)) ``` @@ -203,12 +215,12 @@ scitype([1.3, 4.5, missing]) *Performance note:* Computing type unions over large arrays is expensive and, depending on the convention's implementation and the -array eltype, computing the scitype can be slow. (In the *mlj* +array eltype, computing the scitype can be slow. (In the *MLJ* convention this is mitigated with the help of the `ScientificTypes.Scitype` method, of which other conventions could make use. Do `?ScientificTypes.Scitype` for details.) An eltype `Any` will always be slow and you may want to consider replacing an array -`A` with `broadcast(idenity, A)` to collapse the eltype and speed up +`A` with `broadcast(identity, A)` to collapse the eltype and speed up the computation. Provided the [Tables.jl](https://github.com/JuliaData/Tables.jl) package is loaded, any table implementing the Tables interface has a scitype encoding the scitypes of its columns: @@ -246,7 +258,7 @@ Note that `Table(Continuous,Finite)` is a *type* union and not a `Table` *instan ## The MLJ convention -The table below summarizes the *mlj* convention for representing +The table below summarizes the *MLJ* convention for representing scientific types: Type `T` | `scitype(x)` for `x::T` | package required diff --git a/src/ScientificTypes.jl b/src/ScientificTypes.jl index ea583b4..6be9d12 100644 --- a/src/ScientificTypes.jl +++ b/src/ScientificTypes.jl @@ -6,7 +6,6 @@ export Binary, Table export ColorImage, GrayImage export scitype, scitype_union, elscitype, coerce, coerce!, schema export info -export mlj export autotype # re-export from CategoricalArrays: @@ -59,11 +58,14 @@ info(object) = info(object, Val(ScientificTypes.trait(object))) # ## CONVENTIONS -const CONVENTION=[:unspecified] +abstract type Convention end +struct MLJ <: Convention end + +const CONVENTION=[MLJ(),] convention() = CONVENTION[1] -function mlj() - CONVENTION[1] = :mlj +function set_convention(C) + CONVENTION[1] = C() return nothing end @@ -163,7 +165,6 @@ include("autotype.jl") # and include code not requiring optional dependencies: -mlj() include("conventions/mlj/mlj.jl") include("conventions/mlj/finite.jl") include("conventions/mlj/images.jl") diff --git a/src/autotype.jl b/src/autotype.jl index 778a76c..284b068 100644 --- a/src/autotype.jl +++ b/src/autotype.jl @@ -17,7 +17,8 @@ which applying autotype differs from just using the ambient convention. When coercing with autotype, `only_changes` should be true. * `rules=(:few_to_finite,)`: the set of rules to apply. """ -function autotype(X; only_changes::Bool=true, +autotype(X; kwargs...) = _autotype(X, Val(trait(X)); kwargs...) +function _autotype(X, ::Val{:table}; only_changes::Bool=true, rules::NTuple{N,Symbol} where N=(:few_to_finite,)) # check that X is a table @assert Tables.istable(X) "The function `autotype` requires tabular data." @@ -55,11 +56,11 @@ function autotype(X; only_changes::Bool=true, return suggested_types end -function autotype(X::AbstractArray{T,M}; +function _autotype(X::AbstractArray{T,M}, ::Val{:other}; rules::NTuple{N,Symbol} where N=(:few_to_finite,)) where {T,M} # check that the rules are recognised _check_rules(rules) - sugg_type = scitype_union(X) + sugg_type = elscitype(X) np = prod(size(X)) for rule in rules if rule == :few_to_finite diff --git a/src/conventions/mlj/finite.jl b/src/conventions/mlj/finite.jl index cfabf29..d69098a 100644 --- a/src/conventions/mlj/finite.jl +++ b/src/conventions/mlj/finite.jl @@ -1,9 +1,9 @@ nlevels(c::CategoricalValue) = length(levels(c.pool)) nlevels(c::CategoricalString) = length(levels(c.pool)) -scitype(c::CategoricalValue, ::Val{:mlj}) = +scitype(c::CategoricalValue, ::MLJ) = c.pool.ordered ? OrderedFactor{nlevels(c)} : Multiclass{nlevels(c)} -scitype(c::CategoricalString, ::Val{:mlj}) = +scitype(c::CategoricalString, ::MLJ) = c.pool.ordered ? OrderedFactor{nlevels(c)} : Multiclass{nlevels(c)} # for temporary hack below: @@ -64,7 +64,7 @@ end const CatArr{T,N,V} = CategoricalArray{T,N,<:Any,V} -function scitype(A::CatArr{T,N,V}, ::Val{:mlj}) where {T,N,V} +function scitype(A::CatArr{T,N,V}, ::MLJ) where {T,N,V} nlevels = length(levels(A)) if isordered(A) S = OrderedFactor{nlevels} diff --git a/src/conventions/mlj/images.jl b/src/conventions/mlj/images.jl index 8279751..0193f9c 100644 --- a/src/conventions/mlj/images.jl +++ b/src/conventions/mlj/images.jl @@ -1,4 +1,4 @@ -scitype(image::AbstractArray{<:Gray,2}, ::Val{:mlj}) = +scitype(image::AbstractArray{<:Gray,2}, ::MLJ) = GrayImage{size(image)...} -scitype(image::AbstractArray{<:AbstractRGB,2}, ::Val{:mlj}) = +scitype(image::AbstractArray{<:AbstractRGB,2}, ::MLJ) = ColorImage{size(image)...} diff --git a/src/conventions/mlj/mlj.jl b/src/conventions/mlj/mlj.jl index 03cbb37..d35993a 100644 --- a/src/conventions/mlj/mlj.jl +++ b/src/conventions/mlj/mlj.jl @@ -1,5 +1,5 @@ -scitype(::AbstractFloat, ::Val{:mlj}) = Continuous -scitype(::Integer, ::Val{:mlj}) = Count +scitype(::AbstractFloat, ::MLJ) = Continuous +scitype(::Integer, ::MLJ) = Count function _coerce_missing_warn(::Type{T}) where T T >: Missing || @warn "Missing values encountered coercing scitype to $T.\n"* @@ -8,8 +8,9 @@ end # ## IMPLEMENT PERFORMANCE BOOSTING FOR ARRAYS -Scitype(::Type{<:Integer}, ::Val{:mlj}) = Count -Scitype(::Type{<:AbstractFloat}, ::Val{:mlj}) = Continuous +Scitype(::Type{<:Integer}, ::MLJ) = Count +Scitype(::Type{<:AbstractFloat}, ::MLJ) = Continuous +Scitype(::Type{<:AbstractString}, ::MLJ) = Unknown ## COERCE ARRAY TO CONTINUOUS diff --git a/src/schema.jl b/src/schema.jl index db92e12..8b71584 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -77,7 +77,7 @@ schema(X, ::Val{:other}) = TRAIT_FUNCTION_GIVEN_NAME[:table] = Tables.istable -function scitype(X, ::Val, ::Val{:table}) +function scitype(X, ::Convention, ::Val{:table}) Xcol = Tables.columns(X) col_names = propertynames(Xcol) types = map(col_names) do name @@ -101,7 +101,7 @@ function schema(X, ::Val{:table}) Xcol = Tables.columntable(X) names = s.names types = Tuple{s.types...} - scitypes = Tuple{(scitype_union(getproperty(Xcol, name)) + scitypes = Tuple{(elscitype(getproperty(Xcol, name)) for name in names)...} return Schema(names, types, scitypes, _nrows(X)) end diff --git a/src/scitype.jl b/src/scitype.jl index 2e11524..59613e5 100644 --- a/src/scitype.jl +++ b/src/scitype.jl @@ -3,7 +3,7 @@ scitype(X) The scientific type that `x` may represent. """ -scitype(X) = scitype(X, Val(convention())) +scitype(X) = scitype(X, convention()) scitype(X, C) = scitype(X, C, Val(trait(X))) scitype(X, C, ::Val{:other}) = Unknown @@ -24,66 +24,69 @@ scitype_union(A) = reduce((a,b)->Union{a,b}, (scitype(el) for el in A)) # ## SCITYPES OF TUPLES -scitype(t::Tuple, ::Val) = Tuple{scitype.(t)...} +scitype(t::Tuple, ::Convention) = Tuple{scitype.(t)...} # ## SCITYPES OF ARRAYS """ -ScientificTypes.Scitype(::Type, C::Val) +ScientificTypes.Scitype(::Type, ::C) -Method for implementers of a conventions to enable speed-up of scitype -evaluations for large arrays. +Method for implementers of a convention `C` to enable speed-up of +scitype evaluations for large arrays. In general, one cannot infer the scitype of an object of type `AbstractArray{T, N}` from the machine type alone. For, example, this -never holds in the *mlj* convention for a categorical array, or in the +never holds in the *MLJ* convention for a categorical array, or in the following examples: `X=Any[1, 2, 3]` and `X=Union{Missing,Int64}[1, 2, 3]`. Nevertheless, for some *restricted* machine types `U`, the statement `type(X) == AbstractArray{T, N}` for some `T<:U` already allows one deduce that `scitype(X) = AbstractArray{S,N}`, where `S` is determined -by `U` alone. This is the case in the *mlj* convention, for example, +by `U` alone. This is the case in the *MLJ* convention, for example, if `U = Integer`, in which case `S = Count`. If one explicitly declares - ScientificTypes.Scitype(::Type{<:U}, ::Val{:convention}) = S + ScientificTypes.Scitype(::Type{<:U}, ::C) = S in such cases, then ScientificTypes ensures a considerable speed-up in the computation of `scitype(X)`. There is also a partial speed-up for the case that `T <: Union{U, Missing}`. -For example, in *mlj* one has `Scitype(::Type{<:Integer}) = Count`. +For example, in the *MLJ* convention, one has +`Scitype(::Type{<:Integer}, ::MLJ) = Count`. """ -Scitype(::Type, C::Val) = nothing -Scitype(::Type{Any}, C::Val) = nothing # b/s `Any` isa `Union{<:Any, Missing}` +Scitype(::Type, c::Convention) = nothing +Scitype(::Type{Any}, c::Convention) = + nothing # b/s `Any` isa `Union{<:Any, Missing}` # For all such `T` we can also get almost the same speed-up in the case that # `T` is replaced by `Union{T, Missing}`, which we detect by wrapping -# the answer: +# the answer as a Val: -Scitype(MT::Type{Union{T, Missing}}, C::Val) where T = Val(Scitype(T, C)) +Scitype(MT::Type{Union{T, Missing}}, c::Convention) where T = + Val(Scitype(T, c)) -# For example, in *mlj* convention, Scitype(::Integer) = Count +# For example, Scitype(::Integer, ::MLJ) = count const Arr{T,N} = AbstractArray{T,N} # the dispatcher: -scitype(A::Arr{T}, C) where T = scitype(A, C, Scitype(T, C)) +scitype(A::Arr{T}, c, ::Val{:other}) where T = arr_scitype(A, c, Scitype(T, c)) # the slow fallback: -scitype(A::Arr{<:Any,N}, ::Val, ::Nothing) where N = +arr_scitype(A::Arr{<:Any,N}, ::Convention, ::Nothing) where N = AbstractArray{scitype_union(A),N} # the speed-up: -scitype(::Arr{<:Any,N}, ::Val, S) where N = Arr{S,N} +arr_scitype(::Arr{<:Any,N}, ::Convention, S) where N = Arr{S,N} # partial speed-up for missing types, because broadcast is faster than # computing scitype_union: -function scitype(A::Arr{<:Any,N}, C::Val, ::Val{S}) where {N,S} +function arr_scitype(A::Arr{<:Any,N}, c::Convention, ::Val{S}) where {N,S} if S == nothing - return scitype(A, C, S) + return arr_scitype(A, c, S) else Atight = broadcast(identity, A) if typeof(A) == typeof(Atight) diff --git a/test/autotype.jl b/test/autotype.jl index e1684d3..0958d38 100644 --- a/test/autotype.jl +++ b/test/autotype.jl @@ -84,6 +84,13 @@ end @test sugg_types[:a] == Union{Missing,Multiclass} end +@testset "autotype of a table that is also an array" begin + X = (x=rand(4),) + CSV.write("test.csv", X) + file = CSV.file("test.csv") + @test autotype(file) == autotype(X) +end + ####################### #### Detailed tests ####################### diff --git a/test/basic_tests.jl b/test/basic_tests.jl index 4320801..c97045c 100644 --- a/test/basic_tests.jl +++ b/test/basic_tests.jl @@ -200,7 +200,7 @@ end @test scitype_union(coerce(y, Count)) === Count end -@testset "coerce R->OF (mlj)" begin +@testset "coerce R->OF (MLJ)" begin v = [0.1, 0.2, 0.2, 0.3, missing, 0.1] w = [0.1, 0.2, 0.2, 0.3, 0.1] @test_logs((:warn, r"Missing values encountered"), @@ -210,7 +210,7 @@ end @test all(unique(cw) .== [0.1, 0.2, 0.3]) end -@testset "Any->Multiclass (mlj)" begin +@testset "Any->Multiclass (MLJ)" begin v1 = categorical(Any[1,2,1,2,1,missing,2]) v2 = Any[collect("aksldjfalsdjkfslkjdfalksjdf")...] @test_logs((:warn, r"Missing values"), @@ -233,7 +233,7 @@ end @test eltype(v2c) <: CategoricalValue{Char} end -@testset "Cat->Count,Continuous (mlj)" begin +@testset "Cat->Count,Continuous (MLJ)" begin a = categorical(["a","b","a","b",missing]) a1 = coerce(a, Union{Count,Missing}) @test scitype_union(a1) == Union{Missing,Count} diff --git a/test/type_tests.jl b/test/type_tests.jl index bad4ca7..eb79302 100644 --- a/test/type_tests.jl +++ b/test/type_tests.jl @@ -18,7 +18,7 @@ end y = rand(Int, 5), z = categorical(collect("asdfa")), w = rand(5) - ) + ) s = schema(X) @test info(X) == schema(X) @test s.scitypes == (Continuous, Count, Multiclass{4}, Continuous) @@ -36,3 +36,10 @@ end @test S._nrows(()) == 0 @test S._nrows((i for i in 1:7)) == 7 end + +@testset "scitype of a table that is also an array" begin + X = (x=rand(4),) + CSV.write("test.csv", X) + file = CSV.file("test.csv") + @test scitype(file) == scitype(X) +end