diff --git a/Project.toml b/Project.toml index 1ac7b86..a039c19 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ScientificTypes" uuid = "321657f4-b219-11e9-178b-2701a2544e81" authors = ["Anthony D. Blaom "] -version = "2.2.1" +version = "2.2.2" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" diff --git a/src/convention/scitype.jl b/src/convention/scitype.jl index 83407fb..8fc218a 100644 --- a/src/convention/scitype.jl +++ b/src/convention/scitype.jl @@ -68,9 +68,14 @@ ST.scitype(::Distributions.Distribution{F,S}) where {F,S} = # Text analysis - EXPERIMENTAL +# This would be less of a hack if some of #155 were adopted. + type2scitype(T::Type) = ST.Scitype(T, DefaultConvention()) type2scitype(::Type{<:AbstractVector{T}}) where T = AbstractVector{type2scitype(T)} +type2scitype(::NTuple{N,T}) where {N,T} = NTuple{type2scitype{T}} +const PlainNGram{N} = NTuple{N,<:AbstractString} +const TaggedNGram{N} = NTuple{N,<:CorpusLoaders.TaggedWord} ST.scitype(::TaggedWord, ::DefaultConvention) = Annotated{Textual} ST.scitype(::Document{<:AbstractVector{T}}, ::DefaultConvention) where T = Annotated{AbstractVector{type2scitype(T)}} @@ -80,7 +85,15 @@ ST.scitype(::AbstractDict{<:TaggedWord,<:Integer}, ::DefaultConvention) = Multiset{Annotated{Textual}} ST.scitype(::AbstractDict{<:Union{TaggedWord,AbstractString},<:Integer}, ::DefaultConvention) = - Multiset{Annotated{Textual}} + Multiset{Union{Textual,Annotated{Textual}}} +ST.scitype(::AbstractDict{<:PlainNGram{N}}) where N = + Multiset{NTuple{N,Textual}} +ST.scitype(::AbstractDict{<:TaggedNGram{N}}) where N = + Multiset{NTuple{N,Annotated{Textual}}} +ST.scitype(::AbstractDict{<:PlainNGram}) = + Multiset{NTuple{<:Any,Textual}} +ST.scitype(::AbstractDict{<:TaggedNGram}) = + Multiset{NTuple{<:Any,Annotated{Textual}}} # Scitype for fast array broadcasting diff --git a/test/scitypes.jl b/test/scitypes.jl index 0e748f1..c1ca4ce 100644 --- a/test/scitypes.jl +++ b/test/scitypes.jl @@ -248,12 +248,43 @@ end @test scitype(bag_of_words) == Multiset{Textual} bag_of_tagged_words = Dict(tagged_word => 5) @test scitype(bag_of_tagged_words) == Multiset{Annotated{Textual}} - @test scitype(Document("kadsfkj", "My Document")) == Unknown - @test scitype(Document([tagged_word, tagged_word2], "My Other Doc")) == + @test scitype(Document("My Document", "kadsfkj")) == Unknown + @test scitype(Document([tagged_word, tagged_word2])) == + Annotated{AbstractVector{Annotated{Textual}}} + @test scitype(Document("My Other Doc", [tagged_word, tagged_word2])) == Annotated{AbstractVector{Annotated{Textual}}} nested_tokens = [["dog", "cat"], ["bird", "cat"]] - @test scitype(Document(nested_tokens), "Essay Number 1") == - Annotated{AbstractVector{AbstractVector{Textual}}} + @test scitype(Document("Essay Number 1", nested_tokens)) == + Annotated{AbstractVector{AbstractVector{Textual}}} + + @test scitype(Dict(("cat", "in") => 3)) == Multiset{Tuple{Textual,Textual}} + bag_of_words = Dict("cat in" => 1, + "the hat" => 1, + "the" => 2, + "cat" => 1, + "hat" => 1, + "in the" => 1, + "in" => 1, + "the cat" => 1) + bag_of_ngrams = + Dict(Tuple(String.(split(k))) => v for (k, v) in bag_of_words) + # Dict{Tuple{String, Vararg{String, N} where N}, Int64} with 8 entries: + # ("cat",) => 1 + # ("cat", "in") => 1 + # ("in",) => 1 + # ("the", "hat") => 1 + # ("the",) => 2 + # ("hat",) => 1 + # ("in", "the") => 1 + # ("the", "cat") => 1 + @test scitype(bag_of_ngrams) == Multiset{NTuple{<:Any,Textual}} + + @test scitype(Dict((tagged_word, tagged_word2) => 3)) == + Multiset{Tuple{Annotated{Textual},Annotated{Textual}}} + bag_of_ngrams = Dict((tagged_word, tagged_word2) => 3, + (tagged_word,) => 7) + @test scitype(bag_of_ngrams) == Multiset{NTuple{<:Any,Annotated{Textual}}} + end @testset "Autotype+tight" begin