Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add risis mining #1330

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,50 @@ select docid, conceptId, conceptLabel, stripchars(middle,'.)(,[]') as middle, pr
from (
setschema 'docid,prev,middle,next' select c1, textwindow2s(keywords(filterstopwords(c2)),7,1,3, '\bDARIAH') from pubs where c2 is not null
), grants where conceptLabel="DARIAH EU" and (not regexprmatches("edariah",lower(middle)) and not regexprmatches("riyadh",lower(context)) )
) group by docid;
) group by docid


union all

-- RISIS
select case when ma="RISIS_DATASET" then jdict('documentId', id, 'conceptId', 'RISIS_DATASET', 'confidenceLevel', 0.8, 'textsnippet', prev||" <<< "||middle||" >>> "||next) else jdict('documentId', id, 'conceptId', 'RISIS', 'confidenceLevel', 0.8, 'textsnippet', prev||" <<< "||middle||" >>> "||next) end from
(
select * from
(
-- cortext
select id, "CORTEXT" as ma, prev, middle, next from (setschema 'id,text,prev,middle,next' select id, text, textwindow2s(lower(text), 10,1,10, "(?:\b|\W)cortext(?:\b|\d)") from (setschema 'id,text' select c1,c2 from pubs)) where
regexprmatches("cortext\.net|cortext\.org|www\.cortext\.|risis|ifris|text analysis|text mining|software|platform|plateforme|cortext manager|analysis|mining|nltk|github\.com\/cortext\/|corpus|\blisis\b|\b\inrae\b", prev||" "||middle||" "||next)
or regexprmatches("\bRISIS\b|\bINRAE\b|CorTexT|\bLISIS\b",text)

-- gate
union all

select id, "GATE" as ma, prev, middle, next from (setschema 'id,prev,middle,next' select id, textwindow2s(text, 10,1,10, "\bGATE(?:\b|\d)|gatecloud|gate\.ac\.uk") from (setschema 'id,text' select c1,c2 from pubs))
where regexprmatches("text mining|gatecloud|gate\.ac\.uk|\buima\b|classifier|semantic|\bnlp\b|text engineering|natural language|language engineering|information extraction|text analytics|cunningham|text process|architecture text|maynard|tablan|bontcheva|gate framework|tokenizer|tokeniser|sheffield|text annotation|language processing|\bnltk\b|treetagger|\byatea\b", lower(prev||" "||middle||" "||next))

union all


select id, "RISIS_DATASET" as ma, prev, middle, next from (setschema 'id,prev,middle,next' select id, textwindow2s(text, 10,1,10, "\bCIB\b|\bCIB1\b|\bCIB2\b|\bCINNOB\b|\bCWTS\b|\bEUPRO\b|\bEU\-PRO\b|\bETER\b|\bSIPER\b|\bIFRIS\b|\bNATPRO\b|\bJOREP\b|\bMORE\b") from (setschema 'id,text' select c1,c2 from pubs))

union all

select id, "RISIS_DATASET" as ma, prev, middle, next from (setschema 'id,prev,middle,next' select id, textwindow2s(lower(text), 10,5,10, "\bcheetah\b|\bprofile\b|\beuropean tertiary education register\b|\bscience and innovation policy evaluations repository\b") from (setschema 'id,text' select c1,c2 from pubs))
where (regexprmatches("\bcheetah\b", middle) and regexprmatches("\bfirms*\b", j2s(prev,middle,next))) or (regexprmatches("\bprofile\b", middle) and regexprmatches("\bcareers|mobility|dataset\b", j2s(prev,middle,next)))
or (not regexprmatches("\bcheetah\b", middle) and not regexprmatches("\bprofile\b", middle))


union all

select id, upper(regexpr("(orgreg|firmreg)",middle)) as ma, prev, middle, next from (setschema 'id,text,prev,middle,next' select id, text, textwindow2s(lower(text), 10,1,10, "\borgreg\b|\bfirmreg\b") from (setschema 'id,text' select c1,c2 from pubs))
where regexprmatches("\brisis\b", prev||" "||middle||" "||next) or regexprmatches("\bRISIS\b", text)



union all

select id, "RISIS" as ma, prev, middle, next from (setschema 'id,prev,middle,next' select id, textwindow2s(text, 10,1,10, "\bRISIS\b|\bRISIS1\b|\bRISIS2\b|\brisis\.eu\b") from (setschema 'id,text' select c1,c2 from pubs))
where (regexprmatches("recherche|patent|grant|support|acknowledge|innovation|research", prev||" "||middle||" "||next) and not regexprmatches("risis\.eu",lower(middle)) )
or regexprmatches("risis\.eu",lower(middle))
) group by id) ;