From c5d81bcc58eea02046103865f1266ce5d3d3a268 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 16 Jan 2024 16:35:18 +0100 Subject: [PATCH 1/2] add v2 config --- .../configs/{nel.cfg => nel_v1.cfg} | 0 tutorials/nel_emerson/configs/nel_v2.cfg | 147 ++++++++++++++++++ tutorials/nel_emerson/project.yml | 2 +- 3 files changed, 148 insertions(+), 1 deletion(-) rename tutorials/nel_emerson/configs/{nel.cfg => nel_v1.cfg} (100%) create mode 100644 tutorials/nel_emerson/configs/nel_v2.cfg diff --git a/tutorials/nel_emerson/configs/nel.cfg b/tutorials/nel_emerson/configs/nel_v1.cfg similarity index 100% rename from tutorials/nel_emerson/configs/nel.cfg rename to tutorials/nel_emerson/configs/nel_v1.cfg diff --git a/tutorials/nel_emerson/configs/nel_v2.cfg b/tutorials/nel_emerson/configs/nel_v2.cfg new file mode 100644 index 000000000..96b2ed47f --- /dev/null +++ b/tutorials/nel_emerson/configs/nel_v2.cfg @@ -0,0 +1,147 @@ +[paths] +train = "" +dev = "" +raw = null +init_tok2vec = null +kb = "" +base_nlp = "temp/my_nlp" +vectors = "${paths.base_nlp}" + +[system] +seed = 342 +gpu_allocator = null + +[nlp] +lang = "en" +pipeline = ["sentencizer","entity_ruler","ner","entity_linker"] +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.sentencizer] +factory = "sentencizer" +punct_chars = null + +[components.entity_ruler] +source = "${paths.base_nlp}" +component = "entity_ruler" + +[components.ner] +source = "${paths.base_nlp}" +component = "ner" + +[components.entity_linker] +factory = "entity_linker" +entity_vector_length = 64 +generate_empty_kb = {"@misc":"spacy.EmptyKB.v2"} +get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} +get_candidates_batch = {"@misc":"spacy.CandidateBatchGenerator.v1"} +incl_context = true +incl_prior = true +labels_discard = [] + +[components.entity_linker.model] +@architectures = "spacy.EntityLinker.v2" +nO = null + +[components.entity_linker.model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v2" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 3 +subword_features = true + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null + +[initialize.components] + +[initialize.components.entity_linker] + +[initialize.components.entity_linker.kb_loader] +@misc = "spacy.KBFromFile.v1" +kb_path = ${paths.kb} + +[initialize.tokenizer] + + +[corpora] + +[corpora.train] +@readers = "MyCorpus.v1" +file = ${paths.train} + +[corpora.dev] +@readers = "MyCorpus.v1" +file = ${paths.dev} + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.2 +patience = 10000 +eval_frequency = 200 +accumulate_gradient = 2 +max_epochs = 0 +max_steps = 600 +frozen_components = ["sentencizer","ner"] +before_to_disk = null + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +nel_micro_p = 0.0 +nel_micro_r = 0.0 +nel_micro_f = 1.0 + +[pretraining] + +[optimizer] +@optimizers = "Adam.v1" +learn_rate = 0.001 +beta1 = 0.9 +beta2 = 0.999 +L2 = 0.0 +eps = 0.00000001 +grad_clip = 1.0 +L2_is_weight_decay = true +use_averages = true \ No newline at end of file diff --git a/tutorials/nel_emerson/project.yml b/tutorials/nel_emerson/project.yml index ff097c5e5..4c099d2c3 100644 --- a/tutorials/nel_emerson/project.yml +++ b/tutorials/nel_emerson/project.yml @@ -3,7 +3,7 @@ description: "**This project was created as part of a [step-by-step video tutori # Variables can be referenced across the project.yml using ${vars.var_name} vars: name: "nel_emerson" - config: "nel.cfg" + config: "nel_v2.cfg" vectors_model: "en_core_web_md" annotations: "emerson_annotated_text.jsonl" entities: "entities.csv" From b1696e10e3907b179aeb6b1d2573a614f9cdf9e3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 16 Jan 2024 16:38:07 +0100 Subject: [PATCH 2/2] small fixes --- tutorials/nel_emerson/configs/nel_v2.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/nel_emerson/configs/nel_v2.cfg b/tutorials/nel_emerson/configs/nel_v2.cfg index 96b2ed47f..c96c9432f 100644 --- a/tutorials/nel_emerson/configs/nel_v2.cfg +++ b/tutorials/nel_emerson/configs/nel_v2.cfg @@ -4,7 +4,7 @@ dev = "" raw = null init_tok2vec = null kb = "" -base_nlp = "temp/my_nlp" +base_nlp = "" vectors = "${paths.base_nlp}" [system] @@ -52,7 +52,7 @@ nO = null @architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 -depth = 4 +depth = 2 embed_size = 2000 window_size = 1 maxout_pieces = 3