new: Update parser and splitter model

ivyleavedtoadflax · ivyleavedtoadflax · commit 3e48684e9e76 · 2020-03-30T18:30:38.000-03:00
* Bumps parser model to 2020.3.8
* Bumps splitter model to 2020.3.6 - note that this model does not perform
    better than the model it replaces at present, but the previous model is
    not compatible with breaking API changes that have been implemented in the
    2020.3.2 of deep_reference_parser. Nonetheless it will be relatively easy
    to experiment with the splitter model to get a higher score, and in any case
    this individual splitter model is mostly superseded by the multitask model,
    and just provided her for comparison.
diff --git a/deep_reference_parser/__version__.py b/deep_reference_parser/__version__.py
@@ -5,5 +5,5 @@
 __author__ = "Wellcome Trust DataLabs Team"
 __author_email__ = "Grp_datalabs-datascience@Wellcomecloud.onmicrosoft.com"
 __license__ = "MIT"
-__splitter_model_version__ = "2019.12.0_splitting"
-__parser_model_version__ = "2020.3.2_parsing"
+__splitter_model_version__ = "2020.3.6_splitting"
+__parser_model_version__ = "2020.3.8_parsing"
diff --git a/deep_reference_parser/common.py b/deep_reference_parser/common.py
@@ -47,13 +47,9 @@ def download_model_artefacts(model_dir, s3_slug, artefacts=None):
     if not artefacts:
 
         artefacts = [
-            "char2ind.pickle",
-            "ind2label.pickle",
-            "ind2word.pickle",
-            "label2ind.pickle",
+            "indices.pickle"
             "maxes.pickle",
             "weights.h5",
-            "word2ind.pickle",
         ]
 
     for artefact in artefacts:
diff --git a/deep_reference_parser/configs/2019.12.0_splitting.ini b/deep_reference_parser/configs/2019.12.0_splitting.ini
diff --git a/deep_reference_parser/configs/2020.3.2_parsing.ini b/deep_reference_parser/configs/2020.3.2_parsing.ini
diff --git a/deep_reference_parser/configs/2020.3.6_splitting.ini b/deep_reference_parser/configs/2020.3.6_splitting.ini
@@ -0,0 +1,39 @@
+[DEFAULT]
+version = 2020.3.6_splitting
+description = Splitting model trained on a combination of Reach and Rodrigues 
+    data. The Rodrigues data have been concatenated into a single continuous
+    document and then cut into sequences of length=line_length, so that the
+    Rodrigues data and Reach data have the same lengths without need for much
+    padding or truncating.
+deep_reference_parser_version = e489f7efa31072b95175be8f728f1fcf03a4cabb
+
+[data]
+test_proportion = 0.25
+valid_proportion = 0.25
+data_path = data/
+respect_line_endings = 0
+respect_doc_endings = 1
+line_limit = 250
+policy_train = data/splitting/2020.3.6_splitting_train.tsv
+policy_test = data/splitting/2020.3.6_splitting_test.tsv
+policy_valid = data/splitting/2020.3.6_splitting_valid.tsv
+s3_slug = https://datalabs-public.s3.eu-west-2.amazonaws.com/deep_reference_parser/
+
+[build]
+output_path = models/splitting/2020.3.6_splitting/
+output = crf
+word_embeddings = embeddings/2020.1.1-wellcome-embeddings-300.txt
+pretrained_embedding = 0
+dropout = 0.5
+lstm_hidden = 400
+word_embedding_size = 300
+char_embedding_size = 100
+char_embedding_type = BILSTM
+optimizer = rmsprop
+
+[train]
+epochs = 30
+batch_size = 100
+early_stopping_patience = 5
+metric = val_f1
+
diff --git a/deep_reference_parser/configs/2020.3.8_parsing.ini b/deep_reference_parser/configs/2020.3.8_parsing.ini
@@ -0,0 +1,38 @@
+[DEFAULT]
+version = 2020.3.8_parsing
+description = Parsing model trained on a combination of Reach and Rodrigues 
+    data. The Rodrigues data have been concatenated into a single continuous
+    document and then cut into sequences of length=line_length, so that the
+    Rodrigues data and Reach data have the same lengths without need for much
+    padding or truncating.
+deep_reference_parser_version = e489f7efa31072b95175be8f728f1fcf03a4cabb
+
+[data]
+test_proportion = 0.25
+valid_proportion = 0.25
+data_path = data/
+respect_line_endings = 0
+respect_doc_endings = 1
+line_limit = 100
+policy_train = data/parsing/2020.3.8_parsing_train.tsv
+policy_test = data/parsing/2020.3.8_parsing_test.tsv
+policy_valid = data/parsing/2020.3.8_parsing_valid.tsv
+s3_slug = https://datalabs-public.s3.eu-west-2.amazonaws.com/deep_reference_parser/
+
+[build]
+output_path = models/parsing/2020.3.8_parsing/
+output = crf
+word_embeddings = embeddings/2020.1.1-wellcome-embeddings-300.txt
+pretrained_embedding = 0
+dropout = 0.5
+lstm_hidden = 400
+word_embedding_size = 300
+char_embedding_size = 100
+char_embedding_type = BILSTM
+optimizer = rmsprop
+
+[train]
+epochs = 30
+batch_size = 100
+early_stopping_patience = 5
+metric = val_f1