[TF] Move model building & unify train scripts (mindee#1744)

felixdittrich92 · Oct 9, 2024 · 2f9f50e · 2f9f50e
1 parent 59f1c30
commit 2f9f50e
Show file tree

Hide file tree

Showing 30 changed files with 93 additions and 78 deletions.
diff --git a/.github/workflows/references.yml b/.github/workflows/references.yml
@@ -114,16 +114,16 @@ jobs:
           unzip toy_recogition_set-036a4d80.zip -d reco_set
       - if: matrix.framework == 'tensorflow'
         name: Train for a short epoch (TF) (document orientation)
-        run: python references/classification/train_tensorflow_orientation.py ./det_set ./det_set resnet18 page -b 2 --epochs 1
+        run: python references/classification/train_tensorflow_orientation.py resnet18 --type page --train_path ./det_set --val_path ./det_set  -b 2 --epochs 1
       - if: matrix.framework == 'pytorch'
         name: Train for a short epoch (PT) (document orientation)
-        run: python references/classification/train_pytorch_orientation.py ./det_set ./det_set resnet18 page -b 2 --epochs 1
+        run: python references/classification/train_pytorch_orientation.py resnet18 --type page --train_path ./det_set --val_path ./det_set  -b 2 --epochs 1
       - if: matrix.framework == 'tensorflow'
         name: Train for a short epoch (TF) (crop orientation)
-        run: python references/classification/train_tensorflow_orientation.py ./reco_set ./reco_set resnet18 crop -b 4 --epochs 1
+        run: python references/classification/train_tensorflow_orientation.py resnet18 --type crop --train_path ./reco_set --val_path ./reco_set  -b 4 --epochs 1
       - if: matrix.framework == 'pytorch'
         name: Train for a short epoch (PT) (crop orientation)
-        run: python references/classification/train_pytorch_orientation.py ./reco_set ./reco_set resnet18 crop -b 4 --epochs 1
+        run: python references/classification/train_pytorch_orientation.py resnet18 --type crop --train_path ./reco_set --val_path ./reco_set  -b 4 --epochs 1
 
   train-text-recognition:
     runs-on: ${{ matrix.os }}
@@ -318,10 +318,10 @@ jobs:
           unzip toy_detection_set-bbbb4243.zip -d det_set
       - if: matrix.framework == 'tensorflow'
         name: Train for a short epoch (TF)
-        run: python references/detection/train_tensorflow.py --train_path ./det_set --val_path ./det_set linknet_resnet18 -b 2 --epochs 1
+        run: python references/detection/train_tensorflow.py linknet_resnet18 --train_path ./det_set --val_path ./det_set -b 2 --epochs 1
       - if: matrix.framework == 'pytorch'
         name: Train for a short epoch (PT)
-        run: python references/detection/train_pytorch.py ./det_set ./det_set db_mobilenet_v3_large -b 2 --epochs 1
+        run: python references/detection/train_pytorch.py db_mobilenet_v3_large --train_path ./det_set --val_path ./det_set -b 2 --epochs 1
 
   evaluate-text-detection:
     runs-on: ${{ matrix.os }}

diff --git a/doctr/models/classification/magc_resnet/tensorflow.py b/doctr/models/classification/magc_resnet/tensorflow.py
@@ -14,7 +14,7 @@
 
 from doctr.datasets import VOCABS
 
-from ...utils import load_pretrained_params
+from ...utils import _build_model, load_pretrained_params
 from ..resnet.tensorflow import ResNet
 
 __all__ = ["magc_resnet31"]
@@ -115,7 +115,7 @@ def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
         # Context modeling: B, H, W, C  ->  B, 1, 1, C
         context = self.context_modeling(inputs)
         # Transform: B, 1, 1, C  ->  B, 1, 1, C
-        transformed = self.transform(context)
+        transformed = self.transform(context, **kwargs)
         return inputs + transformed
 
 
@@ -152,6 +152,8 @@ def _magc_resnet(
         cfg=_cfg,
         **kwargs,
     )
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The number of classes is not the same as the number of classes in the pretrained model =>

diff --git a/doctr/models/classification/mobilenet/tensorflow.py b/doctr/models/classification/mobilenet/tensorflow.py
@@ -13,7 +13,7 @@
 from tensorflow.keras.models import Sequential
 
 from ....datasets import VOCABS
-from ...utils import conv_sequence, load_pretrained_params
+from ...utils import _build_model, conv_sequence, load_pretrained_params
 
 __all__ = [
     "MobileNetV3",
@@ -295,6 +295,8 @@ def _mobilenet_v3(arch: str, pretrained: bool, rect_strides: bool = False, **kwa
         cfg=_cfg,
         **kwargs,
     )
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The number of classes is not the same as the number of classes in the pretrained model =>

diff --git a/doctr/models/classification/resnet/tensorflow.py b/doctr/models/classification/resnet/tensorflow.py
@@ -13,7 +13,7 @@
 
 from doctr.datasets import VOCABS
 
-from ...utils import conv_sequence, load_pretrained_params
+from ...utils import _build_model, conv_sequence, load_pretrained_params
 
 __all__ = ["ResNet", "resnet18", "resnet31", "resnet34", "resnet50", "resnet34_wide"]
 
@@ -210,6 +210,8 @@ def _resnet(
     model = ResNet(
         num_blocks, output_channels, stage_downsample, stage_conv, stage_pooling, origin_stem, cfg=_cfg, **kwargs
     )
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The number of classes is not the same as the number of classes in the pretrained model =>
@@ -358,6 +360,7 @@ def resnet50(pretrained: bool = False, **kwargs: Any) -> ResNet:
     )
 
     model.cfg = _cfg
+    _build_model(model)
 
     # Load pretrained parameters
     if pretrained:

diff --git a/doctr/models/classification/textnet/tensorflow.py b/doctr/models/classification/textnet/tensorflow.py
@@ -12,7 +12,7 @@
 from doctr.datasets import VOCABS
 
 from ...modules.layers.tensorflow import FASTConvLayer
-from ...utils import conv_sequence, load_pretrained_params
+from ...utils import _build_model, conv_sequence, load_pretrained_params
 
 __all__ = ["textnet_tiny", "textnet_small", "textnet_base"]
 
@@ -111,6 +111,8 @@ def _textnet(
 
     # Build the model
     model = TextNet(cfg=_cfg, **kwargs)
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The number of classes is not the same as the number of classes in the pretrained model =>

diff --git a/doctr/models/classification/vgg/tensorflow.py b/doctr/models/classification/vgg/tensorflow.py
@@ -11,7 +11,7 @@
 
 from doctr.datasets import VOCABS
 
-from ...utils import conv_sequence, load_pretrained_params
+from ...utils import _build_model, conv_sequence, load_pretrained_params
 
 __all__ = ["VGG", "vgg16_bn_r"]
 
@@ -81,6 +81,8 @@ def _vgg(
 
     # Build the model
     model = VGG(num_blocks, planes, rect_pools, cfg=_cfg, **kwargs)
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The number of classes is not the same as the number of classes in the pretrained model =>

diff --git a/doctr/models/classification/vit/tensorflow.py b/doctr/models/classification/vit/tensorflow.py
@@ -14,7 +14,7 @@
 from doctr.models.modules.vision_transformer.tensorflow import PatchEmbedding
 from doctr.utils.repr import NestedObject
 
-from ...utils import load_pretrained_params
+from ...utils import _build_model, load_pretrained_params
 
 __all__ = ["vit_s", "vit_b"]
 
@@ -121,6 +121,8 @@ def _vit(
 
     # Build the model
     model = VisionTransformer(cfg=_cfg, **kwargs)
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The number of classes is not the same as the number of classes in the pretrained model =>

diff --git a/doctr/models/detection/differentiable_binarization/tensorflow.py b/doctr/models/detection/differentiable_binarization/tensorflow.py
@@ -14,7 +14,13 @@
 from tensorflow.keras.applications import ResNet50
 
 from doctr.file_utils import CLASS_NAME
-from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, conv_sequence, load_pretrained_params
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
 
 from ...classification import mobilenet_v3_large
@@ -304,6 +310,8 @@ def _db_resnet(
 
     # Build the model
     model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning
@@ -347,6 +355,7 @@ def _db_mobilenet(
 
     # Build the model
     model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
         # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning

diff --git a/doctr/models/detection/fast/tensorflow.py b/doctr/models/detection/fast/tensorflow.py
@@ -13,7 +13,7 @@
 from tensorflow.keras import Model, Sequential, layers
 
 from doctr.file_utils import CLASS_NAME
-from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params
+from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, _build_model, load_pretrained_params
 from doctr.utils.repr import NestedObject
 
 from ...classification import textnet_base, textnet_small, textnet_tiny
@@ -333,6 +333,8 @@ def _fast(
 
     # Build the model
     model = FAST(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning
@@ -342,9 +344,6 @@ def _fast(
             skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]),
         )
 
-    # Build the model for reparameterization to access the layers
-    _ = model(tf.random.uniform(shape=[1, *_cfg["input_shape"]], maxval=1, dtype=tf.float32), training=False)
-
     return model
 
 

diff --git a/doctr/models/detection/linknet/tensorflow.py b/doctr/models/detection/linknet/tensorflow.py
@@ -14,7 +14,13 @@
 
 from doctr.file_utils import CLASS_NAME
 from doctr.models.classification import resnet18, resnet34, resnet50
-from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, conv_sequence, load_pretrained_params
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
 
 from .base import LinkNetPostProcessor, _LinkNet
@@ -79,10 +85,10 @@ def __init__(
             for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1])
         ]
 
-    def call(self, x: List[tf.Tensor]) -> tf.Tensor:
+    def call(self, x: List[tf.Tensor], **kwargs: Any) -> tf.Tensor:
         out = 0
         for decoder, fmap in zip(self.decoders, x[::-1]):
-            out = decoder(out + fmap)
+            out = decoder(out + fmap, **kwargs)
         return out
 
     def extra_repr(self) -> str:
@@ -274,6 +280,8 @@ def _linknet(
 
     # Build the model
     model = LinkNet(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning

diff --git a/doctr/models/factory/hub.py b/doctr/models/factory/hub.py
@@ -27,8 +27,6 @@
 
 if is_torch_available():
     import torch
-elif is_tf_available():
-    import tensorflow as tf
 
 __all__ = ["login_to_hub", "push_to_hf_hub", "from_hub", "_save_model_and_config_for_hf_hub"]
 
@@ -76,8 +74,6 @@ def _save_model_and_config_for_hf_hub(model: Any, save_dir: str, arch: str, task
         torch.save(model.state_dict(), weights_path)
     elif is_tf_available():
         weights_path = save_directory / "tf_model.weights.h5"
-        # NOTE: `model.build` is not an option because it doesn't runs in eager mode
-        _ = model(tf.ones((1, *model.cfg["input_shape"])), training=False)
         model.save_weights(str(weights_path))
 
     config_path = save_directory / "config.json"
@@ -229,8 +225,6 @@ def from_hub(repo_id: str, **kwargs: Any):
         model.load_state_dict(state_dict)
     else:  # tf
         weights = hf_hub_download(repo_id, filename="tf_model.weights.h5", **kwargs)
-        # NOTE: `model.build` is not an option because it doesn't runs in eager mode
-        _ = model(tf.ones((1, *model.cfg["input_shape"])), training=False)
         model.load_weights(weights)
 
     return model
diff --git a/doctr/models/preprocessor/tensorflow.py b/doctr/models/preprocessor/tensorflow.py
@@ -41,7 +41,7 @@ def __init__(
         self.resize = Resize(output_size, **kwargs)
         # Perform the division by 255 at the same time
         self.normalize = Normalize(mean, std)
-        self._runs_on_cuda = tf.test.is_gpu_available()
+        self._runs_on_cuda = tf.config.list_physical_devices("GPU") != []
 
     def batch_inputs(self, samples: List[tf.Tensor]) -> List[tf.Tensor]:
         """Gather samples into batches for inference purposes

diff --git a/doctr/models/recognition/crnn/tensorflow.py b/doctr/models/recognition/crnn/tensorflow.py
@@ -13,7 +13,7 @@
 from doctr.datasets import VOCABS
 
 from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 
 __all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
@@ -245,6 +245,7 @@ def _crnn(
 
     # Build the model
     model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
         # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning

diff --git a/doctr/models/recognition/master/tensorflow.py b/doctr/models/recognition/master/tensorflow.py
@@ -13,7 +13,7 @@
 from doctr.models.classification import magc_resnet31
 from doctr.models.modules.transformer import Decoder, PositionalEncoding
 
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from .base import _MASTER, _MASTERPostProcessor
 
 __all__ = ["MASTER", "master"]
@@ -290,6 +290,8 @@ def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool
         cfg=_cfg,
         **kwargs,
     )
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning

diff --git a/doctr/models/recognition/parseq/tensorflow.py b/doctr/models/recognition/parseq/tensorflow.py
@@ -16,7 +16,7 @@
 from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward
 
 from ...classification import vit_s
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from .base import _PARSeq, _PARSeqPostProcessor
 
 __all__ = ["PARSeq", "parseq"]
@@ -473,6 +473,8 @@ def _parseq(
 
     # Build the model
     model = PARSeq(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning

diff --git a/doctr/models/recognition/sar/tensorflow.py b/doctr/models/recognition/sar/tensorflow.py
@@ -13,7 +13,7 @@
 from doctr.utils.repr import NestedObject
 
 from ...classification import resnet31
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 
 __all__ = ["SAR", "sar_resnet31"]
@@ -392,6 +392,7 @@ def _sar(
 
     # Build the model
     model = SAR(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
         # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning

diff --git a/doctr/models/recognition/vitstr/tensorflow.py b/doctr/models/recognition/vitstr/tensorflow.py
@@ -12,7 +12,7 @@
 from doctr.datasets import VOCABS
 
 from ...classification import vit_b, vit_s
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from .base import _ViTSTR, _ViTSTRPostProcessor
 
 __all__ = ["ViTSTR", "vitstr_small", "vitstr_base"]
@@ -216,6 +216,8 @@ def _vitstr(
 
     # Build the model
     model = ViTSTR(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
+
     # Load pretrained parameters
     if pretrained:
         # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning