diff --git a/.gitignore b/.gitignore index a29689f24da218ecc97b8abf3feba2cea7f8a7a7..1ef65aa1c2b6f80ebd3091567ffc986f6553e315 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ # Compiled python source # *.pyc +.idea diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 798ece96a05534e95653f82f2f80477fdf1109a1..d10434913058690bcf9f7dbef74700126acf8a4d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,7 +6,11 @@ variables: OTB_TEST_DIR: $OTB_BUILD/Testing/Temporary # OTB testing directory ARTIFACT_TEST_DIR: $CI_PROJECT_DIR/testing CRC_BOOK_TMP: /tmp/crc_book_tests_tmp - + DOCKER_BUILDKIT: 1 + DOCKER_DRIVER: overlay2 + CACHE_IMAGE_BASE: $CI_REGISTRY_IMAGE:otbtf-base + CACHE_IMAGE_BUILDER: $CI_REGISTRY_IMAGE:builder + workflow: rules: - if: $CI_MERGE_REQUEST_ID # Execute jobs in merge request context @@ -18,41 +22,58 @@ stages: - Applications Test - Ship -test docker image: +docker image: stage: Build allow_failure: false tags: [godzilla] image: docker/compose:latest except: - - master + - develop services: - name: docker:dind before_script: - echo -n $CI_REGISTRY_PASSWORD | docker login -u $CI_REGISTRY_USER --password-stdin $CI_REGISTRY timeout: 10 hours script: - - ls -ll / - - ls -ll /bzl_cache/ - - touch /bzl_cache/toto - - docker pull $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME || - - docker pull $CI_REGISTRY_IMAGE:cpu-basic-test || - > docker build - --pull + --target otbtf-base + --network="host" + --cache-from $CACHE_IMAGE_BASE + --tag $CACHE_IMAGE_BASE + --build-arg BASE_IMG="ubuntu:20.04" + --build-arg BUILDKIT_INLINE_CACHE=1 + "." + - > + docker build + --target builder + --network="host" + --cache-from $CACHE_IMAGE_BASE + --cache-from $CACHE_IMAGE_BUILDER + --tag $CACHE_IMAGE_BUILDER + --build-arg OTBTESTS="true" + --build-arg KEEP_SRC_OTB="true" + --build-arg BZL_CONFIGS="" + --build-arg BASE_IMG="ubuntu:20.04" + --build-arg BUILDKIT_INLINE_CACHE=1 + "." + - > + docker build --network="host" - --cache-from $CI_REGISTRY_IMAGE:cpu-basic-test + --cache-from $CACHE_IMAGE_BASE + --cache-from $CACHE_IMAGE_BUILDER --cache-from $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - --label "org.opencontainers.image.title=$CI_PROJECT_TITLE" - --label "org.opencontainers.image.url=$CI_PROJECT_URL" - --label "org.opencontainers.image.created=$CI_JOB_STARTED_AT" - --label "org.opencontainers.image.revision=$CI_COMMIT_SHA" - --label "org.opencontainers.image.version=$CI_COMMIT_REF_NAME" + --cache-from $CI_REGISTRY_IMAGE:cpu-basic-dev-testing --tag $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME --build-arg OTBTESTS="true" --build-arg KEEP_SRC_OTB="true" --build-arg BZL_CONFIGS="" --build-arg BASE_IMG="ubuntu:20.04" - . + --build-arg BUILDKIT_INLINE_CACHE=1 + "." + after_script: + - docker push $CACHE_IMAGE_BASE + - docker push $CACHE_IMAGE_BUILDER - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME .static_analysis_base: @@ -63,13 +84,13 @@ flake8: extends: .static_analysis_base script: - sudo apt update && sudo apt install flake8 -y - - python -m flake8 --max-line-length=120 $OTBTF_SRC/python + - python -m flake8 --max-line-length=120 --per-file-ignores="__init__.py:F401" $OTBTF_SRC/otbtf pylint: extends: .static_analysis_base script: - sudo apt update && sudo apt install pylint -y - - pylint --disable=too-many-nested-blocks,too-many-locals,too-many-statements,too-few-public-methods,too-many-instance-attributes,too-many-arguments --ignored-modules=tensorflow --max-line-length=120 --logging-format-style=new $OTBTF_SRC/python + - pylint --logging-format-style=old --disable=too-many-nested-blocks,too-many-locals,too-many-statements,too-few-public-methods,too-many-instance-attributes,too-many-arguments --ignored-modules=tensorflow --max-line-length=120 --logging-format-style=new $OTBTF_SRC/otbtf codespell: extends: .static_analysis_base @@ -100,10 +121,6 @@ ctest: .applications_test_base: extends: .tests_base stage: Applications Test - rules: - # Only for MR targeting 'develop' and 'master' branches because applications tests are slow - - if: $CI_MERGE_REQUEST_ID && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'develop' - - if: $CI_MERGE_REQUEST_ID && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'master' before_script: - pip3 install pytest pytest-cov pytest-order - mkdir -p $ARTIFACT_TEST_DIR @@ -133,8 +150,22 @@ sr4rs: deploy: stage: Ship only: - - master + - develop script: + - echo "Shippping!" - docker pull $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME - - docker tag $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME $CI_REGISTRY_IMAGE:cpu-basic-test - - docker push $CI_REGISTRY_IMAGE:cpu-basic-test + - docker tag $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_NAME $CI_REGISTRY_IMAGE:cpu-basic-dev-testing + - docker push $CI_REGISTRY_IMAGE:cpu-basic-dev-testing + - docker build --network='host' --tag $CI_REGISTRY_IMAGE:cpu-basic --build-arg BASE_IMG=ubuntu:20.04 --build-arg BZL_CONFIGS="" . # cpu-basic + - docker push $CI_REGISTRY_IMAGE:cpu-basic + - docker build --network='host' --tag $CI_REGISTRY_IMAGE:cpu-basic-dev --build-arg BASE_IMG=ubuntu:20.04 --build-arg BZL_CONFIGS="" --build-arg KEEP_SRC_OTB=true . # cpu-basic-dev + - docker push $CI_REGISTRY_IMAGE:cpu-basic-dev + - docker build --network='host' --tag $CI_REGISTRY_IMAGE:gpu --build-arg BASE_IMG=nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04 . # gpu + - docker push $CI_REGISTRY_IMAGE:gpu + - docker build --network='host' --tag mdl4eo/otbtf${VER}:gpu-dev --build-arg BASE_IMG=nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04 --build-arg KEEP_SRC_OTB=true . # gpu-dev + - docker push $CI_REGISTRY_IMAGE:gpu-dev + - docker build --network='host' --tag $CI_REGISTRY_IMAGE:gpu-basic --build-arg BASE_IMG=nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04 --build-arg BZL_CONFIGS="" . # gpu-basic + - docker push $CI_REGISTRY_IMAGE:gpu-basic + - docker build --network='host' --tag mdl4eo/otbtf${VER}:gpu-basic-dev --build-arg BZL_CONFIGS="" --build-arg BASE_IMG=nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04 --build-arg KEEP_SRC_OTB=true . # gpu-basic-dev + - docker push $CI_REGISTRY_IMAGE:gpu-basic-dev + diff --git a/Dockerfile b/Dockerfile index 4a4d3ab7cea36c05d740c0e15d75ad16bc76ba85..d5a644f7bcc1c16251ba27b887eca984608339fb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ RUN if $GUI; then \ RUN ln -s /usr/bin/python3 /usr/local/bin/python && ln -s /usr/bin/pip3 /usr/local/bin/pip # NumPy version is conflicting with system's gdal dep and may require venv ARG NUMPY_SPEC="==1.22.*" -RUN pip install --no-cache-dir -U pip wheel mock six future deprecated "numpy$NUMPY_SPEC" \ +RUN pip install --no-cache-dir -U pip wheel mock six future tqdm deprecated "numpy$NUMPY_SPEC" \ && pip install --no-cache-dir --no-deps keras_applications keras_preprocessing # ---------------------------------------------------------------------------- @@ -148,7 +148,7 @@ COPY --from=builder /src /src # System-wide ENV ENV PATH="/opt/otbtf/bin:$PATH" ENV LD_LIBRARY_PATH="/opt/otbtf/lib:$LD_LIBRARY_PATH" -ENV PYTHONPATH="/opt/otbtf/lib/python3/site-packages:/opt/otbtf/lib/otb/python:/src/otbtf/python" +ENV PYTHONPATH="/opt/otbtf/lib/python3/site-packages:/opt/otbtf/lib/otb/python:/src/otbtf" ENV OTB_APPLICATION_PATH="/opt/otbtf/lib/otb/applications" # Default user, directory and command (bash is the entrypoint when using 'docker create') diff --git a/doc/DOCKERUSE.md b/doc/DOCKERUSE.md index 69382c9613dd988f0a75d4c1b8eff0cdb487529c..9c100896166250eab6a2f6c2f6abaf4ad2bd9e97 100644 --- a/doc/DOCKERUSE.md +++ b/doc/DOCKERUSE.md @@ -26,8 +26,9 @@ Here is the list of OTBTF docker images hosted on [dockerhub](https://hub.docker | **mdl4eo/otbtf3.0:gpu-dev** | Ubuntu Focal | r2.5 | 7.4.0 | GPU (dev) | yes | 5.2,6.1,7.0,7.5,8.6| | **mdl4eo/otbtf3.1:cpu-basic** | Ubuntu Focal | r2.8 | 7.4.0 | CPU, no optimization | no | 5.2,6.1,7.0,7.5,8.6| | **mdl4eo/otbtf3.1:cpu-basic-dev** | Ubuntu Focal | r2.8 | 7.4.0 | CPU, no optimization (dev) | yes | 5.2,6.1,7.0,7.5,8.6| -| **mdl4eo/otbtf3.1:gpu-basic** | Ubuntu Focal | r2.8 | 7.4.0 | GPU | yes | 5.2,6.1,7.0,7.5,8.6| -| **mdl4eo/otbtf3.1:gpu** | Ubuntu Focal | r2.8 | 7.4.0 | GPU, no optimization | yes | 5.2,6.1,7.0,7.5,8.6| +| **mdl4eo/otbtf3.1:gpu-basic** | Ubuntu Focal | r2.8 | 7.4.0 | GPU, no optimization | no | 5.2,6.1,7.0,7.5,8.6| +| **mdl4eo/otbtf3.1:gpu-basic-dev** | Ubuntu Focal | r2.8 | 7.4.0 | GPU, no optimization (dev) | yes | 5.2,6.1,7.0,7.5,8.6| +| **mdl4eo/otbtf3.1:gpu** | Ubuntu Focal | r2.8 | 7.4.0 | GPU | no | 5.2,6.1,7.0,7.5,8.6| | **mdl4eo/otbtf3.1:gpu-dev** | Ubuntu Focal | r2.8 | 7.4.0 | GPU (dev) | yes | 5.2,6.1,7.0,7.5,8.6| - `cpu` tagged docker images are compiled for CPU usage only. diff --git a/otbtf/__init__.py b/otbtf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ac36018a1e7f401c3aaaab9d0809f9541a0c376f --- /dev/null +++ b/otbtf/__init__.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# ========================================================================== +# +# Copyright 2018-2019 IRSTEA +# Copyright 2020-2022 INRAE +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ==========================================================================*/ +""" +OTBTF python module +""" +from otbtf.utils import read_as_np_arr, gdal_open +from otbtf.dataset import Buffer, PatchesReaderBase, PatchesImagesReader, IteratorBase, RandomIterator, Dataset, \ + DatasetFromPatchesImages +from otbtf.tfrecords import TFRecords diff --git a/python/otbtf.py b/otbtf/dataset.py similarity index 84% rename from python/otbtf.py rename to otbtf/dataset.py index 860e86a7b30dc182698ec521a341daeb44d00c99..002754811458597db20a34233a83132ece5e7090 100644 --- a/python/otbtf.py +++ b/otbtf/dataset.py @@ -18,8 +18,7 @@ # # ==========================================================================*/ """ -Contains stuff to help working with TensorFlow and geospatial data in the -OTBTF framework. +Contains stuff to help working with TensorFlow and geospatial data in the OTBTF framework. """ import threading import multiprocessing @@ -28,48 +27,12 @@ import logging from abc import ABC, abstractmethod import numpy as np import tensorflow as tf -from osgeo import gdal - - -# ----------------------------------------------------- Helpers -------------------------------------------------------- - - -def gdal_open(filename): - """ - Open a GDAL raster - :param filename: raster file - :return: a GDAL dataset instance - """ - gdal_ds = gdal.Open(filename) - if gdal_ds is None: - raise Exception("Unable to open file {}".format(filename)) - return gdal_ds - - -def read_as_np_arr(gdal_ds, as_patches=True): - """ - Read a GDAL raster as numpy array - :param gdal_ds: a GDAL dataset instance - :param as_patches: if True, the returned numpy array has the following shape (n, psz_x, psz_x, nb_channels). If - False, the shape is (1, psz_y, psz_x, nb_channels) - :return: Numpy array of dim 4 - """ - buffer = gdal_ds.ReadAsArray() - size_x = gdal_ds.RasterXSize - if len(buffer.shape) == 3: - buffer = np.transpose(buffer, axes=(1, 2, 0)) - if not as_patches: - n_elems = 1 - size_y = gdal_ds.RasterYSize - else: - n_elems = int(gdal_ds.RasterYSize / size_x) - size_y = size_x - return np.float32(buffer.reshape((n_elems, size_y, size_x, gdal_ds.RasterCount))) +from otbtf.utils import read_as_np_arr, gdal_open +from otbtf.tfrecords import TFRecords # -------------------------------------------------- Buffer class ------------------------------------------------------ - class Buffer: """ Used to store and access list of objects @@ -102,7 +65,6 @@ class Buffer: # ---------------------------------------------- PatchesReaderBase class ----------------------------------------------- - class PatchesReaderBase(ABC): """ Base class for patches delivery @@ -147,7 +109,6 @@ class PatchesReaderBase(ABC): # --------------------------------------------- PatchesImagesReader class ---------------------------------------------- - class PatchesImagesReader(PatchesReaderBase): """ This class provides a read access to a set of patches images. @@ -167,7 +128,7 @@ class PatchesImagesReader(PatchesReaderBase): :see PatchesReaderBase """ - def __init__(self, filenames_dict: dict, use_streaming=False): + def __init__(self, filenames_dict, use_streaming=False, scalar_dict=None): """ :param filenames_dict: A dict() structured as follow: {src_name1: [src1_patches_image_1.tif, ..., src1_patches_image_N.tif], @@ -175,6 +136,11 @@ class PatchesImagesReader(PatchesReaderBase): ... src_nameM: [srcM_patches_image_1.tif, ..., srcM_patches_image_N.tif]} :param use_streaming: if True, the patches are read on the fly from the disc, nothing is kept in memory. + :param scalar_dict: (optional) a dict containing list of scalars (int, float, str) as follow: + {scalar_name1: ["value_1", ..., "value_N"], + scalar_name2: [value_1, ..., value_N], + ... + scalar_nameM: [value1, ..., valueN]} """ assert len(filenames_dict.values()) > 0 @@ -182,13 +148,18 @@ class PatchesImagesReader(PatchesReaderBase): # gdal_ds dict self.gdal_ds = {key: [gdal_open(src_fn) for src_fn in src_fns] for key, src_fns in filenames_dict.items()} - # check number of patches in each sources - if len({len(ds_list) for ds_list in self.gdal_ds.values()}) != 1: - raise Exception("Each source must have the same number of patches images") - # streaming on/off self.use_streaming = use_streaming + # Scalar dict (e.g. for metadata) + # If the scalars are not numpy.ndarray, convert them + self.scalar_dict = {key: [i if isinstance(i, np.ndarray) else np.asarray(i) for i in scalars] + for key, scalars in scalar_dict.items()} if scalar_dict else {} + + # check number of patches in each sources + if len({len(ds_list) for ds_list in list(self.gdal_ds.values()) + list(self.scalar_dict.values())}) != 1: + raise Exception("Each source must have the same number of patches images") + # gdal_ds check nb_of_patches = {key: 0 for key in self.gdal_ds} self.nb_of_channels = dict() @@ -200,9 +171,9 @@ class PatchesImagesReader(PatchesReaderBase): else: if self.nb_of_channels[src_key] != gdal_ds.RasterCount: raise Exception("All patches images from one source must have the same number of channels!" - "Error happened for source: {}".format(src_key)) + f"Error happened for source: {src_key}") if len(set(nb_of_patches.values())) != 1: - raise Exception("Sources must have the same number of patches! Number of patches: {}".format(nb_of_patches)) + raise Exception(f"Sources must have the same number of patches! Number of patches: {nb_of_patches}") # gdal_ds sizes src_key_0 = list(self.gdal_ds)[0] # first key @@ -211,8 +182,8 @@ class PatchesImagesReader(PatchesReaderBase): # if use_streaming is False, we store in memory all patches images if not self.use_streaming: - patches_list = {src_key: [read_as_np_arr(ds) for ds in self.gdal_ds[src_key]] for src_key in self.gdal_ds} - self.patches_buffer = {src_key: np.concatenate(patches_list[src_key], axis=0) for src_key in self.gdal_ds} + self.patches_buffer = {src_key: np.concatenate([read_as_np_arr(ds) for ds in src_ds], axis=0) for + src_key, src_ds in self.gdal_ds.items()} def _get_ds_and_offset_from_index(self, index): offset = index @@ -236,8 +207,9 @@ class PatchesImagesReader(PatchesReaderBase): assert yoff + psz <= gdal_ds.RasterYSize buffer = gdal_ds.ReadAsArray(0, yoff, psz, psz) if len(buffer.shape) == 3: - buffer = np.transpose(buffer, axes=(1, 2, 0)) - return np.float32(buffer) + # multi-band raster + return np.transpose(buffer, axes=(1, 2, 0)) + return np.expand_dims(buffer, axis=2) def get_sample(self, index): """ @@ -252,18 +224,19 @@ class PatchesImagesReader(PatchesReaderBase): assert index >= 0 assert index < self.size + i, offset = self._get_ds_and_offset_from_index(index) + res = {src_key: scalar[i] for src_key, scalar in self.scalar_dict.items()} if not self.use_streaming: - res = {src_key: self.patches_buffer[src_key][index, :, :, :] for src_key in self.gdal_ds} + res.update({src_key: arr[index, :, :, :] for src_key, arr in self.patches_buffer.items()}) else: - i, offset = self._get_ds_and_offset_from_index(index) - res = {src_key: self._read_extract_as_np_arr(self.gdal_ds[src_key][i], offset) for src_key in self.gdal_ds} - + res.update({src_key: self._read_extract_as_np_arr(self.gdal_ds[src_key][i], offset) + for src_key in self.gdal_ds}) return res def get_stats(self): """ Compute some statistics for each source. - Depending if streaming is used, the statistics are computed directly in memory, or chunk-by-chunk. + When streaming is used, chunk-by-chunk. Else, the statistics are computed directly in memory. :return statistics dict """ @@ -309,11 +282,11 @@ class PatchesImagesReader(PatchesReaderBase): # ----------------------------------------------- IteratorBase class --------------------------------------------------- - class IteratorBase(ABC): """ Base class for iterators """ + @abstractmethod def __init__(self, patches_reader: PatchesReaderBase): pass @@ -321,7 +294,6 @@ class IteratorBase(ABC): # ---------------------------------------------- RandomIterator class -------------------------------------------------- - class RandomIterator(IteratorBase): """ Pick a random number in the [0, handler.size) range. @@ -351,7 +323,6 @@ class RandomIterator(IteratorBase): # ------------------------------------------------- Dataset class ------------------------------------------------------ - class Dataset: """ Handles the "mining" of patches. @@ -361,17 +332,24 @@ class Dataset: :see Buffer """ - def __init__(self, patches_reader: PatchesReaderBase, buffer_length: int = 128, - Iterator: IteratorBase = RandomIterator): + def __init__(self, patches_reader: PatchesReaderBase = None, buffer_length: int = 128, + Iterator=RandomIterator, max_nb_of_samples=None): """ :param patches_reader: The patches reader instance :param buffer_length: The number of samples that are stored in the buffer :param Iterator: The iterator class used to generate the sequence of patches indices. + :param max_nb_of_samples: Optional, max number of samples to consider """ - # patches reader self.patches_reader = patches_reader - self.size = self.patches_reader.get_size() + + # If necessary, limit the nb of samples + logging.info('Number of samples: %s', self.patches_reader.get_size()) + if max_nb_of_samples and self.patches_reader.get_size() > max_nb_of_samples: + logging.info('Reducing number of samples to %s', max_nb_of_samples) + self.size = max_nb_of_samples + else: + self.size = self.patches_reader.get_size() # iterator self.iterator = Iterator(patches_reader=self.patches_reader) @@ -404,8 +382,21 @@ class Dataset: output_types=self.output_types, output_shapes=self.output_shapes).repeat(1) + def to_tfrecords(self, output_dir, n_samples_per_shard=100, drop_remainder=True): + """ + Save the dataset into TFRecord files + + :param output_dir: output directory + :param n_samples_per_shard: number of samples per TFRecord file + :param drop_remainder: drop remainder samples + """ + tfrecord = TFRecords(output_dir) + tfrecord.ds2tfrecord(self, n_samples_per_shard=n_samples_per_shard, drop_remainder=drop_remainder) + def get_stats(self) -> dict: """ + Compute dataset statistics + :return: the dataset statistics, computed by the patches reader """ with self.mining_lock: @@ -493,7 +484,6 @@ class Dataset: # ----------------------------------------- DatasetFromPatchesImages class --------------------------------------------- - class DatasetFromPatchesImages(Dataset): """ Handles the "mining" of a set of patches images. @@ -502,8 +492,8 @@ class DatasetFromPatchesImages(Dataset): :see Dataset """ - def __init__(self, filenames_dict: dict, use_streaming: bool = False, buffer_length: int = 128, - Iterator: IteratorBase = RandomIterator): + def __init__(self, filenames_dict, use_streaming=False, buffer_length: int = 128, + Iterator=RandomIterator): """ :param filenames_dict: A dict() structured as follow: {src_name1: [src1_patches_image1, ..., src1_patches_imageN1], diff --git a/python/examples/tensorflow_v1x/create_savedmodel_ienco-m3_patchbased.py b/otbtf/examples/tensorflow_v1x/create_savedmodel_ienco-m3_patchbased.py similarity index 100% rename from python/examples/tensorflow_v1x/create_savedmodel_ienco-m3_patchbased.py rename to otbtf/examples/tensorflow_v1x/create_savedmodel_ienco-m3_patchbased.py diff --git a/python/examples/tensorflow_v1x/create_savedmodel_maggiori17_fullyconv.py b/otbtf/examples/tensorflow_v1x/create_savedmodel_maggiori17_fullyconv.py similarity index 100% rename from python/examples/tensorflow_v1x/create_savedmodel_maggiori17_fullyconv.py rename to otbtf/examples/tensorflow_v1x/create_savedmodel_maggiori17_fullyconv.py diff --git a/python/examples/tensorflow_v1x/create_savedmodel_pxs_fcn.py b/otbtf/examples/tensorflow_v1x/create_savedmodel_pxs_fcn.py similarity index 100% rename from python/examples/tensorflow_v1x/create_savedmodel_pxs_fcn.py rename to otbtf/examples/tensorflow_v1x/create_savedmodel_pxs_fcn.py diff --git a/python/examples/tensorflow_v1x/create_savedmodel_simple_cnn.py b/otbtf/examples/tensorflow_v1x/create_savedmodel_simple_cnn.py similarity index 100% rename from python/examples/tensorflow_v1x/create_savedmodel_simple_cnn.py rename to otbtf/examples/tensorflow_v1x/create_savedmodel_simple_cnn.py diff --git a/python/examples/tensorflow_v1x/create_savedmodel_simple_fcn.py b/otbtf/examples/tensorflow_v1x/create_savedmodel_simple_fcn.py similarity index 100% rename from python/examples/tensorflow_v1x/create_savedmodel_simple_fcn.py rename to otbtf/examples/tensorflow_v1x/create_savedmodel_simple_fcn.py diff --git a/python/examples/tensorflow_v2x/l2_norm.py b/otbtf/examples/tensorflow_v2x/l2_norm.py similarity index 100% rename from python/examples/tensorflow_v2x/l2_norm.py rename to otbtf/examples/tensorflow_v2x/l2_norm.py diff --git a/python/examples/tensorflow_v2x/scalar_product.py b/otbtf/examples/tensorflow_v2x/scalar_product.py similarity index 100% rename from python/examples/tensorflow_v2x/scalar_product.py rename to otbtf/examples/tensorflow_v2x/scalar_product.py diff --git a/otbtf/tfrecords.py b/otbtf/tfrecords.py new file mode 100644 index 0000000000000000000000000000000000000000..b2aae0b2fdc7f1a0341840b447c121694b8c8367 --- /dev/null +++ b/otbtf/tfrecords.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- +# ========================================================================== +# +# Copyright 2018-2019 IRSTEA +# Copyright 2020-2022 INRAE +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ==========================================================================*/ +""" +The tfrecords module provides an implementation for the TFRecords files read/write +""" +import glob +import json +import os +import logging +from functools import partial +import tensorflow as tf +from tqdm import tqdm + + +class TFRecords: + """ + This class allows to convert Dataset objects to TFRecords and to load them in dataset tensorflows format. + """ + + def __init__(self, path): + """ + :param path: Can be a directory where TFRecords must be saved/loaded + """ + self.dirpath = path + os.makedirs(self.dirpath, exist_ok=True) + self.output_types_file = os.path.join(self.dirpath, "output_types.json") + self.output_shape_file = os.path.join(self.dirpath, "output_shape.json") + self.output_shape = self.load(self.output_shape_file) if os.path.exists(self.output_shape_file) else None + self.output_types = self.load(self.output_types_file) if os.path.exists(self.output_types_file) else None + + @staticmethod + def _bytes_feature(value): + """ + Convert a value to a type compatible with tf.train.Example. + :param value: value + :return a bytes_list from a string / byte. + """ + if isinstance(value, type(tf.constant(0))): + value = value.numpy() # BytesList won't unpack a string from an EagerTensor. + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + + def ds2tfrecord(self, dataset, n_samples_per_shard=100, drop_remainder=True): + """ + Convert and save samples from dataset object to tfrecord files. + :param dataset: Dataset object to convert into a set of tfrecords + :param n_samples_per_shard: Number of samples per shard + :param drop_remainder: Whether additional samples should be dropped. Advisable if using multiworkers training. + If True, all TFRecords will have `n_samples_per_shard` samples + """ + logging.info("%s samples", dataset.size) + + nb_shards = (dataset.size // n_samples_per_shard) + if not drop_remainder and dataset.size % n_samples_per_shard > 0: + nb_shards += 1 + + output_shapes = {key: (None,) + output_shape for key, output_shape in dataset.output_shapes.items()} + self.save(output_shapes, self.output_shape_file) + + output_types = {key: output_type.name for key, output_type in dataset.output_types.items()} + self.save(output_types, self.output_types_file) + + for i in tqdm(range(nb_shards)): + + if (i + 1) * n_samples_per_shard <= dataset.size: + nb_sample = n_samples_per_shard + else: + nb_sample = dataset.size - i * n_samples_per_shard + + filepath = os.path.join(self.dirpath, f"{i}.records") + with tf.io.TFRecordWriter(filepath) as writer: + for _ in range(nb_sample): + sample = dataset.read_one_sample() + serialized_sample = {name: tf.io.serialize_tensor(fea) for name, fea in sample.items()} + features = {name: self._bytes_feature(serialized_tensor) for name, serialized_tensor in + serialized_sample.items()} + tf_features = tf.train.Features(feature=features) + example = tf.train.Example(features=tf_features) + writer.write(example.SerializeToString()) + + @staticmethod + def save(data, filepath): + """ + Save data to JSON format. + :param data: Data to save json format + :param filepath: Output file name + """ + + with open(filepath, 'w') as file: + json.dump(data, file, indent=4) + + @staticmethod + def load(filepath): + """ + Return data from JSON format. + :param filepath: Input file name + """ + with open(filepath, 'r') as file: + return json.load(file) + + @staticmethod + def parse_tfrecord(example, features_types, target_keys, preprocessing_fn=None, **kwargs): + """ + Parse example object to sample dict. + :param example: Example object to parse + :param features_types: List of types for each feature + :param target_keys: list of keys of the targets + :param preprocessing_fn: Optional. A preprocessing function that takes input, target as args and returns + a tuple (input_preprocessed, target_preprocessed) + :param kwargs: some keywords arguments for preprocessing_fn + """ + read_features = {key: tf.io.FixedLenFeature([], dtype=tf.string) for key in features_types} + example_parsed = tf.io.parse_single_example(example, read_features) + + for key in read_features.keys(): + example_parsed[key] = tf.io.parse_tensor(example_parsed[key], out_type=features_types[key]) + + # Differentiating inputs and outputs + input_parsed = {key: value for (key, value) in example_parsed.items() if key not in target_keys} + target_parsed = {key: value for (key, value) in example_parsed.items() if key in target_keys} + + if preprocessing_fn: + input_parsed, target_parsed = preprocessing_fn(input_parsed, target_parsed, **kwargs) + + return input_parsed, target_parsed + + def read(self, batch_size, target_keys, n_workers=1, drop_remainder=True, shuffle_buffer_size=None, + preprocessing_fn=None, **kwargs): + """ + Read all tfrecord files matching with pattern and convert data to tensorflow dataset. + :param batch_size: Size of tensorflow batch + :param target_keys: Keys of the target, e.g. ['s2_out'] + :param n_workers: number of workers, e.g. 4 if using 4 GPUs + e.g. 12 if using 3 nodes of 4 GPUs + :param drop_remainder: whether the last batch should be dropped in the case it has fewer than + `batch_size` elements. True is advisable when training on multiworkers. + False is advisable when evaluating metrics so that all samples are used + :param shuffle_buffer_size: if None, shuffle is not used. Else, blocks of shuffle_buffer_size + elements are shuffled using uniform random. + :param preprocessing_fn: Optional. A preprocessing function that takes input, target as args and returns + a tuple (input_preprocessed, target_preprocessed) + :param kwargs: some keywords arguments for preprocessing_fn + """ + options = tf.data.Options() + if shuffle_buffer_size: + options.experimental_deterministic = False # disable order, increase speed + options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO # for multiworker + parse = partial(self.parse_tfrecord, features_types=self.output_types, target_keys=target_keys, + preprocessing_fn=preprocessing_fn, **kwargs) + + # TODO: to be investigated : + # 1/ num_parallel_reads useful ? I/O bottleneck of not ? + # 2/ num_parallel_calls=tf.data.experimental.AUTOTUNE useful ? + tfrecords_pattern_path = os.path.join(self.dirpath, "*.records") + matching_files = glob.glob(tfrecords_pattern_path) + logging.info('Searching TFRecords in %s...', tfrecords_pattern_path) + logging.info('Number of matching TFRecords: %s', len(matching_files)) + matching_files = matching_files[:n_workers * (len(matching_files) // n_workers)] # files multiple of workers + nb_matching_files = len(matching_files) + if nb_matching_files == 0: + raise Exception(f"At least one worker has no TFRecord file in {tfrecords_pattern_path}. Please ensure that " + "the number of TFRecord files is greater or equal than the number of workers!") + logging.info('Reducing number of records to : %s', nb_matching_files) + dataset = tf.data.TFRecordDataset(matching_files) # , num_parallel_reads=2) # interleaves reads from xxx files + dataset = dataset.with_options(options) # uses data as soon as it streams in, rather than in its original order + dataset = dataset.map(parse, num_parallel_calls=tf.data.experimental.AUTOTUNE) + if shuffle_buffer_size: + dataset = dataset.shuffle(buffer_size=shuffle_buffer_size) + dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) + dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) + + return dataset diff --git a/otbtf/utils.py b/otbtf/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..069638a551e0226c9ff3ad6cc1537359922b29c1 --- /dev/null +++ b/otbtf/utils.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# ========================================================================== +# +# Copyright 2018-2019 IRSTEA +# Copyright 2020-2022 INRAE +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ==========================================================================*/ +""" +The utils module provides some helpers to read patches using gdal +""" +from osgeo import gdal +import numpy as np + + +# ----------------------------------------------------- Helpers -------------------------------------------------------- + +def gdal_open(filename): + """ + Open a GDAL raster + :param filename: raster file + :return: a GDAL dataset instance + """ + gdal_ds = gdal.Open(filename) + if not gdal_ds: + raise Exception(f"Unable to open file {filename}") + return gdal_ds + + +def read_as_np_arr(gdal_ds, as_patches=True, dtype=None): + """ + Read a GDAL raster as numpy array + :param gdal_ds: a GDAL dataset instance + :param as_patches: if True, the returned numpy array has the following shape (n, psz_x, psz_x, nb_channels). If + False, the shape is (1, psz_y, psz_x, nb_channels) + :param dtype: if not None array dtype will be cast to given numpy data type (np.float32, np.uint16...) + :return: Numpy array of dim 4 + """ + buffer = gdal_ds.ReadAsArray() + size_x = gdal_ds.RasterXSize + if len(buffer.shape) == 3: + buffer = np.transpose(buffer, axes=(1, 2, 0)) + if as_patches: + n_elems = int(gdal_ds.RasterYSize / size_x) + size_y = size_x + else: + n_elems = 1 + size_y = gdal_ds.RasterYSize + + buffer = buffer.reshape((n_elems, size_y, size_x, gdal_ds.RasterCount)) + if dtype is not None: + buffer = buffer.astype(dtype) + + return buffer diff --git a/python/__init__.py b/python/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..3a95ac4a7a80f85a25389af04ad8b44b27295aa9 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +import setuptools + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +setuptools.setup( + name="otbtf", + version="0.1", + author="Remi Cresson", + author_email="remi.cresson@inrae.fr", + description="OTBTF: Orfeo ToolBox meets TensorFlow", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://gitlab.irstea.fr/remi.cresson/otbtf", + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Topic :: Scientific/Engineering :: GIS", + "Topic :: Scientific/Engineering :: Image Processing", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + ], + packages=setuptools.find_packages(), + python_requires=">=3.6", + keywords="remote sensing, otb, orfeotoolbox, orfeo toolbox, tensorflow, tf, deep learning, machine learning", +) \ No newline at end of file diff --git a/python/tricks.py b/tricks/__init__.py similarity index 97% rename from python/tricks.py rename to tricks/__init__.py index b31b14c39ddd89498983ea457c16b15f2c5409df..d22e7e96543aa57b3925a2738377110ab28943f4 100644 --- a/python/tricks.py +++ b/tricks/__init__.py @@ -25,7 +25,7 @@ for TF 1.X versions. """ import tensorflow.compat.v1 as tf from deprecated import deprecated -from otbtf import gdal_open, read_as_np_arr as read_as_np_arr_from_gdal_ds +from otbtf.utils import gdal_open, read_as_np_arr as read_as_np_arr_from_gdal_ds tf.disable_v2_behavior() diff --git a/python/ckpt2savedmodel.py b/tricks/ckpt2savedmodel.py similarity index 97% rename from python/ckpt2savedmodel.py rename to tricks/ckpt2savedmodel.py index 117203bafd89bcbfaa272952323434dac4046a8b..ff22965f985623b318292790717ed32fa4e04536 100755 --- a/python/ckpt2savedmodel.py +++ b/tricks/ckpt2savedmodel.py @@ -26,7 +26,7 @@ can be more conveniently exported as SavedModel (see how to build a model with keras in Tensorflow 2). """ import argparse -from tricks import ckpt_to_savedmodel +from tricks.tricks import ckpt_to_savedmodel def main():