Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GriB file with constant field (all values = 0) throws an error #355

Open
larsbarring opened this issue Oct 5, 2023 · 11 comments
Open

GriB file with constant field (all values = 0) throws an error #355

larsbarring opened this issue Oct 5, 2023 · 11 comments

Comments

@larsbarring
Copy link

larsbarring commented Oct 5, 2023

We have a file where all data is 0. This is encoded as a bitmap in section 6, while section 7 is empty. When reading this into iris we get the error as below. Eccodes is able to read this without problems so it appears that there is no problem with the file. It seems like this error is similar to what was reported in #131. The file is available at the end, although with the ending ".txt" appended to allow it to be uploaded.

import iris
import iris_grib

iris.__version__
'3.8.0.dev21'

iris_grib.__version__
'0.18.0'

cube=iris.load_cube("sd_an_1961092406.grb")
cube.data
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[10], line 1
----> 1 cube.data

File ~/CODE/scitools/iris/lib/iris/cube.py:2466, in Cube.data(self)
   2433 @property
   2434 def data(self):
   2435     """
   2436     The :class:`numpy.ndarray` representing the multi-dimensional data of
   2437     the cube.
   (...)
   2464 
   2465     """
-> 2466     return self._data_manager.data

File ~/CODE/scitools/iris/lib/iris/_data_manager.py:206, in DataManager.data(self)
    203 if self.has_lazy_data():
    204     try:
    205         # Realise the lazy data.
--> 206         result = as_concrete_data(self._lazy_array)
    207         # Assign the realised result.
    208         self._real_array = result

File ~/CODE/scitools/iris/lib/iris/_lazy_data.py:288, in as_concrete_data(data)
    271 """
    272 Return the actual content of a lazy array, as a numpy array.
    273 If the input data is a NumPy `ndarray` or masked array, return it
   (...)
    285 
    286 """
    287 if is_lazy_data(data):
--> 288     (data,) = _co_realise_lazy_arrays([data])
    290 return data

File ~/CODE/scitools/iris/lib/iris/_lazy_data.py:251, in _co_realise_lazy_arrays(arrays)
    236 def _co_realise_lazy_arrays(arrays):
    237     """
    238     Compute multiple lazy arrays and return a list of real values.
    239 
   (...)
    249 
    250     """
--> 251     computed_arrays = da.compute(*arrays)
    252     results = []
    253     for lazy_in, real_out in zip(arrays, computed_arrays):
    254         # Ensure we always have arrays.
    255         # Note : in some cases dask (and numpy) will return a scalar
    256         # numpy.int/numpy.float object rather than an ndarray.
    257         # Recorded in https://github.com/dask/dask/issues/2111.

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/base.py:599, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
    596     keys.append(x.__dask_keys__())
    597     postcomputes.append(x.__dask_postcompute__())
--> 599 results = schedule(dsk, keys, **kwargs)
    600 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/threaded.py:89, in get(dsk, keys, cache, num_workers, pool, **kwargs)
     86     elif isinstance(pool, multiprocessing.pool.Pool):
     87         pool = MultiprocessingPoolExecutor(pool)
---> 89 results = get_async(
     90     pool.submit,
     91     pool._max_workers,
     92     dsk,
     93     keys,
     94     cache=cache,
     95     get_id=_thread_get_id,
     96     pack_exception=pack_exception,
     97     **kwargs,
     98 )
    100 # Cleanup pools associated to dead threads
    101 with pools_lock:

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/local.py:511, in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
    509         _execute_task(task, data)  # Re-execute locally
    510     else:
--> 511         raise_exception(exc, tb)
    512 res, worker_id = loads(res_info)
    513 state["cache"][key] = res

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/local.py:319, in reraise(exc, tb)
    317 if exc.__traceback__ is not tb:
    318     raise exc.with_traceback(tb)
--> 319 raise exc

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/local.py:224, in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
    222 try:
    223     task, data = loads(task_info)
--> 224     result = _execute_task(task, data)
    225     id = get_id()
    226     result = dumps((result, id))

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/core.py:119, in _execute_task(arg, cache, dsk)
    115     func, args = arg[0], arg[1:]
    116     # Note: Don't assign the subtask results to a variable. numpy detects
    117     # temporaries by their reference count and can execute certain
    118     # operations in-place.
--> 119     return func(*(_execute_task(a, cache) for a in args))
    120 elif not ishashable(arg):
    121     return arg

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/optimization.py:990, in SubgraphCallable.__call__(self, *args)
    988 if not len(args) == len(self.inkeys):
    989     raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 990 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/core.py:149, in get(dsk, out, cache)
    147 for key in toposort(dsk):
    148     task = dsk[key]
--> 149     result = _execute_task(task, cache)
    150     cache[key] = result
    151 result = _execute_task(out, cache)

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/core.py:119, in _execute_task(arg, cache, dsk)
    115     func, args = arg[0], arg[1:]
    116     # Note: Don't assign the subtask results to a variable. numpy detects
    117     # temporaries by their reference count and can execute certain
    118     # operations in-place.
--> 119     return func(*(_execute_task(a, cache) for a in args))
    120 elif not ishashable(arg):
    121     return arg

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/utils.py:73, in apply(func, args, kwargs)
     42 """Apply a function given its positional and keyword arguments.
     43 
     44 Equivalent to ``func(*args, **kwargs)``
   (...)
     70 >>> dsk = {'task-name': task}  # adds the task to a low level Dask task graph
     71 """
     72 if kwargs:
---> 73     return func(*args, **kwargs)
     74 else:
     75     return func(*args)

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/dask/array/core.py:120, in getter(a, b, asarray, lock)
    118     lock.acquire()
    119 try:
--> 120     c = a[b]
    121     # Below we special-case `np.matrix` to force a conversion to
    122     # `np.ndarray` and preserve original Dask behavior for `getter`,
    123     # as for all purposes `np.matrix` is array-like and thus
    124     # `is_arraylike` evaluates to `True` in that case.
    125     if asarray and (not is_arraylike(c) or isinstance(c, np.matrix)):

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/iris_grib/message.py:239, in _DataProxy.__getitem__(self, keys)
    237 bitmap_section = sections[6]
    238 bitmap = self._bitmap(bitmap_section)
--> 239 data = sections[7]['codedValues']
    241 if bitmap is not None:
    242     # Note that bitmap and data are both 1D arrays at this point.
    243     if np.count_nonzero(bitmap) == data.shape[0]:
    244         # Only the non-masked values are included in codedValues.

File ~/mambaforge/envs/scitools/lib/python3.11/site-packages/iris_grib/message.py:405, in Section.__getitem__(self, key)
    403     value = self._number
    404 elif key not in self._keys:
--> 405     raise KeyError('{!r} not defined in section {}'.format(
    406         key, self._number))
    407 else:
    408     value = self._get_key_value(key)

KeyError: "'codedValues' not defined in section 7"

sd_an_1961092406.grb.txt

@larsbarring
Copy link
Author

larsbarring commented Oct 12, 2023

While I am not a GriB expert, I nevertheless did some further investigations by comparing with the file for the preceding day (attached below). This sheds some light on where the problem might be. grib_dump -O shows for the non-problematic file:

1-4 section5Length = 21
5 numberOfSection = 5
6-9 numberOfValues = 466641
10-11 dataRepresentationTemplateNumber = 0 [Grid point data - simple packing (grib2/tables/17/5.0.table) ]
12-15 referenceValue = 0
16-17 binaryScaleFactor = -31
18-19 decimalScaleFactor = 0
20 bitsPerValue = 24
21 typeOfOriginalFieldValues = 0 [Floating point (grib2/tables/17/5.1.table) ]

and for the problematic file:

1-4 section5Length = 21
5 numberOfSection = 5
6-9 numberOfValues = 466641
10-11 dataRepresentationTemplateNumber = 0 [Grid point data - simple packing (grib2/tables/17/5.0.table) ]
12-15 referenceValue = 0
16-17 binaryScaleFactor = -28
18-19 decimalScaleFactor = 0
20 bitsPerValue = 0
21 typeOfOriginalFieldValues = 0 [Floating point (grib2/tables/17/5.1.table) ]

As far as I understand, if zero bits are used to represent the data then section 7 will be empty, in which case the referenceValue represents the whole field (taking a non-zero decimalScaleFactor into account). This was commented by @greenlaw here.

sd_an_1961092306.grb.txt

@larsbarring larsbarring changed the title GriB file with bitmapped data raises an error GriB file with constant field (all values = 0) throws an error Oct 12, 2023
@greenlaw
Copy link

We ended up monkey-patching this (and a couple other special cases) by updating DataProxy.__getitem__ (originally defined in iris_grib/message.py) as follows. Use at your own risk.

    def __getitem__(self, keys):
        """
        GRIB message data accessor.

        This handles reconstruction of the data values from packged representations, including
        some special cases:

        1) reconstruction of a message's data when `codedValues` is missing entirely, and
        2) treating 9999 as missing data when `missingValueManagementUsed==1`.

        See:
            https://apps.ecmwf.int/codes/grib/format/grib2/regulations/
            https://apps.ecmwf.int/codes/grib/format/grib2/templates/5/
        """
        # NB. Currently assumes that the validity of this interpretation
        # is checked before this proxy is created.
        message = self.recreate_raw()
        sections = message.sections
        bitmap_section = sections[6]
        bitmap = self._bitmap(bitmap_section)

        if "codedValues" not in sections[7].keys():
            data_rep_template = sections[5]["dataRepresentationTemplateNumber"]
            if data_rep_template not in (0, 40, 41, 42, 50):
                raise TranslationError(
                    f"Reconstruction of missing codedValues for dataRepresentationTemplateNumber {data_rep_template} is unsupported"
                )

            reference_value = sections[5]["referenceValue"]
            decimal_scale_factor = sections[5]["decimalScaleFactor"]
            data = np.ones(self.shape) * reference_value / (10**decimal_scale_factor)
        else:
            data = sections[7]["codedValues"]

            if bitmap is not None:
                # Note that bitmap and data are both 1D arrays at this point.
                if np.count_nonzero(bitmap) == data.shape[0]:
                    # Only the non-masked values are included in codedValues.
                    _data = np.empty(shape=bitmap.shape)
                    _data[bitmap.astype(bool)] = data
                    # `ma.masked_array` masks where input = 1, the opposite of
                    # the behaviour specified by the GRIB spec.
                    data = ma.masked_array(_data, mask=np.logical_not(bitmap), fill_value=np.nan)
                else:
                    msg = "Shapes of data and bitmap do not match."
                    raise TranslationError(msg)
            elif "missingValueManagementUsed" in sections[5].keys() and sections[5]["missingValueManagementUsed"] == 1:
                # This appears to be required for reading certain complex-packing fields.
                # Whether it is caused by a data encoding problem or a bug in eccodes is unclear.
                # The relevant eccodes decoder logic can be found here (note that although the module
                # is misnamed as '2order_packing', it is definitely the complex packing logic):
                # https://github.com/ecmwf/eccodes/blob/ac303936267ae99a9b3ae103e7d2db74674098e9/src/grib_accessor_class_data_g22order_packing.cc#L601
                data = ma.masked_array(data, mask=(data == 9999), fill_value=np.nan)

            data = data.reshape(self.shape)

        return data.__getitem__(keys)

@bjlittle
Copy link
Member

bjlittle commented Nov 17, 2023

@larsbarring and @greenlaw This issue should now be resolved thanks to #362.

This has been included in the 0.19.0 release, which is now available on conda-forge and PyPI.

Care to try it out and confirm whether this fixes your problem?

If so, feel free to close this issue 👍

@larsbarring
Copy link
Author

@bjlittle thanks for taking care of this! Here is my test:

$ ipython 
Python 3.11.0 | packaged by conda-forge | (main, Jan 14 2023, 12:27:40) [GCC 11.3.0]
Type 'copyright', 'credits' or 'license' for more information
IPython 8.10.0 -- An enhanced Interactive Python. Type '?' for help.

In [1]: import iris

In [2]: print(iris.__version__)
3.8.0.dev52

In [3]: import iris_grib
/home/a001257/mambaforge/envs/scitools/lib/python3.11/site-packages/gribapi/__init__.py:23: UserWarning: ecCodes 2.31.0 or higher is recommended. You are running version 2.29.0
  warnings.warn(

In [4]: print(iris_grib.__version__)
0.19.dev0

In [5]: d1 = iris.load_cube("sd_an_1961092306.grb").data

In [6]: print(d1.min(), d1.max())
0.0 0.0040847319178283215

In [7]: d2 = iris.load_cube("sd_an_1961092406.grb").data

In [8]: print(d2.min(), d2.max())
0.0 0.0

That is, the all-zero file is read without problems, as expected. 👍 👍 👍

However, when looking at the PR (#362) I see that if the bitsPerValue field is 0 then data is filled with with zeros. This is of course right for my particular file (and probably most other). However, as @greenlaw notes the GRIB documentation allows the same type of "packing" if all values are the same, be it zeroes or something else. To be on the safe side you might want to change line 254 in message.py to implement the formula from the GRIB doumentation, i.e something similar to what @greenlaw did (above):

reference_value = sections[5]["referenceValue"]
decimal_scale_factor = sections[5]["decimalScaleFactor"]
data = np.ones(self.shape) * reference_value / (10**decimal_scale_factor)

@bjlittle
Copy link
Member

@greenlaw Do you have an example GRIB2 file with a single message that demonstrates this behaviour?

I'm happy to add this extension as a patch i.e., 0.19.1

@greenlaw
Copy link

greenlaw commented Dec 1, 2023

@bjlittle @larsbarring Sorry, it was a while ago that I wrote that code, and I can't seem to find a file that demonstrates the non-zero missing codedValues behavior. It's possible that eccodes handles this behind the scenes and the formula I included above is unnecessary. If I'm able to find one I will let you know.

@bjlittle
Copy link
Member

bjlittle commented Dec 1, 2023

No problem @greenlaw, thanks for getting back to let me know 🍻

@larsbarring
Copy link
Author

I guess that files with all values in a field be exactly the same non-zero value are not that easy to come by. Hence I have hacked the test file included in my first post:

import struct
import os

with open("sd_an_1961092406.grb", mode='rb') as file:
    fileContent = file.read()

# position in file of referenceValue -- derived using grib_dump -O
refPosition = 16 + 21 + 81 + 34 + 11

# change referenceValue to a "reasonable number", 
# I did not bother what it actually meant or how it was packed into grib
ref = struct.pack("f",1.9999)
new = fileContent[0:refPosition] + ref + fileContent[refPosition+4:]

with open("sd_an_1961092406__NEW.grb", mode='wb') as file:
    file.write(new)

print("\nCheck that the new value landed where is supposed to (actual value is garbled):") 
os.system("grib_dump -O  sd_an_1961092406__NEW.grb | grep referenceValue")
print("\n\n\nAnd print how eccodes sees the data:")
os.system("grib_dump sd_an_1961092406__NEW.grb | tail -n 36")

import iris
import iris_grib

print(iris.__version__)
print(iris_grib.__version__)

cube = iris.load_cube("sd_an_1961092406__NEW.grb")
print(f"\n\nCUBE min = {cube.data.min()},   and max = {cube.data.max()}")

results in this printout:

Check that the new value landed where is supposed to (actual value is garbled):
12-15     referenceValue = -0.000482554



And print how eccodes sees the data:
  # A bit map applies to this product and is specified in this Section (grib2/tables/17/6.0.table)  
  bitMapIndicator = 0;
  bitmapPresent = 1;
  values(466641) =  {
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554, 
  -0.000482554, -0.000482554, -0.000482554, -0.000482554, -0.000482554
  ... 466541 more values
  } 
  #-READ ONLY- maximum = -0.000482554;
  #-READ ONLY- minimum = -0.000482554;
  #-READ ONLY- average = -0.000482554;
  #-READ ONLY- standardDeviation = 0;
  #-READ ONLY- skewness = 0;
  #-READ ONLY- kurtosis = 0;
  #-READ ONLY- isConstant = 1;
  #-READ ONLY- numberOfMissing = 0;
  #-READ ONLY- getNumberOfValues = 466641;
}
/home/a001257/mambaforge/envs/scitools/lib/python3.11/site-packages/gribapi/__init__.py:23: UserWarning: ecCodes 2.31.0 or higher is recommended. You are running version 2.29.0
  warnings.warn(
3.8.0.dev52
0.19.dev0


CUBE min = 0.0,   and max = 0.0

@trexfeathers
Copy link
Contributor

Thanks very much @larsbarring! Resource permitting, we now have all we need to work on this

@larsbarring
Copy link
Author

@trexfeathers -- thanks

and if you change my code as follows the numbers becomes as one expects:

# put a "reasonable number"
ref = struct.pack(">f", 1.999)
bin = struct.pack("h", 0)
new = fileContent[0:refPosition] + ref + bin +fileContent[refPosition+6:]

where the grib-dump output now shows 1.999

@larsbarring
Copy link
Author

.... and now I happened to bounce into #265 ....

@bjlittle bjlittle removed their assignment Apr 9, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
Status: No status
Development

No branches or pull requests

4 participants