Upgrades H264 Decoder

This commit updates the h264 decoder to work with newer versions of libavcodec that ships with Ubuntu Noble. There are still some colour artifacts.
clydemcqueen · Jun 29, 2024 · 2f53c49 · 2f53c49
1 parent d961961
commit 2f53c49
Show file tree

Hide file tree

Showing 5 changed files with 97 additions and 85 deletions.
diff --git a/tello_driver/CMakeLists.txt b/tello_driver/CMakeLists.txt
@@ -70,6 +70,7 @@ set(DRIVER_NODE_DEPS
   class_loader
   cv_bridge
   geometry_msgs
+  rclcpp_components
   OpenCV
   rclcpp
   ros2_shared
@@ -127,7 +128,8 @@ set(JOY_NODE_DEPS
   geometry_msgs
   rclcpp
   sensor_msgs
-  tello_msgs)
+  tello_msgs
+  rclcpp_components)
 
 add_library(tello_joy_node SHARED
   ${JOY_NODE_SOURCES})

diff --git a/tello_driver/h264decoder/h264decoder.cpp b/tello_driver/h264decoder/h264decoder.cpp
@@ -1,49 +1,45 @@
 extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavutil/avutil.h>
+#include <libavutil/imgutils.h>
 #include <libavutil/mem.h>
 #include <libswscale/swscale.h>
 }
 
-#ifndef PIX_FMT_RGB24
-#define PIX_FMT_RGB24 AV_PIX_FMT_RGB24
-#endif
-
-#ifndef CODEC_CAP_TRUNCATED
-#define CODEC_CAP_TRUNCATED AV_CODEC_CAP_TRUNCATED
-#endif
-
-#ifndef CODEC_FLAG_TRUNCATED
-#define CODEC_FLAG_TRUNCATED AV_CODEC_FLAG_TRUNCATED
-#endif
+#include <iostream>
 
 #include "h264decoder.hpp"
-#include <utility>
 
 typedef unsigned char ubyte;
 
 /* For backward compatibility with release 9 or so of libav */
-#if (LIBAVCODEC_VERSION_MAJOR <= 54)
+#if (LIBAVCODEC_VERSION_MAJOR <= 54) 
 #  define av_frame_alloc avcodec_alloc_frame
-#  define av_frame_free  avcodec_free_frame
+#  define av_frame_free  avcodec_free_frame  
 #endif
 
 
 H264Decoder::H264Decoder()
+  : pkt_{std::make_unique<AVPacket>()}
 {
+#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(58, 9, 100)
   avcodec_register_all();
+#endif
 
   codec = avcodec_find_decoder(AV_CODEC_ID_H264);
   if (!codec)
     throw H264InitFailure("cannot find decoder");
-
+  
   context = avcodec_alloc_context3(codec);
   if (!context)
     throw H264InitFailure("cannot allocate context");
 
-  if(codec->capabilities & CODEC_CAP_TRUNCATED) {
-    context->flags |= CODEC_FLAG_TRUNCATED;
+#if LIBAVCODEC_VERSION_MAJOR < 60
+  // Note: CODEC_CAP_TRUNCATED was prefixed with AV_...
+  if(codec->capabilities & AV_CODEC_CAP_TRUNCATED) {
+    context->flags |= AV_CODEC_FLAG_TRUNCATED;
   }
+#endif  
 
   int err = avcodec_open2(context, codec, nullptr);
   if (err < 0)
@@ -52,17 +48,14 @@ H264Decoder::H264Decoder()
   parser = av_parser_init(AV_CODEC_ID_H264);
   if (!parser)
     throw H264InitFailure("cannot init parser");
-
+  
   frame = av_frame_alloc();
   if (!frame)
     throw H264InitFailure("cannot allocate frame");
 
-#if 1
-  pkt = new AVPacket;
-  if (!pkt)
+  if (!pkt_)
     throw H264InitFailure("cannot allocate packet");
-  av_init_packet(pkt);
-#endif
+  av_init_packet(pkt_.get());
 }
 
 
@@ -72,34 +65,42 @@ H264Decoder::~H264Decoder()
   avcodec_close(context);
   av_free(context);
   av_frame_free(&frame);
-#if 1
-  delete pkt;
-#endif
 }
 
 
-ssize_t H264Decoder::parse(const ubyte* in_data, ssize_t in_size)
+ParseResult H264Decoder::parse(const ubyte* in_data, ptrdiff_t in_size)
 {
-  auto nread = av_parser_parse2(parser, context, &pkt->data, &pkt->size,
-    in_data, in_size,
-    0, 0, AV_NOPTS_VALUE);
-  return nread;
-}
-
-
-bool H264Decoder::is_frame_available() const
-{
-  return pkt->size > 0;
+  auto nread = av_parser_parse2(parser, context, &pkt_->data, &pkt_->size, 
+                                in_data, in_size, 
+                                0, 0, AV_NOPTS_VALUE);
+  // We might have a frame to decode. But what exactly is a packet? 
+  //Is it guaranteed to contain a complete frame?
+  if (pkt_->size)
+  {
+    const auto frame = decode_frame();
+    return ParseResult{nread, frame};
+  }
+  return ParseResult{nread, nullptr};
 }
 
 
-const AVFrame& H264Decoder::decode_frame()
+const AVFrame* H264Decoder::decode_frame()
 {
+#if (LIBAVCODEC_VERSION_MAJOR > 56)
+  int ret = avcodec_send_packet(context, pkt_.get());
+  if (!ret) {
+    ret = avcodec_receive_frame(context, frame);
+    if (!ret)
+      return frame;
+  }
+  return nullptr;
+#else
   int got_picture = 0;
   int nread = avcodec_decode_video2(context, frame, &got_picture, pkt);
   if (nread < 0 || got_picture == 0)
-    throw H264DecodeFailure("error decoding frame\n");
+    return nullptr;
   return *frame;
+#endif
 }
 
 
@@ -123,19 +124,19 @@ const AVFrame& ConverterRGB24::convert(const AVFrame &frame, ubyte* out_rgb)
   int w = frame.width;
   int h = frame.height;
   int pix_fmt = frame.format;
-
-  context = sws_getCachedContext(context,
-    w, h, (AVPixelFormat)pix_fmt,
-    w, h, AV_PIX_FMT_BGR24, SWS_BILINEAR,
-    nullptr, nullptr, nullptr);
+  
+  context = sws_getCachedContext(context, 
+                                 w, h, (AVPixelFormat)pix_fmt, 
+                                 w, h, AV_PIX_FMT_RGB24, SWS_BILINEAR,
+                                 nullptr, nullptr, nullptr);
   if (!context)
     throw H264DecodeFailure("cannot allocate context");
-
+  
   // Setup framergb with out_rgb as external buffer. Also say that we want RGB24 output.
-  avpicture_fill((AVPicture*)framergb, out_rgb, AV_PIX_FMT_BGR24, w, h);
+  av_image_fill_arrays(framergb->data, framergb->linesize, out_rgb, AV_PIX_FMT_RGB24, w, h, 1);
   // Do the conversion.
   sws_scale(context, frame.data, frame.linesize, 0, h,
-    framergb->data, framergb->linesize);
+            framergb->data, framergb->linesize);
   framergb->width = w;
   framergb->height = h;
   return *framergb;
@@ -144,15 +145,14 @@ const AVFrame& ConverterRGB24::convert(const AVFrame &frame, ubyte* out_rgb)
 /*
 Determine required size of framebuffer.
 
-avpicture_get_size is used in http://dranger.com/ffmpeg/tutorial01.html
-to do this. However, avpicture_get_size returns the size of a compact
-representation, without padding bytes. Since we use avpicture_fill to
+avpicture_get_size is used in http://dranger.com/ffmpeg/tutorial01.html 
+to do this. However, avpicture_get_size returns the size of a compact 
+representation, without padding bytes. Since we use av_image_fill_arrays to
 fill the buffer we should also use it to determine the required size.
 */
 int ConverterRGB24::predict_size(int w, int h)
 {
-  // TODO do we need this?
-  return avpicture_fill((AVPicture*)framergb, nullptr, AV_PIX_FMT_BGR24, w, h);
+  return av_image_fill_arrays(framergb->data, framergb->linesize, nullptr, AV_PIX_FMT_RGB24, w, h, 1);
 }
 
 

diff --git a/tello_driver/h264decoder/h264decoder.hpp b/tello_driver/h264decoder/h264decoder.hpp
@@ -1,24 +1,26 @@
 #pragma once
 /*
-This h264 decoder class  is just a thin wrapper around libav
-functions to decode h264 videos. It would have been easy to use
-libav directly in   the python module code but I like to keep these
+This h264 decoder class  is just a thin wrapper around libav 
+functions to decode h264 videos. It would have been easy to use 
+libav directly in   the python module code but I like to keep these 
 things separate.
-
+ 
 It is mostly based on roxlu's code. See
 http://roxlu.com/2014/039/decoding-h264-and-yuv420p-playback
 
 However, in contrast to roxlu's code the color space conversion is
 done by libav functions - so on the CPU, I suppose.
 
-Most functions/members throw exceptions. This way, error states are
-conveniently forwarded to python via the exception translation
-mechanisms of boost::python.
+Most functions/members throw exceptions. This way, error states are 
+conveniently forwarded to python via the exception translation 
+mechanisms of boost::python.  
 */
 
 // for ssize_t (signed int type as large as pointer type)
 #include <cstdlib>
 #include <stdexcept>
+#include <utility>
+#include <memory>
 
 struct AVCodecContext;
 struct AVFrame;
@@ -37,13 +39,20 @@ class H264Exception : public std::runtime_error
 class H264InitFailure : public H264Exception
 {
 public:
-  H264InitFailure(const char* s) : H264Exception(s) {}
+    H264InitFailure(const char* s) : H264Exception(s) {}
 };
 
 class H264DecodeFailure : public H264Exception
 {
 public:
-  H264DecodeFailure(const char* s) : H264Exception(s) {}
+    H264DecodeFailure(const char* s) : H264Exception(s) {}
+};
+
+
+struct ParseResult
+{
+  ptrdiff_t num_bytes_consumed = 0;
+  const AVFrame* frame = nullptr;
 };
 
 
@@ -52,43 +61,44 @@ class H264Decoder
   /* Persistent things here, using RAII for cleanup. */
   AVCodecContext        *context;
   AVFrame               *frame;
-  AVCodec               *codec;
+  const AVCodec               *codec;
   AVCodecParserContext  *parser;
-  /* In the documentation example on the github master branch, the
-packet is put on the heap. This is done here to store the pointers
-to the encoded data, which must be kept around  between calls to
-parse- and decode frame. In release 11 it is put on the stack, too.
+  /* In the documentation example on the github master branch, the 
+packet is put on the heap. This is done here to store the pointers 
+to the encoded data, which must be kept around  between calls to 
+parse- and decode frame. In release 11 it is put on the stack, too. 
   */
-  AVPacket              *pkt;
+  std::unique_ptr<AVPacket> pkt_;
+
+  const AVFrame* decode_frame();
+
 public:
   H264Decoder();
   ~H264Decoder();
-  /* First, parse a continuous data stream, dividing it into
-packets. When there is enough data to form a new frame, decode
-the data and return the frame. parse returns the number
-of consumed bytes of the input stream. It stops consuming
+  /* First, parse a continuous data stream, dividing it into 
+packets. When there is enough data to form a new frame, decode 
+the data and return the frame. parse returns the number 
+of consumed bytes of the input stream. It stops consuming 
 bytes at frame boundaries.
   */
-  ssize_t parse(const unsigned char* in_data, ssize_t in_size);
-  bool is_frame_available() const;
-  const AVFrame& decode_frame();
+  ParseResult parse(const unsigned char* in_data, ptrdiff_t in_size);
 };
 
 // TODO: Rename to OutputStage or so?!
 class ConverterRGB24
 {
   SwsContext *context;
   AVFrame *framergb;
-
+  
 public:
   ConverterRGB24();
   ~ConverterRGB24();
-
-  /*  Returns, given a width and height,
+   
+  /*  Returns, given a width and height, 
       how many bytes the frame buffer is going to need. */
   int predict_size(int w, int h);
-  /*  Given a decoded frame, convert it to RGB format and fill
-out_rgb with the result. Returns a AVFrame structure holding
+  /*  Given a decoded frame, convert it to RGB format and fill 
+out_rgb with the result. Returns a AVFrame structure holding 
 additional information about the RGB frame, such as the number of
 bytes in a row and so on. */
   const AVFrame& convert(const AVFrame &frame, unsigned char* out_rgb);

diff --git a/tello_driver/include/tello_driver_node.hpp b/tello_driver/include/tello_driver_node.hpp
@@ -1,7 +1,7 @@
 #include <asio.hpp>
 
 #include "rclcpp/rclcpp.hpp"
-#include "cv_bridge/cv_bridge.h"
+#include "cv_bridge/cv_bridge.hpp"
 #include "geometry_msgs/msg/twist.hpp"
 #include "sensor_msgs/msg/camera_info.hpp"
 #include "tello_msgs/msg/flight_data.hpp"

diff --git a/tello_driver/src/video_socket.cpp b/tello_driver/src/video_socket.cpp
@@ -75,12 +75,12 @@ namespace tello_driver
     try {
       while (next < seq_buffer_next_) {
         // Parse h264
-        ssize_t consumed = decoder_.parse(seq_buffer_.data() + next, seq_buffer_next_ - next);
+        auto result = decoder_.parse(seq_buffer_.data() + next, seq_buffer_next_ - next);
 
         // Is a frame available?
-        if (decoder_.is_frame_available()) {
+        if (result.frame) {
           // Decode the frame
-          const AVFrame &frame = decoder_.decode_frame();
+          const AVFrame &frame = *result.frame;
 
           // Convert pixels from YUV420P to BGR24
           int size = converter_.predict_size(frame.width, frame.height);
@@ -113,7 +113,7 @@ namespace tello_driver
           }
         }
 
-        next += consumed;
+        next += result.num_bytes_consumed;
       }
     }
     catch (std::runtime_error e) {