/* * GStreamer * Copyright (C) 2024 Collabora Ltd. * Authors: Daniel Morin * Vineet Suryan * Santosh Mahto * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301, USA. */ /** * SECTION:element-yolosegv8tensordec * @short_description: Decode tensors from a FastSAM/YOLOv8 segmentation * models * * This element can parse per-buffer inference tensors meta data generated by an upstream * inference element * * ## Example launch command: * * Test image file, model file and labels file can be found here : * https://gitlab.collabora.com/gstreamer/onnx-models * * gst-launch-1.0 v4l2src device=/dev/video4 ! videorate max-rate=3 \ * ! videoconvertscale ! video/x-raw, pixel-aspect-ratio=1/1 \ * ! onnxinference \ * model-file=/home/dmorin/repos/onnx-models/models/yolov8s-seg.onnx \ * ! yolosegv8tensordec class-confidence-threshold=0.8 iou-threshold=0.3 \ * max-detections=100 \ * label-file=/home/dmorin/repos/onnx-models/labels/COCO_classes.txt \ * ! segmentationoverlay \ * ! glimagesink sink="gtkglsink processing-deadline=300000000 * * The original repository of the Yolo is located at * https://github.com/ultralytics/ultralytics. * For easy experimentation, a object segmentation model based on Yolo * architecture in Onnx format can be found at https://col.la/gstonnxmodelseg. * This model already has the required tensor-ids embedded in the model * It's also possible to embed tensor-ids into any model based on Yolo * architecture to allow this tensor-decoder to decode tensors. This process * is described in the Readme of this repository: https://col.la/gstonnxmodels" * * Since: 1.28 */ #ifdef HAVE_CONFI_H #include "config.h" #endif #include "gstyolosegtensordecoder.h" #include #include #include #define YOLO_SEGMENTATION_LOGITS "yolo-v8-segmentation-out-protos" GQuark YOLO_SEGMENTATION_LOGITS_TENSOR_ID; #define YOLO_SEGMENTATION_DETECTION_MASK "yolo-v8-segmentation-out-detections" GQuark YOLO_SEGMENTATION_DETECTION_MASK_ID; /* *INDENT-OFF* */ static GstStaticPadTemplate gst_yolo_seg_tensor_decoder_sink_template = GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS, GST_STATIC_CAPS ("video/x-raw," "tensors=(structure)[" "tensorgroups," "yolo-v8-segmentation-out=(/uniquelist){" "(GstCaps)[" "tensor/strided," "tensor-id=yolo-v8-segmentation-out-detections," "dims=(int)<1, [1,max], [1,max]>," "dims-order=(string)col-major," "type=(string)float32" "]," "(GstCaps)[" "tensor/strided," "tensor-id=yolo-v8-segmentation-out-protos," "dims=(int)<1, [1,max], [1,max], [1,max]>," "dims-order=(string)col-major," "type=(string)float32" "]" "}" "]" )); /* *INDENT-ON* */ GST_DEBUG_CATEGORY_STATIC (yolo_seg_tensor_decoder_debug); #define GST_CAT_DEFAULT yolo_seg_tensor_decoder_debug GST_ELEMENT_REGISTER_DEFINE (yolo_seg_tensor_decoder, "yolosegv8tensordec", GST_RANK_SECONDARY, GST_TYPE_YOLO_SEG_TENSOR_DECODER); /* For debug purpose */ typedef struct _DebugCandidates { gpointer self; gsize fields; /* Fields count do debug */ gsize offset; /* Fields offset */ gsize start; /* First field index to debug */ } DebugCandidates; /* GstYoloSegTensorDecoder Prototypes */ static gboolean gst_yolo_seg_tensor_decoder_stop (GstBaseTransform * trans); static GstFlowReturn gst_yolo_seg_tensor_decoder_transform_ip (GstBaseTransform * trans, GstBuffer * buf); static void gst_yolo_seg_tensor_decoder_object_found (GstYoloTensorDecoder * od, GstAnalyticsRelationMeta * rmeta, BBox * bb, gfloat confidence, GQuark class_quark, const gfloat * candidate_masks, gsize offset, guint count); G_DEFINE_TYPE (GstYoloSegTensorDecoder, gst_yolo_seg_tensor_decoder, GST_TYPE_YOLO_TENSOR_DECODER); static gboolean gst_yolo_seg_tensor_decoder_stop (GstBaseTransform * trans) { GstYoloSegTensorDecoder *self = GST_YOLO_SEG_TENSOR_DECODER (trans); self->mask_w = 0; self->mask_h = 0; self->mask_length = 0; if (self->mask_pool) gst_buffer_pool_set_active (self->mask_pool, FALSE); g_clear_object (&self->mask_pool); return TRUE; } static void gst_yolo_seg_tensor_decoder_class_init (GstYoloSegTensorDecoderClass * klass) { GstElementClass *element_class = (GstElementClass *) klass; GstBaseTransformClass *basetransform_class = (GstBaseTransformClass *) klass; GstYoloTensorDecoderClass *od_class = (GstYoloTensorDecoderClass *) klass; /* Define GstYoloSegTensorDecoder debug category. */ GST_DEBUG_CATEGORY_INIT (yolo_seg_tensor_decoder_debug, "yolosegv8tensordec", 0, "Tensor decoder for Yolo segmentation models"); YOLO_SEGMENTATION_DETECTION_MASK_ID = g_quark_from_static_string (YOLO_SEGMENTATION_DETECTION_MASK); YOLO_SEGMENTATION_LOGITS_TENSOR_ID = g_quark_from_static_string (YOLO_SEGMENTATION_LOGITS); gst_element_class_set_static_metadata (element_class, "YOLO v8-11 segmentastion tensor decoder", "Tensordecoder/Video", "Decode tensors output from the inference of Yolo or FastSAM model (segmentation)" " on video frames. It works with YOLO version > 8 and FastSAM models.", "Daniel Morin , Santosh Mahto "); gst_element_class_add_pad_template (element_class, gst_static_pad_template_get (&gst_yolo_seg_tensor_decoder_sink_template)); basetransform_class->transform_ip = gst_yolo_seg_tensor_decoder_transform_ip; basetransform_class->stop = gst_yolo_seg_tensor_decoder_stop; od_class->object_found = gst_yolo_seg_tensor_decoder_object_found; /* Workaround hotdoc bug */ gst_type_mark_as_plugin_api (GST_TYPE_YOLO_TENSOR_DECODER, 0); } static void gst_yolo_seg_tensor_decoder_init (GstYoloSegTensorDecoder * self) { /* GstYoloSegTensorDecoder instance initialization */ self->mask_w = 0; self->mask_h = 0; self->mask_length = 0; self->mask_pool = NULL; memset (&self->mask_roi, 0, sizeof (BBox)); gst_base_transform_set_passthrough (GST_BASE_TRANSFORM (self), FALSE); } static gboolean gst_yolo_seg_tensor_decoder_get_tensors (GstYoloSegTensorDecoder * self, GstBuffer * buf, const GstTensor ** logits_tensor, const GstTensor ** detections_tensor) { GstMeta *meta = NULL; gpointer iter_state = NULL; if (!gst_buffer_get_meta (buf, GST_TENSOR_META_API_TYPE)) { GST_WARNING_OBJECT (self, "missing tensor meta from buffer %" GST_PTR_FORMAT, buf); return FALSE; } while ((meta = gst_buffer_iterate_meta_filtered (buf, &iter_state, GST_TENSOR_META_API_TYPE))) { GstTensorMeta *tmeta = (GstTensorMeta *) meta; const gsize YOLO_LOGITS_TENSOR_N_DIMS = 4; static const gsize logits_dims[4] = { 1, G_MAXSIZE, G_MAXSIZE, G_MAXSIZE }; const gsize YOLO_DETECTIONS_TENSOR_N_DIMS = 3; static const gsize detections_dims[3] = { 1, G_MAXSIZE, G_MAXSIZE }; *logits_tensor = gst_tensor_meta_get_typed_tensor (tmeta, YOLO_SEGMENTATION_LOGITS_TENSOR_ID, GST_TENSOR_DATA_TYPE_FLOAT32, GST_TENSOR_DIM_ORDER_COL_MAJOR, YOLO_LOGITS_TENSOR_N_DIMS, logits_dims); if (*logits_tensor == NULL) continue; *detections_tensor = gst_tensor_meta_get_typed_tensor (tmeta, YOLO_SEGMENTATION_DETECTION_MASK_ID, GST_TENSOR_DATA_TYPE_FLOAT32, GST_TENSOR_DIM_ORDER_COL_MAJOR, YOLO_DETECTIONS_TENSOR_N_DIMS, detections_dims); if (*detections_tensor == NULL) continue; guint num_masks = (*logits_tensor)->dims[1]; if ((*detections_tensor)->dims[1] < 4 + 1 + num_masks) { GST_WARNING_OBJECT (self, "Ignore tensor because dims[1] is %zu < %d", (*detections_tensor)->dims[1], 4 + 1 + num_masks); continue; } return TRUE; } return FALSE; } /* gst_yolo_seg_tensor_decoder_transform_ip: * @trans: Instance * @buf:inout: Buffer containing media and where tensors can be attached * @return: Flow errors * Decode Yolo tensors, post-process tensors and store decoded information * into an analytics-meta that is attached to the buffer before been pushed * downstream. */ static GstFlowReturn gst_yolo_seg_tensor_decoder_transform_ip (GstBaseTransform * trans, GstBuffer * buf) { GstYoloSegTensorDecoder *self = GST_YOLO_SEG_TENSOR_DECODER (trans); GstYoloTensorDecoder *od = GST_YOLO_TENSOR_DECODER (trans); GstAnalyticsRelationMeta *rmeta; gsize mask_w, mask_h; const GstTensor *detections_tensor; const GstTensor *logits_tensor; GstFlowReturn ret = GST_FLOW_OK; gboolean rv; if (!gst_yolo_seg_tensor_decoder_get_tensors (self, buf, &logits_tensor, &detections_tensor)) { GST_WARNING_OBJECT (self, "Couldn't find logit or detections tensor, skipping"); return GST_FLOW_OK; } rmeta = gst_buffer_add_analytics_relation_meta (buf); if (rmeta == NULL) { GST_ELEMENT_ERROR (trans, STREAM, FAILED, (NULL), ("Analytics Relation meta allocation failed")); return GST_FLOW_ERROR; } mask_w = logits_tensor->dims[2]; mask_h = logits_tensor->dims[3]; /* The detections need to be cropped to fit the SAR of the image. */ /* TODO: We're reconstructing the transformation that was done on the * original image based on the assumption that the complete image without * deformation would be analyzed. This assumption is not alway true and * we should try to find a way to convey this transformation information * and retrieve from here to know the transformation that need to be done * on the mask.*/ if (self->mask_w != mask_w || self->mask_h != mask_h) { self->mask_w = mask_w; self->mask_h = mask_h; self->mask_length = mask_w * mask_h; if (od->video_info.width > od->video_info.height) { self->bb2mask_gain = ((gfloat) self->mask_w) / od->video_info.width; self->mask_roi.x = 0; self->mask_roi.w = self->mask_w; self->mask_roi.h = ((gfloat) self->bb2mask_gain) * od->video_info.height; self->mask_roi.y = (self->mask_h - self->mask_roi.h) / 2; } else { self->bb2mask_gain = ((gfloat) self->mask_h) / od->video_info.height; self->mask_roi.y = 0; self->mask_roi.h = self->mask_h; self->mask_roi.w = self->bb2mask_gain * od->video_info.width; self->mask_roi.x = (self->mask_w - self->mask_roi.w) / 2; } if (self->mask_pool) { gst_buffer_pool_set_active (self->mask_pool, FALSE); g_clear_object (&self->mask_pool); } } if (self->mask_pool == NULL) { GstVideoInfo minfo; GstCaps *caps; gst_video_info_init (&minfo); gst_video_info_set_format (&minfo, GST_VIDEO_FORMAT_GRAY8, self->mask_w, self->mask_h); caps = gst_video_info_to_caps (&minfo);; self->mask_pool = gst_video_buffer_pool_new (); GstStructure *config = gst_buffer_pool_get_config (self->mask_pool); gst_buffer_pool_config_set_params (config, caps, self->mask_length, 0, 0); gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META); gst_buffer_pool_set_config (self->mask_pool, config); gst_buffer_pool_set_active (self->mask_pool, TRUE); gst_caps_unref (caps); } /* Retrieve memory at index 0 from logits_tensor in READ mode */ rv = gst_buffer_map (logits_tensor->data, &self->map_info_logits, GST_MAP_READ); if (!rv) { GST_ELEMENT_ERROR (self, STREAM, FAILED, (NULL), ("Couldn't map logits tensor buffer: %" GST_PTR_FORMAT, logits_tensor->data)); return GST_FLOW_ERROR; } self->logits_tensor = logits_tensor; if (!gst_yolo_tensor_decoder_decode_f32 (od, rmeta, detections_tensor, logits_tensor->dims[1])) ret = GST_FLOW_ERROR; gst_buffer_unmap (logits_tensor->data, &self->map_info_logits); return ret; } static float sigmoid (float x) { /* Check for positive overflow */ if (x > 0) { double exp_neg_x = exp (-x); return 1.0 / (1.0 + exp_neg_x); } /* Check for negative overflow and improve stability for negative x */ else { double exp_x = exp (x); return exp_x / (1.0 + exp_x); } } static void gst_yolo_seg_tensor_decoder_object_found (GstYoloTensorDecoder * od, GstAnalyticsRelationMeta * rmeta, BBox * bb, gfloat confidence, GQuark class_quark, const gfloat * candidate_masks, gsize offset, guint count) { GstYoloSegTensorDecoder *self = GST_YOLO_SEG_TENSOR_DECODER (od); GstAnalyticsODMtd od_mtd; GstBuffer *mask_buf = NULL; gfloat *data_logits = (gfloat *) self->map_info_logits.data; BBox bb_mask; GstFlowReturn flowret; GstMapInfo out_mask_info; guint region_ids[2] = { 0, count }; GstAnalyticsMtd seg_mtd; gst_analytics_relation_meta_add_od_mtd (rmeta, class_quark, bb->x, bb->y, bb->w, bb->h, confidence, &od_mtd); bb_mask.x = self->bb2mask_gain * bb->x + self->mask_roi.x; bb_mask.y = self->bb2mask_gain * bb->y + self->mask_roi.y; bb_mask.w = self->bb2mask_gain * bb->w; bb_mask.h = self->bb2mask_gain * bb->h; flowret = gst_buffer_pool_acquire_buffer (self->mask_pool, &mask_buf, NULL); g_assert (flowret == GST_FLOW_OK); gst_buffer_map (mask_buf, &out_mask_info, GST_MAP_READWRITE); GstVideoMeta *vmeta = gst_buffer_get_video_meta (mask_buf); g_assert (vmeta != NULL); vmeta->width = bb_mask.w; vmeta->height = bb_mask.h; #define MX_MAX (bb_mask.x + bb_mask.w) #define MY_MAX (bb_mask.y + bb_mask.h) for (gint my = bb_mask.y, i = 0; my < MY_MAX; my++) { for (gint mx = bb_mask.x; mx < MX_MAX; mx++, i++) { float sum = 0.0f; gint j = my * self->mask_w + mx; for (gsize k = 0; k < self->logits_tensor->dims[1]; ++k) { GST_TRACE_OBJECT (self, "protos data at ((mx=%d,my=%d)=%d, %zu) is %f", mx, my, j, k, data_logits[k * self->mask_length + j]); sum += candidate_masks[offset * k] * data_logits[k * self->mask_length + j]; } out_mask_info.data[i] = sigmoid (sum) > 0.5 ? count : 0; } } gst_analytics_relation_meta_add_segmentation_mtd (rmeta, mask_buf, GST_SEGMENTATION_TYPE_INSTANCE, 1, region_ids, bb->x, bb->y, bb->w, bb->h, &seg_mtd); gst_analytics_relation_meta_set_relation (rmeta, GST_ANALYTICS_REL_TYPE_RELATE_TO, od_mtd.id, seg_mtd.id); gst_analytics_relation_meta_set_relation (rmeta, GST_ANALYTICS_REL_TYPE_RELATE_TO, seg_mtd.id, od_mtd.id); gst_buffer_unmap (mask_buf, &out_mask_info); }