From 2342c05e43573d3045e1648c586c613b95766080 Mon Sep 17 00:00:00 2001 From: Paul B Mahol Date: Sat, 3 Jun 2023 10:49:08 +0200 Subject: [PATCH] avcodec/magicyuvenc: put some slice work under parallel execution Speeds up slice threaded encoding. --- libavcodec/magicyuvenc.c | 96 +++++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 22 deletions(-) diff --git a/libavcodec/magicyuvenc.c b/libavcodec/magicyuvenc.c index a12ef5a33d..082e2846c7 100644 --- a/libavcodec/magicyuvenc.c +++ b/libavcodec/magicyuvenc.c @@ -65,8 +65,12 @@ typedef struct MagicYUVContext { int hshift[4]; int vshift[4]; uint8_t **slices; + uint8_t **bitslices; + unsigned bitslice_size; unsigned *slice_pos; + unsigned *slice_size; unsigned tables_size; + PTable *counts; uint8_t *decorrelate_buf[2]; HuffEntry he[4][256]; LLVidEncDSPContext llvidencdsp; @@ -206,15 +210,20 @@ static av_cold int magy_encode_init(AVCodecContext *avctx) s->nb_slices = FFMAX(1, s->nb_slices); s->slice_height = FFALIGN((avctx->height + s->nb_slices - 1) / s->nb_slices, 1 << s->vshift[1]); s->slice_pos = av_calloc(s->nb_slices * s->planes, sizeof(*s->slice_pos)); + s->slice_size = av_calloc(s->nb_slices * s->planes, sizeof(*s->slice_size)); s->slices = av_calloc(s->nb_slices * s->planes, sizeof(*s->slices)); - if (!s->slices || !s->slice_pos) + s->bitslices = av_calloc(s->nb_slices * s->planes, sizeof(*s->bitslices)); + s->counts = av_calloc(s->nb_slices * s->planes * 256, sizeof(*s->counts)); + if (!s->slices || !s->slice_pos || !s->counts || !s->slice_size) return AVERROR(ENOMEM); + s->bitslice_size = avctx->width * (s->slice_height + 2) + AV_INPUT_BUFFER_PADDING_SIZE; for (int n = 0; n < s->nb_slices; n++) { for (int i = 0; i < s->planes; i++) { + s->bitslices[n * s->planes + i] = av_malloc(s->bitslice_size); s->slices[n * s->planes + i] = av_malloc(avctx->width * (s->slice_height + 2) + - AV_INPUT_BUFFER_PADDING_SIZE); - if (!s->slices[n * s->planes + i]) { + AV_INPUT_BUFFER_PADDING_SIZE); + if (!s->slices[n * s->planes + i] || !s->bitslices[n * s->planes + i]) { av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer.\n"); return AVERROR(ENOMEM); } @@ -358,6 +367,20 @@ static void magy_huffman_compute_bits(PTable *prob_table, HuffEntry *distincts, } } +static int count_plane_slice(AVCodecContext *avctx, int n, int plane) +{ + MagicYUVContext *s = avctx->priv_data; + const uint8_t *dst = s->slices[n * s->planes + plane]; + PTable *counts = s->counts + 256 * (n * s->planes + plane); + + memset(counts, 0, sizeof(*counts) * 256); + + count_usage(dst, AV_CEIL_RSHIFT(avctx->width, s->hshift[plane]), + AV_CEIL_RSHIFT(s->slice_height, s->vshift[plane]), counts); + + return 0; +} + static int encode_table(AVCodecContext *avctx, PutBitContext *pb, HuffEntry *he, int plane) { @@ -366,15 +389,15 @@ static int encode_table(AVCodecContext *avctx, uint16_t codes_counts[33] = { 0 }; for (int n = 0; n < s->nb_slices; n++) { - const uint8_t *dst = s->slices[n * s->planes + plane]; + PTable *slice_counts = s->counts + 256 * (n * s->planes + plane); - count_usage(dst, AV_CEIL_RSHIFT(avctx->width, s->hshift[plane]), - AV_CEIL_RSHIFT(s->slice_height, s->vshift[plane]), counts); + for (int i = 0; i < 256; i++) + counts[i].prob = slice_counts[i].prob; + } - for (int i = 0; i < 256; i++) { - counts[i].prob++; - counts[i].value = i; - } + for (int i = 0; i < 256; i++) { + counts[i].prob++; + counts[i].value = i; } magy_huffman_compute_bits(counts, he, codes_counts, 256, 12); @@ -389,8 +412,8 @@ static int encode_table(AVCodecContext *avctx, return 0; } -static int encode_slice(uint8_t *src, uint8_t *dst, int dst_size, - int width, int height, HuffEntry *he, int prediction) +static int encode_plane_slice(uint8_t *src, uint8_t *dst, int dst_size, + int width, int height, HuffEntry *he, int prediction) { PutBitContext pb; int i, j; @@ -420,6 +443,31 @@ static int encode_slice(uint8_t *src, uint8_t *dst, int dst_size, return put_bytes_output(&pb); } +static int encode_slice(AVCodecContext *avctx, void *tdata, + int n, int threadnr) +{ + MagicYUVContext *s = avctx->priv_data; + const int slice_height = s->slice_height; + const int last_height = FFMIN(slice_height, avctx->height - n * slice_height); + const int height = (n < (s->nb_slices - 1)) ? slice_height : last_height; + PutByteContext pb; + + for (int i = 0; i < s->planes; i++) { + bytestream2_init_writer(&pb, s->bitslices[n + s->planes + i], + s->bitslice_size); + + s->slice_size[n * s->planes + i] = + encode_plane_slice(s->slices[n * s->planes + i], + s->bitslices[n * s->planes + i], + bytestream2_get_bytes_left_p(&pb), + AV_CEIL_RSHIFT(avctx->width, s->hshift[i]), + AV_CEIL_RSHIFT(height, s->vshift[i]), + s->he[i], s->frame_pred); + } + + return 0; +} + static int predict_slice(AVCodecContext *avctx, void *tdata, int n, int threadnr) { @@ -469,6 +517,9 @@ static int predict_slice(AVCodecContext *avctx, void *tdata, } } + for (int p = 0; p < s->planes; p++) + count_plane_slice(avctx, n, p); + return 0; } @@ -528,18 +579,14 @@ static int magy_encode_frame(AVCodecContext *avctx, AVPacket *pkt, s->tables_size = put_bytes_count(&s->pb, 1); bytestream2_skip_p(&pb, s->tables_size); + avctx->execute2(avctx, encode_slice, NULL, NULL, s->nb_slices); + for (int n = 0; n < s->nb_slices; n++) { for (int i = 0; i < s->planes; i++) { - unsigned slice_size; - s->slice_pos[n * s->planes + i] = bytestream2_tell_p(&pb); - slice_size = encode_slice(s->slices[n * s->planes + i], - pkt->data + bytestream2_tell_p(&pb), - bytestream2_get_bytes_left_p(&pb), - AV_CEIL_RSHIFT(frame->width, s->hshift[i]), - AV_CEIL_RSHIFT(slice_height, s->vshift[i]), - s->he[i], s->frame_pred); - bytestream2_skip_p(&pb, slice_size); + + bytestream2_put_buffer(&pb, s->bitslices[n * s->planes + i], + s->slice_size[n * s->planes + i]); } } @@ -564,9 +611,14 @@ static av_cold int magy_encode_close(AVCodecContext *avctx) MagicYUVContext *s = avctx->priv_data; av_freep(&s->slice_pos); - for (int i = 0; i < s->planes && s->slices; i++) + av_freep(&s->slice_size); + for (int i = 0; i < s->planes * s->nb_slices && s->slices; i++) av_freep(&s->slices[i]); + for (int i = 0; i < s->planes * s->nb_slices && s->bitslices; i++) + av_freep(&s->bitslices[i]); + av_freep(&s->counts); av_freep(&s->slices); + av_freep(&s->bitslices); av_freep(&s->decorrelate_buf); return 0;