avcodec/vvc: simplify priority logical to improve performance for 4K/8K
For 4K/8K video processing, it's possible to have over 1,000 tasks pending on the executor. In such cases, O(n) and O(log(n)) insertion times are too costly. Reducing this to O(1) will significantly decrease the time spent in critical sections clip | before | after | delta ------------------------------------------------------------|--------|--------|------- VVC_HDR_UHDTV2_OpenGOP_7680x4320_50fps_HLG10.bit | 24 | 27 | 12.5% VVC_HDR_UHDTV2_OpenGOP_7680x4320_50fps_HLG10_HighBitrate.bit| 12 | 17 | 41.7% tears_of_steel_4k_8M_8bit_2000.vvc | 34 | 102 | 200.0% VVC_UHDTV1_OpenGOP_3840x2160_60fps_HLG10.bit | 126 | 128 | 1.6% RitualDance_1920x1080_60_10_420_37_RA.266 | 350 | 378 | 8.0% NovosobornayaSquare_1920x1080.bin | 341 | 369 | 8.2% Tango2_3840x2160_60_10_420_27_LD.266 | 69 | 70 | 1.4% RitualDance_1920x1080_60_10_420_32_LD.266 | 243 | 259 | 6.6% Chimera_8bit_1080P_1000_frames.vvc | 420 | 392 | -6.7% BQTerrace_1920x1080_60_10_420_22_RA.vvc | 148 | 144 | -2.7%
This commit is contained in:
parent
40a14ef970
commit
846fbc395b
3 changed files with 55 additions and 50 deletions
|
@ -48,6 +48,11 @@ typedef struct ThreadInfo {
|
||||||
ExecutorThread thread;
|
ExecutorThread thread;
|
||||||
} ThreadInfo;
|
} ThreadInfo;
|
||||||
|
|
||||||
|
typedef struct Queue {
|
||||||
|
FFTask *head;
|
||||||
|
FFTask *tail;
|
||||||
|
} Queue;
|
||||||
|
|
||||||
struct FFExecutor {
|
struct FFExecutor {
|
||||||
FFTaskCallbacks cb;
|
FFTaskCallbacks cb;
|
||||||
int thread_count;
|
int thread_count;
|
||||||
|
@ -60,29 +65,39 @@ struct FFExecutor {
|
||||||
AVCond cond;
|
AVCond cond;
|
||||||
int die;
|
int die;
|
||||||
|
|
||||||
FFTask *tasks;
|
Queue *q;
|
||||||
};
|
};
|
||||||
|
|
||||||
static FFTask* remove_task(FFTask **prev, FFTask *t)
|
static FFTask* remove_task(Queue *q)
|
||||||
{
|
{
|
||||||
*prev = t->next;
|
FFTask *t = q->head;
|
||||||
t->next = NULL;
|
if (t) {
|
||||||
|
q->head = t->next;
|
||||||
|
t->next = NULL;
|
||||||
|
if (!q->head)
|
||||||
|
q->tail = NULL;
|
||||||
|
}
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void add_task(FFTask **prev, FFTask *t)
|
static void add_task(Queue *q, FFTask *t)
|
||||||
{
|
{
|
||||||
t->next = *prev;
|
t->next = NULL;
|
||||||
*prev = t;
|
if (!q->head)
|
||||||
|
q->tail = q->head = t;
|
||||||
|
else
|
||||||
|
q->tail = q->tail->next = t;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int run_one_task(FFExecutor *e, void *lc)
|
static int run_one_task(FFExecutor *e, void *lc)
|
||||||
{
|
{
|
||||||
FFTaskCallbacks *cb = &e->cb;
|
FFTaskCallbacks *cb = &e->cb;
|
||||||
FFTask **prev = &e->tasks;
|
FFTask *t = NULL;
|
||||||
|
|
||||||
if (*prev) {
|
for (int i = 0; i < e->cb.priorities && !t; i++)
|
||||||
FFTask *t = remove_task(prev, *prev);
|
t = remove_task(e->q + i);
|
||||||
|
|
||||||
|
if (t) {
|
||||||
if (e->thread_count > 0)
|
if (e->thread_count > 0)
|
||||||
ff_mutex_unlock(&e->lock);
|
ff_mutex_unlock(&e->lock);
|
||||||
cb->run(t, lc, cb->user_data);
|
cb->run(t, lc, cb->user_data);
|
||||||
|
@ -132,6 +147,7 @@ static void executor_free(FFExecutor *e, const int has_lock, const int has_cond)
|
||||||
ff_mutex_destroy(&e->lock);
|
ff_mutex_destroy(&e->lock);
|
||||||
|
|
||||||
av_free(e->threads);
|
av_free(e->threads);
|
||||||
|
av_free(e->q);
|
||||||
av_free(e->local_contexts);
|
av_free(e->local_contexts);
|
||||||
|
|
||||||
av_free(e);
|
av_free(e);
|
||||||
|
@ -141,7 +157,7 @@ FFExecutor* ff_executor_alloc(const FFTaskCallbacks *cb, int thread_count)
|
||||||
{
|
{
|
||||||
FFExecutor *e;
|
FFExecutor *e;
|
||||||
int has_lock = 0, has_cond = 0;
|
int has_lock = 0, has_cond = 0;
|
||||||
if (!cb || !cb->user_data || !cb->run || !cb->priority_higher)
|
if (!cb || !cb->user_data || !cb->run || !cb->priorities)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
e = av_mallocz(sizeof(*e));
|
e = av_mallocz(sizeof(*e));
|
||||||
|
@ -153,6 +169,10 @@ FFExecutor* ff_executor_alloc(const FFTaskCallbacks *cb, int thread_count)
|
||||||
if (!e->local_contexts)
|
if (!e->local_contexts)
|
||||||
goto free_executor;
|
goto free_executor;
|
||||||
|
|
||||||
|
e->q = av_calloc(e->cb.priorities, sizeof(Queue));
|
||||||
|
if (!e->q)
|
||||||
|
goto free_executor;
|
||||||
|
|
||||||
e->threads = av_calloc(FFMAX(thread_count, 1), sizeof(*e->threads));
|
e->threads = av_calloc(FFMAX(thread_count, 1), sizeof(*e->threads));
|
||||||
if (!e->threads)
|
if (!e->threads)
|
||||||
goto free_executor;
|
goto free_executor;
|
||||||
|
@ -192,16 +212,10 @@ void ff_executor_free(FFExecutor **executor)
|
||||||
|
|
||||||
void ff_executor_execute(FFExecutor *e, FFTask *t)
|
void ff_executor_execute(FFExecutor *e, FFTask *t)
|
||||||
{
|
{
|
||||||
FFTaskCallbacks *cb = &e->cb;
|
|
||||||
FFTask **prev;
|
|
||||||
|
|
||||||
if (e->thread_count)
|
if (e->thread_count)
|
||||||
ff_mutex_lock(&e->lock);
|
ff_mutex_lock(&e->lock);
|
||||||
if (t) {
|
if (t)
|
||||||
for (prev = &e->tasks; *prev && cb->priority_higher(*prev, t); prev = &(*prev)->next)
|
add_task(e->q + t->priority % e->cb.priorities, t);
|
||||||
/* nothing */;
|
|
||||||
add_task(prev, t);
|
|
||||||
}
|
|
||||||
if (e->thread_count) {
|
if (e->thread_count) {
|
||||||
ff_cond_signal(&e->cond);
|
ff_cond_signal(&e->cond);
|
||||||
ff_mutex_unlock(&e->lock);
|
ff_mutex_unlock(&e->lock);
|
||||||
|
|
|
@ -32,6 +32,7 @@ typedef struct FFTask FFTask;
|
||||||
|
|
||||||
struct FFTask {
|
struct FFTask {
|
||||||
FFTask *next;
|
FFTask *next;
|
||||||
|
int priority; // task priority should >= 0 and < AVTaskCallbacks.priorities
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct FFTaskCallbacks {
|
typedef struct FFTaskCallbacks {
|
||||||
|
@ -39,8 +40,8 @@ typedef struct FFTaskCallbacks {
|
||||||
|
|
||||||
int local_context_size;
|
int local_context_size;
|
||||||
|
|
||||||
// return 1 if a's priority > b's priority
|
// how many priorities do we have?
|
||||||
int (*priority_higher)(const FFTask *a, const FFTask *b);
|
int priorities;
|
||||||
|
|
||||||
// run the task
|
// run the task
|
||||||
int (*run)(FFTask *t, void *local_context, void *user_data);
|
int (*run)(FFTask *t, void *local_context, void *user_data);
|
||||||
|
|
|
@ -103,13 +103,28 @@ typedef struct VVCFrameThread {
|
||||||
AVCond cond;
|
AVCond cond;
|
||||||
} VVCFrameThread;
|
} VVCFrameThread;
|
||||||
|
|
||||||
|
#define PRIORITY_LOWEST 2
|
||||||
static void add_task(VVCContext *s, VVCTask *t)
|
static void add_task(VVCContext *s, VVCTask *t)
|
||||||
{
|
{
|
||||||
VVCFrameThread *ft = t->fc->ft;
|
VVCFrameThread *ft = t->fc->ft;
|
||||||
|
FFTask *task = &t->u.task;
|
||||||
|
const int priorities[] = {
|
||||||
|
0, // VVC_TASK_STAGE_INIT,
|
||||||
|
0, // VVC_TASK_STAGE_PARSE,
|
||||||
|
// For an 8K clip, a CTU line completed in the reference frame may trigger 64 and more inter tasks.
|
||||||
|
// We assign these tasks the lowest priority to avoid being overwhelmed with inter tasks.
|
||||||
|
PRIORITY_LOWEST, // VVC_TASK_STAGE_INTER
|
||||||
|
1, // VVC_TASK_STAGE_RECON,
|
||||||
|
1, // VVC_TASK_STAGE_LMCS,
|
||||||
|
1, // VVC_TASK_STAGE_DEBLOCK_V,
|
||||||
|
1, // VVC_TASK_STAGE_DEBLOCK_H,
|
||||||
|
1, // VVC_TASK_STAGE_SAO,
|
||||||
|
1, // VVC_TASK_STAGE_ALF,
|
||||||
|
};
|
||||||
|
|
||||||
atomic_fetch_add(&ft->nb_scheduled_tasks, 1);
|
atomic_fetch_add(&ft->nb_scheduled_tasks, 1);
|
||||||
|
task->priority = priorities[t->stage];
|
||||||
ff_executor_execute(s->executor, &t->u.task);
|
ff_executor_execute(s->executor, task);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void task_init(VVCTask *t, VVCTaskStage stage, VVCFrameContext *fc, const int rx, const int ry)
|
static void task_init(VVCTask *t, VVCTaskStage stage, VVCFrameContext *fc, const int rx, const int ry)
|
||||||
|
@ -372,31 +387,6 @@ static int task_is_stage_ready(VVCTask *t, int add)
|
||||||
return task_has_target_score(t, stage, score);
|
return task_has_target_score(t, stage, score);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CHECK(a, b) \
|
|
||||||
do { \
|
|
||||||
if ((a) != (b)) \
|
|
||||||
return (a) < (b); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
static int task_priority_higher(const FFTask *_a, const FFTask *_b)
|
|
||||||
{
|
|
||||||
const VVCTask *a = (const VVCTask*)_a;
|
|
||||||
const VVCTask *b = (const VVCTask*)_b;
|
|
||||||
|
|
||||||
|
|
||||||
if (a->stage <= VVC_TASK_STAGE_PARSE || b->stage <= VVC_TASK_STAGE_PARSE) {
|
|
||||||
CHECK(a->stage, b->stage);
|
|
||||||
CHECK(a->fc->decode_order, b->fc->decode_order); //decode order
|
|
||||||
CHECK(a->ry, b->ry);
|
|
||||||
return a->rx < b->rx;
|
|
||||||
}
|
|
||||||
|
|
||||||
CHECK(a->fc->decode_order, b->fc->decode_order); //decode order
|
|
||||||
CHECK(a->rx + a->ry + a->stage, b->rx + b->ry + b->stage); //zigzag with type
|
|
||||||
CHECK(a->rx + a->ry, b->rx + b->ry); //zigzag
|
|
||||||
return a->ry < b->ry;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void check_colocation(VVCContext *s, VVCTask *t)
|
static void check_colocation(VVCContext *s, VVCTask *t)
|
||||||
{
|
{
|
||||||
const VVCFrameContext *fc = t->fc;
|
const VVCFrameContext *fc = t->fc;
|
||||||
|
@ -681,7 +671,7 @@ FFExecutor* ff_vvc_executor_alloc(VVCContext *s, const int thread_count)
|
||||||
FFTaskCallbacks callbacks = {
|
FFTaskCallbacks callbacks = {
|
||||||
s,
|
s,
|
||||||
sizeof(VVCLocalContext),
|
sizeof(VVCLocalContext),
|
||||||
task_priority_higher,
|
PRIORITY_LOWEST + 1,
|
||||||
task_run,
|
task_run,
|
||||||
};
|
};
|
||||||
return ff_executor_alloc(&callbacks, thread_count);
|
return ff_executor_alloc(&callbacks, thread_count);
|
||||||
|
|
Loading…
Add table
Reference in a new issue