forked from FFmpeg/FFmpeg
aarch64: Lowercase UXTW/SXTW and similar flags
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
184103b310
commit
93cda5a9c2
6 changed files with 105 additions and 105 deletions
|
@ -38,7 +38,7 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
|
||||||
lsl w9, w9, #3
|
lsl w9, w9, #3
|
||||||
lsl w10, w10, #1
|
lsl w10, w10, #1
|
||||||
add w9, w9, w10
|
add w9, w9, w10
|
||||||
add x6, x6, w9, UXTW
|
add x6, x6, w9, uxtw
|
||||||
ld1r {v22.8h}, [x6]
|
ld1r {v22.8h}, [x6]
|
||||||
.endif
|
.endif
|
||||||
.ifc \codec,vc1
|
.ifc \codec,vc1
|
||||||
|
@ -208,7 +208,7 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
|
||||||
lsl w9, w9, #3
|
lsl w9, w9, #3
|
||||||
lsl w10, w10, #1
|
lsl w10, w10, #1
|
||||||
add w9, w9, w10
|
add w9, w9, w10
|
||||||
add x6, x6, w9, UXTW
|
add x6, x6, w9, uxtw
|
||||||
ld1r {v22.8h}, [x6]
|
ld1r {v22.8h}, [x6]
|
||||||
.endif
|
.endif
|
||||||
.ifc \codec,vc1
|
.ifc \codec,vc1
|
||||||
|
|
|
@ -385,7 +385,7 @@ function ff_h264_idct8_add4_neon, export=1
|
||||||
movrel x14, .L_ff_h264_idct8_add_neon
|
movrel x14, .L_ff_h264_idct8_add_neon
|
||||||
1: ldrb w9, [x7], #4
|
1: ldrb w9, [x7], #4
|
||||||
ldrsw x0, [x5], #16
|
ldrsw x0, [x5], #16
|
||||||
ldrb w9, [x4, w9, UXTW]
|
ldrb w9, [x4, w9, uxtw]
|
||||||
subs w9, w9, #1
|
subs w9, w9, #1
|
||||||
b.lt 2f
|
b.lt 2f
|
||||||
ldrsh w11, [x1]
|
ldrsh w11, [x1]
|
||||||
|
|
|
@ -186,13 +186,13 @@ function ff_bwdif_filter_line3_neon, export=1
|
||||||
mov w10, w6 // w10 = loop count
|
mov w10, w6 // w10 = loop count
|
||||||
neg w9, w5 // w9 = mref
|
neg w9, w5 // w9 = mref
|
||||||
lsl w8, w9, #1 // w8 = mref2
|
lsl w8, w9, #1 // w8 = mref2
|
||||||
add w7, w9, w9, LSL #1 // w7 = mref3
|
add w7, w9, w9, lsl #1 // w7 = mref3
|
||||||
lsl w6, w9, #2 // w6 = mref4
|
lsl w6, w9, #2 // w6 = mref4
|
||||||
mov w11, w5 // w11 = pref
|
mov w11, w5 // w11 = pref
|
||||||
lsl w12, w5, #1 // w12 = pref2
|
lsl w12, w5, #1 // w12 = pref2
|
||||||
add w13, w5, w5, LSL #1 // w13 = pref3
|
add w13, w5, w5, lsl #1 // w13 = pref3
|
||||||
lsl w14, w5, #2 // w14 = pref4
|
lsl w14, w5, #2 // w14 = pref4
|
||||||
add w15, w5, w5, LSL #2 // w15 = pref5
|
add w15, w5, w5, lsl #2 // w15 = pref5
|
||||||
add w16, w14, w12 // w16 = pref6
|
add w16, w14, w12 // w16 = pref6
|
||||||
|
|
||||||
lsl w5, w1, #1 // w5 = d_stride * 2
|
lsl w5, w1, #1 // w5 = d_stride * 2
|
||||||
|
|
|
@ -35,10 +35,10 @@
|
||||||
|
|
||||||
function ff_compute_safe_ssd_integral_image_neon, export=1
|
function ff_compute_safe_ssd_integral_image_neon, export=1
|
||||||
movi v26.4s, #0 // used as zero for the "rotations" in acc_sum_store
|
movi v26.4s, #0 // used as zero for the "rotations" in acc_sum_store
|
||||||
sub x3, x3, w6, UXTW // s1 padding (s1_linesize - w)
|
sub x3, x3, w6, uxtw // s1 padding (s1_linesize - w)
|
||||||
sub x5, x5, w6, UXTW // s2 padding (s2_linesize - w)
|
sub x5, x5, w6, uxtw // s2 padding (s2_linesize - w)
|
||||||
sub x9, x0, w1, UXTW #2 // dst_top
|
sub x9, x0, w1, uxtw #2 // dst_top
|
||||||
sub x1, x1, w6, UXTW // dst padding (dst_linesize_32 - w)
|
sub x1, x1, w6, uxtw // dst padding (dst_linesize_32 - w)
|
||||||
lsl x1, x1, #2 // dst padding expressed in bytes
|
lsl x1, x1, #2 // dst padding expressed in bytes
|
||||||
1: mov w10, w6 // width copy for each line
|
1: mov w10, w6 // width copy for each line
|
||||||
sub x0, x0, #16 // beginning of the dst line minus 4 sums
|
sub x0, x0, #16 // beginning of the dst line minus 4 sums
|
||||||
|
|
|
@ -54,10 +54,10 @@ function ff_hscale8to15_X8_neon, export=1
|
||||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
||||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
||||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
||||||
add x17, x3, w8, UXTW // srcp + filterPos[0]
|
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
||||||
add x8, x3, w0, UXTW // srcp + filterPos[1]
|
add x8, x3, w0, uxtw // srcp + filterPos[1]
|
||||||
add x0, x3, w11, UXTW // srcp + filterPos[2]
|
add x0, x3, w11, uxtw // srcp + filterPos[2]
|
||||||
add x11, x3, w9, UXTW // srcp + filterPos[3]
|
add x11, x3, w9, uxtw // srcp + filterPos[3]
|
||||||
mov w15, w6 // filterSize counter
|
mov w15, w6 // filterSize counter
|
||||||
2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
||||||
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||||
|
@ -231,14 +231,14 @@ function ff_hscale8to15_4_neon, export=1
|
||||||
add x5, x5, #32 // advance filterPos
|
add x5, x5, #32 // advance filterPos
|
||||||
|
|
||||||
// gather random access data from src into contiguous memory
|
// gather random access data from src into contiguous memory
|
||||||
ldr w8, [x3, w8, UXTW] // src[filterPos[idx + 0]][0..3]
|
ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]][0..3]
|
||||||
ldr w9, [x3, w9, UXTW] // src[filterPos[idx + 1]][0..3]
|
ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]][0..3]
|
||||||
ldr w10, [x3, w10, UXTW] // src[filterPos[idx + 2]][0..3]
|
ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]][0..3]
|
||||||
ldr w11, [x3, w11, UXTW] // src[filterPos[idx + 3]][0..3]
|
ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]][0..3]
|
||||||
ldr w12, [x3, w12, UXTW] // src[filterPos[idx + 4]][0..3]
|
ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]][0..3]
|
||||||
ldr w13, [x3, w13, UXTW] // src[filterPos[idx + 5]][0..3]
|
ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]][0..3]
|
||||||
ldr w14, [x3, w14, UXTW] // src[filterPos[idx + 6]][0..3]
|
ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]][0..3]
|
||||||
ldr w15, [x3, w15, UXTW] // src[filterPos[idx + 7]][0..3]
|
ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]][0..3]
|
||||||
stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
|
stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
|
||||||
stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
|
stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
|
||||||
stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
|
stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
|
||||||
|
@ -263,21 +263,21 @@ function ff_hscale8to15_4_neon, export=1
|
||||||
// interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
|
// interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
|
||||||
uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
|
uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
|
||||||
uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
|
uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
|
||||||
ldr w8, [x3, w8, UXTW] // src[filterPos[idx + 0]], next iteration
|
ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]], next iteration
|
||||||
ldr w9, [x3, w9, UXTW] // src[filterPos[idx + 1]], next iteration
|
ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]], next iteration
|
||||||
uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
|
uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
|
||||||
uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
|
uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
|
||||||
ldr w10, [x3, w10, UXTW] // src[filterPos[idx + 2]], next iteration
|
ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]], next iteration
|
||||||
ldr w11, [x3, w11, UXTW] // src[filterPos[idx + 3]], next iteration
|
ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]], next iteration
|
||||||
|
|
||||||
smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
|
smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
|
||||||
smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
|
smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
|
||||||
ldr w12, [x3, w12, UXTW] // src[filterPos[idx + 4]], next iteration
|
ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]], next iteration
|
||||||
ldr w13, [x3, w13, UXTW] // src[filterPos[idx + 5]], next iteration
|
ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]], next iteration
|
||||||
smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
|
smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
|
||||||
smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
|
smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
|
||||||
ldr w14, [x3, w14, UXTW] // src[filterPos[idx + 6]], next iteration
|
ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]], next iteration
|
||||||
ldr w15, [x3, w15, UXTW] // src[filterPos[idx + 7]], next iteration
|
ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]], next iteration
|
||||||
|
|
||||||
smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
|
smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
|
||||||
smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
|
smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
|
||||||
|
@ -331,7 +331,7 @@ function ff_hscale8to15_4_neon, export=1
|
||||||
2:
|
2:
|
||||||
// load src
|
// load src
|
||||||
ldr w8, [x5], #4 // filterPos[i]
|
ldr w8, [x5], #4 // filterPos[i]
|
||||||
add x9, x3, w8, UXTW // calculate the address for src load
|
add x9, x3, w8, uxtw // calculate the address for src load
|
||||||
ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3]
|
ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3]
|
||||||
// load filter
|
// load filter
|
||||||
ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3]
|
ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3]
|
||||||
|
@ -372,14 +372,14 @@ function ff_hscale8to19_4_neon, export=1
|
||||||
add x5, x5, #32
|
add x5, x5, #32
|
||||||
|
|
||||||
// load data from
|
// load data from
|
||||||
ldr w8, [x3, w8, UXTW]
|
ldr w8, [x3, w8, uxtw]
|
||||||
ldr w9, [x3, w9, UXTW]
|
ldr w9, [x3, w9, uxtw]
|
||||||
ldr w10, [x3, w10, UXTW]
|
ldr w10, [x3, w10, uxtw]
|
||||||
ldr w11, [x3, w11, UXTW]
|
ldr w11, [x3, w11, uxtw]
|
||||||
ldr w12, [x3, w12, UXTW]
|
ldr w12, [x3, w12, uxtw]
|
||||||
ldr w13, [x3, w13, UXTW]
|
ldr w13, [x3, w13, uxtw]
|
||||||
ldr w14, [x3, w14, UXTW]
|
ldr w14, [x3, w14, uxtw]
|
||||||
ldr w15, [x3, w15, UXTW]
|
ldr w15, [x3, w15, uxtw]
|
||||||
|
|
||||||
sub sp, sp, #32
|
sub sp, sp, #32
|
||||||
|
|
||||||
|
@ -399,30 +399,30 @@ function ff_hscale8to19_4_neon, export=1
|
||||||
ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
|
ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
|
||||||
add x5, x5, #32
|
add x5, x5, #32
|
||||||
uxtl v0.8h, v0.8b
|
uxtl v0.8h, v0.8b
|
||||||
ldr w8, [x3, w8, UXTW]
|
ldr w8, [x3, w8, uxtw]
|
||||||
smull v5.4s, v0.4h, v28.4h // multiply first column of src
|
smull v5.4s, v0.4h, v28.4h // multiply first column of src
|
||||||
ldr w9, [x3, w9, UXTW]
|
ldr w9, [x3, w9, uxtw]
|
||||||
smull2 v6.4s, v0.8h, v28.8h
|
smull2 v6.4s, v0.8h, v28.8h
|
||||||
stp w8, w9, [sp]
|
stp w8, w9, [sp]
|
||||||
|
|
||||||
uxtl v1.8h, v1.8b
|
uxtl v1.8h, v1.8b
|
||||||
ldr w10, [x3, w10, UXTW]
|
ldr w10, [x3, w10, uxtw]
|
||||||
smlal v5.4s, v1.4h, v29.4h // multiply second column of src
|
smlal v5.4s, v1.4h, v29.4h // multiply second column of src
|
||||||
ldr w11, [x3, w11, UXTW]
|
ldr w11, [x3, w11, uxtw]
|
||||||
smlal2 v6.4s, v1.8h, v29.8h
|
smlal2 v6.4s, v1.8h, v29.8h
|
||||||
stp w10, w11, [sp, #8]
|
stp w10, w11, [sp, #8]
|
||||||
|
|
||||||
uxtl v2.8h, v2.8b
|
uxtl v2.8h, v2.8b
|
||||||
ldr w12, [x3, w12, UXTW]
|
ldr w12, [x3, w12, uxtw]
|
||||||
smlal v5.4s, v2.4h, v30.4h // multiply third column of src
|
smlal v5.4s, v2.4h, v30.4h // multiply third column of src
|
||||||
ldr w13, [x3, w13, UXTW]
|
ldr w13, [x3, w13, uxtw]
|
||||||
smlal2 v6.4s, v2.8h, v30.8h
|
smlal2 v6.4s, v2.8h, v30.8h
|
||||||
stp w12, w13, [sp, #16]
|
stp w12, w13, [sp, #16]
|
||||||
|
|
||||||
uxtl v3.8h, v3.8b
|
uxtl v3.8h, v3.8b
|
||||||
ldr w14, [x3, w14, UXTW]
|
ldr w14, [x3, w14, uxtw]
|
||||||
smlal v5.4s, v3.4h, v31.4h // multiply fourth column of src
|
smlal v5.4s, v3.4h, v31.4h // multiply fourth column of src
|
||||||
ldr w15, [x3, w15, UXTW]
|
ldr w15, [x3, w15, uxtw]
|
||||||
smlal2 v6.4s, v3.8h, v31.8h
|
smlal2 v6.4s, v3.8h, v31.8h
|
||||||
stp w14, w15, [sp, #24]
|
stp w14, w15, [sp, #24]
|
||||||
|
|
||||||
|
@ -468,7 +468,7 @@ function ff_hscale8to19_4_neon, export=1
|
||||||
|
|
||||||
2:
|
2:
|
||||||
ldr w8, [x5], #4 // load filterPos
|
ldr w8, [x5], #4 // load filterPos
|
||||||
add x9, x3, w8, UXTW // src + filterPos
|
add x9, x3, w8, uxtw // src + filterPos
|
||||||
ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single
|
ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single
|
||||||
ld1 {v31.4h}, [x4], #8
|
ld1 {v31.4h}, [x4], #8
|
||||||
uxtl v0.8h, v0.8b
|
uxtl v0.8h, v0.8b
|
||||||
|
@ -503,10 +503,10 @@ function ff_hscale8to19_X8_neon, export=1
|
||||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
||||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
||||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
||||||
add x17, x3, w8, UXTW // srcp + filterPos[0]
|
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
||||||
add x8, x3, w0, UXTW // srcp + filterPos[1]
|
add x8, x3, w0, uxtw // srcp + filterPos[1]
|
||||||
add x0, x3, w11, UXTW // srcp + filterPos[2]
|
add x0, x3, w11, uxtw // srcp + filterPos[2]
|
||||||
add x11, x3, w9, UXTW // srcp + filterPos[3]
|
add x11, x3, w9, uxtw // srcp + filterPos[3]
|
||||||
mov w15, w6 // filterSize counter
|
mov w15, w6 // filterSize counter
|
||||||
2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
|
||||||
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||||
|
@ -567,13 +567,13 @@ function ff_hscale8to19_X4_neon, export=1
|
||||||
|
|
||||||
mov x12, x4 // filter + 0
|
mov x12, x4 // filter + 0
|
||||||
add x13, x4, x7 // filter + 1
|
add x13, x4, x7 // filter + 1
|
||||||
add x8, x3, w8, UXTW // srcp + filterPos 0
|
add x8, x3, w8, uxtw // srcp + filterPos 0
|
||||||
add x14, x13, x7 // filter + 2
|
add x14, x13, x7 // filter + 2
|
||||||
add x9, x3, w9, UXTW // srcp + filterPos 1
|
add x9, x3, w9, uxtw // srcp + filterPos 1
|
||||||
add x15, x14, x7 // filter + 3
|
add x15, x14, x7 // filter + 3
|
||||||
add x10, x3, w10, UXTW // srcp + filterPos 2
|
add x10, x3, w10, uxtw // srcp + filterPos 2
|
||||||
mov w0, w6 // save the filterSize to temporary variable
|
mov w0, w6 // save the filterSize to temporary variable
|
||||||
add x11, x3, w11, UXTW // srcp + filterPos 3
|
add x11, x3, w11, uxtw // srcp + filterPos 3
|
||||||
add x5, x5, #16 // advance filter position
|
add x5, x5, #16 // advance filter position
|
||||||
mov x16, xzr // clear the register x16 used for offsetting the filter values
|
mov x16, xzr // clear the register x16 used for offsetting the filter values
|
||||||
|
|
||||||
|
@ -674,14 +674,14 @@ function ff_hscale16to15_4_neon_asm, export=1
|
||||||
lsl x15, x15, #1
|
lsl x15, x15, #1
|
||||||
|
|
||||||
// load src with given offset
|
// load src with given offset
|
||||||
ldr x8, [x3, w8, UXTW]
|
ldr x8, [x3, w8, uxtw]
|
||||||
ldr x9, [x3, w9, UXTW]
|
ldr x9, [x3, w9, uxtw]
|
||||||
ldr x10, [x3, w10, UXTW]
|
ldr x10, [x3, w10, uxtw]
|
||||||
ldr x11, [x3, w11, UXTW]
|
ldr x11, [x3, w11, uxtw]
|
||||||
ldr x12, [x3, w12, UXTW]
|
ldr x12, [x3, w12, uxtw]
|
||||||
ldr x13, [x3, w13, UXTW]
|
ldr x13, [x3, w13, uxtw]
|
||||||
ldr x14, [x3, w14, UXTW]
|
ldr x14, [x3, w14, uxtw]
|
||||||
ldr x15, [x3, w15, UXTW]
|
ldr x15, [x3, w15, uxtw]
|
||||||
|
|
||||||
sub sp, sp, #64
|
sub sp, sp, #64
|
||||||
// push src on stack so it can be loaded into vectors later
|
// push src on stack so it can be loaded into vectors later
|
||||||
|
@ -754,14 +754,14 @@ function ff_hscale16to15_4_neon_asm, export=1
|
||||||
lsl x14, x14, #1
|
lsl x14, x14, #1
|
||||||
lsl x15, x15, #1
|
lsl x15, x15, #1
|
||||||
|
|
||||||
ldr x8, [x3, w8, UXTW]
|
ldr x8, [x3, w8, uxtw]
|
||||||
ldr x9, [x3, w9, UXTW]
|
ldr x9, [x3, w9, uxtw]
|
||||||
ldr x10, [x3, w10, UXTW]
|
ldr x10, [x3, w10, uxtw]
|
||||||
ldr x11, [x3, w11, UXTW]
|
ldr x11, [x3, w11, uxtw]
|
||||||
ldr x12, [x3, w12, UXTW]
|
ldr x12, [x3, w12, uxtw]
|
||||||
ldr x13, [x3, w13, UXTW]
|
ldr x13, [x3, w13, uxtw]
|
||||||
ldr x14, [x3, w14, UXTW]
|
ldr x14, [x3, w14, uxtw]
|
||||||
ldr x15, [x3, w15, UXTW]
|
ldr x15, [x3, w15, uxtw]
|
||||||
|
|
||||||
stp x8, x9, [sp]
|
stp x8, x9, [sp]
|
||||||
stp x10, x11, [sp, #16]
|
stp x10, x11, [sp, #16]
|
||||||
|
@ -819,7 +819,7 @@ function ff_hscale16to15_4_neon_asm, export=1
|
||||||
2:
|
2:
|
||||||
ldr w8, [x5], #4 // load filterPos
|
ldr w8, [x5], #4 // load filterPos
|
||||||
lsl w8, w8, #1
|
lsl w8, w8, #1
|
||||||
add x9, x3, w8, UXTW // src + filterPos
|
add x9, x3, w8, uxtw // src + filterPos
|
||||||
ld1 {v0.4h}, [x9] // load 4 * uint16_t
|
ld1 {v0.4h}, [x9] // load 4 * uint16_t
|
||||||
ld1 {v31.4h}, [x4], #8
|
ld1 {v31.4h}, [x4], #8
|
||||||
|
|
||||||
|
@ -869,10 +869,10 @@ function ff_hscale16to15_X8_neon_asm, export=1
|
||||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
||||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
||||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
||||||
add x17, x3, w8, UXTW // srcp + filterPos[0]
|
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
||||||
add x8, x3, w10, UXTW // srcp + filterPos[1]
|
add x8, x3, w10, uxtw // srcp + filterPos[1]
|
||||||
add x10, x3, w11, UXTW // srcp + filterPos[2]
|
add x10, x3, w11, uxtw // srcp + filterPos[2]
|
||||||
add x11, x3, w9, UXTW // srcp + filterPos[3]
|
add x11, x3, w9, uxtw // srcp + filterPos[3]
|
||||||
mov w15, w6 // filterSize counter
|
mov w15, w6 // filterSize counter
|
||||||
2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
|
2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
|
||||||
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||||
|
@ -1082,14 +1082,14 @@ function ff_hscale16to19_4_neon_asm, export=1
|
||||||
lsl x15, x15, #1
|
lsl x15, x15, #1
|
||||||
|
|
||||||
// load src with given offset
|
// load src with given offset
|
||||||
ldr x8, [x3, w8, UXTW]
|
ldr x8, [x3, w8, uxtw]
|
||||||
ldr x9, [x3, w9, UXTW]
|
ldr x9, [x3, w9, uxtw]
|
||||||
ldr x10, [x3, w10, UXTW]
|
ldr x10, [x3, w10, uxtw]
|
||||||
ldr x11, [x3, w11, UXTW]
|
ldr x11, [x3, w11, uxtw]
|
||||||
ldr x12, [x3, w12, UXTW]
|
ldr x12, [x3, w12, uxtw]
|
||||||
ldr x13, [x3, w13, UXTW]
|
ldr x13, [x3, w13, uxtw]
|
||||||
ldr x14, [x3, w14, UXTW]
|
ldr x14, [x3, w14, uxtw]
|
||||||
ldr x15, [x3, w15, UXTW]
|
ldr x15, [x3, w15, uxtw]
|
||||||
|
|
||||||
sub sp, sp, #64
|
sub sp, sp, #64
|
||||||
// push src on stack so it can be loaded into vectors later
|
// push src on stack so it can be loaded into vectors later
|
||||||
|
@ -1160,14 +1160,14 @@ function ff_hscale16to19_4_neon_asm, export=1
|
||||||
lsl x14, x14, #1
|
lsl x14, x14, #1
|
||||||
lsl x15, x15, #1
|
lsl x15, x15, #1
|
||||||
|
|
||||||
ldr x8, [x3, w8, UXTW]
|
ldr x8, [x3, w8, uxtw]
|
||||||
ldr x9, [x3, w9, UXTW]
|
ldr x9, [x3, w9, uxtw]
|
||||||
ldr x10, [x3, w10, UXTW]
|
ldr x10, [x3, w10, uxtw]
|
||||||
ldr x11, [x3, w11, UXTW]
|
ldr x11, [x3, w11, uxtw]
|
||||||
ldr x12, [x3, w12, UXTW]
|
ldr x12, [x3, w12, uxtw]
|
||||||
ldr x13, [x3, w13, UXTW]
|
ldr x13, [x3, w13, uxtw]
|
||||||
ldr x14, [x3, w14, UXTW]
|
ldr x14, [x3, w14, uxtw]
|
||||||
ldr x15, [x3, w15, UXTW]
|
ldr x15, [x3, w15, uxtw]
|
||||||
|
|
||||||
stp x8, x9, [sp]
|
stp x8, x9, [sp]
|
||||||
stp x10, x11, [sp, #16]
|
stp x10, x11, [sp, #16]
|
||||||
|
@ -1224,7 +1224,7 @@ function ff_hscale16to19_4_neon_asm, export=1
|
||||||
2:
|
2:
|
||||||
ldr w8, [x5], #4 // load filterPos
|
ldr w8, [x5], #4 // load filterPos
|
||||||
lsl w8, w8, #1
|
lsl w8, w8, #1
|
||||||
add x9, x3, w8, UXTW // src + filterPos
|
add x9, x3, w8, uxtw // src + filterPos
|
||||||
ld1 {v0.4h}, [x9] // load 4 * uint16_t
|
ld1 {v0.4h}, [x9] // load 4 * uint16_t
|
||||||
ld1 {v31.4h}, [x4], #8
|
ld1 {v31.4h}, [x4], #8
|
||||||
|
|
||||||
|
@ -1274,10 +1274,10 @@ function ff_hscale16to19_X8_neon_asm, export=1
|
||||||
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
movi v1.2d, #0 // val sum part 2 (for dst[1])
|
||||||
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
movi v2.2d, #0 // val sum part 3 (for dst[2])
|
||||||
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
movi v3.2d, #0 // val sum part 4 (for dst[3])
|
||||||
add x17, x3, w8, UXTW // srcp + filterPos[0]
|
add x17, x3, w8, uxtw // srcp + filterPos[0]
|
||||||
add x8, x3, w10, UXTW // srcp + filterPos[1]
|
add x8, x3, w10, uxtw // srcp + filterPos[1]
|
||||||
add x10, x3, w11, UXTW // srcp + filterPos[2]
|
add x10, x3, w11, uxtw // srcp + filterPos[2]
|
||||||
add x11, x3, w9, UXTW // srcp + filterPos[3]
|
add x11, x3, w9, uxtw // srcp + filterPos[3]
|
||||||
mov w15, w6 // filterSize counter
|
mov w15, w6 // filterSize counter
|
||||||
2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
|
2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
|
||||||
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
|
||||||
|
|
|
@ -102,7 +102,7 @@
|
||||||
.macro increment_nv12
|
.macro increment_nv12
|
||||||
ands w15, w1, #1
|
ands w15, w1, #1
|
||||||
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
|
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
|
||||||
add x6, x6, w16, SXTW // srcC += incC
|
add x6, x6, w16, sxtw // srcC += incC
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro increment_nv21
|
.macro increment_nv21
|
||||||
|
@ -113,13 +113,13 @@
|
||||||
ands w15, w1, #1
|
ands w15, w1, #1
|
||||||
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
|
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
|
||||||
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
|
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
|
||||||
add x6, x6, w16, SXTW // srcU += incU
|
add x6, x6, w16, sxtw // srcU += incU
|
||||||
add x13, x13, w17, SXTW // srcV += incV
|
add x13, x13, w17, sxtw // srcV += incV
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro increment_yuv422p
|
.macro increment_yuv422p
|
||||||
add x6, x6, w7, SXTW // srcU += incU
|
add x6, x6, w7, sxtw // srcU += incU
|
||||||
add x13, x13, w14, SXTW // srcV += incV
|
add x13, x13, w14, sxtw // srcV += incV
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
|
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
|
||||||
|
@ -189,8 +189,8 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||||
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
|
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
|
||||||
subs w8, w8, #16 // width -= 16
|
subs w8, w8, #16 // width -= 16
|
||||||
b.gt 2b
|
b.gt 2b
|
||||||
add x2, x2, w3, SXTW // dst += padding
|
add x2, x2, w3, sxtw // dst += padding
|
||||||
add x4, x4, w5, SXTW // srcY += paddingY
|
add x4, x4, w5, sxtw // srcY += paddingY
|
||||||
increment_\ifmt
|
increment_\ifmt
|
||||||
subs w1, w1, #1 // height -= 1
|
subs w1, w1, #1 // height -= 1
|
||||||
b.gt 1b
|
b.gt 1b
|
||||||
|
|
Loading…
Add table
Reference in a new issue