diff options
Diffstat (limited to 'target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch')
-rw-r--r-- | target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch b/target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch new file mode 100644 index 0000000..ea0f4f1 --- /dev/null +++ b/target/linux/brcm2708/patches-4.9/950-0165-drm-vc4-Add-fragment-shader-threading-support.patch @@ -0,0 +1,228 @@ +From 8f5722ac3e42a33345bfd82b7ad6a153134a4239 Mon Sep 17 00:00:00 2001 +From: Jonas Pfeil <pfeiljonas@gmx.de> +Date: Tue, 8 Nov 2016 00:18:39 +0100 +Subject: [PATCH] drm/vc4: Add fragment shader threading support + +FS threading brings performance improvements of 0-20% in glmark2. + +The validation code checks for thread switch signals and ensures that +the registers of the other thread are not touched, and that our clamps +are not live across thread switches. It also checks that the +threading and branching instructions do not interfere. + +(Original patch by Jonas, changes by anholt for style cleanup, +removing validation the kernel doesn't need to do, and adding the flag +for userspace). + +v2: Minor style fixes from checkpatch. + +Signed-off-by: Jonas Pfeil <pfeiljonas@gmx.de> +Signed-off-by: Eric Anholt <eric@anholt.net> +(cherry picked from commit c778cc5df944291dcdb1ca7a6bb781fbc22550c5) +--- + drivers/gpu/drm/vc4/vc4_drv.c | 1 + + drivers/gpu/drm/vc4/vc4_drv.h | 2 + + drivers/gpu/drm/vc4/vc4_validate.c | 17 +++++--- + drivers/gpu/drm/vc4/vc4_validate_shaders.c | 63 ++++++++++++++++++++++++++++++ + include/uapi/drm/vc4_drm.h | 1 + + 5 files changed, 79 insertions(+), 5 deletions(-) + +--- a/drivers/gpu/drm/vc4/vc4_drv.c ++++ b/drivers/gpu/drm/vc4/vc4_drv.c +@@ -82,6 +82,7 @@ static int vc4_get_param_ioctl(struct dr + break; + case DRM_VC4_PARAM_SUPPORTS_BRANCHES: + case DRM_VC4_PARAM_SUPPORTS_ETC1: ++ case DRM_VC4_PARAM_SUPPORTS_THREADED_FS: + args->value = true; + break; + default: +--- a/drivers/gpu/drm/vc4/vc4_drv.h ++++ b/drivers/gpu/drm/vc4/vc4_drv.h +@@ -384,6 +384,8 @@ struct vc4_validated_shader_info { + + uint32_t num_uniform_addr_offsets; + uint32_t *uniform_addr_offsets; ++ ++ bool is_threaded; + }; + + /** +--- a/drivers/gpu/drm/vc4/vc4_validate.c ++++ b/drivers/gpu/drm/vc4/vc4_validate.c +@@ -789,11 +789,6 @@ validate_gl_shader_rec(struct drm_device + exec->shader_rec_v += roundup(packet_size, 16); + exec->shader_rec_size -= packet_size; + +- if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) { +- DRM_ERROR("Multi-threaded fragment shaders not supported.\n"); +- return -EINVAL; +- } +- + for (i = 0; i < shader_reloc_count; i++) { + if (src_handles[i] > exec->bo_count) { + DRM_ERROR("Shader handle %d too big\n", src_handles[i]); +@@ -810,6 +805,18 @@ validate_gl_shader_rec(struct drm_device + return -EINVAL; + } + ++ if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) != ++ to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) { ++ DRM_ERROR("Thread mode of CL and FS do not match\n"); ++ return -EINVAL; ++ } ++ ++ if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded || ++ to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) { ++ DRM_ERROR("cs and vs cannot be threaded\n"); ++ return -EINVAL; ++ } ++ + for (i = 0; i < shader_reloc_count; i++) { + struct vc4_validated_shader_info *validated_shader; + uint32_t o = shader_reloc_offsets[i]; +--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c ++++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c +@@ -83,6 +83,13 @@ struct vc4_shader_validation_state { + * basic blocks. + */ + bool needs_uniform_address_for_loop; ++ ++ /* Set when we find an instruction writing the top half of the ++ * register files. If we allowed writing the unusable regs in ++ * a threaded shader, then the other shader running on our ++ * QPU's clamp validation would be invalid. ++ */ ++ bool all_registers_used; + }; + + static uint32_t +@@ -119,6 +126,13 @@ raddr_add_a_to_live_reg_index(uint64_t i + } + + static bool ++live_reg_is_upper_half(uint32_t lri) ++{ ++ return (lri >= 16 && lri < 32) || ++ (lri >= 32 + 16 && lri < 32 + 32); ++} ++ ++static bool + is_tmu_submit(uint32_t waddr) + { + return (waddr == QPU_W_TMU0_S || +@@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_sha + } else { + validation_state->live_immediates[lri] = ~0; + } ++ ++ if (live_reg_is_upper_half(lri)) ++ validation_state->all_registers_used = true; + } + + switch (waddr) { +@@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_valid + } + } + ++ if ((raddr_a >= 16 && raddr_a < 32) || ++ (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) { ++ validation_state->all_registers_used = true; ++ } ++ + return true; + } + +@@ -753,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_o + { + bool found_shader_end = false; + int shader_end_ip = 0; ++ uint32_t last_thread_switch_ip = -3; + uint32_t ip; + struct vc4_validated_shader_info *validated_shader = NULL; + struct vc4_shader_validation_state validation_state; +@@ -785,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_o + if (!vc4_handle_branch_target(&validation_state)) + goto fail; + ++ if (ip == last_thread_switch_ip + 3) { ++ /* Reset r0-r3 live clamp data */ ++ int i; ++ ++ for (i = 64; i < LIVE_REG_COUNT; i++) { ++ validation_state.live_min_clamp_offsets[i] = ~0; ++ validation_state.live_max_clamp_regs[i] = false; ++ validation_state.live_immediates[i] = ~0; ++ } ++ } ++ + switch (sig) { + case QPU_SIG_NONE: + case QPU_SIG_WAIT_FOR_SCOREBOARD: +@@ -794,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_o + case QPU_SIG_LOAD_TMU1: + case QPU_SIG_PROG_END: + case QPU_SIG_SMALL_IMM: ++ case QPU_SIG_THREAD_SWITCH: ++ case QPU_SIG_LAST_THREAD_SWITCH: + if (!check_instruction_writes(validated_shader, + &validation_state)) { + DRM_ERROR("Bad write at ip %d\n", ip); +@@ -809,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_o + shader_end_ip = ip; + } + ++ if (sig == QPU_SIG_THREAD_SWITCH || ++ sig == QPU_SIG_LAST_THREAD_SWITCH) { ++ validated_shader->is_threaded = true; ++ ++ if (ip < last_thread_switch_ip + 3) { ++ DRM_ERROR("Thread switch too soon after " ++ "last switch at ip %d\n", ip); ++ goto fail; ++ } ++ last_thread_switch_ip = ip; ++ } ++ + break; + + case QPU_SIG_LOAD_IMM: +@@ -823,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_o + if (!check_branch(inst, validated_shader, + &validation_state, ip)) + goto fail; ++ ++ if (ip < last_thread_switch_ip + 3) { ++ DRM_ERROR("Branch in thread switch at ip %d", ++ ip); ++ goto fail; ++ } ++ + break; + default: + DRM_ERROR("Unsupported QPU signal %d at " +@@ -844,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_o + goto fail; + } + ++ /* Might corrupt other thread */ ++ if (validated_shader->is_threaded && ++ validation_state.all_registers_used) { ++ DRM_ERROR("Shader uses threading, but uses the upper " ++ "half of the registers, too\n"); ++ goto fail; ++ } ++ + /* If we did a backwards branch and we haven't emitted a uniforms + * reset since then, we still need the uniforms stream to have the + * uniforms address available so that the backwards branch can do its +--- a/include/uapi/drm/vc4_drm.h ++++ b/include/uapi/drm/vc4_drm.h +@@ -287,6 +287,7 @@ struct drm_vc4_get_hang_state { + #define DRM_VC4_PARAM_V3D_IDENT2 2 + #define DRM_VC4_PARAM_SUPPORTS_BRANCHES 3 + #define DRM_VC4_PARAM_SUPPORTS_ETC1 4 ++#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS 5 + + struct drm_vc4_get_param { + __u32 param; |