From 569ddd287edcc7c8f272587f8fac186fb50848b4 Mon Sep 17 00:00:00 2001 From: gaaclarke <30870216+gaaclarke@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:07:22 -0700 Subject: [PATCH] Vectorize rrect_blur (flutter/engine#55576) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit issue: https://github.com/flutter/flutter/issues/148496 28% speed improvement in the rrect_blur shader. ## before Screenshot 2024-10-01 at 3 45 46 PM ## after Screenshot 2024-10-01 at 3 42 30 PM [C++, Objective-C, Java style guides]: https://github.com/flutter/engine/blob/main/CONTRIBUTING.md#style --- .../shader_lib/impeller/gaussian.glsl | 12 ++++++ .../impeller/entity/shaders/rrect_blur.frag | 43 +++++++++++-------- engine/src/flutter/impeller/tools/malioc.json | 34 +++++++-------- 3 files changed, 54 insertions(+), 35 deletions(-) diff --git a/engine/src/flutter/impeller/compiler/shader_lib/impeller/gaussian.glsl b/engine/src/flutter/impeller/compiler/shader_lib/impeller/gaussian.glsl index f5ab4b9c7b..fef2a39aa5 100644 --- a/engine/src/flutter/impeller/compiler/shader_lib/impeller/gaussian.glsl +++ b/engine/src/flutter/impeller/compiler/shader_lib/impeller/gaussian.glsl @@ -14,6 +14,12 @@ float IPGaussian(float x, float sigma) { return exp(-0.5f * x * x / variance) / (kSqrtTwoPi * sigma); } +/// Equivalent to `IPGaussian(float x, float sigma)` but executed 4x. +vec4 IPGaussian(vec4 x, float sigma) { + float variance = sigma * sigma; + return exp(-0.5f * x * x / variance) / (kSqrtTwoPi * sigma); +} + /// Gaussian distribution function. float16_t IPHalfGaussian(float16_t x, float16_t sigma) { float16_t variance = sigma * sigma; @@ -56,6 +62,12 @@ vec2 IPVec2FastGaussianIntegral(vec2 x, float sigma) { return 1.0 / (1.0 + exp(-kSqrtThree / sigma * x)); } +/// Equivalent to `IPVec2FastGaussianIntegral(vec2 x, float sigma)` but operated +/// 4x instead of 2x. +vec4 IPVec4FastGaussianIntegral(vec4 x, float sigma) { + return 1.0 / (1.0 + exp(-kSqrtThree / sigma * x)); +} + /// Simpler (but less accurate) approximation of the Gaussian integral. f16vec2 IPHalfVec2FastGaussianIntegral(f16vec2 x, float16_t sigma) { return 1.0hf / (1.0hf + exp(float16_t(-kSqrtThree) / sigma * x)); diff --git a/engine/src/flutter/impeller/entity/shaders/rrect_blur.frag b/engine/src/flutter/impeller/entity/shaders/rrect_blur.frag index bc4e9c7d4d..a4574ad7b5 100644 --- a/engine/src/flutter/impeller/entity/shaders/rrect_blur.frag +++ b/engine/src/flutter/impeller/entity/shaders/rrect_blur.frag @@ -23,7 +23,9 @@ const int kSampleCount = 4; /// Closed form unidirectional rounded rect blur mask solution using the /// analytical Gaussian integral (with approximated erf). -float RRectBlurX(vec2 sample_position, vec2 half_size) { +vec4 RRectBlurX(float sample_position_x, + vec4 sample_position_y, + vec2 half_size) { // The vertical edge of the rrect consists of a flat portion and a curved // portion, the two of which vary in size depending on the size of the // corner radii, both adding up to half_size.y. @@ -33,8 +35,8 @@ float RRectBlurX(vec2 sample_position, vec2 half_size) { // negative (and then clamped to 0) for positions that are located // vertically in the flat part of the rrect, and will be the relative // distance from the center of curvature otherwise. - float space_y = - min(0.0, half_size.y - frag_info.corner_radii.y - abs(sample_position.y)); + vec4 space_y = min(vec4(0.0), half_size.y - frag_info.corner_radii.y - + abs(sample_position_y)); // space is now in the range [0.0, corner_radii.y]. If the y sample was // in the flat portion of the rrect, it will be 0.0 @@ -52,22 +54,31 @@ float RRectBlurX(vec2 sample_position, vec2 half_size) { // space_y was larger than corner_radii.y. // The calling function RRectBlur will never provide a Y sample outside // of that range, though, so the max(0.0) is mostly a precaution. - float unit_space_y = space_y / frag_info.corner_radii.y; - float unit_space_x = sqrt(max(0.0, 1.0 - unit_space_y * unit_space_y)); - float rrect_distance = + vec4 unit_space_y = space_y / frag_info.corner_radii.y; + vec4 unit_space_x = sqrt(max(vec4(0.0), 1.0 - unit_space_y * unit_space_y)); + vec4 rrect_distance = half_size.x - frag_info.corner_radii.x * (1.0 - unit_space_x); + vec4 result; // Now we integrate the Gaussian over the range of the relative positions // of the left and right sides of the rrect relative to the sampling // X coordinate. - vec2 integral = IPVec2FastGaussianIntegral( - float(sample_position.x) + vec2(-rrect_distance, rrect_distance), + vec4 integral = IPVec4FastGaussianIntegral( + float(sample_position_x) + vec4(-rrect_distance[0], rrect_distance[0], + -rrect_distance[1], rrect_distance[1]), float(frag_info.blur_sigma)); // integral.y contains the evaluation of the indefinite gaussian integral // function at (X + rrect_distance) and integral.x contains the evaluation // of it at (X - rrect_distance). Subtracting the two produces the // integral result over the range from one to the other. - return integral.y - integral.x; + result.xy = integral.yw - integral.xz; + integral = IPVec4FastGaussianIntegral( + float(sample_position_x) + vec4(-rrect_distance[2], rrect_distance[2], + -rrect_distance[3], rrect_distance[3]), + float(frag_info.blur_sigma)); + result.zw = integral.yw - integral.xz; + + return result; } float RRectBlur(vec2 sample_position, vec2 half_size) { @@ -84,15 +95,11 @@ float RRectBlur(vec2 sample_position, vec2 half_size) { float interval = (end_y - begin_y) / kSampleCount; // Sample the X blur kSampleCount times, weighted by the Gaussian function. - float result = 0.0; - for (int sample_i = 0; sample_i < kSampleCount; sample_i++) { - float y = begin_y + interval * (float(sample_i) + 0.5); - result += - RRectBlurX(vec2(sample_position.x, sample_position.y - y), half_size) * - IPGaussian(float(y), float(frag_info.blur_sigma)) * interval; - } - - return result; + vec4 ys = vec4(0.5, 1.5, 2.5, 3.5) * interval + begin_y; + vec4 sample_ys = sample_position.y - ys; + vec4 blurx = RRectBlurX(sample_position.x, sample_ys, half_size); + vec4 gaussian_y = IPGaussian(ys, float(frag_info.blur_sigma)); + return dot(blurx, gaussian_y * interval); } void main() { diff --git a/engine/src/flutter/impeller/tools/malioc.json b/engine/src/flutter/impeller/tools/malioc.json index 24b5da7324..85c0c6effd 100644 --- a/engine/src/flutter/impeller/tools/malioc.json +++ b/engine/src/flutter/impeller/tools/malioc.json @@ -3858,8 +3858,8 @@ "arith_fma" ], "longest_path_cycles": [ - 1.65625, - 1.65625, + 1.59375, + 1.59375, 0.453125, 1.5, 0.0, @@ -3880,8 +3880,8 @@ "arith_fma" ], "shortest_path_cycles": [ - 1.65625, - 1.65625, + 1.59375, + 1.59375, 0.421875, 1.5, 0.0, @@ -3893,8 +3893,8 @@ "arith_fma" ], "total_cycles": [ - 1.65625, - 1.65625, + 1.59375, + 1.59375, 0.453125, 1.5, 0.0, @@ -3922,7 +3922,7 @@ "arithmetic" ], "longest_path_cycles": [ - 25.739999771118164, + 12.869999885559082, 1.0, 0.0 ], @@ -3935,7 +3935,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 25.739999771118164, + 12.869999885559082, 1.0, 0.0 ], @@ -3943,14 +3943,14 @@ "arithmetic" ], "total_cycles": [ - 8.333333015441895, + 13.333333015441895, 1.0, 0.0 ] }, - "thread_occupancy": 100, + "thread_occupancy": 50, "uniform_registers_used": 3, - "work_registers_used": 4 + "work_registers_used": 7 } } } @@ -6816,8 +6816,8 @@ "arith_fma" ], "longest_path_cycles": [ - 1.65625, - 1.65625, + 1.59375, + 1.59375, 0.421875, 1.5, 0.0, @@ -6838,8 +6838,8 @@ "arith_fma" ], "shortest_path_cycles": [ - 1.65625, - 1.65625, + 1.59375, + 1.59375, 0.421875, 1.5, 0.0, @@ -6851,8 +6851,8 @@ "arith_fma" ], "total_cycles": [ - 1.65625, - 1.65625, + 1.59375, + 1.59375, 0.421875, 1.5, 0.0,