From 569ddd287edcc7c8f272587f8fac186fb50848b4 Mon Sep 17 00:00:00 2001
From: gaaclarke <30870216+gaaclarke@users.noreply.github.com>
Date: Wed, 2 Oct 2024 17:07:22 -0700
Subject: [PATCH] Vectorize rrect_blur (flutter/engine#55576)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

issue: https://github.com/flutter/flutter/issues/148496

28% speed improvement in the rrect_blur shader.

## before
<img width="1728" alt="Screenshot 2024-10-01 at 3 45 46â¯PM" src="https://github.com/user-attachments/assets/643068a5-ab1e-4fa3-bc03-184b2ee4a6cf">

## after
<img width="1728" alt="Screenshot 2024-10-01 at 3 42 30â¯PM" src="https://github.com/user-attachments/assets/41445231-ffea-4279-8142-ce126df8187c">

[C++, Objective-C, Java style guides]: https://github.com/flutter/engine/blob/main/CONTRIBUTING.md#style
---
 .../shader_lib/impeller/gaussian.glsl         | 12 ++++++
 .../impeller/entity/shaders/rrect_blur.frag   | 43 +++++++++++--------
 engine/src/flutter/impeller/tools/malioc.json | 34 +++++++--------
 3 files changed, 54 insertions(+), 35 deletions(-)
diff --git a/engine/src/flutter/impeller/compiler/shader_lib/impeller/gaussian.glsl b/engine/src/flutter/impeller/compiler/shader_lib/impeller/gaussian.glsl
index f5ab4b9c7b..fef2a39aa5 100644
--- a/engine/src/flutter/impeller/compiler/shader_lib/impeller/gaussian.glsl
+++ b/engine/src/flutter/impeller/compiler/shader_lib/impeller/gaussian.glsl
@@ -14,6 +14,12 @@ float IPGaussian(float x, float sigma) {
   return exp(-0.5f * x * x / variance) / (kSqrtTwoPi * sigma);
 }
 
+/// Equivalent to `IPGaussian(float x, float sigma)` but executed 4x.
+vec4 IPGaussian(vec4 x, float sigma) {
+  float variance = sigma * sigma;
+  return exp(-0.5f * x * x / variance) / (kSqrtTwoPi * sigma);
+}
+
 /// Gaussian distribution function.
 float16_t IPHalfGaussian(float16_t x, float16_t sigma) {
   float16_t variance = sigma * sigma;
@@ -56,6 +62,12 @@ vec2 IPVec2FastGaussianIntegral(vec2 x, float sigma) {
   return 1.0 / (1.0 + exp(-kSqrtThree / sigma * x));
 }
 
+/// Equivalent to `IPVec2FastGaussianIntegral(vec2 x, float sigma)` but operated
+/// 4x instead of 2x.
+vec4 IPVec4FastGaussianIntegral(vec4 x, float sigma) {
+  return 1.0 / (1.0 + exp(-kSqrtThree / sigma * x));
+}
+
 /// Simpler (but less accurate) approximation of the Gaussian integral.
 f16vec2 IPHalfVec2FastGaussianIntegral(f16vec2 x, float16_t sigma) {
   return 1.0hf / (1.0hf + exp(float16_t(-kSqrtThree) / sigma * x));
diff --git a/engine/src/flutter/impeller/entity/shaders/rrect_blur.frag b/engine/src/flutter/impeller/entity/shaders/rrect_blur.frag
index bc4e9c7d4d..a4574ad7b5 100644
--- a/engine/src/flutter/impeller/entity/shaders/rrect_blur.frag
+++ b/engine/src/flutter/impeller/entity/shaders/rrect_blur.frag
@@ -23,7 +23,9 @@ const int kSampleCount = 4;
 
 /// Closed form unidirectional rounded rect blur mask solution using the
 /// analytical Gaussian integral (with approximated erf).
-float RRectBlurX(vec2 sample_position, vec2 half_size) {
+vec4 RRectBlurX(float sample_position_x,
+                vec4 sample_position_y,
+                vec2 half_size) {
   // The vertical edge of the rrect consists of a flat portion and a curved
   // portion, the two of which vary in size depending on the size of the
   // corner radii, both adding up to half_size.y.
@@ -33,8 +35,8 @@ float RRectBlurX(vec2 sample_position, vec2 half_size) {
   // negative (and then clamped to 0) for positions that are located
   // vertically in the flat part of the rrect, and will be the relative
   // distance from the center of curvature otherwise.
-  float space_y =
-      min(0.0, half_size.y - frag_info.corner_radii.y - abs(sample_position.y));
+  vec4 space_y = min(vec4(0.0), half_size.y - frag_info.corner_radii.y -
+                                    abs(sample_position_y));
   // space is now in the range [0.0, corner_radii.y]. If the y sample was
   // in the flat portion of the rrect, it will be 0.0
 
@@ -52,22 +54,31 @@ float RRectBlurX(vec2 sample_position, vec2 half_size) {
   // space_y was larger than corner_radii.y.
   // The calling function RRectBlur will never provide a Y sample outside
   // of that range, though, so the max(0.0) is mostly a precaution.
-  float unit_space_y = space_y / frag_info.corner_radii.y;
-  float unit_space_x = sqrt(max(0.0, 1.0 - unit_space_y * unit_space_y));
-  float rrect_distance =
+  vec4 unit_space_y = space_y / frag_info.corner_radii.y;
+  vec4 unit_space_x = sqrt(max(vec4(0.0), 1.0 - unit_space_y * unit_space_y));
+  vec4 rrect_distance =
       half_size.x - frag_info.corner_radii.x * (1.0 - unit_space_x);
 
+  vec4 result;
   // Now we integrate the Gaussian over the range of the relative positions
   // of the left and right sides of the rrect relative to the sampling
   // X coordinate.
-  vec2 integral = IPVec2FastGaussianIntegral(
-      float(sample_position.x) + vec2(-rrect_distance, rrect_distance),
+  vec4 integral = IPVec4FastGaussianIntegral(
+      float(sample_position_x) + vec4(-rrect_distance[0], rrect_distance[0],
+                                      -rrect_distance[1], rrect_distance[1]),
       float(frag_info.blur_sigma));
   // integral.y contains the evaluation of the indefinite gaussian integral
   // function at (X + rrect_distance) and integral.x contains the evaluation
   // of it at (X - rrect_distance). Subtracting the two produces the
   // integral result over the range from one to the other.
-  return integral.y - integral.x;
+  result.xy = integral.yw - integral.xz;
+  integral = IPVec4FastGaussianIntegral(
+      float(sample_position_x) + vec4(-rrect_distance[2], rrect_distance[2],
+                                      -rrect_distance[3], rrect_distance[3]),
+      float(frag_info.blur_sigma));
+  result.zw = integral.yw - integral.xz;
+
+  return result;
 }
 
 float RRectBlur(vec2 sample_position, vec2 half_size) {
@@ -84,15 +95,11 @@ float RRectBlur(vec2 sample_position, vec2 half_size) {
   float interval = (end_y - begin_y) / kSampleCount;
 
   // Sample the X blur kSampleCount times, weighted by the Gaussian function.
-  float result = 0.0;
-  for (int sample_i = 0; sample_i < kSampleCount; sample_i++) {
-    float y = begin_y + interval * (float(sample_i) + 0.5);
-    result +=
-        RRectBlurX(vec2(sample_position.x, sample_position.y - y), half_size) *
-        IPGaussian(float(y), float(frag_info.blur_sigma)) * interval;
-  }
-
-  return result;
+  vec4 ys = vec4(0.5, 1.5, 2.5, 3.5) * interval + begin_y;
+  vec4 sample_ys = sample_position.y - ys;
+  vec4 blurx = RRectBlurX(sample_position.x, sample_ys, half_size);
+  vec4 gaussian_y = IPGaussian(ys, float(frag_info.blur_sigma));
+  return dot(blurx, gaussian_y * interval);
 }
 
 void main() {
diff --git a/engine/src/flutter/impeller/tools/malioc.json b/engine/src/flutter/impeller/tools/malioc.json
index 24b5da7324..85c0c6effd 100644
--- a/engine/src/flutter/impeller/tools/malioc.json
+++ b/engine/src/flutter/impeller/tools/malioc.json
@@ -3858,8 +3858,8 @@
               "arith_fma"
             ],
             "longest_path_cycles": [
-              1.65625,
-              1.65625,
+              1.59375,
+              1.59375,
               0.453125,
               1.5,
               0.0,
@@ -3880,8 +3880,8 @@
               "arith_fma"
             ],
             "shortest_path_cycles": [
-              1.65625,
-              1.65625,
+              1.59375,
+              1.59375,
               0.421875,
               1.5,
               0.0,
@@ -3893,8 +3893,8 @@
               "arith_fma"
             ],
             "total_cycles": [
-              1.65625,
-              1.65625,
+              1.59375,
+              1.59375,
               0.453125,
               1.5,
               0.0,
@@ -3922,7 +3922,7 @@
               "arithmetic"
             ],
             "longest_path_cycles": [
-              25.739999771118164,
+              12.869999885559082,
               1.0,
               0.0
             ],
@@ -3935,7 +3935,7 @@
               "arithmetic"
             ],
             "shortest_path_cycles": [
-              25.739999771118164,
+              12.869999885559082,
               1.0,
               0.0
             ],
@@ -3943,14 +3943,14 @@
               "arithmetic"
             ],
             "total_cycles": [
-              8.333333015441895,
+              13.333333015441895,
               1.0,
               0.0
             ]
           },
-          "thread_occupancy": 100,
+          "thread_occupancy": 50,
           "uniform_registers_used": 3,
-          "work_registers_used": 4
+          "work_registers_used": 7
         }
       }
     }
@@ -6816,8 +6816,8 @@
               "arith_fma"
             ],
             "longest_path_cycles": [
-              1.65625,
-              1.65625,
+              1.59375,
+              1.59375,
               0.421875,
               1.5,
               0.0,
@@ -6838,8 +6838,8 @@
               "arith_fma"
             ],
             "shortest_path_cycles": [
-              1.65625,
-              1.65625,
+              1.59375,
+              1.59375,
               0.421875,
               1.5,
               0.0,
@@ -6851,8 +6851,8 @@
               "arith_fma"
             ],
             "total_cycles": [
-              1.65625,
-              1.65625,
+              1.59375,
+              1.59375,
               0.421875,
               1.5,
               0.0,