Revert "[Impeller] migrate Gaussian shaders to half precision. (#40752)" (flutter/engine#40784)
Revert "[Impeller] migrate Gaussian shaders to half precision."
This commit is contained in:
@@ -6,53 +6,51 @@
|
||||
#define GAUSSIAN_GLSL_
|
||||
|
||||
#include <impeller/constants.glsl>
|
||||
#include <impeller/types.glsl>
|
||||
|
||||
/// Gaussian distribution function.
|
||||
float16_t IPGaussian(float16_t x, float16_t sigma) {
|
||||
float16_t variance = sigma * sigma;
|
||||
return exp(-0.5hf * x * x / variance) / (float16_t(kSqrtTwoPi) * sigma);
|
||||
float IPGaussian(float x, float sigma) {
|
||||
float variance = sigma * sigma;
|
||||
return exp(-0.5 * x * x / variance) / (kSqrtTwoPi * sigma);
|
||||
}
|
||||
|
||||
/// Abramowitz and Stegun erf approximation.
|
||||
float16_t IPErf(float16_t x) {
|
||||
float16_t a = abs(x);
|
||||
float IPErf(float x) {
|
||||
float a = abs(x);
|
||||
// 0.278393*x + 0.230389*x^2 + 0.078108*x^4 + 1
|
||||
float16_t b =
|
||||
(0.278393hf + (0.230389hf + 0.078108hf * a * a) * a) * a + 1.0hf;
|
||||
return sign(x) * (1.0hf - 1.0hf / (b * b * b * b));
|
||||
float b = (0.278393 + (0.230389 + 0.078108 * a * a) * a) * a + 1.0;
|
||||
return sign(x) * (1 - 1 / (b * b * b * b));
|
||||
}
|
||||
|
||||
/// Vec2 variation for the Abramowitz and Stegun erf approximation.
|
||||
f16vec2 IPVec2Erf(f16vec2 x) {
|
||||
f16vec2 a = abs(x);
|
||||
vec2 IPVec2Erf(vec2 x) {
|
||||
vec2 a = abs(x);
|
||||
// 0.278393*x + 0.230389*x^2 + 0.078108*x^4 + 1
|
||||
f16vec2 b = (0.278393hf + (0.230389hf + 0.078108hf * a * a) * a) * a + 1.0hf;
|
||||
return sign(x) * (1.0hf - 1.0hf / (b * b * b * b));
|
||||
vec2 b = (0.278393 + (0.230389 + 0.078108 * a * a) * a) * a + 1.0;
|
||||
return sign(x) * (1 - 1 / (b * b * b * b));
|
||||
}
|
||||
|
||||
/// The indefinite integral of the Gaussian function.
|
||||
/// Uses a very close approximation of Erf.
|
||||
float16_t IPGaussianIntegral(float16_t x, float16_t sigma) {
|
||||
float IPGaussianIntegral(float x, float sigma) {
|
||||
// ( 1 + erf( x * (sqrt(2) / (2 * sigma) ) ) / 2
|
||||
return (1.0hf + IPErf(x * (float16_t(kHalfSqrtTwo) / sigma))) * 0.5hf;
|
||||
return (1 + IPErf(x * (kHalfSqrtTwo / sigma))) * 0.5;
|
||||
}
|
||||
|
||||
/// Vec2 variation for the indefinite integral of the Gaussian function.
|
||||
/// Uses a very close approximation of Erf.
|
||||
f16vec2 IPVec2GaussianIntegral(f16vec2 x, float16_t sigma) {
|
||||
vec2 IPVec2GaussianIntegral(vec2 x, float sigma) {
|
||||
// ( 1 + erf( x * (sqrt(2) / (2 * sigma) ) ) / 2
|
||||
return (1.0hf + IPVec2Erf(x * (float16_t(kHalfSqrtTwo) / sigma))) * 0.5hf;
|
||||
return (1 + IPVec2Erf(x * (kHalfSqrtTwo / sigma))) * 0.5;
|
||||
}
|
||||
|
||||
/// Simpler (but less accurate) approximation of the Gaussian integral.
|
||||
f16vec2 IPVec2FastGaussianIntegral(f16vec2 x, float16_t sigma) {
|
||||
return 1.0hf / (1.0hf + exp(float16_t(-kSqrtThree) / sigma * x));
|
||||
vec2 IPVec2FastGaussianIntegral(vec2 x, float sigma) {
|
||||
return 1 / (1 + exp(-kSqrtThree / sigma * x));
|
||||
}
|
||||
|
||||
/// Simple logistic sigmoid with a domain of [-1, 1] and range of [0, 1].
|
||||
float16_t IPSigmoid(float16_t x) {
|
||||
return 1.03731472073hf / (1.0hf + exp(-4.0hf * x)) - 0.0186573603638hf;
|
||||
float IPSigmoid(float x) {
|
||||
return 1.03731472073 / (1 + exp(-4 * x)) - 0.0186573603638;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -143,15 +143,6 @@ vec4 IPSampleDecal(sampler2D texture_sampler, vec2 coords) {
|
||||
return texture(texture_sampler, coords);
|
||||
}
|
||||
|
||||
/// Sample a texture with decal tile mode.
|
||||
f16vec4 IPHalfSampleDecal(f16sampler2D texture_sampler, f16vec2 coords) {
|
||||
if (any(lessThan(coords, f16vec2(0.0hf))) ||
|
||||
any(greaterThanEqual(coords, f16vec2(1.0)))) {
|
||||
return f16vec4(0.0);
|
||||
}
|
||||
return texture(texture_sampler, coords);
|
||||
}
|
||||
|
||||
/// Sample a texture, emulating a specific tile mode.
|
||||
///
|
||||
/// This is useful for Impeller graphics backend that don't have native support
|
||||
|
||||
@@ -15,42 +15,42 @@
|
||||
// integral (using an erf approximation) to the 4 edges of the UV rectangle and
|
||||
// multiplying them.
|
||||
|
||||
uniform f16sampler2D texture_sampler;
|
||||
uniform sampler2D texture_sampler;
|
||||
|
||||
uniform FragInfo {
|
||||
float16_t src_factor;
|
||||
float16_t inner_blur_factor;
|
||||
float16_t outer_blur_factor;
|
||||
float src_factor;
|
||||
float inner_blur_factor;
|
||||
float outer_blur_factor;
|
||||
|
||||
f16vec2 sigma_uv;
|
||||
vec2 sigma_uv;
|
||||
}
|
||||
frag_info;
|
||||
|
||||
in f16vec2 v_texture_coords;
|
||||
in vec2 v_texture_coords;
|
||||
|
||||
out f16vec4 frag_color;
|
||||
out vec4 frag_color;
|
||||
|
||||
float16_t BoxBlurMask(f16vec2 uv) {
|
||||
float BoxBlurMask(vec2 uv) {
|
||||
// LTRB
|
||||
return IPGaussianIntegral(uv.x, frag_info.sigma_uv.x) * //
|
||||
IPGaussianIntegral(uv.y, frag_info.sigma_uv.y) * //
|
||||
IPGaussianIntegral(1.0hf - uv.x, frag_info.sigma_uv.x) * //
|
||||
IPGaussianIntegral(1.0hf - uv.y, frag_info.sigma_uv.y);
|
||||
return IPGaussianIntegral(uv.x, frag_info.sigma_uv.x) * //
|
||||
IPGaussianIntegral(uv.y, frag_info.sigma_uv.y) * //
|
||||
IPGaussianIntegral(1 - uv.x, frag_info.sigma_uv.x) * //
|
||||
IPGaussianIntegral(1 - uv.y, frag_info.sigma_uv.y);
|
||||
}
|
||||
|
||||
void main() {
|
||||
f16vec4 image_color = texture(texture_sampler, v_texture_coords);
|
||||
float16_t blur_factor = BoxBlurMask(v_texture_coords);
|
||||
vec4 image_color = texture(texture_sampler, v_texture_coords);
|
||||
float blur_factor = BoxBlurMask(v_texture_coords);
|
||||
|
||||
float16_t within_bounds =
|
||||
float16_t(v_texture_coords.x >= 0.0hf && v_texture_coords.y >= 0.0hf &&
|
||||
v_texture_coords.x < 1.0hf && v_texture_coords.y < 1.0hf);
|
||||
float16_t inner_factor =
|
||||
float within_bounds =
|
||||
float(v_texture_coords.x >= 0 && v_texture_coords.y >= 0 &&
|
||||
v_texture_coords.x < 1 && v_texture_coords.y < 1);
|
||||
float inner_factor =
|
||||
(frag_info.inner_blur_factor * blur_factor + frag_info.src_factor) *
|
||||
within_bounds;
|
||||
float16_t outer_factor =
|
||||
frag_info.outer_blur_factor * blur_factor * (1.0hf - within_bounds);
|
||||
float outer_factor =
|
||||
frag_info.outer_blur_factor * blur_factor * (1 - within_bounds);
|
||||
|
||||
float16_t mask_factor = inner_factor + outer_factor;
|
||||
float mask_factor = inner_factor + outer_factor;
|
||||
frag_color = image_color * mask_factor;
|
||||
}
|
||||
|
||||
@@ -15,10 +15,10 @@ frame_info;
|
||||
in vec2 vertices;
|
||||
in vec2 texture_coords;
|
||||
|
||||
out f16vec2 v_texture_coords;
|
||||
out vec2 v_texture_coords;
|
||||
|
||||
void main() {
|
||||
gl_Position = frame_info.mvp * vec4(vertices, 0.0, 1.0);
|
||||
v_texture_coords = f16vec2(
|
||||
IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale));
|
||||
v_texture_coords =
|
||||
IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale);
|
||||
}
|
||||
|
||||
@@ -18,52 +18,52 @@
|
||||
#include <impeller/texture.glsl>
|
||||
#include <impeller/types.glsl>
|
||||
|
||||
uniform f16sampler2D texture_sampler;
|
||||
uniform sampler2D texture_sampler;
|
||||
|
||||
uniform BlurInfo {
|
||||
f16vec2 texture_size;
|
||||
f16vec2 blur_direction;
|
||||
vec2 texture_size;
|
||||
vec2 blur_direction;
|
||||
|
||||
// The blur sigma and radius have a linear relationship which is defined
|
||||
// host-side, but both are useful controls here. Sigma (pixels per standard
|
||||
// deviation) is used to define the gaussian function itself, whereas the
|
||||
// radius is used to limit how much of the function is integrated.
|
||||
float16_t blur_sigma;
|
||||
float16_t blur_radius;
|
||||
float blur_sigma;
|
||||
float blur_radius;
|
||||
}
|
||||
blur_info;
|
||||
|
||||
#if ENABLE_ALPHA_MASK
|
||||
uniform f16sampler2D alpha_mask_sampler;
|
||||
uniform sampler2D alpha_mask_sampler;
|
||||
|
||||
uniform MaskInfo {
|
||||
float16_t src_factor;
|
||||
float16_t inner_blur_factor;
|
||||
float16_t outer_blur_factor;
|
||||
float src_factor;
|
||||
float inner_blur_factor;
|
||||
float outer_blur_factor;
|
||||
}
|
||||
mask_info;
|
||||
#endif
|
||||
|
||||
f16vec4 Sample(f16sampler2D tex, f16vec2 coords) {
|
||||
vec4 Sample(sampler2D tex, vec2 coords) {
|
||||
#if ENABLE_DECAL_SPECIALIZATION
|
||||
return IPHalfSampleDecal(tex, coords);
|
||||
return IPSampleDecal(tex, coords);
|
||||
#else
|
||||
return texture(tex, coords);
|
||||
#endif
|
||||
}
|
||||
|
||||
in f16vec2 v_texture_coords;
|
||||
in f16vec2 v_src_texture_coords;
|
||||
in vec2 v_texture_coords;
|
||||
in vec2 v_src_texture_coords;
|
||||
|
||||
out f16vec4 frag_color;
|
||||
out vec4 frag_color;
|
||||
|
||||
void main() {
|
||||
f16vec4 total_color = f16vec4(0.0hf);
|
||||
float16_t gaussian_integral = 0.0hf;
|
||||
f16vec2 blur_uv_offset = blur_info.blur_direction / blur_info.texture_size;
|
||||
vec4 total_color = vec4(0);
|
||||
float gaussian_integral = 0;
|
||||
vec2 blur_uv_offset = blur_info.blur_direction / blur_info.texture_size;
|
||||
|
||||
for (float16_t i = -blur_info.blur_radius; i <= blur_info.blur_radius; i++) {
|
||||
float16_t gaussian = IPGaussian(i, blur_info.blur_sigma);
|
||||
for (float i = -blur_info.blur_radius; i <= blur_info.blur_radius; i++) {
|
||||
float gaussian = IPGaussian(i, blur_info.blur_sigma);
|
||||
gaussian_integral += gaussian;
|
||||
total_color +=
|
||||
gaussian *
|
||||
@@ -75,12 +75,11 @@ void main() {
|
||||
frag_color = total_color / gaussian_integral;
|
||||
|
||||
#if ENABLE_ALPHA_MASK
|
||||
f16vec4 src_color = Sample(alpha_mask_sampler, // sampler
|
||||
v_src_texture_coords // texture coordinates
|
||||
vec4 src_color = Sample(alpha_mask_sampler, // sampler
|
||||
v_src_texture_coords // texture coordinates
|
||||
);
|
||||
float16_t blur_factor =
|
||||
mask_info.inner_blur_factor * float16_t(src_color.a > 0.0hf) +
|
||||
mask_info.outer_blur_factor * float16_t(src_color.a == 0.0hf);
|
||||
float blur_factor = mask_info.inner_blur_factor * float(src_color.a > 0) +
|
||||
mask_info.outer_blur_factor * float(src_color.a == 0);
|
||||
|
||||
frag_color = frag_color * blur_factor + src_color * mask_info.src_factor;
|
||||
#endif
|
||||
|
||||
@@ -16,13 +16,13 @@ in vec2 vertices;
|
||||
in vec2 texture_coords;
|
||||
in vec2 src_texture_coords;
|
||||
|
||||
out f16vec2 v_texture_coords;
|
||||
out f16vec2 v_src_texture_coords;
|
||||
out vec2 v_texture_coords;
|
||||
out vec2 v_src_texture_coords;
|
||||
|
||||
void main() {
|
||||
gl_Position = frame_info.mvp * vec4(vertices, 0.0, 1.0);
|
||||
v_texture_coords = f16vec2(
|
||||
IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale));
|
||||
v_src_texture_coords = f16vec2(IPRemapCoords(
|
||||
src_texture_coords, frame_info.alpha_mask_sampler_y_coord_scale));
|
||||
v_texture_coords =
|
||||
IPRemapCoords(texture_coords, frame_info.texture_sampler_y_coord_scale);
|
||||
v_src_texture_coords = IPRemapCoords(
|
||||
src_texture_coords, frame_info.alpha_mask_sampler_y_coord_scale);
|
||||
}
|
||||
|
||||
@@ -6,61 +6,58 @@
|
||||
#include <impeller/types.glsl>
|
||||
|
||||
uniform FragInfo {
|
||||
f16vec4 color;
|
||||
f16vec2 rect_size;
|
||||
float16_t blur_sigma;
|
||||
float16_t corner_radius;
|
||||
vec4 color;
|
||||
float blur_sigma;
|
||||
vec2 rect_size;
|
||||
float corner_radius;
|
||||
}
|
||||
frag_info;
|
||||
|
||||
in f16vec2 v_position;
|
||||
in vec2 v_position;
|
||||
|
||||
out f16vec4 frag_color;
|
||||
out vec4 frag_color;
|
||||
|
||||
const int kSampleCount = 4;
|
||||
|
||||
float16_t RRectDistance(f16vec2 sample_position, f16vec2 half_size) {
|
||||
f16vec2 space = abs(sample_position) - half_size + frag_info.corner_radius;
|
||||
return length(max(space, float16_t(0.0hf))) +
|
||||
min(max(space.x, space.y), float16_t(0.0hf)) - frag_info.corner_radius;
|
||||
float RRectDistance(vec2 sample_position, vec2 half_size) {
|
||||
vec2 space = abs(sample_position) - half_size + frag_info.corner_radius;
|
||||
return length(max(space, 0.0)) + min(max(space.x, space.y), 0.0) -
|
||||
frag_info.corner_radius;
|
||||
}
|
||||
|
||||
/// Closed form unidirectional rounded rect blur mask solution using the
|
||||
/// analytical Gaussian integral (with approximated erf).
|
||||
float16_t RRectShadowX(f16vec2 sample_position, f16vec2 half_size) {
|
||||
float RRectShadowX(vec2 sample_position, vec2 half_size) {
|
||||
// Compute the X direction distance field (not incorporating the Y distance)
|
||||
// for the rounded rect.
|
||||
float16_t space =
|
||||
min(float16_t(0.0hf),
|
||||
half_size.y - frag_info.corner_radius - abs(sample_position.y));
|
||||
float16_t rrect_distance =
|
||||
float space =
|
||||
min(0, half_size.y - frag_info.corner_radius - abs(sample_position.y));
|
||||
float rrect_distance =
|
||||
half_size.x - frag_info.corner_radius +
|
||||
sqrt(max(
|
||||
float16_t(0.0hf),
|
||||
frag_info.corner_radius * frag_info.corner_radius - space * space));
|
||||
sqrt(max(0, frag_info.corner_radius * frag_info.corner_radius -
|
||||
space * space));
|
||||
|
||||
// Map the linear distance field to the approximate Gaussian integral.
|
||||
f16vec2 integral = IPVec2FastGaussianIntegral(
|
||||
sample_position.x + f16vec2(-rrect_distance, rrect_distance),
|
||||
vec2 integral = IPVec2FastGaussianIntegral(
|
||||
sample_position.x + vec2(-rrect_distance, rrect_distance),
|
||||
frag_info.blur_sigma);
|
||||
return integral.y - integral.x;
|
||||
}
|
||||
|
||||
float16_t RRectShadow(f16vec2 sample_position, f16vec2 half_size) {
|
||||
float RRectShadow(vec2 sample_position, vec2 half_size) {
|
||||
// Limit the sampling range to 3 standard deviations in the Y direction from
|
||||
// the kernel center to incorporate 99.7% of the color contribution.
|
||||
float16_t half_sampling_range = frag_info.blur_sigma * 3.0hf;
|
||||
float half_sampling_range = frag_info.blur_sigma * 3;
|
||||
|
||||
float16_t begin_y =
|
||||
max(-half_sampling_range, sample_position.y - half_size.y);
|
||||
float16_t end_y = min(half_sampling_range, sample_position.y + half_size.y);
|
||||
float16_t interval = (end_y - begin_y) / float16_t(kSampleCount);
|
||||
float begin_y = max(-half_sampling_range, sample_position.y - half_size.y);
|
||||
float end_y = min(half_sampling_range, sample_position.y + half_size.y);
|
||||
float interval = (end_y - begin_y) / kSampleCount;
|
||||
|
||||
// Sample the X blur kSampleCount times, weighted by the Gaussian function.
|
||||
float16_t result = 0.0hf;
|
||||
float result = 0;
|
||||
for (int sample_i = 0; sample_i < kSampleCount; sample_i++) {
|
||||
float16_t y = begin_y + interval * (float16_t(sample_i) + 0.5hf);
|
||||
result += RRectShadowX(f16vec2(sample_position.x, sample_position.y - y),
|
||||
float y = begin_y + interval * (sample_i + 0.5);
|
||||
result += RRectShadowX(vec2(sample_position.x, sample_position.y - y),
|
||||
half_size) *
|
||||
IPGaussian(y, frag_info.blur_sigma) * interval;
|
||||
}
|
||||
@@ -71,10 +68,10 @@ float16_t RRectShadow(f16vec2 sample_position, f16vec2 half_size) {
|
||||
void main() {
|
||||
frag_color = frag_info.color;
|
||||
|
||||
f16vec2 half_size = frag_info.rect_size * 0.5hf;
|
||||
f16vec2 sample_position = v_position - half_size;
|
||||
vec2 half_size = frag_info.rect_size * 0.5;
|
||||
vec2 sample_position = v_position - half_size;
|
||||
|
||||
if (frag_info.blur_sigma > 0.0hf) {
|
||||
if (frag_info.blur_sigma > 0) {
|
||||
frag_color *= RRectShadow(sample_position, half_size);
|
||||
} else {
|
||||
frag_color *= -RRectDistance(sample_position, half_size);
|
||||
|
||||
@@ -11,10 +11,10 @@ frame_info;
|
||||
|
||||
in vec2 position;
|
||||
|
||||
out f16vec2 v_position;
|
||||
out vec2 v_position;
|
||||
|
||||
void main() {
|
||||
gl_Position = frame_info.mvp * vec4(position, 0.0, 1.0);
|
||||
// The fragment stage uses local coordinates to compute the blur.
|
||||
v_position = f16vec2(position);
|
||||
v_position = position;
|
||||
}
|
||||
|
||||
@@ -1440,7 +1440,7 @@
|
||||
"uses_late_zs_update": false,
|
||||
"variants": {
|
||||
"Main": {
|
||||
"fp16_arithmetic": 44,
|
||||
"fp16_arithmetic": 5,
|
||||
"has_stack_spilling": false,
|
||||
"performance": {
|
||||
"longest_path_bound_pipelines": [
|
||||
@@ -1448,8 +1448,8 @@
|
||||
"arith_fma"
|
||||
],
|
||||
"longest_path_cycles": [
|
||||
0.875,
|
||||
0.875,
|
||||
0.8125,
|
||||
0.8125,
|
||||
0.203125,
|
||||
0.25,
|
||||
0.0,
|
||||
@@ -1470,8 +1470,8 @@
|
||||
"arith_fma"
|
||||
],
|
||||
"shortest_path_cycles": [
|
||||
0.875,
|
||||
0.875,
|
||||
0.8125,
|
||||
0.8125,
|
||||
0.203125,
|
||||
0.25,
|
||||
0.0,
|
||||
@@ -1483,8 +1483,8 @@
|
||||
"arith_fma"
|
||||
],
|
||||
"total_cycles": [
|
||||
0.875,
|
||||
0.875,
|
||||
0.8125,
|
||||
0.8125,
|
||||
0.203125,
|
||||
0.25,
|
||||
0.0,
|
||||
@@ -1495,7 +1495,7 @@
|
||||
"stack_spill_bytes": 0,
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 12,
|
||||
"work_registers_used": 18
|
||||
"work_registers_used": 22
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5806,7 +5806,7 @@
|
||||
"uses_late_zs_update": false,
|
||||
"variants": {
|
||||
"Main": {
|
||||
"fp16_arithmetic": 86,
|
||||
"fp16_arithmetic": 10,
|
||||
"has_stack_spilling": false,
|
||||
"performance": {
|
||||
"longest_path_bound_pipelines": [
|
||||
@@ -5814,9 +5814,9 @@
|
||||
"arith_fma"
|
||||
],
|
||||
"longest_path_cycles": [
|
||||
0.90625,
|
||||
0.90625,
|
||||
0.265625,
|
||||
0.8125,
|
||||
0.8125,
|
||||
0.234375,
|
||||
0.25,
|
||||
0.0,
|
||||
0.25,
|
||||
@@ -5836,9 +5836,9 @@
|
||||
"arith_fma"
|
||||
],
|
||||
"shortest_path_cycles": [
|
||||
0.90625,
|
||||
0.90625,
|
||||
0.234375,
|
||||
0.8125,
|
||||
0.8125,
|
||||
0.203125,
|
||||
0.25,
|
||||
0.0,
|
||||
0.25,
|
||||
@@ -5849,9 +5849,9 @@
|
||||
"arith_fma"
|
||||
],
|
||||
"total_cycles": [
|
||||
0.90625,
|
||||
0.90625,
|
||||
0.265625,
|
||||
0.8125,
|
||||
0.8125,
|
||||
0.234375,
|
||||
0.25,
|
||||
0.0,
|
||||
0.25,
|
||||
@@ -5860,8 +5860,8 @@
|
||||
},
|
||||
"stack_spill_bytes": 0,
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 12,
|
||||
"work_registers_used": 29
|
||||
"uniform_registers_used": 10,
|
||||
"work_registers_used": 32
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -5906,7 +5906,7 @@
|
||||
},
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 1,
|
||||
"work_registers_used": 2
|
||||
"work_registers_used": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6636,7 +6636,7 @@
|
||||
"uses_late_zs_update": false,
|
||||
"variants": {
|
||||
"Main": {
|
||||
"fp16_arithmetic": 68,
|
||||
"fp16_arithmetic": 63,
|
||||
"has_stack_spilling": false,
|
||||
"performance": {
|
||||
"longest_path_bound_pipelines": [
|
||||
@@ -6662,13 +6662,14 @@
|
||||
],
|
||||
"shortest_path_bound_pipelines": [
|
||||
"arith_total",
|
||||
"arith_cvt",
|
||||
"arith_sfu",
|
||||
"varying"
|
||||
],
|
||||
"shortest_path_cycles": [
|
||||
0.25,
|
||||
0.15625,
|
||||
0.1875,
|
||||
0.171875,
|
||||
0.25,
|
||||
0.25,
|
||||
0.0,
|
||||
0.25,
|
||||
@@ -6683,7 +6684,7 @@
|
||||
"total_cycles": [
|
||||
0.5,
|
||||
0.359375,
|
||||
0.421875,
|
||||
0.484375,
|
||||
0.5,
|
||||
0.0,
|
||||
0.5,
|
||||
@@ -6692,7 +6693,7 @@
|
||||
},
|
||||
"stack_spill_bytes": 0,
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 12,
|
||||
"uniform_registers_used": 10,
|
||||
"work_registers_used": 21
|
||||
}
|
||||
}
|
||||
@@ -6723,7 +6724,7 @@
|
||||
"arithmetic"
|
||||
],
|
||||
"shortest_path_cycles": [
|
||||
3.9600000381469727,
|
||||
4.619999885559082,
|
||||
2.0,
|
||||
0.0
|
||||
],
|
||||
@@ -6731,7 +6732,7 @@
|
||||
"arithmetic"
|
||||
],
|
||||
"total_cycles": [
|
||||
8.0,
|
||||
8.666666984558105,
|
||||
2.0,
|
||||
2.0
|
||||
]
|
||||
@@ -6756,7 +6757,7 @@
|
||||
"uses_late_zs_update": false,
|
||||
"variants": {
|
||||
"Main": {
|
||||
"fp16_arithmetic": 64,
|
||||
"fp16_arithmetic": 58,
|
||||
"has_stack_spilling": false,
|
||||
"performance": {
|
||||
"longest_path_bound_pipelines": [
|
||||
@@ -6785,9 +6786,9 @@
|
||||
"texture"
|
||||
],
|
||||
"shortest_path_cycles": [
|
||||
0.15625,
|
||||
0.15625,
|
||||
0.09375,
|
||||
0.171875,
|
||||
0.171875,
|
||||
0.109375,
|
||||
0.0625,
|
||||
0.0,
|
||||
0.25,
|
||||
@@ -6800,7 +6801,7 @@
|
||||
"total_cycles": [
|
||||
0.359375,
|
||||
0.359375,
|
||||
0.21875,
|
||||
0.234375,
|
||||
0.125,
|
||||
0.0,
|
||||
0.5,
|
||||
@@ -6810,7 +6811,7 @@
|
||||
"stack_spill_bytes": 0,
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 12,
|
||||
"work_registers_used": 19
|
||||
"work_registers_used": 20
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -6840,7 +6841,7 @@
|
||||
"arithmetic"
|
||||
],
|
||||
"shortest_path_cycles": [
|
||||
2.9700000286102295,
|
||||
3.299999952316284,
|
||||
2.0,
|
||||
1.0
|
||||
],
|
||||
@@ -6848,14 +6849,14 @@
|
||||
"arithmetic"
|
||||
],
|
||||
"total_cycles": [
|
||||
5.0,
|
||||
5.333333492279053,
|
||||
2.0,
|
||||
2.0
|
||||
]
|
||||
},
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 2,
|
||||
"work_registers_used": 3
|
||||
"work_registers_used": 4
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6873,7 +6874,7 @@
|
||||
"uses_late_zs_update": false,
|
||||
"variants": {
|
||||
"Main": {
|
||||
"fp16_arithmetic": 70,
|
||||
"fp16_arithmetic": 61,
|
||||
"has_stack_spilling": false,
|
||||
"performance": {
|
||||
"longest_path_bound_pipelines": [
|
||||
@@ -6899,13 +6900,12 @@
|
||||
],
|
||||
"shortest_path_bound_pipelines": [
|
||||
"arith_total",
|
||||
"arith_cvt",
|
||||
"arith_sfu"
|
||||
"arith_cvt"
|
||||
],
|
||||
"shortest_path_cycles": [
|
||||
0.0625,
|
||||
0.03125,
|
||||
0.0625,
|
||||
0.078125,
|
||||
0.046875,
|
||||
0.078125,
|
||||
0.0625,
|
||||
0.0,
|
||||
0.0,
|
||||
@@ -6918,7 +6918,7 @@
|
||||
"total_cycles": [
|
||||
0.3125,
|
||||
0.234375,
|
||||
0.28125,
|
||||
0.296875,
|
||||
0.3125,
|
||||
0.0,
|
||||
0.25,
|
||||
@@ -6958,7 +6958,7 @@
|
||||
"arithmetic"
|
||||
],
|
||||
"shortest_path_cycles": [
|
||||
2.309999942779541,
|
||||
2.9700000286102295,
|
||||
1.0,
|
||||
0.0
|
||||
],
|
||||
@@ -6966,7 +6966,7 @@
|
||||
"arithmetic"
|
||||
],
|
||||
"total_cycles": [
|
||||
6.0,
|
||||
6.666666507720947,
|
||||
1.0,
|
||||
1.0
|
||||
]
|
||||
@@ -6991,7 +6991,7 @@
|
||||
"uses_late_zs_update": false,
|
||||
"variants": {
|
||||
"Main": {
|
||||
"fp16_arithmetic": 66,
|
||||
"fp16_arithmetic": 57,
|
||||
"has_stack_spilling": false,
|
||||
"performance": {
|
||||
"longest_path_bound_pipelines": [
|
||||
@@ -7017,13 +7017,12 @@
|
||||
],
|
||||
"shortest_path_bound_pipelines": [
|
||||
"arith_total",
|
||||
"arith_cvt",
|
||||
"arith_sfu"
|
||||
"arith_cvt"
|
||||
],
|
||||
"shortest_path_cycles": [
|
||||
0.0625,
|
||||
0.03125,
|
||||
0.0625,
|
||||
0.078125,
|
||||
0.046875,
|
||||
0.078125,
|
||||
0.0625,
|
||||
0.0,
|
||||
0.0,
|
||||
@@ -7036,7 +7035,7 @@
|
||||
"total_cycles": [
|
||||
0.234375,
|
||||
0.234375,
|
||||
0.1875,
|
||||
0.203125,
|
||||
0.125,
|
||||
0.0,
|
||||
0.25,
|
||||
@@ -7046,7 +7045,7 @@
|
||||
"stack_spill_bytes": 0,
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 8,
|
||||
"work_registers_used": 19
|
||||
"work_registers_used": 20
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -7076,7 +7075,7 @@
|
||||
"arithmetic"
|
||||
],
|
||||
"shortest_path_cycles": [
|
||||
1.9800000190734863,
|
||||
2.309999942779541,
|
||||
1.0,
|
||||
0.0
|
||||
],
|
||||
@@ -7084,14 +7083,14 @@
|
||||
"arithmetic"
|
||||
],
|
||||
"total_cycles": [
|
||||
4.0,
|
||||
4.333333492279053,
|
||||
1.0,
|
||||
1.0
|
||||
]
|
||||
},
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 1,
|
||||
"work_registers_used": 3
|
||||
"work_registers_used": 4
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8921,17 +8920,17 @@
|
||||
"uses_late_zs_update": false,
|
||||
"variants": {
|
||||
"Main": {
|
||||
"fp16_arithmetic": 68,
|
||||
"fp16_arithmetic": 33,
|
||||
"has_stack_spilling": false,
|
||||
"performance": {
|
||||
"longest_path_bound_pipelines": [
|
||||
"arith_total",
|
||||
"arith_sfu"
|
||||
"arith_fma"
|
||||
],
|
||||
"longest_path_cycles": [
|
||||
1.5,
|
||||
1.3875000476837158,
|
||||
0.737500011920929,
|
||||
1.5125000476837158,
|
||||
1.5125000476837158,
|
||||
0.546875,
|
||||
1.5,
|
||||
0.0,
|
||||
0.125,
|
||||
@@ -8961,12 +8960,12 @@
|
||||
],
|
||||
"total_bound_pipelines": [
|
||||
"arith_total",
|
||||
"arith_sfu"
|
||||
"arith_fma"
|
||||
],
|
||||
"total_cycles": [
|
||||
1.5625,
|
||||
1.5125000476837158,
|
||||
0.762499988079071,
|
||||
1.6375000476837158,
|
||||
1.6375000476837158,
|
||||
0.578125,
|
||||
1.5625,
|
||||
0.0,
|
||||
0.125,
|
||||
@@ -8975,7 +8974,7 @@
|
||||
},
|
||||
"stack_spill_bytes": 0,
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 16,
|
||||
"uniform_registers_used": 20,
|
||||
"work_registers_used": 32
|
||||
}
|
||||
}
|
||||
@@ -8990,12 +8989,12 @@
|
||||
"has_stack_spilling": false,
|
||||
"performance": {
|
||||
"longest_path_bound_pipelines": [
|
||||
"arithmetic"
|
||||
null
|
||||
],
|
||||
"longest_path_cycles": [
|
||||
22.110000610351562,
|
||||
1.0,
|
||||
0.0
|
||||
null,
|
||||
null,
|
||||
null
|
||||
],
|
||||
"pipelines": [
|
||||
"arithmetic",
|
||||
@@ -9014,14 +9013,14 @@
|
||||
"arithmetic"
|
||||
],
|
||||
"total_cycles": [
|
||||
10.0,
|
||||
10.666666984558105,
|
||||
1.0,
|
||||
0.0
|
||||
]
|
||||
},
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 2,
|
||||
"work_registers_used": 3
|
||||
"uniform_registers_used": 1,
|
||||
"work_registers_used": 4
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -12274,17 +12273,17 @@
|
||||
"uses_late_zs_update": false,
|
||||
"variants": {
|
||||
"Main": {
|
||||
"fp16_arithmetic": 65,
|
||||
"fp16_arithmetic": 37,
|
||||
"has_stack_spilling": false,
|
||||
"performance": {
|
||||
"longest_path_bound_pipelines": [
|
||||
"arith_total",
|
||||
"arith_sfu"
|
||||
"arith_fma"
|
||||
],
|
||||
"longest_path_cycles": [
|
||||
1.5,
|
||||
1.4249999523162842,
|
||||
0.699999988079071,
|
||||
1.5499999523162842,
|
||||
1.5499999523162842,
|
||||
0.515625,
|
||||
1.5,
|
||||
0.0,
|
||||
0.125,
|
||||
@@ -12314,12 +12313,12 @@
|
||||
],
|
||||
"total_bound_pipelines": [
|
||||
"arith_total",
|
||||
"arith_sfu"
|
||||
"arith_fma"
|
||||
],
|
||||
"total_cycles": [
|
||||
1.5625,
|
||||
1.5499999523162842,
|
||||
0.75,
|
||||
1.6749999523162842,
|
||||
1.6749999523162842,
|
||||
0.5625,
|
||||
1.5625,
|
||||
0.0,
|
||||
0.125,
|
||||
@@ -12329,7 +12328,7 @@
|
||||
"stack_spill_bytes": 0,
|
||||
"thread_occupancy": 100,
|
||||
"uniform_registers_used": 18,
|
||||
"work_registers_used": 31
|
||||
"work_registers_used": 32
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user