diff --git a/engine/src/flutter/libs/minikin/GraphemeBreak.cpp b/engine/src/flutter/libs/minikin/GraphemeBreak.cpp index 56f3a52a5d..b1188e8d88 100644 --- a/engine/src/flutter/libs/minikin/GraphemeBreak.cpp +++ b/engine/src/flutter/libs/minikin/GraphemeBreak.cpp @@ -102,70 +102,102 @@ bool GraphemeBreak::isGraphemeBreak(const float* advances, const uint16_t* buf, if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) { return false; } - // Rule GB8a that looks at even-off cases. - // - // sot (RI RI)* RI x RI - // [^RI] (RI RI)* RI x RI - // RI ÷ RI - if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) { - // Look at up to 1000 code units. - start = std::max((ssize_t)start, (ssize_t)offset_back - 1000); - while (offset_back > start) { - U16_PREV(buf, start, offset_back, c1); - if (tailoredGraphemeClusterBreak(c1) != U_GCB_REGIONAL_INDICATOR) { - offset_back += U16_LENGTH(c1); - break; - } - } - - // The number 4 comes from the number of code units in a whole flag. - return (offset - offset_back) % 4 == 0; - } // Rule GB9, x (Extend | ZWJ); Rule GB9a, x SpacingMark; Rule GB9b, Prepend x if (p2 == U_GCB_EXTEND || p2 == U_GCB_ZWJ || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) { return false; } - // Cluster Indic syllables together (tailoring of UAX #29). - // Immediately after each virama (that is not just a pure killer) followed by a letter, we - // check to see if the next character has a non-zero width assigned to it in the advances - // array. A zero width means a cluster is formed with the virama (so there is no grapheme - // break), while a non-zero width means a new cluster is started (so there may be a grapheme - // break). - if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama - && !isPureKiller(c1) - && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER - && (advances == nullptr || advances[offset - start] == 0)) { - return false; + + // This is used to decide font-dependent grapheme clusters. If we don't have the advance + // information, we become conservative in grapheme breaking and assume that it has no advance. + const bool c2_has_advance = (advances != nullptr && advances[offset - start] != 0.0); + + // All the following rules are font-dependent, in the way that if we know c2 has an advance, + // we definitely know that it cannot form a grapheme with the character(s) before it. So we + // make the decision in favor a grapheme break early. + if (c2_has_advance) { + return true; } - // Tailoring: make emoji sequences with ZWJ a single grapheme cluster + + // Note: For Rule GB10 and GB11 below, we do not use the Unicode line breaking properties for + // determining emoji-ness and carry our own data, because our data could be more fresh than what + // ICU provides. + // + // Tailored version of Rule GB10, (E_Base | EBG) Extend* × E_Modifier. + // The rule itself says do not break between emoji base and emoji modifiers, skipping all Extend + // characters. Variation selectors are considered Extend, so they are handled fine. + // + // We tailor this by requiring that an actual ligature is formed. If the font doesn't form a + // ligature, we allow a break before the modifier. + if (isEmojiModifier(c2)) { + uint32_t c0 = c1; + size_t offset_backback = offset_back; + int32_t p0 = p1; + if (p0 == U_GCB_EXTEND && offset_backback > start) { + // skip over emoji variation selector + U16_PREV(buf, start, offset_backback, c0); + p0 = tailoredGraphemeClusterBreak(c0); + } + if (isEmojiBase(c0)) { + return false; + } + } + // Tailored version of Rule GB11, ZWJ × (Glue_After_Zwj | EBG) + // We try to make emoji sequences with ZWJ a single grapheme cluster, but only if they actually + // merge to one cluster. So we are more relaxed than the UAX #29 rules in accepting any emoji + // character after the ZWJ, but are tighter in that we only treat it as one cluster if a + // ligature is actually formed and we also require the character before the ZWJ to also be an + // emoji. if (p1 == U_GCB_ZWJ && isEmoji(c2) && offset_back > start) { // look at character before ZWJ to see that both can participate in an emoji zwj sequence uint32_t c0 = 0; - U16_PREV(buf, start, offset_back, c0); - if (c0 == 0xFE0F && offset_back > start) { + size_t offset_backback = offset_back; + U16_PREV(buf, start, offset_backback, c0); + if (c0 == 0xFE0F && offset_backback > start) { // skip over emoji variation selector - U16_PREV(buf, start, offset_back, c0); + U16_PREV(buf, start, offset_backback, c0); } if (isEmoji(c0)) { return false; } } - // Proposed Rule GB9c from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf - // E_Base x E_Modifier - // TODO: Migrate to Rule GB10 and Rule GB11 with fixing following test cases in - // GraphemeBreak.tailoring and GraphemeBreak.emojiModifiers (Bug: 34211654) - // U+0628 U+200D U+2764 is expected to have grapheme boundary after U+200D. - // U+270C U+FE0E U+1F3FB is expected to have grapheme boundary after U+200D. - if (isEmojiModifier(c2)) { - if (c1 == 0xFE0F && offset_back > start) { - // skip over emoji variation selector - U16_PREV(buf, start, offset_back, c1); - } - if (isEmojiBase(c1)) { + // Tailored version of Rule GB12 and Rule GB13 that look at even-odd cases. + // sot (RI RI)* RI x RI + // [^RI] (RI RI)* RI x RI + // + // If we have font information, we have already broken the cluster if and only if the second + // character had no advance, which means a ligature was formed. If we don't, we look back like + // UAX #29 recommends, but only up to 1000 code units. + if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) { + if (advances != nullptr) { + // We have advances information. But if we are here, we already know c2 has no advance. + // So we should definitely disallow a break. return false; + } else { + // Look at up to 1000 code units. + const size_t lookback_barrier = std::max((ssize_t)start, (ssize_t)offset_back - 1000); + size_t offset_backback = offset_back; + while (offset_backback > lookback_barrier) { + uint32_t c0 = 0; + U16_PREV(buf, lookback_barrier, offset_backback, c0); + if (tailoredGraphemeClusterBreak(c0) != U_GCB_REGIONAL_INDICATOR) { + offset_backback += U16_LENGTH(c0); + break; + } + } + // The number 4 comes from the number of code units in a whole flag. + return (offset - offset_backback) % 4 == 0; } } - // Rule GB10, Any ÷ Any + // Cluster Indic syllables together (tailoring of UAX #29). + // Immediately after each virama (that is not just a pure killer) followed by a letter, we + // disallow grapheme breaks (if we are here, we don't know about advances, or we already know + // that c2 has no advance). + if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama + && !isPureKiller(c1) + && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) { + return false; + } + // Rule GB999, Any ÷ Any return true; } diff --git a/engine/src/flutter/tests/unittest/GraphemeBreakTests.cpp b/engine/src/flutter/tests/unittest/GraphemeBreakTests.cpp index 96bd8a8e79..6720df6bef 100644 --- a/engine/src/flutter/tests/unittest/GraphemeBreakTests.cpp +++ b/engine/src/flutter/tests/unittest/GraphemeBreakTests.cpp @@ -91,7 +91,7 @@ TEST(GraphemeBreak, rules) { EXPECT_TRUE(IsBreak("U+11A8 | U+AC00")); // T x LV EXPECT_TRUE(IsBreak("U+11A8 | U+AC01")); // T x LVT - // Rule GB8a, Regional_Indicator x Regional_Indicator + // Rule GB12 and Rule GB13, Regional_Indicator x Regional_Indicator EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8")); EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag) EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag) @@ -99,6 +99,17 @@ TEST(GraphemeBreak, rules) { EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag) EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag) + // Same case as the two above, knowing that the first two characters ligate, which is what + // would typically happen. + const float firstPairLigated[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}; // Two entries per codepoint + EXPECT_TRUE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA")); + EXPECT_FALSE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA")); + // Repeat the tests, But now the font doesn't have a ligature for the first two characters, + // while it does have a ligature for the last two. This could happen for fonts that do not + // support some (potentially encoded later than they were developed) flags. + const float secondPairLigated[] = {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}; + EXPECT_FALSE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA")); + EXPECT_TRUE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA")); EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag) EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag) @@ -110,14 +121,15 @@ TEST(GraphemeBreak, rules) { EXPECT_FALSE( IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag) - // Rule GB9, x Extend + // Rule GB9, x (Extend | ZWJ) EXPECT_FALSE(IsBreak("'a' | U+0301")); // combining accent + EXPECT_FALSE(IsBreak("'a' | U+200D")); // ZWJ // Rule GB9a, x SpacingMark EXPECT_FALSE(IsBreak("U+0915 | U+093E")); // KA, AA (spacing mark) // Rule GB9b, Prepend x // see tailoring test for prepend, as current ICU doesn't have any characters in the class - // Rule GB10, Any ÷ Any + // Rule GB999, Any ÷ Any EXPECT_TRUE(IsBreak("'a' | 'b'")); EXPECT_TRUE(IsBreak("'f' | 'i'")); // probable ligature EXPECT_TRUE(IsBreak("U+0644 | U+0627")); // probable ligature, lam + alef @@ -198,8 +210,7 @@ TEST(GraphemeBreak, tailoring) { EXPECT_TRUE(IsBreakWithAdvances(separate, "U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer - // suppress grapheme breaks in zwj emoji sequences, see - // http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html + // suppress grapheme breaks in zwj emoji sequences EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468")); EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468")); EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468")); @@ -228,10 +239,42 @@ TEST(GraphemeBreak, emojiModifiers) { EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF")); // boy + modifier EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF")); // sign of the horns + modifier EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF")); // selfie (Unicode 9) + modifier + // Reptition of the tests above, with the knowledge that they are ligated. + const float ligated1_2[] = {1.0, 0.0, 0.0}; + const float ligated2_2[] = {1.0, 0.0, 0.0, 0.0}; + EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+261D | U+1F3FB")); + EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+270C | U+1F3FB")); + EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FB")); + EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FC")); + EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FD")); + EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FE")); + EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FF")); + EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F918 | U+1F3FF")); + EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F933 | U+1F3FF")); + // Reptition of the tests above, with the knowledge that they are not ligated. + const float unligated1_2[] = {1.0, 1.0, 0.0}; + const float unligated2_2[] = {1.0, 0.0, 1.0, 0.0}; + EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+261D | U+1F3FB")); + EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+270C | U+1F3FB")); + EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FB")); + EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FC")); + EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FD")); + EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FE")); + EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FF")); + EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F918 | U+1F3FF")); + EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F933 | U+1F3FF")); - // adding emoji style variation selector doesn't affect grapheme cluster - EXPECT_TRUE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier + // adding extend characters between emoji base and modifier doesn't affect grapheme cluster + EXPECT_FALSE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB")); // heart + emoji style + modifier + // Reptition of the two tests above, with the knowledge that they are ligated. + const float ligated1_1_2[] = {1.0, 0.0, 0.0, 0.0}; + EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0E | U+1F3FB")); + EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0F | U+1F3FB")); + // Reptition of the first two tests, with the knowledge that they are not ligated. + const float unligated1_1_2[] = {1.0, 0.0, 1.0, 0.0}; + EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0E | U+1F3FB")); + EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0F | U+1F3FB")); // heart is not an emoji base EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB")); // heart + modifier @@ -241,17 +284,26 @@ TEST(GraphemeBreak, emojiModifiers) { // rat is not an emoji modifer EXPECT_TRUE(IsBreak("U+1F466 | U+1F400")); // boy + rat - } TEST(GraphemeBreak, genderBalancedEmoji) { // U+1F469 is WOMAN, U+200D is ZWJ, U+1F4BC is BRIEFCASE. EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+1F4BC")); EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F4BC")); + // The above two cases, when the ligature is not supported in the font. We now expect a break + // between them. + const float unligated2_1_2[] = {1.0, 0.0, 0.0, 1.0, 0.0}; + EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 | U+200D U+1F4BC")); + EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 U+200D | U+1F4BC")); // U+2695 has now emoji property, so should be part of ZWJ sequence. EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+2695")); EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2695")); + // The above two cases, when the ligature is not supported in the font. We now expect a break + // between them. + const float unligated2_1_1[] = {1.0, 0.0, 0.0, 1.0}; + EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 | U+200D U+2695")); + EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 U+200D | U+2695")); } TEST(GraphemeBreak, offsets) {