Update emoji grapheme breaking rules

The rules are updated to the latest UAX #29, with tailorings based on the font in use: we can now use the clustering information calculated by Layout, so we will only disallow a grapheme break if an emoji ligature is actually formed. Test: Unit tests have been updated and pass. Bug: 30917298 Bug: 34211654 Change-Id: Idc0ef9f1f4f45dc45a50ed69e45c43ebfaea0306
2017-03-15 16:35:36 -07:00
parent fde7453c82
commit f2fd20ec54
2 changed files with 139 additions and 55 deletions
--- a/engine/src/flutter/libs/minikin/GraphemeBreak.cpp
+++ b/engine/src/flutter/libs/minikin/GraphemeBreak.cpp
@@ -102,70 +102,102 @@ bool GraphemeBreak::isGraphemeBreak(const float* advances, const uint16_t* buf,
    if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
        return false;
    }
-    // Rule GB8a that looks at even-off cases.
-    //
-    // sot   (RI RI)*  RI x RI
-    // [^RI] (RI RI)*  RI x RI
-    //                 RI ÷ RI
-    if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
-        // Look at up to 1000 code units.
-        start = std::max((ssize_t)start, (ssize_t)offset_back - 1000);
-        while (offset_back > start) {
-            U16_PREV(buf, start, offset_back, c1);
-            if (tailoredGraphemeClusterBreak(c1) != U_GCB_REGIONAL_INDICATOR) {
-                offset_back += U16_LENGTH(c1);
-                break;
-            }
-        }
-
-        // The number 4 comes from the number of code units in a whole flag.
-        return (offset - offset_back) % 4 == 0;
-    }
    // Rule GB9, x (Extend | ZWJ); Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
    if (p2 == U_GCB_EXTEND || p2 == U_GCB_ZWJ || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) {
        return false;
    }
-    // Cluster Indic syllables together (tailoring of UAX #29).
-    // Immediately after each virama (that is not just a pure killer) followed by a letter, we
-    // check to see if the next character has a non-zero width assigned to it in the advances
-    // array. A zero width means a cluster is formed with the virama (so there is no grapheme
-    // break), while a non-zero width means a new cluster is started (so there may be a grapheme
-    // break).
-    if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
-            && !isPureKiller(c1)
-            && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER
-            && (advances == nullptr || advances[offset - start] == 0)) {
-        return false;
+
+    // This is used to decide font-dependent grapheme clusters. If we don't have the advance
+    // information, we become conservative in grapheme breaking and assume that it has no advance.
+    const bool c2_has_advance = (advances != nullptr && advances[offset - start] != 0.0);
+
+    // All the following rules are font-dependent, in the way that if we know c2 has an advance,
+    // we definitely know that it cannot form a grapheme with the character(s) before it. So we
+    // make the decision in favor a grapheme break early.
+    if (c2_has_advance) {
+        return true;
    }
-    // Tailoring: make emoji sequences with ZWJ a single grapheme cluster
+
+    // Note: For Rule GB10 and GB11 below, we do not use the Unicode line breaking properties for
+    // determining emoji-ness and carry our own data, because our data could be more fresh than what
+    // ICU provides.
+    //
+    // Tailored version of Rule GB10, (E_Base | EBG) Extend* × E_Modifier.
+    // The rule itself says do not break between emoji base and emoji modifiers, skipping all Extend
+    // characters. Variation selectors are considered Extend, so they are handled fine.
+    //
+    // We tailor this by requiring that an actual ligature is formed. If the font doesn't form a
+    // ligature, we allow a break before the modifier.
+    if (isEmojiModifier(c2)) {
+        uint32_t c0 = c1;
+        size_t offset_backback = offset_back;
+        int32_t p0 = p1;
+        if (p0 == U_GCB_EXTEND && offset_backback > start) {
+            // skip over emoji variation selector
+            U16_PREV(buf, start, offset_backback, c0);
+            p0 = tailoredGraphemeClusterBreak(c0);
+        }
+        if (isEmojiBase(c0)) {
+            return false;
+        }
+    }
+    // Tailored version of Rule GB11, ZWJ × (Glue_After_Zwj | EBG)
+    // We try to make emoji sequences with ZWJ a single grapheme cluster, but only if they actually
+    // merge to one cluster. So we are more relaxed than the UAX #29 rules in accepting any emoji
+    // character after the ZWJ, but are tighter in that we only treat it as one cluster if a
+    // ligature is actually formed and we also require the character before the ZWJ to also be an
+    // emoji.
    if (p1 == U_GCB_ZWJ && isEmoji(c2) && offset_back > start) {
        // look at character before ZWJ to see that both can participate in an emoji zwj sequence
        uint32_t c0 = 0;
-        U16_PREV(buf, start, offset_back, c0);
-        if (c0 == 0xFE0F && offset_back > start) {
+        size_t offset_backback = offset_back;
+        U16_PREV(buf, start, offset_backback, c0);
+        if (c0 == 0xFE0F && offset_backback > start) {
            // skip over emoji variation selector
-            U16_PREV(buf, start, offset_back, c0);
+            U16_PREV(buf, start, offset_backback, c0);
        }
        if (isEmoji(c0)) {
            return false;
        }
    }
-    // Proposed Rule GB9c from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
-    // E_Base x E_Modifier
-    // TODO: Migrate to Rule GB10 and Rule GB11 with fixing following test cases in
-    //       GraphemeBreak.tailoring and GraphemeBreak.emojiModifiers (Bug: 34211654)
-    // U+0628 U+200D U+2764 is expected to have grapheme boundary after U+200D.
-    // U+270C U+FE0E U+1F3FB is expected to have grapheme boundary after U+200D.
-    if (isEmojiModifier(c2)) {
-        if (c1 == 0xFE0F && offset_back > start) {
-            // skip over emoji variation selector
-            U16_PREV(buf, start, offset_back, c1);
-        }
-        if (isEmojiBase(c1)) {
+    // Tailored version of Rule GB12 and Rule GB13 that look at even-odd cases.
+    // sot   (RI RI)*  RI x RI
+    // [^RI] (RI RI)*  RI x RI
+    //
+    // If we have font information, we have already broken the cluster if and only if the second
+    // character had no advance, which means a ligature was formed. If we don't, we look back like
+    // UAX #29 recommends, but only up to 1000 code units.
+    if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
+        if (advances != nullptr) {
+            // We have advances information. But if we are here, we already know c2 has no advance.
+            // So we should definitely disallow a break.
            return false;
+        } else {
+            // Look at up to 1000 code units.
+            const size_t lookback_barrier = std::max((ssize_t)start, (ssize_t)offset_back - 1000);
+            size_t offset_backback = offset_back;
+            while (offset_backback > lookback_barrier) {
+                uint32_t c0 = 0;
+                U16_PREV(buf, lookback_barrier, offset_backback, c0);
+                if (tailoredGraphemeClusterBreak(c0) != U_GCB_REGIONAL_INDICATOR) {
+                    offset_backback += U16_LENGTH(c0);
+                    break;
+                }
+            }
+            // The number 4 comes from the number of code units in a whole flag.
+            return (offset - offset_backback) % 4 == 0;
        }
    }
-    // Rule GB10, Any ÷ Any
+    // Cluster Indic syllables together (tailoring of UAX #29).
+    // Immediately after each virama (that is not just a pure killer) followed by a letter, we
+    // disallow grapheme breaks (if we are here, we don't know about advances, or we already know
+    // that c2 has no advance).
+    if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
+            && !isPureKiller(c1)
+            && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
+        return false;
+    }
+    // Rule GB999, Any ÷ Any
    return true;
 }

--- a/engine/src/flutter/tests/unittest/GraphemeBreakTests.cpp
+++ b/engine/src/flutter/tests/unittest/GraphemeBreakTests.cpp
@@ -91,7 +91,7 @@ TEST(GraphemeBreak, rules) {
    EXPECT_TRUE(IsBreak("U+11A8 | U+AC00"));  // T x LV
    EXPECT_TRUE(IsBreak("U+11A8 | U+AC01"));  // T x LVT

-    // Rule GB8a, Regional_Indicator x Regional_Indicator
+    // Rule GB12 and Rule GB13, Regional_Indicator x Regional_Indicator
    EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8"));
    EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
    EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
@@ -99,6 +99,17 @@ TEST(GraphemeBreak, rules) {

    EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA"));  // Regional indicator pair (flag)
    EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA"));  // Regional indicator pair (flag)
+    // Same case as the two above, knowing that the first two characters ligate, which is what
+    // would typically happen.
+    const float firstPairLigated[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}; // Two entries per codepoint
+    EXPECT_TRUE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA"));
+    EXPECT_FALSE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA"));
+    // Repeat the tests, But now the font doesn't have a ligature for the first two characters,
+    // while it does have a ligature for the last two. This could happen for fonts that do not
+    // support some (potentially encoded later than they were developed) flags.
+    const float secondPairLigated[] = {1.0, 0.0, 1.0, 0.0, 0.0, 0.0};
+    EXPECT_FALSE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA"));
+    EXPECT_TRUE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA"));

    EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA"));  // Regional indicator pair (flag)
    EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA"));  // Regional indicator pair (flag)
@@ -110,14 +121,15 @@ TEST(GraphemeBreak, rules) {
    EXPECT_FALSE(
            IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8"));  // Regional indicator pair (flag)

-    // Rule GB9, x Extend
+    // Rule GB9, x (Extend | ZWJ)
    EXPECT_FALSE(IsBreak("'a' | U+0301"));  // combining accent
+    EXPECT_FALSE(IsBreak("'a' | U+200D"));  // ZWJ
    // Rule GB9a, x SpacingMark
    EXPECT_FALSE(IsBreak("U+0915 | U+093E"));  // KA, AA (spacing mark)
    // Rule GB9b, Prepend x
    // see tailoring test for prepend, as current ICU doesn't have any characters in the class

-    // Rule GB10, Any ÷ Any
+    // Rule GB999, Any ÷ Any
    EXPECT_TRUE(IsBreak("'a' | 'b'"));
    EXPECT_TRUE(IsBreak("'f' | 'i'"));  // probable ligature
    EXPECT_TRUE(IsBreak("U+0644 | U+0627"));  // probable ligature, lam + alef
@@ -198,8 +210,7 @@ TEST(GraphemeBreak, tailoring) {
    EXPECT_TRUE(IsBreakWithAdvances(separate,
            "U+0E01 U+0E3A | U+0E01"));  // thai phinthu = pure killer

-    // suppress grapheme breaks in zwj emoji sequences, see
-    // http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html
+    // suppress grapheme breaks in zwj emoji sequences
    EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468"));
    EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468"));
    EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468"));
@@ -228,10 +239,42 @@ TEST(GraphemeBreak, emojiModifiers) {
    EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF"));  // boy + modifier
    EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF"));  // sign of the horns + modifier
    EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF"));  // selfie (Unicode 9) + modifier
+    // Reptition of the tests above, with the knowledge that they are ligated.
+    const float ligated1_2[] = {1.0, 0.0, 0.0};
+    const float ligated2_2[] = {1.0, 0.0, 0.0, 0.0};
+    EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+261D | U+1F3FB"));
+    EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+270C | U+1F3FB"));
+    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FB"));
+    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FC"));
+    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FD"));
+    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FE"));
+    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FF"));
+    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F918 | U+1F3FF"));
+    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F933 | U+1F3FF"));
+    // Reptition of the tests above, with the knowledge that they are not ligated.
+    const float unligated1_2[] = {1.0, 1.0, 0.0};
+    const float unligated2_2[] = {1.0, 0.0, 1.0, 0.0};
+    EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+261D | U+1F3FB"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+270C | U+1F3FB"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FB"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FC"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FD"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FE"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FF"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F918 | U+1F3FF"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F933 | U+1F3FF"));

-    // adding emoji style variation selector doesn't affect grapheme cluster
-    EXPECT_TRUE(IsBreak("U+270C U+FE0E | U+1F3FB"));  // victory hand + text style + modifier
+    // adding extend characters between emoji base and modifier doesn't affect grapheme cluster
+    EXPECT_FALSE(IsBreak("U+270C U+FE0E | U+1F3FB"));  // victory hand + text style + modifier
    EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB"));  // heart + emoji style + modifier
+    // Reptition of the two tests above, with the knowledge that they are ligated.
+    const float ligated1_1_2[] = {1.0, 0.0, 0.0, 0.0};
+    EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0E | U+1F3FB"));
+    EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0F | U+1F3FB"));
+    // Reptition of the first two tests, with the knowledge that they are not ligated.
+    const float unligated1_1_2[] = {1.0, 0.0, 1.0, 0.0};
+    EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0E | U+1F3FB"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0F | U+1F3FB"));

    // heart is not an emoji base
    EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB"));  // heart + modifier
@@ -241,17 +284,26 @@ TEST(GraphemeBreak, emojiModifiers) {

    // rat is not an emoji modifer
    EXPECT_TRUE(IsBreak("U+1F466 | U+1F400"));  // boy + rat
-
 }

 TEST(GraphemeBreak, genderBalancedEmoji) {
    // U+1F469 is WOMAN, U+200D is ZWJ, U+1F4BC is BRIEFCASE.
    EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+1F4BC"));
    EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F4BC"));
+    // The above two cases, when the ligature is not supported in the font. We now expect a break
+    // between them.
+    const float unligated2_1_2[] = {1.0, 0.0, 0.0, 1.0, 0.0};
+    EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 | U+200D U+1F4BC"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 U+200D | U+1F4BC"));

    // U+2695 has now emoji property, so should be part of ZWJ sequence.
    EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+2695"));
    EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2695"));
+    // The above two cases, when the ligature is not supported in the font. We now expect a break
+    // between them.
+    const float unligated2_1_1[] = {1.0, 0.0, 0.0, 1.0};
+    EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 | U+200D U+2695"));
+    EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 U+200D | U+2695"));
 }

 TEST(GraphemeBreak, offsets) {