forked from firka/flutter
Update emoji grapheme breaking rules
The rules are updated to the latest UAX #29, with tailorings based on the font in use: we can now use the clustering information calculated by Layout, so we will only disallow a grapheme break if an emoji ligature is actually formed. Test: Unit tests have been updated and pass. Bug: 30917298 Bug: 34211654 Change-Id: Idc0ef9f1f4f45dc45a50ed69e45c43ebfaea0306
This commit is contained in:
@@ -102,70 +102,102 @@ bool GraphemeBreak::isGraphemeBreak(const float* advances, const uint16_t* buf,
|
||||
if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
|
||||
return false;
|
||||
}
|
||||
// Rule GB8a that looks at even-off cases.
|
||||
//
|
||||
// sot (RI RI)* RI x RI
|
||||
// [^RI] (RI RI)* RI x RI
|
||||
// RI ÷ RI
|
||||
if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
|
||||
// Look at up to 1000 code units.
|
||||
start = std::max((ssize_t)start, (ssize_t)offset_back - 1000);
|
||||
while (offset_back > start) {
|
||||
U16_PREV(buf, start, offset_back, c1);
|
||||
if (tailoredGraphemeClusterBreak(c1) != U_GCB_REGIONAL_INDICATOR) {
|
||||
offset_back += U16_LENGTH(c1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// The number 4 comes from the number of code units in a whole flag.
|
||||
return (offset - offset_back) % 4 == 0;
|
||||
}
|
||||
// Rule GB9, x (Extend | ZWJ); Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
|
||||
if (p2 == U_GCB_EXTEND || p2 == U_GCB_ZWJ || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) {
|
||||
return false;
|
||||
}
|
||||
// Cluster Indic syllables together (tailoring of UAX #29).
|
||||
// Immediately after each virama (that is not just a pure killer) followed by a letter, we
|
||||
// check to see if the next character has a non-zero width assigned to it in the advances
|
||||
// array. A zero width means a cluster is formed with the virama (so there is no grapheme
|
||||
// break), while a non-zero width means a new cluster is started (so there may be a grapheme
|
||||
// break).
|
||||
if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama
|
||||
&& !isPureKiller(c1)
|
||||
&& u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER
|
||||
&& (advances == nullptr || advances[offset - start] == 0)) {
|
||||
return false;
|
||||
|
||||
// This is used to decide font-dependent grapheme clusters. If we don't have the advance
|
||||
// information, we become conservative in grapheme breaking and assume that it has no advance.
|
||||
const bool c2_has_advance = (advances != nullptr && advances[offset - start] != 0.0);
|
||||
|
||||
// All the following rules are font-dependent, in the way that if we know c2 has an advance,
|
||||
// we definitely know that it cannot form a grapheme with the character(s) before it. So we
|
||||
// make the decision in favor a grapheme break early.
|
||||
if (c2_has_advance) {
|
||||
return true;
|
||||
}
|
||||
// Tailoring: make emoji sequences with ZWJ a single grapheme cluster
|
||||
|
||||
// Note: For Rule GB10 and GB11 below, we do not use the Unicode line breaking properties for
|
||||
// determining emoji-ness and carry our own data, because our data could be more fresh than what
|
||||
// ICU provides.
|
||||
//
|
||||
// Tailored version of Rule GB10, (E_Base | EBG) Extend* × E_Modifier.
|
||||
// The rule itself says do not break between emoji base and emoji modifiers, skipping all Extend
|
||||
// characters. Variation selectors are considered Extend, so they are handled fine.
|
||||
//
|
||||
// We tailor this by requiring that an actual ligature is formed. If the font doesn't form a
|
||||
// ligature, we allow a break before the modifier.
|
||||
if (isEmojiModifier(c2)) {
|
||||
uint32_t c0 = c1;
|
||||
size_t offset_backback = offset_back;
|
||||
int32_t p0 = p1;
|
||||
if (p0 == U_GCB_EXTEND && offset_backback > start) {
|
||||
// skip over emoji variation selector
|
||||
U16_PREV(buf, start, offset_backback, c0);
|
||||
p0 = tailoredGraphemeClusterBreak(c0);
|
||||
}
|
||||
if (isEmojiBase(c0)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Tailored version of Rule GB11, ZWJ × (Glue_After_Zwj | EBG)
|
||||
// We try to make emoji sequences with ZWJ a single grapheme cluster, but only if they actually
|
||||
// merge to one cluster. So we are more relaxed than the UAX #29 rules in accepting any emoji
|
||||
// character after the ZWJ, but are tighter in that we only treat it as one cluster if a
|
||||
// ligature is actually formed and we also require the character before the ZWJ to also be an
|
||||
// emoji.
|
||||
if (p1 == U_GCB_ZWJ && isEmoji(c2) && offset_back > start) {
|
||||
// look at character before ZWJ to see that both can participate in an emoji zwj sequence
|
||||
uint32_t c0 = 0;
|
||||
U16_PREV(buf, start, offset_back, c0);
|
||||
if (c0 == 0xFE0F && offset_back > start) {
|
||||
size_t offset_backback = offset_back;
|
||||
U16_PREV(buf, start, offset_backback, c0);
|
||||
if (c0 == 0xFE0F && offset_backback > start) {
|
||||
// skip over emoji variation selector
|
||||
U16_PREV(buf, start, offset_back, c0);
|
||||
U16_PREV(buf, start, offset_backback, c0);
|
||||
}
|
||||
if (isEmoji(c0)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Proposed Rule GB9c from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
// E_Base x E_Modifier
|
||||
// TODO: Migrate to Rule GB10 and Rule GB11 with fixing following test cases in
|
||||
// GraphemeBreak.tailoring and GraphemeBreak.emojiModifiers (Bug: 34211654)
|
||||
// U+0628 U+200D U+2764 is expected to have grapheme boundary after U+200D.
|
||||
// U+270C U+FE0E U+1F3FB is expected to have grapheme boundary after U+200D.
|
||||
if (isEmojiModifier(c2)) {
|
||||
if (c1 == 0xFE0F && offset_back > start) {
|
||||
// skip over emoji variation selector
|
||||
U16_PREV(buf, start, offset_back, c1);
|
||||
}
|
||||
if (isEmojiBase(c1)) {
|
||||
// Tailored version of Rule GB12 and Rule GB13 that look at even-odd cases.
|
||||
// sot (RI RI)* RI x RI
|
||||
// [^RI] (RI RI)* RI x RI
|
||||
//
|
||||
// If we have font information, we have already broken the cluster if and only if the second
|
||||
// character had no advance, which means a ligature was formed. If we don't, we look back like
|
||||
// UAX #29 recommends, but only up to 1000 code units.
|
||||
if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
|
||||
if (advances != nullptr) {
|
||||
// We have advances information. But if we are here, we already know c2 has no advance.
|
||||
// So we should definitely disallow a break.
|
||||
return false;
|
||||
} else {
|
||||
// Look at up to 1000 code units.
|
||||
const size_t lookback_barrier = std::max((ssize_t)start, (ssize_t)offset_back - 1000);
|
||||
size_t offset_backback = offset_back;
|
||||
while (offset_backback > lookback_barrier) {
|
||||
uint32_t c0 = 0;
|
||||
U16_PREV(buf, lookback_barrier, offset_backback, c0);
|
||||
if (tailoredGraphemeClusterBreak(c0) != U_GCB_REGIONAL_INDICATOR) {
|
||||
offset_backback += U16_LENGTH(c0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// The number 4 comes from the number of code units in a whole flag.
|
||||
return (offset - offset_backback) % 4 == 0;
|
||||
}
|
||||
}
|
||||
// Rule GB10, Any ÷ Any
|
||||
// Cluster Indic syllables together (tailoring of UAX #29).
|
||||
// Immediately after each virama (that is not just a pure killer) followed by a letter, we
|
||||
// disallow grapheme breaks (if we are here, we don't know about advances, or we already know
|
||||
// that c2 has no advance).
|
||||
if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama
|
||||
&& !isPureKiller(c1)
|
||||
&& u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
|
||||
return false;
|
||||
}
|
||||
// Rule GB999, Any ÷ Any
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ TEST(GraphemeBreak, rules) {
|
||||
EXPECT_TRUE(IsBreak("U+11A8 | U+AC00")); // T x LV
|
||||
EXPECT_TRUE(IsBreak("U+11A8 | U+AC01")); // T x LVT
|
||||
|
||||
// Rule GB8a, Regional_Indicator x Regional_Indicator
|
||||
// Rule GB12 and Rule GB13, Regional_Indicator x Regional_Indicator
|
||||
EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8"));
|
||||
EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
|
||||
EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
|
||||
@@ -99,6 +99,17 @@ TEST(GraphemeBreak, rules) {
|
||||
|
||||
EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag)
|
||||
EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag)
|
||||
// Same case as the two above, knowing that the first two characters ligate, which is what
|
||||
// would typically happen.
|
||||
const float firstPairLigated[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}; // Two entries per codepoint
|
||||
EXPECT_TRUE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA"));
|
||||
EXPECT_FALSE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA"));
|
||||
// Repeat the tests, But now the font doesn't have a ligature for the first two characters,
|
||||
// while it does have a ligature for the last two. This could happen for fonts that do not
|
||||
// support some (potentially encoded later than they were developed) flags.
|
||||
const float secondPairLigated[] = {1.0, 0.0, 1.0, 0.0, 0.0, 0.0};
|
||||
EXPECT_FALSE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA"));
|
||||
|
||||
EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag)
|
||||
EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag)
|
||||
@@ -110,14 +121,15 @@ TEST(GraphemeBreak, rules) {
|
||||
EXPECT_FALSE(
|
||||
IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag)
|
||||
|
||||
// Rule GB9, x Extend
|
||||
// Rule GB9, x (Extend | ZWJ)
|
||||
EXPECT_FALSE(IsBreak("'a' | U+0301")); // combining accent
|
||||
EXPECT_FALSE(IsBreak("'a' | U+200D")); // ZWJ
|
||||
// Rule GB9a, x SpacingMark
|
||||
EXPECT_FALSE(IsBreak("U+0915 | U+093E")); // KA, AA (spacing mark)
|
||||
// Rule GB9b, Prepend x
|
||||
// see tailoring test for prepend, as current ICU doesn't have any characters in the class
|
||||
|
||||
// Rule GB10, Any ÷ Any
|
||||
// Rule GB999, Any ÷ Any
|
||||
EXPECT_TRUE(IsBreak("'a' | 'b'"));
|
||||
EXPECT_TRUE(IsBreak("'f' | 'i'")); // probable ligature
|
||||
EXPECT_TRUE(IsBreak("U+0644 | U+0627")); // probable ligature, lam + alef
|
||||
@@ -198,8 +210,7 @@ TEST(GraphemeBreak, tailoring) {
|
||||
EXPECT_TRUE(IsBreakWithAdvances(separate,
|
||||
"U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer
|
||||
|
||||
// suppress grapheme breaks in zwj emoji sequences, see
|
||||
// http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html
|
||||
// suppress grapheme breaks in zwj emoji sequences
|
||||
EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468"));
|
||||
EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468"));
|
||||
EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468"));
|
||||
@@ -228,10 +239,42 @@ TEST(GraphemeBreak, emojiModifiers) {
|
||||
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF")); // boy + modifier
|
||||
EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF")); // sign of the horns + modifier
|
||||
EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF")); // selfie (Unicode 9) + modifier
|
||||
// Reptition of the tests above, with the knowledge that they are ligated.
|
||||
const float ligated1_2[] = {1.0, 0.0, 0.0};
|
||||
const float ligated2_2[] = {1.0, 0.0, 0.0, 0.0};
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+261D | U+1F3FB"));
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+270C | U+1F3FB"));
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FB"));
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FC"));
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FD"));
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FE"));
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FF"));
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F918 | U+1F3FF"));
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F933 | U+1F3FF"));
|
||||
// Reptition of the tests above, with the knowledge that they are not ligated.
|
||||
const float unligated1_2[] = {1.0, 1.0, 0.0};
|
||||
const float unligated2_2[] = {1.0, 0.0, 1.0, 0.0};
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+261D | U+1F3FB"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+270C | U+1F3FB"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FB"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FC"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FD"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FE"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FF"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F918 | U+1F3FF"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F933 | U+1F3FF"));
|
||||
|
||||
// adding emoji style variation selector doesn't affect grapheme cluster
|
||||
EXPECT_TRUE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier
|
||||
// adding extend characters between emoji base and modifier doesn't affect grapheme cluster
|
||||
EXPECT_FALSE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier
|
||||
EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB")); // heart + emoji style + modifier
|
||||
// Reptition of the two tests above, with the knowledge that they are ligated.
|
||||
const float ligated1_1_2[] = {1.0, 0.0, 0.0, 0.0};
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0E | U+1F3FB"));
|
||||
EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0F | U+1F3FB"));
|
||||
// Reptition of the first two tests, with the knowledge that they are not ligated.
|
||||
const float unligated1_1_2[] = {1.0, 0.0, 1.0, 0.0};
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0E | U+1F3FB"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0F | U+1F3FB"));
|
||||
|
||||
// heart is not an emoji base
|
||||
EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB")); // heart + modifier
|
||||
@@ -241,17 +284,26 @@ TEST(GraphemeBreak, emojiModifiers) {
|
||||
|
||||
// rat is not an emoji modifer
|
||||
EXPECT_TRUE(IsBreak("U+1F466 | U+1F400")); // boy + rat
|
||||
|
||||
}
|
||||
|
||||
TEST(GraphemeBreak, genderBalancedEmoji) {
|
||||
// U+1F469 is WOMAN, U+200D is ZWJ, U+1F4BC is BRIEFCASE.
|
||||
EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+1F4BC"));
|
||||
EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F4BC"));
|
||||
// The above two cases, when the ligature is not supported in the font. We now expect a break
|
||||
// between them.
|
||||
const float unligated2_1_2[] = {1.0, 0.0, 0.0, 1.0, 0.0};
|
||||
EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 | U+200D U+1F4BC"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 U+200D | U+1F4BC"));
|
||||
|
||||
// U+2695 has now emoji property, so should be part of ZWJ sequence.
|
||||
EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+2695"));
|
||||
EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2695"));
|
||||
// The above two cases, when the ligature is not supported in the font. We now expect a break
|
||||
// between them.
|
||||
const float unligated2_1_1[] = {1.0, 0.0, 0.0, 1.0};
|
||||
EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 | U+200D U+2695"));
|
||||
EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 U+200D | U+2695"));
|
||||
}
|
||||
|
||||
TEST(GraphemeBreak, offsets) {
|
||||
|
||||
Reference in New Issue
Block a user