From a14712eaf83ce4e1108f8b9f12e9ab0ec16acbf8 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 18 Feb 2016 15:00:24 -0800 Subject: [PATCH] Suppress linebreaks in emoji ZWJ sequences Due to the way emoji ZWJ sequences are defined, the ICU line breaking algorithm determines that there are valid line breaks inside the sequence. This patch suppresses these line breaks. This is an adaptation of I225ebebc0f4186e4b8f48fee399c4a62b3f0218a into the nyc-dev branch. Bug: 25433289 Change-Id: I84b50b1e6ef13d436965eab389659d02a30d100f --- .../src/flutter/libs/minikin/WordBreaker.cpp | 29 ++++++++++++++++++- engine/src/flutter/tests/WordBreakerTests.cpp | 24 +++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/engine/src/flutter/libs/minikin/WordBreaker.cpp b/engine/src/flutter/libs/minikin/WordBreaker.cpp index ca69a50307..ec84c39f9a 100644 --- a/engine/src/flutter/libs/minikin/WordBreaker.cpp +++ b/engine/src/flutter/libs/minikin/WordBreaker.cpp @@ -25,6 +25,7 @@ namespace android { const uint32_t CHAR_SOFT_HYPHEN = 0x00AD; +const uint16_t CHAR_ZWJ = 0x200D; void WordBreaker::setLocale(const icu::Locale& locale) { UErrorCode status = U_ZERO_ERROR; @@ -62,6 +63,32 @@ enum ScanState { SAW_COLON_SLASH_SLASH, }; +/** + * Determine whether a line break at position i within the buffer buf is valid. This + * represents customization beyond the ICU behavior, because plain ICU provides some + * line break opportunities that we don't want. + **/ +static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) { + if (codeUnit == CHAR_SOFT_HYPHEN) { + return false; + } + if (codeUnit == CHAR_ZWJ) { + // Possible emoji ZWJ sequence + uint32_t next_codepoint; + U16_NEXT(buf, i, bufEnd, next_codepoint); + if (next_codepoint == 0x2764 || // HEAVY BLACK HEART + next_codepoint == 0x1F466 || // BOY + next_codepoint == 0x1F467 || // GIRL + next_codepoint == 0x1F468 || // MAN + next_codepoint == 0x1F469 || // WOMAN + next_codepoint == 0x1F48B || // KISS MARK + next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE + return false; + } + } + return true; +} + // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses static bool breakAfter(uint16_t c) { return c == ':' || c == '=' || c == '&'; @@ -149,7 +176,7 @@ ssize_t WordBreaker::next() { result = mBreakIterator->next(); } } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize - && mText[result - 1] == CHAR_SOFT_HYPHEN); + && !isBreakValid(mText[result - 1], mText, mTextSize, result)); mCurrent = (ssize_t)result; return mCurrent; } diff --git a/engine/src/flutter/tests/WordBreakerTests.cpp b/engine/src/flutter/tests/WordBreakerTests.cpp index 9662b2f7a4..6c5e4795c8 100644 --- a/engine/src/flutter/tests/WordBreakerTests.cpp +++ b/engine/src/flutter/tests/WordBreakerTests.cpp @@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) { EXPECT_EQ(0, breaker.breakBadness()); } +TEST_F(WordBreakerTest, zwjEmojiSequences) { + uint16_t buf[] = { + // man + zwj + heart + zwj + man + 0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68, + // woman + zwj + heart + zwj + woman + 0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69, + // eye + zwj + left speech bubble + 0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8, + }; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man + EXPECT_EQ(0, breaker.wordStart()); + EXPECT_EQ(7, breaker.wordEnd()); + EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman + EXPECT_EQ(7, breaker.wordStart()); + EXPECT_EQ(17, breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(17, breaker.wordStart()); + EXPECT_EQ(22, breaker.wordEnd()); +} + TEST_F(WordBreakerTest, punct) { uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', '!', '!'};