Suppress linebreaks in emoji ZWJ sequences

Due to the way emoji ZWJ sequences are defined, the ICU line breaking
algorithm determines that there are valid line breaks inside the
sequence. This patch suppresses these line breaks.

This is an adaptation of I225ebebc0f4186e4b8f48fee399c4a62b3f0218a
into the nyc-dev branch.

Bug: 25433289
Change-Id: I84b50b1e6ef13d436965eab389659d02a30d100f
This commit is contained in:
Raph Levien
2016-02-18 15:00:24 -08:00
parent 72ab39455f
commit a14712eaf8
2 changed files with 52 additions and 1 deletions

View File

@@ -25,6 +25,7 @@
namespace android {
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
const uint16_t CHAR_ZWJ = 0x200D;
void WordBreaker::setLocale(const icu::Locale& locale) {
UErrorCode status = U_ZERO_ERROR;
@@ -62,6 +63,32 @@ enum ScanState {
SAW_COLON_SLASH_SLASH,
};
/**
* Determine whether a line break at position i within the buffer buf is valid. This
* represents customization beyond the ICU behavior, because plain ICU provides some
* line break opportunities that we don't want.
**/
static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
if (codeUnit == CHAR_SOFT_HYPHEN) {
return false;
}
if (codeUnit == CHAR_ZWJ) {
// Possible emoji ZWJ sequence
uint32_t next_codepoint;
U16_NEXT(buf, i, bufEnd, next_codepoint);
if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
next_codepoint == 0x1F466 || // BOY
next_codepoint == 0x1F467 || // GIRL
next_codepoint == 0x1F468 || // MAN
next_codepoint == 0x1F469 || // WOMAN
next_codepoint == 0x1F48B || // KISS MARK
next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE
return false;
}
}
return true;
}
// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
static bool breakAfter(uint16_t c) {
return c == ':' || c == '=' || c == '&';
@@ -149,7 +176,7 @@ ssize_t WordBreaker::next() {
result = mBreakIterator->next();
}
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
&& mText[result - 1] == CHAR_SOFT_HYPHEN);
&& !isBreakValid(mText[result - 1], mText, mTextSize, result));
mCurrent = (ssize_t)result;
return mCurrent;
}

View File

@@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) {
EXPECT_EQ(0, breaker.breakBadness());
}
TEST_F(WordBreakerTest, zwjEmojiSequences) {
uint16_t buf[] = {
// man + zwj + heart + zwj + man
0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
// woman + zwj + heart + zwj + woman
0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
// eye + zwj + left speech bubble
0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man
EXPECT_EQ(0, breaker.wordStart());
EXPECT_EQ(7, breaker.wordEnd());
EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
EXPECT_EQ(7, breaker.wordStart());
EXPECT_EQ(17, breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(17, breaker.wordStart());
EXPECT_EQ(22, breaker.wordEnd());
}
TEST_F(WordBreakerTest, punct) {
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
'!', '!'};