Suppress linebreaks in emoji ZWJ sequences
Due to the way emoji ZWJ sequences are defined, the ICU line breaking algorithm determines that there are valid line breaks inside the sequence. This patch suppresses these line breaks. This is an adaptation of I225ebebc0f4186e4b8f48fee399c4a62b3f0218a into the nyc-dev branch. Bug: 25433289 Change-Id: I84b50b1e6ef13d436965eab389659d02a30d100f
This commit is contained in:
@@ -25,6 +25,7 @@
|
||||
namespace android {
|
||||
|
||||
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
|
||||
const uint16_t CHAR_ZWJ = 0x200D;
|
||||
|
||||
void WordBreaker::setLocale(const icu::Locale& locale) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
@@ -62,6 +63,32 @@ enum ScanState {
|
||||
SAW_COLON_SLASH_SLASH,
|
||||
};
|
||||
|
||||
/**
|
||||
* Determine whether a line break at position i within the buffer buf is valid. This
|
||||
* represents customization beyond the ICU behavior, because plain ICU provides some
|
||||
* line break opportunities that we don't want.
|
||||
**/
|
||||
static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
|
||||
if (codeUnit == CHAR_SOFT_HYPHEN) {
|
||||
return false;
|
||||
}
|
||||
if (codeUnit == CHAR_ZWJ) {
|
||||
// Possible emoji ZWJ sequence
|
||||
uint32_t next_codepoint;
|
||||
U16_NEXT(buf, i, bufEnd, next_codepoint);
|
||||
if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
|
||||
next_codepoint == 0x1F466 || // BOY
|
||||
next_codepoint == 0x1F467 || // GIRL
|
||||
next_codepoint == 0x1F468 || // MAN
|
||||
next_codepoint == 0x1F469 || // WOMAN
|
||||
next_codepoint == 0x1F48B || // KISS MARK
|
||||
next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
|
||||
static bool breakAfter(uint16_t c) {
|
||||
return c == ':' || c == '=' || c == '&';
|
||||
@@ -149,7 +176,7 @@ ssize_t WordBreaker::next() {
|
||||
result = mBreakIterator->next();
|
||||
}
|
||||
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
|
||||
&& mText[result - 1] == CHAR_SOFT_HYPHEN);
|
||||
&& !isBreakValid(mText[result - 1], mText, mTextSize, result));
|
||||
mCurrent = (ssize_t)result;
|
||||
return mCurrent;
|
||||
}
|
||||
|
||||
@@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) {
|
||||
EXPECT_EQ(0, breaker.breakBadness());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, zwjEmojiSequences) {
|
||||
uint16_t buf[] = {
|
||||
// man + zwj + heart + zwj + man
|
||||
0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
|
||||
// woman + zwj + heart + zwj + woman
|
||||
0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
|
||||
// eye + zwj + left speech bubble
|
||||
0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
|
||||
};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man
|
||||
EXPECT_EQ(0, breaker.wordStart());
|
||||
EXPECT_EQ(7, breaker.wordEnd());
|
||||
EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
|
||||
EXPECT_EQ(7, breaker.wordStart());
|
||||
EXPECT_EQ(17, breaker.wordEnd());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(17, breaker.wordStart());
|
||||
EXPECT_EQ(22, breaker.wordEnd());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, punct) {
|
||||
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
|
||||
'!', '!'};
|
||||
|
||||
Reference in New Issue
Block a user