Merge "Suppress linebreaks in emoji ZWJ sequences" into nyc-dev
am: e87aac42d0
* commit 'e87aac42d0f7c80a0836d4cde29ed36e4e848003':
Suppress linebreaks in emoji ZWJ sequences
This commit is contained in:
@@ -25,6 +25,7 @@
|
||||
namespace android {
|
||||
|
||||
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
|
||||
const uint16_t CHAR_ZWJ = 0x200D;
|
||||
|
||||
void WordBreaker::setLocale(const icu::Locale& locale) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
@@ -62,6 +63,32 @@ enum ScanState {
|
||||
SAW_COLON_SLASH_SLASH,
|
||||
};
|
||||
|
||||
/**
|
||||
* Determine whether a line break at position i within the buffer buf is valid. This
|
||||
* represents customization beyond the ICU behavior, because plain ICU provides some
|
||||
* line break opportunities that we don't want.
|
||||
**/
|
||||
static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
|
||||
if (codeUnit == CHAR_SOFT_HYPHEN) {
|
||||
return false;
|
||||
}
|
||||
if (codeUnit == CHAR_ZWJ) {
|
||||
// Possible emoji ZWJ sequence
|
||||
uint32_t next_codepoint;
|
||||
U16_NEXT(buf, i, bufEnd, next_codepoint);
|
||||
if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
|
||||
next_codepoint == 0x1F466 || // BOY
|
||||
next_codepoint == 0x1F467 || // GIRL
|
||||
next_codepoint == 0x1F468 || // MAN
|
||||
next_codepoint == 0x1F469 || // WOMAN
|
||||
next_codepoint == 0x1F48B || // KISS MARK
|
||||
next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
|
||||
static bool breakAfter(uint16_t c) {
|
||||
return c == ':' || c == '=' || c == '&';
|
||||
@@ -149,7 +176,7 @@ ssize_t WordBreaker::next() {
|
||||
result = mBreakIterator->next();
|
||||
}
|
||||
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
|
||||
&& mText[result - 1] == CHAR_SOFT_HYPHEN);
|
||||
&& !isBreakValid(mText[result - 1], mText, mTextSize, result));
|
||||
mCurrent = (ssize_t)result;
|
||||
return mCurrent;
|
||||
}
|
||||
|
||||
@@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) {
|
||||
EXPECT_EQ(0, breaker.breakBadness());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, zwjEmojiSequences) {
|
||||
uint16_t buf[] = {
|
||||
// man + zwj + heart + zwj + man
|
||||
0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
|
||||
// woman + zwj + heart + zwj + woman
|
||||
0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
|
||||
// eye + zwj + left speech bubble
|
||||
0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
|
||||
};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man
|
||||
EXPECT_EQ(0, breaker.wordStart());
|
||||
EXPECT_EQ(7, breaker.wordEnd());
|
||||
EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
|
||||
EXPECT_EQ(7, breaker.wordStart());
|
||||
EXPECT_EQ(17, breaker.wordEnd());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(17, breaker.wordStart());
|
||||
EXPECT_EQ(22, breaker.wordEnd());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, punct) {
|
||||
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
|
||||
'!', '!'};
|
||||
|
||||
Reference in New Issue
Block a user