Merge "Suppress linebreaks in emoji ZWJ sequences" into nyc-dev
am: e87aac42d0
* commit 'e87aac42d0f7c80a0836d4cde29ed36e4e848003':
Suppress linebreaks in emoji ZWJ sequences
This commit is contained in:
@@ -25,6 +25,7 @@
|
|||||||
namespace android {
|
namespace android {
|
||||||
|
|
||||||
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
|
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
|
||||||
|
const uint16_t CHAR_ZWJ = 0x200D;
|
||||||
|
|
||||||
void WordBreaker::setLocale(const icu::Locale& locale) {
|
void WordBreaker::setLocale(const icu::Locale& locale) {
|
||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
@@ -62,6 +63,32 @@ enum ScanState {
|
|||||||
SAW_COLON_SLASH_SLASH,
|
SAW_COLON_SLASH_SLASH,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine whether a line break at position i within the buffer buf is valid. This
|
||||||
|
* represents customization beyond the ICU behavior, because plain ICU provides some
|
||||||
|
* line break opportunities that we don't want.
|
||||||
|
**/
|
||||||
|
static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
|
||||||
|
if (codeUnit == CHAR_SOFT_HYPHEN) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (codeUnit == CHAR_ZWJ) {
|
||||||
|
// Possible emoji ZWJ sequence
|
||||||
|
uint32_t next_codepoint;
|
||||||
|
U16_NEXT(buf, i, bufEnd, next_codepoint);
|
||||||
|
if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
|
||||||
|
next_codepoint == 0x1F466 || // BOY
|
||||||
|
next_codepoint == 0x1F467 || // GIRL
|
||||||
|
next_codepoint == 0x1F468 || // MAN
|
||||||
|
next_codepoint == 0x1F469 || // WOMAN
|
||||||
|
next_codepoint == 0x1F48B || // KISS MARK
|
||||||
|
next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
|
// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
|
||||||
static bool breakAfter(uint16_t c) {
|
static bool breakAfter(uint16_t c) {
|
||||||
return c == ':' || c == '=' || c == '&';
|
return c == ':' || c == '=' || c == '&';
|
||||||
@@ -149,7 +176,7 @@ ssize_t WordBreaker::next() {
|
|||||||
result = mBreakIterator->next();
|
result = mBreakIterator->next();
|
||||||
}
|
}
|
||||||
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
|
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
|
||||||
&& mText[result - 1] == CHAR_SOFT_HYPHEN);
|
&& !isBreakValid(mText[result - 1], mText, mTextSize, result));
|
||||||
mCurrent = (ssize_t)result;
|
mCurrent = (ssize_t)result;
|
||||||
return mCurrent;
|
return mCurrent;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) {
|
|||||||
EXPECT_EQ(0, breaker.breakBadness());
|
EXPECT_EQ(0, breaker.breakBadness());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(WordBreakerTest, zwjEmojiSequences) {
|
||||||
|
uint16_t buf[] = {
|
||||||
|
// man + zwj + heart + zwj + man
|
||||||
|
0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
|
||||||
|
// woman + zwj + heart + zwj + woman
|
||||||
|
0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
|
||||||
|
// eye + zwj + left speech bubble
|
||||||
|
0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
|
||||||
|
};
|
||||||
|
WordBreaker breaker;
|
||||||
|
breaker.setLocale(icu::Locale::getEnglish());
|
||||||
|
breaker.setText(buf, NELEM(buf));
|
||||||
|
EXPECT_EQ(0, breaker.current());
|
||||||
|
EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man
|
||||||
|
EXPECT_EQ(0, breaker.wordStart());
|
||||||
|
EXPECT_EQ(7, breaker.wordEnd());
|
||||||
|
EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
|
||||||
|
EXPECT_EQ(7, breaker.wordStart());
|
||||||
|
EXPECT_EQ(17, breaker.wordEnd());
|
||||||
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||||
|
EXPECT_EQ(17, breaker.wordStart());
|
||||||
|
EXPECT_EQ(22, breaker.wordEnd());
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(WordBreakerTest, punct) {
|
TEST_F(WordBreakerTest, punct) {
|
||||||
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
|
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
|
||||||
'!', '!'};
|
'!', '!'};
|
||||||
|
|||||||
Reference in New Issue
Block a user