Suppress line breaks in emoji + modifier

An emoji base with an emoji modifier renders as a single glyph and
thus should not be a line break. Current (Unicode 8) logic does
indicate a line break, so we override the results of the ICU line
break iterator. The code references a proposal to improve Unicode
behavior; when that is adopted and we upgrade ICU accordingly, the
special-case code should be deleted, but the tests can remain.

Bug: 27343378
Change-Id: I5de9c53e9a34c503816f9131e3d894e6f7a57d13
This commit is contained in:
Raph Levien
2016-02-25 13:50:33 -08:00
parent b7d66e3db0
commit 7f9de429d4
2 changed files with 47 additions and 12 deletions

View File

@@ -17,7 +17,8 @@
#define LOG_TAG "Minikin"
#include <cutils/log.h>
#include "minikin/WordBreaker.h"
#include <minikin/WordBreaker.h>
#include "MinikinInternal.h"
#include <unicode/uchar.h>
#include <unicode/utf16.h>
@@ -25,7 +26,7 @@
namespace android {
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
const uint16_t CHAR_ZWJ = 0x200D;
const uint32_t CHAR_ZWJ = 0x200D;
void WordBreaker::setLocale(const icu::Locale& locale) {
UErrorCode status = U_ZERO_ERROR;
@@ -68,14 +69,18 @@ enum ScanState {
* represents customization beyond the ICU behavior, because plain ICU provides some
* line break opportunities that we don't want.
**/
static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
if (codeUnit == CHAR_SOFT_HYPHEN) {
static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
uint32_t codePoint;
size_t prev_offset = i;
U16_PREV(buf, 0, prev_offset, codePoint);
if (codePoint == CHAR_SOFT_HYPHEN) {
return false;
}
if (codeUnit == CHAR_ZWJ) {
uint32_t next_codepoint;
size_t next_offset = i;
U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
if (codePoint == CHAR_ZWJ) {
// Possible emoji ZWJ sequence
uint32_t next_codepoint;
U16_NEXT(buf, i, bufEnd, next_codepoint);
if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
next_codepoint == 0x1F466 || // BOY
next_codepoint == 0x1F467 || // GIRL
@@ -86,6 +91,17 @@ static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd,
return false;
}
}
// Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
// EB x EM
if (isEmojiModifier(next_codepoint)) {
if (codePoint == 0xFE0F && prev_offset > 0) {
// skip over emoji variation selector
U16_PREV(buf, 0, prev_offset, codePoint);
}
if (isEmojiBase(codePoint)) {
return false;
}
}
return true;
}
@@ -176,7 +192,7 @@ ssize_t WordBreaker::next() {
result = mBreakIterator->next();
}
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
&& !isBreakValid(mText[result - 1], mText, mTextSize, result));
&& !isBreakValid(mText, mTextSize, result));
mCurrent = (ssize_t)result;
return mCurrent;
}

View File

@@ -29,6 +29,8 @@
#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
#endif
#define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
using namespace android;
typedef ICUTestBase WordBreakerTest;
@@ -70,11 +72,11 @@ TEST_F(WordBreakerTest, softHyphen) {
TEST_F(WordBreakerTest, zwjEmojiSequences) {
uint16_t buf[] = {
// man + zwj + heart + zwj + man
0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
// woman + zwj + heart + zwj + woman
0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
// woman + zwj + heart + zwj + kiss mark + zwj + woman
UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
// eye + zwj + left speech bubble
0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
@@ -91,6 +93,23 @@ TEST_F(WordBreakerTest, zwjEmojiSequences) {
EXPECT_EQ(22, breaker.wordEnd());
}
TEST_F(WordBreakerTest, emojiWithModifier) {
uint16_t buf[] = {
UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier
0x270C, 0xFE0F, UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier
};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(4, breaker.next()); // after man + type 6 fitzpatrick modifier
EXPECT_EQ(0, breaker.wordStart());
EXPECT_EQ(4, breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(4, breaker.wordStart());
EXPECT_EQ(8, breaker.wordEnd());
}
TEST_F(WordBreakerTest, punct) {
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
'!', '!'};