Suppress line breaks in emoji + modifier
An emoji base with an emoji modifier renders as a single glyph and thus should not be a line break. Current (Unicode 8) logic does indicate a line break, so we override the results of the ICU line break iterator. The code references a proposal to improve Unicode behavior; when that is adopted and we upgrade ICU accordingly, the special-case code should be deleted, but the tests can remain. Bug: 27343378 Change-Id: I5de9c53e9a34c503816f9131e3d894e6f7a57d13
This commit is contained in:
@@ -17,7 +17,8 @@
|
||||
#define LOG_TAG "Minikin"
|
||||
#include <cutils/log.h>
|
||||
|
||||
#include "minikin/WordBreaker.h"
|
||||
#include <minikin/WordBreaker.h>
|
||||
#include "MinikinInternal.h"
|
||||
|
||||
#include <unicode/uchar.h>
|
||||
#include <unicode/utf16.h>
|
||||
@@ -25,7 +26,7 @@
|
||||
namespace android {
|
||||
|
||||
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
|
||||
const uint16_t CHAR_ZWJ = 0x200D;
|
||||
const uint32_t CHAR_ZWJ = 0x200D;
|
||||
|
||||
void WordBreaker::setLocale(const icu::Locale& locale) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
@@ -68,14 +69,18 @@ enum ScanState {
|
||||
* represents customization beyond the ICU behavior, because plain ICU provides some
|
||||
* line break opportunities that we don't want.
|
||||
**/
|
||||
static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
|
||||
if (codeUnit == CHAR_SOFT_HYPHEN) {
|
||||
static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
|
||||
uint32_t codePoint;
|
||||
size_t prev_offset = i;
|
||||
U16_PREV(buf, 0, prev_offset, codePoint);
|
||||
if (codePoint == CHAR_SOFT_HYPHEN) {
|
||||
return false;
|
||||
}
|
||||
if (codeUnit == CHAR_ZWJ) {
|
||||
uint32_t next_codepoint;
|
||||
size_t next_offset = i;
|
||||
U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
|
||||
if (codePoint == CHAR_ZWJ) {
|
||||
// Possible emoji ZWJ sequence
|
||||
uint32_t next_codepoint;
|
||||
U16_NEXT(buf, i, bufEnd, next_codepoint);
|
||||
if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
|
||||
next_codepoint == 0x1F466 || // BOY
|
||||
next_codepoint == 0x1F467 || // GIRL
|
||||
@@ -86,6 +91,17 @@ static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd,
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
// EB x EM
|
||||
if (isEmojiModifier(next_codepoint)) {
|
||||
if (codePoint == 0xFE0F && prev_offset > 0) {
|
||||
// skip over emoji variation selector
|
||||
U16_PREV(buf, 0, prev_offset, codePoint);
|
||||
}
|
||||
if (isEmojiBase(codePoint)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -176,7 +192,7 @@ ssize_t WordBreaker::next() {
|
||||
result = mBreakIterator->next();
|
||||
}
|
||||
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
|
||||
&& !isBreakValid(mText[result - 1], mText, mTextSize, result));
|
||||
&& !isBreakValid(mText, mTextSize, result));
|
||||
mCurrent = (ssize_t)result;
|
||||
return mCurrent;
|
||||
}
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
|
||||
#endif
|
||||
|
||||
#define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
|
||||
|
||||
using namespace android;
|
||||
|
||||
typedef ICUTestBase WordBreakerTest;
|
||||
@@ -70,11 +72,11 @@ TEST_F(WordBreakerTest, softHyphen) {
|
||||
TEST_F(WordBreakerTest, zwjEmojiSequences) {
|
||||
uint16_t buf[] = {
|
||||
// man + zwj + heart + zwj + man
|
||||
0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
|
||||
// woman + zwj + heart + zwj + woman
|
||||
0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
|
||||
UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
|
||||
// woman + zwj + heart + zwj + kiss mark + zwj + woman
|
||||
UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
|
||||
// eye + zwj + left speech bubble
|
||||
0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
|
||||
UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
|
||||
};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
@@ -91,6 +93,23 @@ TEST_F(WordBreakerTest, zwjEmojiSequences) {
|
||||
EXPECT_EQ(22, breaker.wordEnd());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, emojiWithModifier) {
|
||||
uint16_t buf[] = {
|
||||
UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier
|
||||
0x270C, 0xFE0F, UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier
|
||||
};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(4, breaker.next()); // after man + type 6 fitzpatrick modifier
|
||||
EXPECT_EQ(0, breaker.wordStart());
|
||||
EXPECT_EQ(4, breaker.wordEnd());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(4, breaker.wordStart());
|
||||
EXPECT_EQ(8, breaker.wordEnd());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, punct) {
|
||||
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
|
||||
'!', '!'};
|
||||
|
||||
Reference in New Issue
Block a user