Suppress line breaks in emoji + modifier

An emoji base with an emoji modifier renders as a single glyph and thus should not be a line break. Current (Unicode 8) logic does indicate a line break, so we override the results of the ICU line break iterator. The code references a proposal to improve Unicode behavior; when that is adopted and we upgrade ICU accordingly, the special-case code should be deleted, but the tests can remain. Bug: 27343378 Change-Id: I5de9c53e9a34c503816f9131e3d894e6f7a57d13
2016-02-25 13:50:33 -08:00
parent b7d66e3db0
commit 7f9de429d4
2 changed files with 47 additions and 12 deletions
--- a/engine/src/flutter/libs/minikin/WordBreaker.cpp
+++ b/engine/src/flutter/libs/minikin/WordBreaker.cpp
@@ -17,7 +17,8 @@
 #define LOG_TAG "Minikin"
 #include <cutils/log.h>

-#include "minikin/WordBreaker.h"
+#include <minikin/WordBreaker.h>
+#include "MinikinInternal.h"

 #include <unicode/uchar.h>
 #include <unicode/utf16.h>
@@ -25,7 +26,7 @@
 namespace android {

 const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
-const uint16_t CHAR_ZWJ = 0x200D;
+const uint32_t CHAR_ZWJ = 0x200D;

 void WordBreaker::setLocale(const icu::Locale& locale) {
    UErrorCode status = U_ZERO_ERROR;
@@ -68,14 +69,18 @@ enum ScanState {
 * represents customization beyond the ICU behavior, because plain ICU provides some
 * line break opportunities that we don't want.
 **/
-static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
-    if (codeUnit == CHAR_SOFT_HYPHEN) {
+static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
+    uint32_t codePoint;
+    size_t prev_offset = i;
+    U16_PREV(buf, 0, prev_offset, codePoint);
+    if (codePoint == CHAR_SOFT_HYPHEN) {
        return false;
    }
-    if (codeUnit == CHAR_ZWJ) {
+    uint32_t next_codepoint;
+    size_t next_offset = i;
+    U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
+    if (codePoint == CHAR_ZWJ) {
        // Possible emoji ZWJ sequence
-        uint32_t next_codepoint;
-        U16_NEXT(buf, i, bufEnd, next_codepoint);
        if (next_codepoint == 0x2764 ||       // HEAVY BLACK HEART
                next_codepoint == 0x1F466 ||  // BOY
                next_codepoint == 0x1F467 ||  // GIRL
@@ -86,6 +91,17 @@ static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd,
            return false;
        }
    }
+    // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+    // EB x EM
+    if (isEmojiModifier(next_codepoint)) {
+        if (codePoint == 0xFE0F && prev_offset > 0) {
+            // skip over emoji variation selector
+            U16_PREV(buf, 0, prev_offset, codePoint);
+        }
+        if (isEmojiBase(codePoint)) {
+            return false;
+        }
+    }
    return true;
 }

@@ -176,7 +192,7 @@ ssize_t WordBreaker::next() {
            result = mBreakIterator->next();
        }
    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
-            && !isBreakValid(mText[result - 1], mText, mTextSize, result));
+            && !isBreakValid(mText, mTextSize, result));
    mCurrent = (ssize_t)result;
    return mCurrent;
 }
--- a/engine/src/flutter/tests/WordBreakerTests.cpp
+++ b/engine/src/flutter/tests/WordBreakerTests.cpp
@@ -29,6 +29,8 @@
 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
 #endif

+#define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
+
 using namespace android;

 typedef ICUTestBase WordBreakerTest;
@@ -70,11 +72,11 @@ TEST_F(WordBreakerTest, softHyphen) {
 TEST_F(WordBreakerTest, zwjEmojiSequences) {
    uint16_t buf[] = {
        // man + zwj + heart + zwj + man
-        0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
-        // woman + zwj + heart + zwj + woman
-        0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
+        UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
+        // woman + zwj + heart + zwj + kiss mark + zwj + woman
+        UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
        // eye + zwj + left speech bubble
-        0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
+        UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
    };
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
@@ -91,6 +93,23 @@ TEST_F(WordBreakerTest, zwjEmojiSequences) {
    EXPECT_EQ(22, breaker.wordEnd());
 }

+TEST_F(WordBreakerTest, emojiWithModifier) {
+    uint16_t buf[] = {
+        UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
+        0x270C, 0xFE0F, UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
+    };
+    WordBreaker breaker;
+    breaker.setLocale(icu::Locale::getEnglish());
+    breaker.setText(buf, NELEM(buf));
+    EXPECT_EQ(0, breaker.current());
+    EXPECT_EQ(4, breaker.next());  // after man + type 6 fitzpatrick modifier
+    EXPECT_EQ(0, breaker.wordStart());
+    EXPECT_EQ(4, breaker.wordEnd());
+    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
+    EXPECT_EQ(4, breaker.wordStart());
+    EXPECT_EQ(8, breaker.wordEnd());
+}
+
 TEST_F(WordBreakerTest, punct) {
    uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
        '!', '!'};