diff --git a/engine/src/flutter/include/minikin/WordBreaker.h b/engine/src/flutter/include/minikin/WordBreaker.h index 8c0050236e..c4aa1514a0 100644 --- a/engine/src/flutter/include/minikin/WordBreaker.h +++ b/engine/src/flutter/include/minikin/WordBreaker.h @@ -63,7 +63,7 @@ private: // state for the email address / url detector ssize_t mScanOffset; - bool mSuppressHyphen; + bool mInEmailOrUrl; }; } // namespace diff --git a/engine/src/flutter/libs/minikin/WordBreaker.cpp b/engine/src/flutter/libs/minikin/WordBreaker.cpp index f438cd5546..edac993d44 100644 --- a/engine/src/flutter/libs/minikin/WordBreaker.cpp +++ b/engine/src/flutter/libs/minikin/WordBreaker.cpp @@ -43,7 +43,7 @@ void WordBreaker::setText(const uint16_t* data, size_t size) { mLast = 0; mCurrent = 0; mScanOffset = 0; - mSuppressHyphen = false; + mInEmailOrUrl = false; UErrorCode status = U_ZERO_ERROR; utext_openUChars(&mUText, data, size, &status); mBreakIterator->setText(&mUText, status); @@ -62,6 +62,17 @@ enum ScanState { SAW_COLON_SLASH_SLASH, }; +// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses +static bool breakAfter(uint16_t c) { + return c == ':' || c == '=' || c == '&'; +} + +// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses +static bool breakBefore(uint16_t c) { + return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' + || c == '%' || c == '=' || c == '&'; +} + ssize_t WordBreaker::next() { mLast = mCurrent; @@ -88,23 +99,45 @@ ssize_t WordBreaker::next() { } } if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { - // no line breaks in entire email address or url - // TODO: refine this according to Chicago Manual of Style rules - while (i < mTextSize && mText[i] == ' ') { - i++; + if (!mBreakIterator->isBoundary(i)) { + i = mBreakIterator->following(i); } - mCurrent = i; - mSuppressHyphen = true; - // Setting mIteratorWasReset will cause next break to be computed following - // mCurrent, rather than following the current ICU iterator location. + mInEmailOrUrl = true; mIteratorWasReset = true; - if (mBreakIterator->isBoundary(mCurrent)) { - return mCurrent; - } } else { - mScanOffset = i; - mSuppressHyphen = false; + mInEmailOrUrl = false; } + mScanOffset = i; + } + + if (mInEmailOrUrl) { + // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.) + uint16_t lastChar = mText[mLast]; + ssize_t i; + for (i = mLast + 1; i < mScanOffset; i++) { + if (breakAfter(lastChar)) { + break; + } + // break after double slash + if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') { + break; + } + uint16_t thisChar = mText[i]; + // never break after hyphen + if (lastChar != '-') { + if (breakBefore(thisChar)) { + break; + } + // break before single slash + if (thisChar == '/' && lastChar != '/' && + !(i + 1 < mScanOffset && mText[i + 1] == '/')) { + break; + } + } + lastChar = thisChar; + } + mCurrent = i; + return mCurrent; } int32_t result; @@ -122,7 +155,7 @@ ssize_t WordBreaker::next() { } ssize_t WordBreaker::wordStart() const { - if (mSuppressHyphen) { + if (mInEmailOrUrl) { return mLast; } ssize_t result = mLast; @@ -142,7 +175,7 @@ ssize_t WordBreaker::wordStart() const { } ssize_t WordBreaker::wordEnd() const { - if (mSuppressHyphen) { + if (mInEmailOrUrl) { return mLast; } ssize_t result = mCurrent; diff --git a/engine/src/flutter/tests/WordBreakerTests.cpp b/engine/src/flutter/tests/WordBreakerTests.cpp index 4111a1b1f4..284b02cb28 100644 --- a/engine/src/flutter/tests/WordBreakerTests.cpp +++ b/engine/src/flutter/tests/WordBreakerTests.cpp @@ -85,7 +85,9 @@ TEST_F(WordBreakerTest, email) { breaker.setLocale(icu::Locale::getEnglish()); breaker.setText(buf, NELEM(buf)); EXPECT_EQ(0, breaker.current()); - EXPECT_EQ(16, breaker.next()); // after "foo@example.com " + EXPECT_EQ(11, breaker.next()); // after "foo@example" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(16, breaker.next()); // after ".com " EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end EXPECT_EQ(16, breaker.wordStart()); // "x" @@ -99,13 +101,19 @@ TEST_F(WordBreakerTest, mailto) { breaker.setLocale(icu::Locale::getEnglish()); breaker.setText(buf, NELEM(buf)); EXPECT_EQ(0, breaker.current()); - EXPECT_EQ(23, breaker.next()); // after "mailto:foo@example.com " + EXPECT_EQ(7, breaker.next()); // after "mailto:" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(18, breaker.next()); // after "foo@example" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(23, breaker.next()); // after ".com " EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end EXPECT_EQ(23, breaker.wordStart()); // "x" EXPECT_EQ(24, breaker.wordEnd()); } +// The current logic always places a line break after a detected email address or URL +// and an immediately following non-ASCII character. TEST_F(WordBreakerTest, emailNonAscii) { uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 0x4E00}; @@ -113,7 +121,9 @@ TEST_F(WordBreakerTest, emailNonAscii) { breaker.setLocale(icu::Locale::getEnglish()); breaker.setText(buf, NELEM(buf)); EXPECT_EQ(0, breaker.current()); - EXPECT_EQ(15, breaker.next()); // after "foo@example.com" + EXPECT_EQ(11, breaker.next()); // after "foo@example" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(15, breaker.next()); // after ".com" EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end EXPECT_EQ(15, breaker.wordStart()); // "一" @@ -127,13 +137,31 @@ TEST_F(WordBreakerTest, emailCombining) { breaker.setLocale(icu::Locale::getEnglish()); breaker.setText(buf, NELEM(buf)); EXPECT_EQ(0, breaker.current()); - EXPECT_EQ(17, breaker.next()); // after "foo@example.com̃" + EXPECT_EQ(11, breaker.next()); // after "foo@example" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(17, breaker.next()); // after ".com̃ " EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end EXPECT_EQ(17, breaker.wordStart()); // "x" EXPECT_EQ(18, breaker.wordEnd()); } +TEST_F(WordBreakerTest, lonelyAt) { + uint16_t buf[] = {'a', ' ', '@', ' ', 'b'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(2, breaker.next()); // after "a " + EXPECT_EQ(0, breaker.wordStart()); // "a" + EXPECT_EQ(1, breaker.wordEnd()); + EXPECT_EQ(4, breaker.next()); // after "@ " + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(4, breaker.wordStart()); // "b" + EXPECT_EQ(5, breaker.wordEnd()); +} + TEST_F(WordBreakerTest, url) { uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'}; @@ -141,9 +169,101 @@ TEST_F(WordBreakerTest, url) { breaker.setLocale(icu::Locale::getEnglish()); breaker.setText(buf, NELEM(buf)); EXPECT_EQ(0, breaker.current()); - EXPECT_EQ(19, breaker.next()); // after "http://example.com " + EXPECT_EQ(5, breaker.next()); // after "http:" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(7, breaker.next()); // after "//" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(14, breaker.next()); // after "example" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(19, breaker.next()); // after ".com " EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end EXPECT_EQ(19, breaker.wordStart()); // "x" EXPECT_EQ(20, breaker.wordEnd()); } + +// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks* +TEST_F(WordBreakerTest, urlBreakChars) { + uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd', + '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(5, breaker.next()); // after "http:" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(7, breaker.next()); // after "//" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(8, breaker.next()); // after "a" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(10, breaker.next()); // after ".b" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(11, breaker.next()); // after "/" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(13, breaker.next()); // after "~c" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(15, breaker.next()); // after ",d" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(17, breaker.next()); // after "-e" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(19, breaker.next()); // after "?f" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(20, breaker.next()); // after "=" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(21, breaker.next()); // after "g" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(22, breaker.next()); // after "&" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(23, breaker.next()); // after "h" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(25, breaker.next()); // after "#i" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(27, breaker.next()); // after "%j" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(29, breaker.next()); // after "_k" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); +} + +TEST_F(WordBreakerTest, urlNoHyphenBreak) { + uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(5, breaker.next()); // after "http:" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(7, breaker.next()); // after "//" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(8, breaker.next()); // after "a" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); +} + +TEST_F(WordBreakerTest, urlEndsWithSlash) { + uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(5, breaker.next()); // after "http:" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(7, breaker.next()); // after "//" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ(8, breaker.next()); // after "a" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); +} + +TEST_F(WordBreakerTest, emailStartsWithSlash) { + uint16_t buf[] = {'/', 'a', '@', 'b'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); +}