From 76022a08e3f01db804d97c10277ee2704ef68f45 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 8 Sep 2015 17:12:10 -0700 Subject: [PATCH] Special-case URLs and email addresses for line breaking Detect URLs and email addresses, and suppress both line breaking and hyphenation within them. Bug: 20126487 Bug: 20566159 Change-Id: I43629347a063dcf579e355e5b678d7195f453ad9 --- .../src/flutter/include/minikin/WordBreaker.h | 4 ++ .../src/flutter/libs/minikin/WordBreaker.cpp | 61 +++++++++++++++- engine/src/flutter/tests/WordBreakerTests.cpp | 70 +++++++++++++++++++ 3 files changed, 134 insertions(+), 1 deletion(-) diff --git a/engine/src/flutter/include/minikin/WordBreaker.h b/engine/src/flutter/include/minikin/WordBreaker.h index 22275bde84..8c0050236e 100644 --- a/engine/src/flutter/include/minikin/WordBreaker.h +++ b/engine/src/flutter/include/minikin/WordBreaker.h @@ -60,6 +60,10 @@ private: ssize_t mLast; ssize_t mCurrent; bool mIteratorWasReset; + + // state for the email address / url detector + ssize_t mScanOffset; + bool mSuppressHyphen; }; } // namespace diff --git a/engine/src/flutter/libs/minikin/WordBreaker.cpp b/engine/src/flutter/libs/minikin/WordBreaker.cpp index b422a62af8..f438cd5546 100644 --- a/engine/src/flutter/libs/minikin/WordBreaker.cpp +++ b/engine/src/flutter/libs/minikin/WordBreaker.cpp @@ -42,6 +42,8 @@ void WordBreaker::setText(const uint16_t* data, size_t size) { mIteratorWasReset = false; mLast = 0; mCurrent = 0; + mScanOffset = 0; + mSuppressHyphen = false; UErrorCode status = U_ZERO_ERROR; utext_openUChars(&mUText, data, size, &status); mBreakIterator->setText(&mUText, status); @@ -52,9 +54,60 @@ ssize_t WordBreaker::current() const { return mCurrent; } +enum ScanState { + START, + SAW_AT, + SAW_COLON, + SAW_COLON_SLASH, + SAW_COLON_SLASH_SLASH, +}; + ssize_t WordBreaker::next() { - int32_t result; mLast = mCurrent; + + // scan forward from current ICU position for email address or URL + if (mLast >= mScanOffset) { + ScanState state = START; + size_t i; + for (i = mLast; i < mTextSize; i++) { + uint16_t c = mText[i]; + // scan only ASCII characters, stop at space + if (!(' ' < c && c <= 0x007E)) { + break; + } + if (state == START && c == '@') { + state = SAW_AT; + } else if (state == START && c == ':') { + state = SAW_COLON; + } else if (state == SAW_COLON || state == SAW_COLON_SLASH) { + if (c == '/') { + state = static_cast((int)state + 1); // next state adds a slash + } else { + state = START; + } + } + } + if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { + // no line breaks in entire email address or url + // TODO: refine this according to Chicago Manual of Style rules + while (i < mTextSize && mText[i] == ' ') { + i++; + } + mCurrent = i; + mSuppressHyphen = true; + // Setting mIteratorWasReset will cause next break to be computed following + // mCurrent, rather than following the current ICU iterator location. + mIteratorWasReset = true; + if (mBreakIterator->isBoundary(mCurrent)) { + return mCurrent; + } + } else { + mScanOffset = i; + mSuppressHyphen = false; + } + } + + int32_t result; do { if (mIteratorWasReset) { result = mBreakIterator->following(mCurrent); @@ -69,6 +122,9 @@ ssize_t WordBreaker::next() { } ssize_t WordBreaker::wordStart() const { + if (mSuppressHyphen) { + return mLast; + } ssize_t result = mLast; while (result < mCurrent) { UChar32 c; @@ -86,6 +142,9 @@ ssize_t WordBreaker::wordStart() const { } ssize_t WordBreaker::wordEnd() const { + if (mSuppressHyphen) { + return mLast; + } ssize_t result = mCurrent; while (result > mLast) { UChar32 c; diff --git a/engine/src/flutter/tests/WordBreakerTests.cpp b/engine/src/flutter/tests/WordBreakerTests.cpp index d389d58cd1..4111a1b1f4 100644 --- a/engine/src/flutter/tests/WordBreakerTests.cpp +++ b/engine/src/flutter/tests/WordBreakerTests.cpp @@ -77,3 +77,73 @@ TEST_F(WordBreakerTest, punct) { EXPECT_EQ(9, breaker.wordStart()); // "world" EXPECT_EQ(14, breaker.wordEnd()); } + +TEST_F(WordBreakerTest, email) { + uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', + ' ', 'x'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(16, breaker.next()); // after "foo@example.com " + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(16, breaker.wordStart()); // "x" + EXPECT_EQ(17, breaker.wordEnd()); +} + +TEST_F(WordBreakerTest, mailto) { + uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', + 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(23, breaker.next()); // after "mailto:foo@example.com " + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(23, breaker.wordStart()); // "x" + EXPECT_EQ(24, breaker.wordEnd()); +} + +TEST_F(WordBreakerTest, emailNonAscii) { + uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', + 0x4E00}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(15, breaker.next()); // after "foo@example.com" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(15, breaker.wordStart()); // "一" + EXPECT_EQ(16, breaker.wordEnd()); +} + +TEST_F(WordBreakerTest, emailCombining) { + uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', + 0x0303, ' ', 'x'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(17, breaker.next()); // after "foo@example.com̃" + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(17, breaker.wordStart()); // "x" + EXPECT_EQ(18, breaker.wordEnd()); +} + +TEST_F(WordBreakerTest, url) { + uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e', + '.', 'c', 'o', 'm', ' ', 'x'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(19, breaker.next()); // after "http://example.com " + EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(19, breaker.wordStart()); // "x" + EXPECT_EQ(20, breaker.wordEnd()); +}