From c3b16d88941b337c2b0b861daf610bf9ca80f908 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Fri, 4 Sep 2015 17:23:05 -0700 Subject: [PATCH] Refine hyphenation around punctuation Implement a WordBreaker that defines our concept of valid word boundaries, customizing the ICU behavior. Currently, we suppress line breaks at soft hyphens (these are handled specially). Also, the new WordBreaker class has methods that determine the start and end of the word (punctuation stripped) for the purpose of hyphenation. This patch, in its current form, doesn't handle email addresses and URLs specially, but the WordBreaker class is the correct place to do so. Also, special case handling of hyphens and dashes is still done in LineBreaker, but all of that should be moved to WordBreaker. Bug: 20126487 Bug: 20566159 Change-Id: I492cbad963f9b74a2915f010dad46bb91f97b2fe --- .../src/flutter/include/minikin/LineBreaker.h | 9 +- .../src/flutter/include/minikin/WordBreaker.h | 67 +++++++++++ engine/src/flutter/libs/minikin/Android.mk | 3 +- .../src/flutter/libs/minikin/LineBreaker.cpp | 107 ++++++++--------- .../src/flutter/libs/minikin/WordBreaker.cpp | 110 ++++++++++++++++++ engine/src/flutter/tests/Android.mk | 3 +- engine/src/flutter/tests/WordBreakerTests.cpp | 79 +++++++++++++ 7 files changed, 311 insertions(+), 67 deletions(-) create mode 100644 engine/src/flutter/include/minikin/WordBreaker.h create mode 100644 engine/src/flutter/libs/minikin/WordBreaker.cpp create mode 100644 engine/src/flutter/tests/WordBreakerTests.cpp diff --git a/engine/src/flutter/include/minikin/LineBreaker.h b/engine/src/flutter/include/minikin/LineBreaker.h index e031fb3182..e28f11da44 100644 --- a/engine/src/flutter/include/minikin/LineBreaker.h +++ b/engine/src/flutter/include/minikin/LineBreaker.h @@ -27,6 +27,7 @@ #include #include #include "minikin/Hyphenator.h" +#include "minikin/WordBreaker.h" namespace android { @@ -102,11 +103,6 @@ class LineBreaker { public: const static int kTab_Shift = 29; // keep synchronized with TAB_MASK in StaticLayout.java - ~LineBreaker() { - utext_close(&mUText); - delete mBreakIterator; - } - // Note: Locale persists across multiple invocations (it is not cleaned up by finish()), // explicitly to avoid the cost of creating ICU BreakIterator objects. It should always // be set on the first invocation, but callers are encouraged not to call again unless @@ -214,8 +210,7 @@ class LineBreaker { void finishBreaksOptimal(); - icu::BreakIterator* mBreakIterator = nullptr; - UText mUText = UTEXT_INITIALIZER; + WordBreaker mWordBreaker; std::vectormTextBuf; std::vectormCharWidths; diff --git a/engine/src/flutter/include/minikin/WordBreaker.h b/engine/src/flutter/include/minikin/WordBreaker.h new file mode 100644 index 0000000000..22275bde84 --- /dev/null +++ b/engine/src/flutter/include/minikin/WordBreaker.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A wrapper around ICU's line break iterator, that gives customized line + * break opportunities, as well as identifying words for the purpose of + * hyphenation. + */ + +#ifndef MINIKIN_WORD_BREAKER_H +#define MINIKIN_WORD_BREAKER_H + +#include "unicode/brkiter.h" +#include + +namespace android { + +class WordBreaker { +public: + ~WordBreaker() { + finish(); + } + + void setLocale(const icu::Locale& locale); + + void setText(const uint16_t* data, size_t size); + + // Advance iterator to next word break. Return offset, or -1 if EOT + ssize_t next(); + + // Current offset of iterator, equal to 0 at BOT or last return from next() + ssize_t current() const; + + // After calling next(), wordStart() and wordEnd() are offsets defining the previous + // word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation. + ssize_t wordStart() const; + + ssize_t wordEnd() const; + + void finish(); + +private: + std::unique_ptr mBreakIterator; + UText mUText = UTEXT_INITIALIZER; + const uint16_t* mText = nullptr; + size_t mTextSize; + ssize_t mLast; + ssize_t mCurrent; + bool mIteratorWasReset; +}; + +} // namespace + +#endif // MINIKIN_WORD_BREAKER_H diff --git a/engine/src/flutter/libs/minikin/Android.mk b/engine/src/flutter/libs/minikin/Android.mk index 1fd0e6e8a0..7cd6f6883c 100644 --- a/engine/src/flutter/libs/minikin/Android.mk +++ b/engine/src/flutter/libs/minikin/Android.mk @@ -33,7 +33,8 @@ minikin_src_files := \ MinikinInternal.cpp \ MinikinRefCounted.cpp \ MinikinFontFreeType.cpp \ - SparseBitSet.cpp + SparseBitSet.cpp \ + WordBreaker.cpp minikin_c_includes := \ external/harfbuzz_ng/src \ diff --git a/engine/src/flutter/libs/minikin/LineBreaker.cpp b/engine/src/flutter/libs/minikin/LineBreaker.cpp index a2507e0ba8..214f19599d 100644 --- a/engine/src/flutter/libs/minikin/LineBreaker.cpp +++ b/engine/src/flutter/libs/minikin/LineBreaker.cpp @@ -29,7 +29,6 @@ using std::vector; namespace android { const int CHAR_TAB = 0x0009; -const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; // Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these // constants are larger than any reasonable actual width score. @@ -55,23 +54,16 @@ const size_t LONGEST_HYPHENATED_WORD = 45; const size_t MAX_TEXT_BUF_RETAIN = 32678; void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) { - delete mBreakIterator; - UErrorCode status = U_ZERO_ERROR; - mBreakIterator = icu::BreakIterator::createLineInstance(locale, status); - // TODO: check status + mWordBreaker.setLocale(locale); - // TODO: load actual resource dependent on locale; letting Minikin do it is a hack mHyphenator = hyphenator; } void LineBreaker::setText() { - UErrorCode status = U_ZERO_ERROR; - utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status); - mBreakIterator->setText(&mUText, status); - mBreakIterator->first(); + mWordBreaker.setText(mTextBuf.data(), mTextBuf.size()); // handle initial break here because addStyleRun may never be called - mBreakIterator->next(); + mWordBreaker.next(); mCandidates.clear(); Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0}; mCandidates.push_back(cand); @@ -151,8 +143,8 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER); } - size_t current = (size_t)mBreakIterator->current(); - size_t wordEnd = start; + size_t current = (size_t)mWordBreaker.current(); + size_t afterWord = start; size_t lastBreak = start; ParaWidth lastBreakWidth = mWidth; ParaWidth postBreak = mWidth; @@ -170,58 +162,56 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa mWidth += mCharWidths[i]; if (!isLineEndSpace(c)) { postBreak = mWidth; - wordEnd = i + 1; + afterWord = i + 1; } } if (i + 1 == current) { - // Override ICU's treatment of soft hyphen as a break opportunity, because we want it - // to be a hyphen break, with penalty and drawing behavior. - if (c != CHAR_SOFT_HYPHEN) { - // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so - // we can pass the whole word down to Hyphenator like the soft hyphen case. - bool wordEndsInHyphen = isLineBreakingHyphen(c); - if (paint != nullptr && mHyphenator != nullptr && - mHyphenationFrequency != kHyphenationFrequency_None && - !wordEndsInHyphen && !temporarilySkipHyphenation && - wordEnd > lastBreak && wordEnd - lastBreak <= LONGEST_HYPHENATED_WORD) { - mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak); - #if VERBOSE_DEBUG - std::string hyphenatedString; - for (size_t j = lastBreak; j < wordEnd; j++) { - if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-'); - // Note: only works with ASCII, should do UTF-8 conversion here - hyphenatedString.push_back(buffer()[j]); - } - ALOGD("hyphenated string: %s", hyphenatedString.c_str()); - #endif + // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so + // we can pass the whole word down to Hyphenator like the soft hyphen case. + bool wordEndsInHyphen = isLineBreakingHyphen(c); + size_t wordStart = mWordBreaker.wordStart(); + size_t wordEnd = mWordBreaker.wordEnd(); + if (paint != nullptr && mHyphenator != nullptr && + mHyphenationFrequency != kHyphenationFrequency_None && + !wordEndsInHyphen && !temporarilySkipHyphenation && + wordEnd > wordStart && wordEnd - wordStart <= LONGEST_HYPHENATED_WORD) { + mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[wordStart], wordEnd - wordStart); +#if VERBOSE_DEBUG + std::string hyphenatedString; + for (size_t j = wordStart; j < wordEnd; j++) { + if (mHyphBuf[j - wordStart]) hyphenatedString.push_back('-'); + // Note: only works with ASCII, should do UTF-8 conversion here + hyphenatedString.push_back(buffer()[j]); + } + ALOGD("hyphenated string: %s", hyphenatedString.c_str()); +#endif - // measure hyphenated substrings - for (size_t j = lastBreak; j < wordEnd; j++) { - uint8_t hyph = mHyphBuf[j - lastBreak]; - if (hyph) { - paint->hyphenEdit = hyph; - layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak, - mTextBuf.size(), bidiFlags, style, *paint); - ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance(); - paint->hyphenEdit = 0; - layout.doLayout(mTextBuf.data(), j, wordEnd - j, - mTextBuf.size(), bidiFlags, style, *paint); - ParaWidth hyphPreBreak = postBreak - layout.getAdvance(); - addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph); - } + // measure hyphenated substrings + for (size_t j = wordStart; j < wordEnd; j++) { + uint8_t hyph = mHyphBuf[j - wordStart]; + if (hyph) { + paint->hyphenEdit = hyph; + layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak, + mTextBuf.size(), bidiFlags, style, *paint); + ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance(); + paint->hyphenEdit = 0; + layout.doLayout(mTextBuf.data(), j, afterWord - j, + mTextBuf.size(), bidiFlags, style, *paint); + ParaWidth hyphPreBreak = postBreak - layout.getAdvance(); + addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph); } } - // Skip hyphenating the next word if and only if the present word ends in a hyphen - temporarilySkipHyphenation = wordEndsInHyphen; - - // Skip break for zero-width characters inside replacement span - if (paint != nullptr || current == end || mCharWidths[current] > 0) { - addWordBreak(current, mWidth, postBreak, 0.0, 0); - } - lastBreak = current; - lastBreakWidth = mWidth; } - current = (size_t)mBreakIterator->next(); + // Skip hyphenating the next word if and only if the present word ends in a hyphen + temporarilySkipHyphenation = wordEndsInHyphen; + + // Skip break for zero-width characters inside replacement span + if (paint != nullptr || current == end || mCharWidths[current] > 0) { + addWordBreak(current, mWidth, postBreak, 0.0, 0); + } + lastBreak = current; + lastBreakWidth = mWidth; + current = (size_t)mWordBreaker.next(); } } @@ -425,6 +415,7 @@ size_t LineBreaker::computeBreaks() { } void LineBreaker::finish() { + mWordBreaker.finish(); mWidth = 0; mCandidates.clear(); mBreaks.clear(); diff --git a/engine/src/flutter/libs/minikin/WordBreaker.cpp b/engine/src/flutter/libs/minikin/WordBreaker.cpp new file mode 100644 index 0000000000..b422a62af8 --- /dev/null +++ b/engine/src/flutter/libs/minikin/WordBreaker.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "Minikin" +#include + +#include "minikin/WordBreaker.h" + +#include +#include + +namespace android { + +const uint32_t CHAR_SOFT_HYPHEN = 0x00AD; + +void WordBreaker::setLocale(const icu::Locale& locale) { + UErrorCode status = U_ZERO_ERROR; + mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status)); + // TODO: handle failure status + if (mText != nullptr) { + mBreakIterator->setText(&mUText, status); + } + mIteratorWasReset = true; +} + +void WordBreaker::setText(const uint16_t* data, size_t size) { + mText = data; + mTextSize = size; + mIteratorWasReset = false; + mLast = 0; + mCurrent = 0; + UErrorCode status = U_ZERO_ERROR; + utext_openUChars(&mUText, data, size, &status); + mBreakIterator->setText(&mUText, status); + mBreakIterator->first(); +} + +ssize_t WordBreaker::current() const { + return mCurrent; +} + +ssize_t WordBreaker::next() { + int32_t result; + mLast = mCurrent; + do { + if (mIteratorWasReset) { + result = mBreakIterator->following(mCurrent); + mIteratorWasReset = false; + } else { + result = mBreakIterator->next(); + } + } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize + && mText[result - 1] == CHAR_SOFT_HYPHEN); + mCurrent = (ssize_t)result; + return mCurrent; +} + +ssize_t WordBreaker::wordStart() const { + ssize_t result = mLast; + while (result < mCurrent) { + UChar32 c; + ssize_t ix = result; + U16_NEXT(mText, ix, mCurrent, c); + int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK); + // strip leading punctuation, defined as OP and QU line breaking classes, + // see UAX #14 + if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) { + break; + } + result = ix; + } + return result; +} + +ssize_t WordBreaker::wordEnd() const { + ssize_t result = mCurrent; + while (result > mLast) { + UChar32 c; + ssize_t ix = result; + U16_PREV(mText, mLast, ix, c); + int32_t gc_mask = U_GET_GC_MASK(c); + // strip trailing space and punctuation + if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) { + break; + } + result = ix; + } + return result; +} + +void WordBreaker::finish() { + mText = nullptr; + // Note: calling utext_close multiply is safe + utext_close(&mUText); +} + +} // namespace android diff --git a/engine/src/flutter/tests/Android.mk b/engine/src/flutter/tests/Android.mk index d2aa5fd107..e6586d7178 100644 --- a/engine/src/flutter/tests/Android.mk +++ b/engine/src/flutter/tests/Android.mk @@ -79,7 +79,8 @@ LOCAL_SRC_FILES += \ MinikinFontForTest.cpp \ GraphemeBreakTests.cpp \ LayoutUtilsTest.cpp \ - UnicodeUtils.cpp + UnicodeUtils.cpp \ + WordBreakerTests.cpp LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/../libs/minikin/ \ diff --git a/engine/src/flutter/tests/WordBreakerTests.cpp b/engine/src/flutter/tests/WordBreakerTests.cpp new file mode 100644 index 0000000000..d389d58cd1 --- /dev/null +++ b/engine/src/flutter/tests/WordBreakerTests.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "ICUTestBase.h" +#include "UnicodeUtils.h" +#include +#include +#include +#include + +#define LOG_TAG "Minikin" +#include + +#ifndef NELEM +#define NELEM(x) ((sizeof(x) / sizeof((x)[0]))) +#endif + +using namespace android; + +typedef ICUTestBase WordBreakerTest; + +TEST_F(WordBreakerTest, basic) { + uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(6, breaker.next()); // after "hello " + EXPECT_EQ(0, breaker.wordStart()); // "hello" + EXPECT_EQ(5, breaker.wordEnd()); + EXPECT_EQ(6, breaker.current()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(6, breaker.wordStart()); // "world" + EXPECT_EQ(11, breaker.wordEnd()); + EXPECT_EQ(11, breaker.current()); +} + +TEST_F(WordBreakerTest, softHyphen) { + uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo " + EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo" + EXPECT_EQ(6, breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(7, breaker.wordStart()); // "world" + EXPECT_EQ(12, breaker.wordEnd()); +} + +TEST_F(WordBreakerTest, punct) { + uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', + '!', '!'}; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(9, breaker.next()); // after "¡¡hello, " + EXPECT_EQ(2, breaker.wordStart()); // "hello" + EXPECT_EQ(7, breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(9, breaker.wordStart()); // "world" + EXPECT_EQ(14, breaker.wordEnd()); +}