Refine hyphenation around punctuation

am: c3b16d8894 * commit 'c3b16d88941b337c2b0b861daf610bf9ca80f908': Refine hyphenation around punctuation
2016-02-17 23:41:55 +00:00
parent 7e4f21a4e3 c3b16d8894
commit 806dd8f792
7 changed files with 311 additions and 67 deletions
--- a/engine/src/flutter/include/minikin/LineBreaker.h
+++ b/engine/src/flutter/include/minikin/LineBreaker.h
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <vector>
 #include "minikin/Hyphenator.h"
+#include "minikin/WordBreaker.h"

 namespace android {

@@ -102,11 +103,6 @@ class LineBreaker {
    public:
        const static int kTab_Shift = 29;  // keep synchronized with TAB_MASK in StaticLayout.java

-        ~LineBreaker() {
-            utext_close(&mUText);
-            delete mBreakIterator;
-        }
-
        // Note: Locale persists across multiple invocations (it is not cleaned up by finish()),
        // explicitly to avoid the cost of creating ICU BreakIterator objects. It should always
        // be set on the first invocation, but callers are encouraged not to call again unless
@@ -214,8 +210,7 @@ class LineBreaker {

        void finishBreaksOptimal();

-        icu::BreakIterator* mBreakIterator = nullptr;
-        UText mUText = UTEXT_INITIALIZER;
+        WordBreaker mWordBreaker;
        std::vector<uint16_t>mTextBuf;
        std::vector<float>mCharWidths;

--- a/engine/src/flutter/include/minikin/WordBreaker.h
+++ b/engine/src/flutter/include/minikin/WordBreaker.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A wrapper around ICU's line break iterator, that gives customized line
+ * break opportunities, as well as identifying words for the purpose of
+ * hyphenation.
+ */
+
+#ifndef MINIKIN_WORD_BREAKER_H
+#define MINIKIN_WORD_BREAKER_H
+
+#include "unicode/brkiter.h"
+#include <memory>
+
+namespace android {
+
+class WordBreaker {
+public:
+    ~WordBreaker() {
+        finish();
+    }
+
+    void setLocale(const icu::Locale& locale);
+
+    void setText(const uint16_t* data, size_t size);
+
+    // Advance iterator to next word break. Return offset, or -1 if EOT
+    ssize_t next();
+
+    // Current offset of iterator, equal to 0 at BOT or last return from next()
+    ssize_t current() const;
+
+    // After calling next(), wordStart() and wordEnd() are offsets defining the previous
+    // word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation.
+    ssize_t wordStart() const;
+
+    ssize_t wordEnd() const;
+
+    void finish();
+
+private:
+    std::unique_ptr<icu::BreakIterator> mBreakIterator;
+    UText mUText = UTEXT_INITIALIZER;
+    const uint16_t* mText = nullptr;
+    size_t mTextSize;
+    ssize_t mLast;
+    ssize_t mCurrent;
+    bool mIteratorWasReset;
+};
+
+}  // namespace
+
+#endif  // MINIKIN_WORD_BREAKER_H
--- a/engine/src/flutter/libs/minikin/Android.mk
+++ b/engine/src/flutter/libs/minikin/Android.mk
@@ -33,7 +33,8 @@ minikin_src_files := \
    MinikinInternal.cpp \
    MinikinRefCounted.cpp \
    MinikinFontFreeType.cpp \
-    SparseBitSet.cpp
+    SparseBitSet.cpp \
+    WordBreaker.cpp

 minikin_c_includes := \
    external/harfbuzz_ng/src \
--- a/engine/src/flutter/libs/minikin/LineBreaker.cpp
+++ b/engine/src/flutter/libs/minikin/LineBreaker.cpp
@@ -29,7 +29,6 @@ using std::vector;
 namespace android {

 const int CHAR_TAB = 0x0009;
-const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;

 // Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
 // constants are larger than any reasonable actual width score.
@@ -55,23 +54,16 @@ const size_t LONGEST_HYPHENATED_WORD = 45;
 const size_t MAX_TEXT_BUF_RETAIN = 32678;

 void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
-    delete mBreakIterator;
-    UErrorCode status = U_ZERO_ERROR;
-    mBreakIterator = icu::BreakIterator::createLineInstance(locale, status);
-    // TODO: check status
+    mWordBreaker.setLocale(locale);

-    // TODO: load actual resource dependent on locale; letting Minikin do it is a hack
    mHyphenator = hyphenator;
 }

 void LineBreaker::setText() {
-    UErrorCode status = U_ZERO_ERROR;
-    utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status);
-    mBreakIterator->setText(&mUText, status);
-    mBreakIterator->first();
+    mWordBreaker.setText(mTextBuf.data(), mTextBuf.size());

    // handle initial break here because addStyleRun may never be called
-    mBreakIterator->next();
+    mWordBreaker.next();
    mCandidates.clear();
    Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
    mCandidates.push_back(cand);
@@ -151,8 +143,8 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
        mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
    }

-    size_t current = (size_t)mBreakIterator->current();
-    size_t wordEnd = start;
+    size_t current = (size_t)mWordBreaker.current();
+    size_t afterWord = start;
    size_t lastBreak = start;
    ParaWidth lastBreakWidth = mWidth;
    ParaWidth postBreak = mWidth;
@@ -170,58 +162,56 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
            mWidth += mCharWidths[i];
            if (!isLineEndSpace(c)) {
                postBreak = mWidth;
-                wordEnd = i + 1;
+                afterWord = i + 1;
            }
        }
        if (i + 1 == current) {
-            // Override ICU's treatment of soft hyphen as a break opportunity, because we want it
-            // to be a hyphen break, with penalty and drawing behavior.
-            if (c != CHAR_SOFT_HYPHEN) {
-                // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
-                // we can pass the whole word down to Hyphenator like the soft hyphen case.
-                bool wordEndsInHyphen = isLineBreakingHyphen(c);
-                if (paint != nullptr && mHyphenator != nullptr &&
-                        mHyphenationFrequency != kHyphenationFrequency_None &&
-                        !wordEndsInHyphen && !temporarilySkipHyphenation &&
-                        wordEnd > lastBreak && wordEnd - lastBreak <= LONGEST_HYPHENATED_WORD) {
-                    mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak);
-    #if VERBOSE_DEBUG
-                    std::string hyphenatedString;
-                    for (size_t j = lastBreak; j < wordEnd; j++) {
-                        if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-');
-                        // Note: only works with ASCII, should do UTF-8 conversion here
-                        hyphenatedString.push_back(buffer()[j]);
-                    }
-                    ALOGD("hyphenated string: %s", hyphenatedString.c_str());
-    #endif
+            // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
+            // we can pass the whole word down to Hyphenator like the soft hyphen case.
+            bool wordEndsInHyphen = isLineBreakingHyphen(c);
+            size_t wordStart = mWordBreaker.wordStart();
+            size_t wordEnd = mWordBreaker.wordEnd();
+            if (paint != nullptr && mHyphenator != nullptr &&
+                    mHyphenationFrequency != kHyphenationFrequency_None &&
+                    !wordEndsInHyphen && !temporarilySkipHyphenation &&
+                    wordEnd > wordStart && wordEnd - wordStart <= LONGEST_HYPHENATED_WORD) {
+                mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[wordStart], wordEnd - wordStart);
+#if VERBOSE_DEBUG
+                std::string hyphenatedString;
+                for (size_t j = wordStart; j < wordEnd; j++) {
+                    if (mHyphBuf[j - wordStart]) hyphenatedString.push_back('-');
+                    // Note: only works with ASCII, should do UTF-8 conversion here
+                    hyphenatedString.push_back(buffer()[j]);
+                }
+                ALOGD("hyphenated string: %s", hyphenatedString.c_str());
+#endif

-                    // measure hyphenated substrings
-                    for (size_t j = lastBreak; j < wordEnd; j++) {
-                        uint8_t hyph = mHyphBuf[j - lastBreak];
-                        if (hyph) {
-                            paint->hyphenEdit = hyph;
-                            layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
-                                    mTextBuf.size(), bidiFlags, style, *paint);
-                            ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
-                            paint->hyphenEdit = 0;
-                            layout.doLayout(mTextBuf.data(), j, wordEnd - j,
-                                    mTextBuf.size(), bidiFlags, style, *paint);
-                            ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
-                            addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
-                        }
+                // measure hyphenated substrings
+                for (size_t j = wordStart; j < wordEnd; j++) {
+                    uint8_t hyph = mHyphBuf[j - wordStart];
+                    if (hyph) {
+                        paint->hyphenEdit = hyph;
+                        layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
+                                mTextBuf.size(), bidiFlags, style, *paint);
+                        ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
+                        paint->hyphenEdit = 0;
+                        layout.doLayout(mTextBuf.data(), j, afterWord - j,
+                                mTextBuf.size(), bidiFlags, style, *paint);
+                        ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
+                        addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
                    }
                }
-                // Skip hyphenating the next word if and only if the present word ends in a hyphen
-                temporarilySkipHyphenation = wordEndsInHyphen;
-
-                // Skip break for zero-width characters inside replacement span
-                if (paint != nullptr || current == end || mCharWidths[current] > 0) {
-                    addWordBreak(current, mWidth, postBreak, 0.0, 0);
-                }
-                lastBreak = current;
-                lastBreakWidth = mWidth;
            }
-            current = (size_t)mBreakIterator->next();
+            // Skip hyphenating the next word if and only if the present word ends in a hyphen
+            temporarilySkipHyphenation = wordEndsInHyphen;
+
+            // Skip break for zero-width characters inside replacement span
+            if (paint != nullptr || current == end || mCharWidths[current] > 0) {
+                addWordBreak(current, mWidth, postBreak, 0.0, 0);
+            }
+            lastBreak = current;
+            lastBreakWidth = mWidth;
+            current = (size_t)mWordBreaker.next();
        }
    }

@@ -425,6 +415,7 @@ size_t LineBreaker::computeBreaks() {
 }

 void LineBreaker::finish() {
+    mWordBreaker.finish();
    mWidth = 0;
    mCandidates.clear();
    mBreaks.clear();
--- a/engine/src/flutter/libs/minikin/WordBreaker.cpp
+++ b/engine/src/flutter/libs/minikin/WordBreaker.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "Minikin"
+#include <cutils/log.h>
+
+#include "minikin/WordBreaker.h"
+
+#include <unicode/uchar.h>
+#include <unicode/utf16.h>
+
+namespace android {
+
+const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
+
+void WordBreaker::setLocale(const icu::Locale& locale) {
+    UErrorCode status = U_ZERO_ERROR;
+    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
+    // TODO: handle failure status
+    if (mText != nullptr) {
+        mBreakIterator->setText(&mUText, status);
+    }
+    mIteratorWasReset = true;
+}
+
+void WordBreaker::setText(const uint16_t* data, size_t size) {
+    mText = data;
+    mTextSize = size;
+    mIteratorWasReset = false;
+    mLast = 0;
+    mCurrent = 0;
+    UErrorCode status = U_ZERO_ERROR;
+    utext_openUChars(&mUText, data, size, &status);
+    mBreakIterator->setText(&mUText, status);
+    mBreakIterator->first();
+}
+
+ssize_t WordBreaker::current() const {
+    return mCurrent;
+}
+
+ssize_t WordBreaker::next() {
+    int32_t result;
+    mLast = mCurrent;
+    do {
+        if (mIteratorWasReset) {
+            result = mBreakIterator->following(mCurrent);
+            mIteratorWasReset = false;
+        } else {
+            result = mBreakIterator->next();
+        }
+    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
+             && mText[result - 1] == CHAR_SOFT_HYPHEN);
+    mCurrent = (ssize_t)result;
+    return mCurrent;
+}
+
+ssize_t WordBreaker::wordStart() const {
+    ssize_t result = mLast;
+    while (result < mCurrent) {
+        UChar32 c;
+        ssize_t ix = result;
+        U16_NEXT(mText, ix, mCurrent, c);
+        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
+        // strip leading punctuation, defined as OP and QU line breaking classes,
+        // see UAX #14
+        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
+            break;
+        }
+        result = ix;
+    }
+    return result;
+}
+
+ssize_t WordBreaker::wordEnd() const {
+    ssize_t result = mCurrent;
+    while (result > mLast) {
+        UChar32 c;
+        ssize_t ix = result;
+        U16_PREV(mText, mLast, ix, c);
+        int32_t gc_mask = U_GET_GC_MASK(c);
+        // strip trailing space and punctuation
+        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
+            break;
+        }
+        result = ix;
+    }
+    return result;
+}
+
+void WordBreaker::finish() {
+    mText = nullptr;
+    // Note: calling utext_close multiply is safe
+    utext_close(&mUText);
+}
+
+}  // namespace android
--- a/engine/src/flutter/tests/Android.mk
+++ b/engine/src/flutter/tests/Android.mk
@@ -79,7 +79,8 @@ LOCAL_SRC_FILES += \
    MinikinFontForTest.cpp \
    GraphemeBreakTests.cpp \
    LayoutUtilsTest.cpp \
-    UnicodeUtils.cpp
+    UnicodeUtils.cpp \
+    WordBreakerTests.cpp

 LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/../libs/minikin/ \
--- a/engine/src/flutter/tests/WordBreakerTests.cpp
+++ b/engine/src/flutter/tests/WordBreakerTests.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include "ICUTestBase.h"
+#include "UnicodeUtils.h"
+#include <minikin/WordBreaker.h>
+#include <unicode/locid.h>
+#include <unicode/uclean.h>
+#include <unicode/udata.h>
+
+#define LOG_TAG "Minikin"
+#include <cutils/log.h>
+
+#ifndef NELEM
+#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
+#endif
+
+using namespace android;
+
+typedef ICUTestBase WordBreakerTest;
+
+TEST_F(WordBreakerTest, basic) {
+    uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
+    WordBreaker breaker;
+    breaker.setLocale(icu::Locale::getEnglish());
+    breaker.setText(buf, NELEM(buf));
+    EXPECT_EQ(0, breaker.current());
+    EXPECT_EQ(6, breaker.next());  // after "hello "
+    EXPECT_EQ(0, breaker.wordStart());  // "hello"
+    EXPECT_EQ(5, breaker.wordEnd());
+    EXPECT_EQ(6, breaker.current());
+    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
+    EXPECT_EQ(6, breaker.wordStart());  // "world"
+    EXPECT_EQ(11, breaker.wordEnd());
+    EXPECT_EQ(11, breaker.current());
+}
+
+TEST_F(WordBreakerTest, softHyphen) {
+    uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
+    WordBreaker breaker;
+    breaker.setLocale(icu::Locale::getEnglish());
+    breaker.setText(buf, NELEM(buf));
+    EXPECT_EQ(0, breaker.current());
+    EXPECT_EQ(7, breaker.next());  // after "hel{SOFT HYPHEN}lo "
+    EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
+    EXPECT_EQ(6, breaker.wordEnd());
+    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
+    EXPECT_EQ(7, breaker.wordStart());  // "world"
+    EXPECT_EQ(12, breaker.wordEnd());
+}
+
+TEST_F(WordBreakerTest, punct) {
+    uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
+        '!', '!'};
+    WordBreaker breaker;
+    breaker.setLocale(icu::Locale::getEnglish());
+    breaker.setText(buf, NELEM(buf));
+    EXPECT_EQ(0, breaker.current());
+    EXPECT_EQ(9, breaker.next());  // after "¡¡hello, "
+    EXPECT_EQ(2, breaker.wordStart());  // "hello"
+    EXPECT_EQ(7, breaker.wordEnd());
+    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
+    EXPECT_EQ(9, breaker.wordStart());  // "world"
+    EXPECT_EQ(14, breaker.wordEnd());
+}