Refine hyphenation around punctuation
am: c3b16d8894
* commit 'c3b16d88941b337c2b0b861daf610bf9ca80f908':
Refine hyphenation around punctuation
This commit is contained in:
@@ -27,6 +27,7 @@
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include "minikin/Hyphenator.h"
|
||||
#include "minikin/WordBreaker.h"
|
||||
|
||||
namespace android {
|
||||
|
||||
@@ -102,11 +103,6 @@ class LineBreaker {
|
||||
public:
|
||||
const static int kTab_Shift = 29; // keep synchronized with TAB_MASK in StaticLayout.java
|
||||
|
||||
~LineBreaker() {
|
||||
utext_close(&mUText);
|
||||
delete mBreakIterator;
|
||||
}
|
||||
|
||||
// Note: Locale persists across multiple invocations (it is not cleaned up by finish()),
|
||||
// explicitly to avoid the cost of creating ICU BreakIterator objects. It should always
|
||||
// be set on the first invocation, but callers are encouraged not to call again unless
|
||||
@@ -214,8 +210,7 @@ class LineBreaker {
|
||||
|
||||
void finishBreaksOptimal();
|
||||
|
||||
icu::BreakIterator* mBreakIterator = nullptr;
|
||||
UText mUText = UTEXT_INITIALIZER;
|
||||
WordBreaker mWordBreaker;
|
||||
std::vector<uint16_t>mTextBuf;
|
||||
std::vector<float>mCharWidths;
|
||||
|
||||
|
||||
67
engine/src/flutter/include/minikin/WordBreaker.h
Normal file
67
engine/src/flutter/include/minikin/WordBreaker.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
* Copyright (C) 2015 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A wrapper around ICU's line break iterator, that gives customized line
|
||||
* break opportunities, as well as identifying words for the purpose of
|
||||
* hyphenation.
|
||||
*/
|
||||
|
||||
#ifndef MINIKIN_WORD_BREAKER_H
|
||||
#define MINIKIN_WORD_BREAKER_H
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include <memory>
|
||||
|
||||
namespace android {
|
||||
|
||||
class WordBreaker {
|
||||
public:
|
||||
~WordBreaker() {
|
||||
finish();
|
||||
}
|
||||
|
||||
void setLocale(const icu::Locale& locale);
|
||||
|
||||
void setText(const uint16_t* data, size_t size);
|
||||
|
||||
// Advance iterator to next word break. Return offset, or -1 if EOT
|
||||
ssize_t next();
|
||||
|
||||
// Current offset of iterator, equal to 0 at BOT or last return from next()
|
||||
ssize_t current() const;
|
||||
|
||||
// After calling next(), wordStart() and wordEnd() are offsets defining the previous
|
||||
// word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation.
|
||||
ssize_t wordStart() const;
|
||||
|
||||
ssize_t wordEnd() const;
|
||||
|
||||
void finish();
|
||||
|
||||
private:
|
||||
std::unique_ptr<icu::BreakIterator> mBreakIterator;
|
||||
UText mUText = UTEXT_INITIALIZER;
|
||||
const uint16_t* mText = nullptr;
|
||||
size_t mTextSize;
|
||||
ssize_t mLast;
|
||||
ssize_t mCurrent;
|
||||
bool mIteratorWasReset;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // MINIKIN_WORD_BREAKER_H
|
||||
@@ -33,7 +33,8 @@ minikin_src_files := \
|
||||
MinikinInternal.cpp \
|
||||
MinikinRefCounted.cpp \
|
||||
MinikinFontFreeType.cpp \
|
||||
SparseBitSet.cpp
|
||||
SparseBitSet.cpp \
|
||||
WordBreaker.cpp
|
||||
|
||||
minikin_c_includes := \
|
||||
external/harfbuzz_ng/src \
|
||||
|
||||
@@ -29,7 +29,6 @@ using std::vector;
|
||||
namespace android {
|
||||
|
||||
const int CHAR_TAB = 0x0009;
|
||||
const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
|
||||
|
||||
// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
|
||||
// constants are larger than any reasonable actual width score.
|
||||
@@ -55,23 +54,16 @@ const size_t LONGEST_HYPHENATED_WORD = 45;
|
||||
const size_t MAX_TEXT_BUF_RETAIN = 32678;
|
||||
|
||||
void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
|
||||
delete mBreakIterator;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
mBreakIterator = icu::BreakIterator::createLineInstance(locale, status);
|
||||
// TODO: check status
|
||||
mWordBreaker.setLocale(locale);
|
||||
|
||||
// TODO: load actual resource dependent on locale; letting Minikin do it is a hack
|
||||
mHyphenator = hyphenator;
|
||||
}
|
||||
|
||||
void LineBreaker::setText() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status);
|
||||
mBreakIterator->setText(&mUText, status);
|
||||
mBreakIterator->first();
|
||||
mWordBreaker.setText(mTextBuf.data(), mTextBuf.size());
|
||||
|
||||
// handle initial break here because addStyleRun may never be called
|
||||
mBreakIterator->next();
|
||||
mWordBreaker.next();
|
||||
mCandidates.clear();
|
||||
Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
|
||||
mCandidates.push_back(cand);
|
||||
@@ -151,8 +143,8 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
|
||||
mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
|
||||
}
|
||||
|
||||
size_t current = (size_t)mBreakIterator->current();
|
||||
size_t wordEnd = start;
|
||||
size_t current = (size_t)mWordBreaker.current();
|
||||
size_t afterWord = start;
|
||||
size_t lastBreak = start;
|
||||
ParaWidth lastBreakWidth = mWidth;
|
||||
ParaWidth postBreak = mWidth;
|
||||
@@ -170,58 +162,56 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
|
||||
mWidth += mCharWidths[i];
|
||||
if (!isLineEndSpace(c)) {
|
||||
postBreak = mWidth;
|
||||
wordEnd = i + 1;
|
||||
afterWord = i + 1;
|
||||
}
|
||||
}
|
||||
if (i + 1 == current) {
|
||||
// Override ICU's treatment of soft hyphen as a break opportunity, because we want it
|
||||
// to be a hyphen break, with penalty and drawing behavior.
|
||||
if (c != CHAR_SOFT_HYPHEN) {
|
||||
// TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
|
||||
// we can pass the whole word down to Hyphenator like the soft hyphen case.
|
||||
bool wordEndsInHyphen = isLineBreakingHyphen(c);
|
||||
if (paint != nullptr && mHyphenator != nullptr &&
|
||||
mHyphenationFrequency != kHyphenationFrequency_None &&
|
||||
!wordEndsInHyphen && !temporarilySkipHyphenation &&
|
||||
wordEnd > lastBreak && wordEnd - lastBreak <= LONGEST_HYPHENATED_WORD) {
|
||||
mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak);
|
||||
#if VERBOSE_DEBUG
|
||||
std::string hyphenatedString;
|
||||
for (size_t j = lastBreak; j < wordEnd; j++) {
|
||||
if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-');
|
||||
// Note: only works with ASCII, should do UTF-8 conversion here
|
||||
hyphenatedString.push_back(buffer()[j]);
|
||||
}
|
||||
ALOGD("hyphenated string: %s", hyphenatedString.c_str());
|
||||
#endif
|
||||
// TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
|
||||
// we can pass the whole word down to Hyphenator like the soft hyphen case.
|
||||
bool wordEndsInHyphen = isLineBreakingHyphen(c);
|
||||
size_t wordStart = mWordBreaker.wordStart();
|
||||
size_t wordEnd = mWordBreaker.wordEnd();
|
||||
if (paint != nullptr && mHyphenator != nullptr &&
|
||||
mHyphenationFrequency != kHyphenationFrequency_None &&
|
||||
!wordEndsInHyphen && !temporarilySkipHyphenation &&
|
||||
wordEnd > wordStart && wordEnd - wordStart <= LONGEST_HYPHENATED_WORD) {
|
||||
mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[wordStart], wordEnd - wordStart);
|
||||
#if VERBOSE_DEBUG
|
||||
std::string hyphenatedString;
|
||||
for (size_t j = wordStart; j < wordEnd; j++) {
|
||||
if (mHyphBuf[j - wordStart]) hyphenatedString.push_back('-');
|
||||
// Note: only works with ASCII, should do UTF-8 conversion here
|
||||
hyphenatedString.push_back(buffer()[j]);
|
||||
}
|
||||
ALOGD("hyphenated string: %s", hyphenatedString.c_str());
|
||||
#endif
|
||||
|
||||
// measure hyphenated substrings
|
||||
for (size_t j = lastBreak; j < wordEnd; j++) {
|
||||
uint8_t hyph = mHyphBuf[j - lastBreak];
|
||||
if (hyph) {
|
||||
paint->hyphenEdit = hyph;
|
||||
layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
|
||||
mTextBuf.size(), bidiFlags, style, *paint);
|
||||
ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
|
||||
paint->hyphenEdit = 0;
|
||||
layout.doLayout(mTextBuf.data(), j, wordEnd - j,
|
||||
mTextBuf.size(), bidiFlags, style, *paint);
|
||||
ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
|
||||
addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
|
||||
}
|
||||
// measure hyphenated substrings
|
||||
for (size_t j = wordStart; j < wordEnd; j++) {
|
||||
uint8_t hyph = mHyphBuf[j - wordStart];
|
||||
if (hyph) {
|
||||
paint->hyphenEdit = hyph;
|
||||
layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
|
||||
mTextBuf.size(), bidiFlags, style, *paint);
|
||||
ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
|
||||
paint->hyphenEdit = 0;
|
||||
layout.doLayout(mTextBuf.data(), j, afterWord - j,
|
||||
mTextBuf.size(), bidiFlags, style, *paint);
|
||||
ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
|
||||
addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
|
||||
}
|
||||
}
|
||||
// Skip hyphenating the next word if and only if the present word ends in a hyphen
|
||||
temporarilySkipHyphenation = wordEndsInHyphen;
|
||||
|
||||
// Skip break for zero-width characters inside replacement span
|
||||
if (paint != nullptr || current == end || mCharWidths[current] > 0) {
|
||||
addWordBreak(current, mWidth, postBreak, 0.0, 0);
|
||||
}
|
||||
lastBreak = current;
|
||||
lastBreakWidth = mWidth;
|
||||
}
|
||||
current = (size_t)mBreakIterator->next();
|
||||
// Skip hyphenating the next word if and only if the present word ends in a hyphen
|
||||
temporarilySkipHyphenation = wordEndsInHyphen;
|
||||
|
||||
// Skip break for zero-width characters inside replacement span
|
||||
if (paint != nullptr || current == end || mCharWidths[current] > 0) {
|
||||
addWordBreak(current, mWidth, postBreak, 0.0, 0);
|
||||
}
|
||||
lastBreak = current;
|
||||
lastBreakWidth = mWidth;
|
||||
current = (size_t)mWordBreaker.next();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -425,6 +415,7 @@ size_t LineBreaker::computeBreaks() {
|
||||
}
|
||||
|
||||
void LineBreaker::finish() {
|
||||
mWordBreaker.finish();
|
||||
mWidth = 0;
|
||||
mCandidates.clear();
|
||||
mBreaks.clear();
|
||||
|
||||
110
engine/src/flutter/libs/minikin/WordBreaker.cpp
Normal file
110
engine/src/flutter/libs/minikin/WordBreaker.cpp
Normal file
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
* Copyright (C) 2015 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#define LOG_TAG "Minikin"
|
||||
#include <cutils/log.h>
|
||||
|
||||
#include "minikin/WordBreaker.h"
|
||||
|
||||
#include <unicode/uchar.h>
|
||||
#include <unicode/utf16.h>
|
||||
|
||||
namespace android {
|
||||
|
||||
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
|
||||
|
||||
void WordBreaker::setLocale(const icu::Locale& locale) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
|
||||
// TODO: handle failure status
|
||||
if (mText != nullptr) {
|
||||
mBreakIterator->setText(&mUText, status);
|
||||
}
|
||||
mIteratorWasReset = true;
|
||||
}
|
||||
|
||||
void WordBreaker::setText(const uint16_t* data, size_t size) {
|
||||
mText = data;
|
||||
mTextSize = size;
|
||||
mIteratorWasReset = false;
|
||||
mLast = 0;
|
||||
mCurrent = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
utext_openUChars(&mUText, data, size, &status);
|
||||
mBreakIterator->setText(&mUText, status);
|
||||
mBreakIterator->first();
|
||||
}
|
||||
|
||||
ssize_t WordBreaker::current() const {
|
||||
return mCurrent;
|
||||
}
|
||||
|
||||
ssize_t WordBreaker::next() {
|
||||
int32_t result;
|
||||
mLast = mCurrent;
|
||||
do {
|
||||
if (mIteratorWasReset) {
|
||||
result = mBreakIterator->following(mCurrent);
|
||||
mIteratorWasReset = false;
|
||||
} else {
|
||||
result = mBreakIterator->next();
|
||||
}
|
||||
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
|
||||
&& mText[result - 1] == CHAR_SOFT_HYPHEN);
|
||||
mCurrent = (ssize_t)result;
|
||||
return mCurrent;
|
||||
}
|
||||
|
||||
ssize_t WordBreaker::wordStart() const {
|
||||
ssize_t result = mLast;
|
||||
while (result < mCurrent) {
|
||||
UChar32 c;
|
||||
ssize_t ix = result;
|
||||
U16_NEXT(mText, ix, mCurrent, c);
|
||||
int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
|
||||
// strip leading punctuation, defined as OP and QU line breaking classes,
|
||||
// see UAX #14
|
||||
if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
|
||||
break;
|
||||
}
|
||||
result = ix;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
ssize_t WordBreaker::wordEnd() const {
|
||||
ssize_t result = mCurrent;
|
||||
while (result > mLast) {
|
||||
UChar32 c;
|
||||
ssize_t ix = result;
|
||||
U16_PREV(mText, mLast, ix, c);
|
||||
int32_t gc_mask = U_GET_GC_MASK(c);
|
||||
// strip trailing space and punctuation
|
||||
if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
|
||||
break;
|
||||
}
|
||||
result = ix;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void WordBreaker::finish() {
|
||||
mText = nullptr;
|
||||
// Note: calling utext_close multiply is safe
|
||||
utext_close(&mUText);
|
||||
}
|
||||
|
||||
} // namespace android
|
||||
@@ -79,7 +79,8 @@ LOCAL_SRC_FILES += \
|
||||
MinikinFontForTest.cpp \
|
||||
GraphemeBreakTests.cpp \
|
||||
LayoutUtilsTest.cpp \
|
||||
UnicodeUtils.cpp
|
||||
UnicodeUtils.cpp \
|
||||
WordBreakerTests.cpp
|
||||
|
||||
LOCAL_C_INCLUDES := \
|
||||
$(LOCAL_PATH)/../libs/minikin/ \
|
||||
|
||||
79
engine/src/flutter/tests/WordBreakerTests.cpp
Normal file
79
engine/src/flutter/tests/WordBreakerTests.cpp
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Copyright (C) 2015 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "ICUTestBase.h"
|
||||
#include "UnicodeUtils.h"
|
||||
#include <minikin/WordBreaker.h>
|
||||
#include <unicode/locid.h>
|
||||
#include <unicode/uclean.h>
|
||||
#include <unicode/udata.h>
|
||||
|
||||
#define LOG_TAG "Minikin"
|
||||
#include <cutils/log.h>
|
||||
|
||||
#ifndef NELEM
|
||||
#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
|
||||
#endif
|
||||
|
||||
using namespace android;
|
||||
|
||||
typedef ICUTestBase WordBreakerTest;
|
||||
|
||||
TEST_F(WordBreakerTest, basic) {
|
||||
uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(6, breaker.next()); // after "hello "
|
||||
EXPECT_EQ(0, breaker.wordStart()); // "hello"
|
||||
EXPECT_EQ(5, breaker.wordEnd());
|
||||
EXPECT_EQ(6, breaker.current());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(6, breaker.wordStart()); // "world"
|
||||
EXPECT_EQ(11, breaker.wordEnd());
|
||||
EXPECT_EQ(11, breaker.current());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, softHyphen) {
|
||||
uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo "
|
||||
EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo"
|
||||
EXPECT_EQ(6, breaker.wordEnd());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(7, breaker.wordStart()); // "world"
|
||||
EXPECT_EQ(12, breaker.wordEnd());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, punct) {
|
||||
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
|
||||
'!', '!'};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(9, breaker.next()); // after "¡¡hello, "
|
||||
EXPECT_EQ(2, breaker.wordStart()); // "hello"
|
||||
EXPECT_EQ(7, breaker.wordEnd());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(9, breaker.wordStart()); // "world"
|
||||
EXPECT_EQ(14, breaker.wordEnd());
|
||||
}
|
||||
Reference in New Issue
Block a user