Refine hyphenation around punctuation

am: c3b16d8894

* commit 'c3b16d88941b337c2b0b861daf610bf9ca80f908':
  Refine hyphenation around punctuation
This commit is contained in:
Raph Levien
2016-02-17 23:41:55 +00:00
committed by android-build-merger
7 changed files with 311 additions and 67 deletions

View File

@@ -27,6 +27,7 @@
#include <cmath>
#include <vector>
#include "minikin/Hyphenator.h"
#include "minikin/WordBreaker.h"
namespace android {
@@ -102,11 +103,6 @@ class LineBreaker {
public:
const static int kTab_Shift = 29; // keep synchronized with TAB_MASK in StaticLayout.java
~LineBreaker() {
utext_close(&mUText);
delete mBreakIterator;
}
// Note: Locale persists across multiple invocations (it is not cleaned up by finish()),
// explicitly to avoid the cost of creating ICU BreakIterator objects. It should always
// be set on the first invocation, but callers are encouraged not to call again unless
@@ -214,8 +210,7 @@ class LineBreaker {
void finishBreaksOptimal();
icu::BreakIterator* mBreakIterator = nullptr;
UText mUText = UTEXT_INITIALIZER;
WordBreaker mWordBreaker;
std::vector<uint16_t>mTextBuf;
std::vector<float>mCharWidths;

View File

@@ -0,0 +1,67 @@
/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A wrapper around ICU's line break iterator, that gives customized line
* break opportunities, as well as identifying words for the purpose of
* hyphenation.
*/
#ifndef MINIKIN_WORD_BREAKER_H
#define MINIKIN_WORD_BREAKER_H
#include "unicode/brkiter.h"
#include <memory>
namespace android {
class WordBreaker {
public:
~WordBreaker() {
finish();
}
void setLocale(const icu::Locale& locale);
void setText(const uint16_t* data, size_t size);
// Advance iterator to next word break. Return offset, or -1 if EOT
ssize_t next();
// Current offset of iterator, equal to 0 at BOT or last return from next()
ssize_t current() const;
// After calling next(), wordStart() and wordEnd() are offsets defining the previous
// word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation.
ssize_t wordStart() const;
ssize_t wordEnd() const;
void finish();
private:
std::unique_ptr<icu::BreakIterator> mBreakIterator;
UText mUText = UTEXT_INITIALIZER;
const uint16_t* mText = nullptr;
size_t mTextSize;
ssize_t mLast;
ssize_t mCurrent;
bool mIteratorWasReset;
};
} // namespace
#endif // MINIKIN_WORD_BREAKER_H

View File

@@ -33,7 +33,8 @@ minikin_src_files := \
MinikinInternal.cpp \
MinikinRefCounted.cpp \
MinikinFontFreeType.cpp \
SparseBitSet.cpp
SparseBitSet.cpp \
WordBreaker.cpp
minikin_c_includes := \
external/harfbuzz_ng/src \

View File

@@ -29,7 +29,6 @@ using std::vector;
namespace android {
const int CHAR_TAB = 0x0009;
const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
// constants are larger than any reasonable actual width score.
@@ -55,23 +54,16 @@ const size_t LONGEST_HYPHENATED_WORD = 45;
const size_t MAX_TEXT_BUF_RETAIN = 32678;
void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
delete mBreakIterator;
UErrorCode status = U_ZERO_ERROR;
mBreakIterator = icu::BreakIterator::createLineInstance(locale, status);
// TODO: check status
mWordBreaker.setLocale(locale);
// TODO: load actual resource dependent on locale; letting Minikin do it is a hack
mHyphenator = hyphenator;
}
void LineBreaker::setText() {
UErrorCode status = U_ZERO_ERROR;
utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status);
mBreakIterator->setText(&mUText, status);
mBreakIterator->first();
mWordBreaker.setText(mTextBuf.data(), mTextBuf.size());
// handle initial break here because addStyleRun may never be called
mBreakIterator->next();
mWordBreaker.next();
mCandidates.clear();
Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
mCandidates.push_back(cand);
@@ -151,8 +143,8 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
}
size_t current = (size_t)mBreakIterator->current();
size_t wordEnd = start;
size_t current = (size_t)mWordBreaker.current();
size_t afterWord = start;
size_t lastBreak = start;
ParaWidth lastBreakWidth = mWidth;
ParaWidth postBreak = mWidth;
@@ -170,58 +162,56 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
mWidth += mCharWidths[i];
if (!isLineEndSpace(c)) {
postBreak = mWidth;
wordEnd = i + 1;
afterWord = i + 1;
}
}
if (i + 1 == current) {
// Override ICU's treatment of soft hyphen as a break opportunity, because we want it
// to be a hyphen break, with penalty and drawing behavior.
if (c != CHAR_SOFT_HYPHEN) {
// TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
// we can pass the whole word down to Hyphenator like the soft hyphen case.
bool wordEndsInHyphen = isLineBreakingHyphen(c);
if (paint != nullptr && mHyphenator != nullptr &&
mHyphenationFrequency != kHyphenationFrequency_None &&
!wordEndsInHyphen && !temporarilySkipHyphenation &&
wordEnd > lastBreak && wordEnd - lastBreak <= LONGEST_HYPHENATED_WORD) {
mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak);
#if VERBOSE_DEBUG
std::string hyphenatedString;
for (size_t j = lastBreak; j < wordEnd; j++) {
if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-');
// Note: only works with ASCII, should do UTF-8 conversion here
hyphenatedString.push_back(buffer()[j]);
}
ALOGD("hyphenated string: %s", hyphenatedString.c_str());
#endif
// TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
// we can pass the whole word down to Hyphenator like the soft hyphen case.
bool wordEndsInHyphen = isLineBreakingHyphen(c);
size_t wordStart = mWordBreaker.wordStart();
size_t wordEnd = mWordBreaker.wordEnd();
if (paint != nullptr && mHyphenator != nullptr &&
mHyphenationFrequency != kHyphenationFrequency_None &&
!wordEndsInHyphen && !temporarilySkipHyphenation &&
wordEnd > wordStart && wordEnd - wordStart <= LONGEST_HYPHENATED_WORD) {
mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[wordStart], wordEnd - wordStart);
#if VERBOSE_DEBUG
std::string hyphenatedString;
for (size_t j = wordStart; j < wordEnd; j++) {
if (mHyphBuf[j - wordStart]) hyphenatedString.push_back('-');
// Note: only works with ASCII, should do UTF-8 conversion here
hyphenatedString.push_back(buffer()[j]);
}
ALOGD("hyphenated string: %s", hyphenatedString.c_str());
#endif
// measure hyphenated substrings
for (size_t j = lastBreak; j < wordEnd; j++) {
uint8_t hyph = mHyphBuf[j - lastBreak];
if (hyph) {
paint->hyphenEdit = hyph;
layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
mTextBuf.size(), bidiFlags, style, *paint);
ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
paint->hyphenEdit = 0;
layout.doLayout(mTextBuf.data(), j, wordEnd - j,
mTextBuf.size(), bidiFlags, style, *paint);
ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
}
// measure hyphenated substrings
for (size_t j = wordStart; j < wordEnd; j++) {
uint8_t hyph = mHyphBuf[j - wordStart];
if (hyph) {
paint->hyphenEdit = hyph;
layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
mTextBuf.size(), bidiFlags, style, *paint);
ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
paint->hyphenEdit = 0;
layout.doLayout(mTextBuf.data(), j, afterWord - j,
mTextBuf.size(), bidiFlags, style, *paint);
ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
}
}
// Skip hyphenating the next word if and only if the present word ends in a hyphen
temporarilySkipHyphenation = wordEndsInHyphen;
// Skip break for zero-width characters inside replacement span
if (paint != nullptr || current == end || mCharWidths[current] > 0) {
addWordBreak(current, mWidth, postBreak, 0.0, 0);
}
lastBreak = current;
lastBreakWidth = mWidth;
}
current = (size_t)mBreakIterator->next();
// Skip hyphenating the next word if and only if the present word ends in a hyphen
temporarilySkipHyphenation = wordEndsInHyphen;
// Skip break for zero-width characters inside replacement span
if (paint != nullptr || current == end || mCharWidths[current] > 0) {
addWordBreak(current, mWidth, postBreak, 0.0, 0);
}
lastBreak = current;
lastBreakWidth = mWidth;
current = (size_t)mWordBreaker.next();
}
}
@@ -425,6 +415,7 @@ size_t LineBreaker::computeBreaks() {
}
void LineBreaker::finish() {
mWordBreaker.finish();
mWidth = 0;
mCandidates.clear();
mBreaks.clear();

View File

@@ -0,0 +1,110 @@
/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define LOG_TAG "Minikin"
#include <cutils/log.h>
#include "minikin/WordBreaker.h"
#include <unicode/uchar.h>
#include <unicode/utf16.h>
namespace android {
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
void WordBreaker::setLocale(const icu::Locale& locale) {
UErrorCode status = U_ZERO_ERROR;
mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
// TODO: handle failure status
if (mText != nullptr) {
mBreakIterator->setText(&mUText, status);
}
mIteratorWasReset = true;
}
void WordBreaker::setText(const uint16_t* data, size_t size) {
mText = data;
mTextSize = size;
mIteratorWasReset = false;
mLast = 0;
mCurrent = 0;
UErrorCode status = U_ZERO_ERROR;
utext_openUChars(&mUText, data, size, &status);
mBreakIterator->setText(&mUText, status);
mBreakIterator->first();
}
ssize_t WordBreaker::current() const {
return mCurrent;
}
ssize_t WordBreaker::next() {
int32_t result;
mLast = mCurrent;
do {
if (mIteratorWasReset) {
result = mBreakIterator->following(mCurrent);
mIteratorWasReset = false;
} else {
result = mBreakIterator->next();
}
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
&& mText[result - 1] == CHAR_SOFT_HYPHEN);
mCurrent = (ssize_t)result;
return mCurrent;
}
ssize_t WordBreaker::wordStart() const {
ssize_t result = mLast;
while (result < mCurrent) {
UChar32 c;
ssize_t ix = result;
U16_NEXT(mText, ix, mCurrent, c);
int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
// strip leading punctuation, defined as OP and QU line breaking classes,
// see UAX #14
if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
break;
}
result = ix;
}
return result;
}
ssize_t WordBreaker::wordEnd() const {
ssize_t result = mCurrent;
while (result > mLast) {
UChar32 c;
ssize_t ix = result;
U16_PREV(mText, mLast, ix, c);
int32_t gc_mask = U_GET_GC_MASK(c);
// strip trailing space and punctuation
if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
break;
}
result = ix;
}
return result;
}
void WordBreaker::finish() {
mText = nullptr;
// Note: calling utext_close multiply is safe
utext_close(&mUText);
}
} // namespace android

View File

@@ -79,7 +79,8 @@ LOCAL_SRC_FILES += \
MinikinFontForTest.cpp \
GraphemeBreakTests.cpp \
LayoutUtilsTest.cpp \
UnicodeUtils.cpp
UnicodeUtils.cpp \
WordBreakerTests.cpp
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/../libs/minikin/ \

View File

@@ -0,0 +1,79 @@
/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <gtest/gtest.h>
#include "ICUTestBase.h"
#include "UnicodeUtils.h"
#include <minikin/WordBreaker.h>
#include <unicode/locid.h>
#include <unicode/uclean.h>
#include <unicode/udata.h>
#define LOG_TAG "Minikin"
#include <cutils/log.h>
#ifndef NELEM
#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
#endif
using namespace android;
typedef ICUTestBase WordBreakerTest;
TEST_F(WordBreakerTest, basic) {
uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(6, breaker.next()); // after "hello "
EXPECT_EQ(0, breaker.wordStart()); // "hello"
EXPECT_EQ(5, breaker.wordEnd());
EXPECT_EQ(6, breaker.current());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(6, breaker.wordStart()); // "world"
EXPECT_EQ(11, breaker.wordEnd());
EXPECT_EQ(11, breaker.current());
}
TEST_F(WordBreakerTest, softHyphen) {
uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo "
EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo"
EXPECT_EQ(6, breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(7, breaker.wordStart()); // "world"
EXPECT_EQ(12, breaker.wordEnd());
}
TEST_F(WordBreakerTest, punct) {
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
'!', '!'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(9, breaker.next()); // after "¡¡hello, "
EXPECT_EQ(2, breaker.wordStart()); // "hello"
EXPECT_EQ(7, breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(9, breaker.wordStart()); // "world"
EXPECT_EQ(14, breaker.wordEnd());
}