Special-case URLs and email addresses for line breaking
Detect URLs and email addresses, and suppress both line breaking and hyphenation within them. Bug: 20126487 Bug: 20566159 Change-Id: I43629347a063dcf579e355e5b678d7195f453ad9
This commit is contained in:
@@ -60,6 +60,10 @@ private:
|
||||
ssize_t mLast;
|
||||
ssize_t mCurrent;
|
||||
bool mIteratorWasReset;
|
||||
|
||||
// state for the email address / url detector
|
||||
ssize_t mScanOffset;
|
||||
bool mSuppressHyphen;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
@@ -42,6 +42,8 @@ void WordBreaker::setText(const uint16_t* data, size_t size) {
|
||||
mIteratorWasReset = false;
|
||||
mLast = 0;
|
||||
mCurrent = 0;
|
||||
mScanOffset = 0;
|
||||
mSuppressHyphen = false;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
utext_openUChars(&mUText, data, size, &status);
|
||||
mBreakIterator->setText(&mUText, status);
|
||||
@@ -52,9 +54,60 @@ ssize_t WordBreaker::current() const {
|
||||
return mCurrent;
|
||||
}
|
||||
|
||||
enum ScanState {
|
||||
START,
|
||||
SAW_AT,
|
||||
SAW_COLON,
|
||||
SAW_COLON_SLASH,
|
||||
SAW_COLON_SLASH_SLASH,
|
||||
};
|
||||
|
||||
ssize_t WordBreaker::next() {
|
||||
int32_t result;
|
||||
mLast = mCurrent;
|
||||
|
||||
// scan forward from current ICU position for email address or URL
|
||||
if (mLast >= mScanOffset) {
|
||||
ScanState state = START;
|
||||
size_t i;
|
||||
for (i = mLast; i < mTextSize; i++) {
|
||||
uint16_t c = mText[i];
|
||||
// scan only ASCII characters, stop at space
|
||||
if (!(' ' < c && c <= 0x007E)) {
|
||||
break;
|
||||
}
|
||||
if (state == START && c == '@') {
|
||||
state = SAW_AT;
|
||||
} else if (state == START && c == ':') {
|
||||
state = SAW_COLON;
|
||||
} else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
|
||||
if (c == '/') {
|
||||
state = static_cast<ScanState>((int)state + 1); // next state adds a slash
|
||||
} else {
|
||||
state = START;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
|
||||
// no line breaks in entire email address or url
|
||||
// TODO: refine this according to Chicago Manual of Style rules
|
||||
while (i < mTextSize && mText[i] == ' ') {
|
||||
i++;
|
||||
}
|
||||
mCurrent = i;
|
||||
mSuppressHyphen = true;
|
||||
// Setting mIteratorWasReset will cause next break to be computed following
|
||||
// mCurrent, rather than following the current ICU iterator location.
|
||||
mIteratorWasReset = true;
|
||||
if (mBreakIterator->isBoundary(mCurrent)) {
|
||||
return mCurrent;
|
||||
}
|
||||
} else {
|
||||
mScanOffset = i;
|
||||
mSuppressHyphen = false;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t result;
|
||||
do {
|
||||
if (mIteratorWasReset) {
|
||||
result = mBreakIterator->following(mCurrent);
|
||||
@@ -69,6 +122,9 @@ ssize_t WordBreaker::next() {
|
||||
}
|
||||
|
||||
ssize_t WordBreaker::wordStart() const {
|
||||
if (mSuppressHyphen) {
|
||||
return mLast;
|
||||
}
|
||||
ssize_t result = mLast;
|
||||
while (result < mCurrent) {
|
||||
UChar32 c;
|
||||
@@ -86,6 +142,9 @@ ssize_t WordBreaker::wordStart() const {
|
||||
}
|
||||
|
||||
ssize_t WordBreaker::wordEnd() const {
|
||||
if (mSuppressHyphen) {
|
||||
return mLast;
|
||||
}
|
||||
ssize_t result = mCurrent;
|
||||
while (result > mLast) {
|
||||
UChar32 c;
|
||||
|
||||
@@ -77,3 +77,73 @@ TEST_F(WordBreakerTest, punct) {
|
||||
EXPECT_EQ(9, breaker.wordStart()); // "world"
|
||||
EXPECT_EQ(14, breaker.wordEnd());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, email) {
|
||||
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
|
||||
' ', 'x'};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(16, breaker.next()); // after "foo@example.com "
|
||||
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(16, breaker.wordStart()); // "x"
|
||||
EXPECT_EQ(17, breaker.wordEnd());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, mailto) {
|
||||
uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
|
||||
'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(23, breaker.next()); // after "mailto:foo@example.com "
|
||||
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(23, breaker.wordStart()); // "x"
|
||||
EXPECT_EQ(24, breaker.wordEnd());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, emailNonAscii) {
|
||||
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
|
||||
0x4E00};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(15, breaker.next()); // after "foo@example.com"
|
||||
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(15, breaker.wordStart()); // "一"
|
||||
EXPECT_EQ(16, breaker.wordEnd());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, emailCombining) {
|
||||
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
|
||||
0x0303, ' ', 'x'};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(17, breaker.next()); // after "foo@example.com̃"
|
||||
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(17, breaker.wordStart()); // "x"
|
||||
EXPECT_EQ(18, breaker.wordEnd());
|
||||
}
|
||||
|
||||
TEST_F(WordBreakerTest, url) {
|
||||
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
|
||||
'.', 'c', 'o', 'm', ' ', 'x'};
|
||||
WordBreaker breaker;
|
||||
breaker.setLocale(icu::Locale::getEnglish());
|
||||
breaker.setText(buf, NELEM(buf));
|
||||
EXPECT_EQ(0, breaker.current());
|
||||
EXPECT_EQ(19, breaker.next()); // after "http://example.com "
|
||||
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
||||
EXPECT_EQ(19, breaker.wordStart()); // "x"
|
||||
EXPECT_EQ(20, breaker.wordEnd());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user