Special-case URLs and email addresses for line breaking

Detect URLs and email addresses, and suppress both line breaking and
hyphenation within them.

Bug: 20126487
Bug: 20566159

Change-Id: I43629347a063dcf579e355e5b678d7195f453ad9
This commit is contained in:
Raph Levien
2015-09-08 17:12:10 -07:00
parent c3b16d8894
commit 76022a08e3
3 changed files with 134 additions and 1 deletions

View File

@@ -60,6 +60,10 @@ private:
ssize_t mLast;
ssize_t mCurrent;
bool mIteratorWasReset;
// state for the email address / url detector
ssize_t mScanOffset;
bool mSuppressHyphen;
};
} // namespace

View File

@@ -42,6 +42,8 @@ void WordBreaker::setText(const uint16_t* data, size_t size) {
mIteratorWasReset = false;
mLast = 0;
mCurrent = 0;
mScanOffset = 0;
mSuppressHyphen = false;
UErrorCode status = U_ZERO_ERROR;
utext_openUChars(&mUText, data, size, &status);
mBreakIterator->setText(&mUText, status);
@@ -52,9 +54,60 @@ ssize_t WordBreaker::current() const {
return mCurrent;
}
enum ScanState {
START,
SAW_AT,
SAW_COLON,
SAW_COLON_SLASH,
SAW_COLON_SLASH_SLASH,
};
ssize_t WordBreaker::next() {
int32_t result;
mLast = mCurrent;
// scan forward from current ICU position for email address or URL
if (mLast >= mScanOffset) {
ScanState state = START;
size_t i;
for (i = mLast; i < mTextSize; i++) {
uint16_t c = mText[i];
// scan only ASCII characters, stop at space
if (!(' ' < c && c <= 0x007E)) {
break;
}
if (state == START && c == '@') {
state = SAW_AT;
} else if (state == START && c == ':') {
state = SAW_COLON;
} else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
if (c == '/') {
state = static_cast<ScanState>((int)state + 1); // next state adds a slash
} else {
state = START;
}
}
}
if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
// no line breaks in entire email address or url
// TODO: refine this according to Chicago Manual of Style rules
while (i < mTextSize && mText[i] == ' ') {
i++;
}
mCurrent = i;
mSuppressHyphen = true;
// Setting mIteratorWasReset will cause next break to be computed following
// mCurrent, rather than following the current ICU iterator location.
mIteratorWasReset = true;
if (mBreakIterator->isBoundary(mCurrent)) {
return mCurrent;
}
} else {
mScanOffset = i;
mSuppressHyphen = false;
}
}
int32_t result;
do {
if (mIteratorWasReset) {
result = mBreakIterator->following(mCurrent);
@@ -69,6 +122,9 @@ ssize_t WordBreaker::next() {
}
ssize_t WordBreaker::wordStart() const {
if (mSuppressHyphen) {
return mLast;
}
ssize_t result = mLast;
while (result < mCurrent) {
UChar32 c;
@@ -86,6 +142,9 @@ ssize_t WordBreaker::wordStart() const {
}
ssize_t WordBreaker::wordEnd() const {
if (mSuppressHyphen) {
return mLast;
}
ssize_t result = mCurrent;
while (result > mLast) {
UChar32 c;

View File

@@ -77,3 +77,73 @@ TEST_F(WordBreakerTest, punct) {
EXPECT_EQ(9, breaker.wordStart()); // "world"
EXPECT_EQ(14, breaker.wordEnd());
}
TEST_F(WordBreakerTest, email) {
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
' ', 'x'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(16, breaker.next()); // after "foo@example.com "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(16, breaker.wordStart()); // "x"
EXPECT_EQ(17, breaker.wordEnd());
}
TEST_F(WordBreakerTest, mailto) {
uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(23, breaker.next()); // after "mailto:foo@example.com "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(23, breaker.wordStart()); // "x"
EXPECT_EQ(24, breaker.wordEnd());
}
TEST_F(WordBreakerTest, emailNonAscii) {
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
0x4E00};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(15, breaker.next()); // after "foo@example.com"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(15, breaker.wordStart()); // "一"
EXPECT_EQ(16, breaker.wordEnd());
}
TEST_F(WordBreakerTest, emailCombining) {
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
0x0303, ' ', 'x'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(17, breaker.next()); // after "foo@example.com̃"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(17, breaker.wordStart()); // "x"
EXPECT_EQ(18, breaker.wordEnd());
}
TEST_F(WordBreakerTest, url) {
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
'.', 'c', 'o', 'm', ' ', 'x'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(19, breaker.next()); // after "http://example.com "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(19, breaker.wordStart()); // "x"
EXPECT_EQ(20, breaker.wordEnd());
}