Add line breaks to email addresses and URLs

This change adds accceptable line breaks according to sections 7.42
(Dividing URLs and e-mail addresses) and 14.12 (URLs or DOIs and line
breaks) of the Chicago Manual of Style (16th ed.). In general, these
place breaks before punctuation symbols, and suppresses them after
hyphens.

Bug: 20126487
Bug: 20566159
Change-Id: I2d07d516b920a506a2f718c38fb435c5eb1ee1f8
This commit is contained in:
Raph Levien
2015-09-08 18:19:53 -07:00
parent 76022a08e3
commit 5102c20dd5
3 changed files with 175 additions and 22 deletions

View File

@@ -63,7 +63,7 @@ private:
// state for the email address / url detector
ssize_t mScanOffset;
bool mSuppressHyphen;
bool mInEmailOrUrl;
};
} // namespace

View File

@@ -43,7 +43,7 @@ void WordBreaker::setText(const uint16_t* data, size_t size) {
mLast = 0;
mCurrent = 0;
mScanOffset = 0;
mSuppressHyphen = false;
mInEmailOrUrl = false;
UErrorCode status = U_ZERO_ERROR;
utext_openUChars(&mUText, data, size, &status);
mBreakIterator->setText(&mUText, status);
@@ -62,6 +62,17 @@ enum ScanState {
SAW_COLON_SLASH_SLASH,
};
// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
static bool breakAfter(uint16_t c) {
return c == ':' || c == '=' || c == '&';
}
// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
static bool breakBefore(uint16_t c) {
return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
|| c == '%' || c == '=' || c == '&';
}
ssize_t WordBreaker::next() {
mLast = mCurrent;
@@ -88,23 +99,45 @@ ssize_t WordBreaker::next() {
}
}
if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
// no line breaks in entire email address or url
// TODO: refine this according to Chicago Manual of Style rules
while (i < mTextSize && mText[i] == ' ') {
i++;
if (!mBreakIterator->isBoundary(i)) {
i = mBreakIterator->following(i);
}
mCurrent = i;
mSuppressHyphen = true;
// Setting mIteratorWasReset will cause next break to be computed following
// mCurrent, rather than following the current ICU iterator location.
mInEmailOrUrl = true;
mIteratorWasReset = true;
if (mBreakIterator->isBoundary(mCurrent)) {
return mCurrent;
}
} else {
mScanOffset = i;
mSuppressHyphen = false;
mInEmailOrUrl = false;
}
mScanOffset = i;
}
if (mInEmailOrUrl) {
// special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
uint16_t lastChar = mText[mLast];
ssize_t i;
for (i = mLast + 1; i < mScanOffset; i++) {
if (breakAfter(lastChar)) {
break;
}
// break after double slash
if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
break;
}
uint16_t thisChar = mText[i];
// never break after hyphen
if (lastChar != '-') {
if (breakBefore(thisChar)) {
break;
}
// break before single slash
if (thisChar == '/' && lastChar != '/' &&
!(i + 1 < mScanOffset && mText[i + 1] == '/')) {
break;
}
}
lastChar = thisChar;
}
mCurrent = i;
return mCurrent;
}
int32_t result;
@@ -122,7 +155,7 @@ ssize_t WordBreaker::next() {
}
ssize_t WordBreaker::wordStart() const {
if (mSuppressHyphen) {
if (mInEmailOrUrl) {
return mLast;
}
ssize_t result = mLast;
@@ -142,7 +175,7 @@ ssize_t WordBreaker::wordStart() const {
}
ssize_t WordBreaker::wordEnd() const {
if (mSuppressHyphen) {
if (mInEmailOrUrl) {
return mLast;
}
ssize_t result = mCurrent;

View File

@@ -85,7 +85,9 @@ TEST_F(WordBreakerTest, email) {
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(16, breaker.next()); // after "foo@example.com "
EXPECT_EQ(11, breaker.next()); // after "foo@example"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(16, breaker.next()); // after ".com "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(16, breaker.wordStart()); // "x"
@@ -99,13 +101,19 @@ TEST_F(WordBreakerTest, mailto) {
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(23, breaker.next()); // after "mailto:foo@example.com "
EXPECT_EQ(7, breaker.next()); // after "mailto:"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(18, breaker.next()); // after "foo@example"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(23, breaker.next()); // after ".com "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(23, breaker.wordStart()); // "x"
EXPECT_EQ(24, breaker.wordEnd());
}
// The current logic always places a line break after a detected email address or URL
// and an immediately following non-ASCII character.
TEST_F(WordBreakerTest, emailNonAscii) {
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
0x4E00};
@@ -113,7 +121,9 @@ TEST_F(WordBreakerTest, emailNonAscii) {
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(15, breaker.next()); // after "foo@example.com"
EXPECT_EQ(11, breaker.next()); // after "foo@example"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(15, breaker.next()); // after ".com"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(15, breaker.wordStart()); // "一"
@@ -127,13 +137,31 @@ TEST_F(WordBreakerTest, emailCombining) {
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(17, breaker.next()); // after "foo@example.com̃"
EXPECT_EQ(11, breaker.next()); // after "foo@example"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(17, breaker.next()); // after ".com̃ "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(17, breaker.wordStart()); // "x"
EXPECT_EQ(18, breaker.wordEnd());
}
TEST_F(WordBreakerTest, lonelyAt) {
uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(2, breaker.next()); // after "a "
EXPECT_EQ(0, breaker.wordStart()); // "a"
EXPECT_EQ(1, breaker.wordEnd());
EXPECT_EQ(4, breaker.next()); // after "@ "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(4, breaker.wordStart()); // "b"
EXPECT_EQ(5, breaker.wordEnd());
}
TEST_F(WordBreakerTest, url) {
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
'.', 'c', 'o', 'm', ' ', 'x'};
@@ -141,9 +169,101 @@ TEST_F(WordBreakerTest, url) {
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(19, breaker.next()); // after "http://example.com "
EXPECT_EQ(5, breaker.next()); // after "http:"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(7, breaker.next()); // after "//"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(14, breaker.next()); // after "example"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(19, breaker.next()); // after ".com "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(19, breaker.wordStart()); // "x"
EXPECT_EQ(20, breaker.wordEnd());
}
// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST_F(WordBreakerTest, urlBreakChars) {
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
'-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(5, breaker.next()); // after "http:"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(7, breaker.next()); // after "//"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(8, breaker.next()); // after "a"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(10, breaker.next()); // after ".b"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(11, breaker.next()); // after "/"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(13, breaker.next()); // after "~c"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(15, breaker.next()); // after ",d"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(17, breaker.next()); // after "-e"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(19, breaker.next()); // after "?f"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(20, breaker.next()); // after "="
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(21, breaker.next()); // after "g"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(22, breaker.next()); // after "&"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(23, breaker.next()); // after "h"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(25, breaker.next()); // after "#i"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(27, breaker.next()); // after "%j"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(29, breaker.next()); // after "_k"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
}
TEST_F(WordBreakerTest, urlNoHyphenBreak) {
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(5, breaker.next()); // after "http:"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(7, breaker.next()); // after "//"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(8, breaker.next()); // after "a"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
}
TEST_F(WordBreakerTest, urlEndsWithSlash) {
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(5, breaker.next()); // after "http:"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(7, breaker.next()); // after "//"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(8, breaker.next()); // after "a"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
}
TEST_F(WordBreakerTest, emailStartsWithSlash) {
uint16_t buf[] = {'/', 'a', '@', 'b'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
}