Do not allow line breaks before currency symbols
Implement the change proposed in UTC document L2/16-043R (http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt) to make sure we do not break between letters and currency symbols. Bug: 24959657 Change-Id: Ia29d0e5625f84870bd910d0c6e19036d17206704
This commit is contained in:
@@ -79,6 +79,18 @@ static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
|
|||||||
uint32_t next_codepoint;
|
uint32_t next_codepoint;
|
||||||
size_t next_offset = i;
|
size_t next_offset = i;
|
||||||
U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
|
U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
|
||||||
|
|
||||||
|
// Proposed change to LB24 from http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt
|
||||||
|
//(AL | HL) × (PR | PO)
|
||||||
|
int32_t lineBreak = u_getIntPropertyValue(codePoint, UCHAR_LINE_BREAK);
|
||||||
|
if (lineBreak == U_LB_ALPHABETIC || lineBreak == U_LB_HEBREW_LETTER) {
|
||||||
|
lineBreak = u_getIntPropertyValue(next_codepoint, UCHAR_LINE_BREAK);
|
||||||
|
if (lineBreak == U_LB_PREFIX_NUMERIC || lineBreak == U_LB_POSTFIX_NUMERIC) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Known emoji ZWJ sequences
|
||||||
if (codePoint == CHAR_ZWJ) {
|
if (codePoint == CHAR_ZWJ) {
|
||||||
// Possible emoji ZWJ sequence
|
// Possible emoji ZWJ sequence
|
||||||
if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
|
if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
|
||||||
@@ -91,6 +103,7 @@ static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
// Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||||
// EB x EM
|
// EB x EM
|
||||||
if (isEmojiModifier(next_codepoint)) {
|
if (isEmojiModifier(next_codepoint)) {
|
||||||
|
|||||||
@@ -69,6 +69,22 @@ TEST_F(WordBreakerTest, softHyphen) {
|
|||||||
EXPECT_EQ(0, breaker.breakBadness());
|
EXPECT_EQ(0, breaker.breakBadness());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(WordBreakerTest, postfixAndPrefix) {
|
||||||
|
uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥
|
||||||
|
WordBreaker breaker;
|
||||||
|
breaker.setLocale(icu::Locale::getEnglish());
|
||||||
|
breaker.setText(buf, NELEM(buf));
|
||||||
|
EXPECT_EQ(0, breaker.current());
|
||||||
|
|
||||||
|
EXPECT_EQ(4, breaker.next()); // after CENT SIGN
|
||||||
|
EXPECT_EQ(0, breaker.wordStart()); // "US¢"
|
||||||
|
EXPECT_EQ(3, breaker.wordEnd());
|
||||||
|
|
||||||
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string
|
||||||
|
EXPECT_EQ(4, breaker.wordStart()); // "JP¥"
|
||||||
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(WordBreakerTest, zwjEmojiSequences) {
|
TEST_F(WordBreakerTest, zwjEmojiSequences) {
|
||||||
uint16_t buf[] = {
|
uint16_t buf[] = {
|
||||||
// man + zwj + heart + zwj + man
|
// man + zwj + heart + zwj + man
|
||||||
|
|||||||
Reference in New Issue
Block a user