Do not break after Myanmar viramas
am: acaf5cc08d
* commit 'acaf5cc08defe3dfaa1e0caa945be494532cbaa0':
Do not break after Myanmar viramas
Change-Id: Idb9303889ac87853a730cdb25fba7faaaf352b93
This commit is contained in:
@@ -76,12 +76,20 @@ static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
|
|||||||
if (codePoint == CHAR_SOFT_HYPHEN) {
|
if (codePoint == CHAR_SOFT_HYPHEN) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
// For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
|
||||||
|
// around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
|
||||||
|
// too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
|
||||||
|
// where no line break could be imagined, since the Myanmar virama is a pure stacker.
|
||||||
|
if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t next_codepoint;
|
uint32_t next_codepoint;
|
||||||
size_t next_offset = i;
|
size_t next_offset = i;
|
||||||
U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
|
U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
|
||||||
|
|
||||||
// Proposed change to LB24 from http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt
|
// Proposed change to LB24 from http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt
|
||||||
//(AL | HL) × (PR | PO)
|
// (AL | HL) × (PR | PO)
|
||||||
int32_t lineBreak = u_getIntPropertyValue(codePoint, UCHAR_LINE_BREAK);
|
int32_t lineBreak = u_getIntPropertyValue(codePoint, UCHAR_LINE_BREAK);
|
||||||
if (lineBreak == U_LB_ALPHABETIC || lineBreak == U_LB_HEBREW_LETTER) {
|
if (lineBreak == U_LB_ALPHABETIC || lineBreak == U_LB_HEBREW_LETTER) {
|
||||||
lineBreak = u_getIntPropertyValue(next_codepoint, UCHAR_LINE_BREAK);
|
lineBreak = u_getIntPropertyValue(next_codepoint, UCHAR_LINE_BREAK);
|
||||||
|
|||||||
@@ -85,6 +85,19 @@ TEST_F(WordBreakerTest, postfixAndPrefix) {
|
|||||||
EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(WordBreakerTest, MyanmarKinzi) {
|
||||||
|
uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C}; // NGA, ASAT, VIRAMA, KA, UU
|
||||||
|
WordBreaker breaker;
|
||||||
|
icu::Locale burmese("my");
|
||||||
|
breaker.setLocale(burmese);
|
||||||
|
breaker.setText(buf, NELEM(buf));
|
||||||
|
EXPECT_EQ(0, breaker.current());
|
||||||
|
|
||||||
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string
|
||||||
|
EXPECT_EQ(0, breaker.wordStart());
|
||||||
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(WordBreakerTest, zwjEmojiSequences) {
|
TEST_F(WordBreakerTest, zwjEmojiSequences) {
|
||||||
uint16_t buf[] = {
|
uint16_t buf[] = {
|
||||||
// man + zwj + heart + zwj + man
|
// man + zwj + heart + zwj + man
|
||||||
|
|||||||
Reference in New Issue
Block a user