diff options
Diffstat (limited to 'external/icu/icu4c-khmerbreakengine.patch.1')
-rw-r--r-- | external/icu/icu4c-khmerbreakengine.patch.1 | 73 |
1 files changed, 37 insertions, 36 deletions
diff --git a/external/icu/icu4c-khmerbreakengine.patch.1 b/external/icu/icu4c-khmerbreakengine.patch.1 index 78cce146c2bf..605914014e96 100644 --- a/external/icu/icu4c-khmerbreakengine.patch.1 +++ b/external/icu/icu4c-khmerbreakengine.patch.1 @@ -1,7 +1,7 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp ---- icu.org/source/common/dictbe.cpp 2021-10-28 18:04:57.000000000 +0200 -+++ icu/source/common/dictbe.cpp 2021-11-15 20:39:03.710870385 +0100 -@@ -32,7 +32,19 @@ +--- icu.org/source/common/dictbe.cpp 2023-06-14 06:23:55.000000000 +0900 ++++ icu/source/common/dictbe.cpp 2023-06-26 17:43:53.034173100 +0900 +@@ -35,7 +35,19 @@ ****************************************************************** */ @@ -13,16 +13,16 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp +DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) + : fTypes(breakTypes), clusterLimit(3) { + UErrorCode status = U_ZERO_ERROR; -+ fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status); ++ fViramaSet.applyPattern(UnicodeString(u"[[:ccc=VR:]]"), status); + + // note Skip Sets contain fIgnoreSet characters too. -+ fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status); -+ fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status); -+ fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status); ++ fSkipStartSet.applyPattern(UnicodeString(u"[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status); ++ fSkipEndSet.applyPattern(UnicodeString(u"[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status); ++ fNBeforeSet.applyPattern(UnicodeString(u"[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status); } DictionaryBreakEngine::~DictionaryBreakEngine() { -@@ -81,6 +93,169 @@ +@@ -85,6 +97,169 @@ fSet.compact(); } @@ -192,7 +192,7 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp /* ****************************************************************** * PossibleWord -@@ -110,7 +285,7 @@ +@@ -114,7 +289,7 @@ ~PossibleWord() {} // Fill the list of candidates if needed, select the longest, and return the number found @@ -201,7 +201,7 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp // Select the currently marked candidate, point after it in the text, and invalidate self int32_t acceptMarked( UText *text ); -@@ -131,12 +306,12 @@ +@@ -135,12 +310,12 @@ }; @@ -211,12 +211,12 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp int32_t start = (int32_t)utext_getNativeIndex(text); if (start != offset) { offset = start; -- count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix); -+ count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix, ignoreSet, minLength); +- count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix); ++ count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix, ignoreSet, minLength); // Dictionary leaves text after longest prefix, not longest word. Back up. if (count <= 0) { utext_setNativeIndex(text, start); -@@ -808,53 +983,30 @@ +@@ -814,53 +989,30 @@ * KhmerBreakEngine */ @@ -243,17 +243,17 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr"); -- fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); +- UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status); + + clusterLimit = 3; + -+ fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]\\u2060\\u200C\\u200D]"), status); ++ UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]\\u2060\\u200C\\u200D]"), status); if (U_SUCCESS(status)) { - setCharacters(fKhmerWordSet); + setCharacters(khmerWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); - fMarkSet.add(0x0020); -- fEndWordSet = fKhmerWordSet; +- fEndWordSet = khmerWordSet; - fBeginWordSet.add(0x1780, 0x17B3); - //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels - //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word @@ -268,8 +268,8 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp -// fSuffixSet.add(THAI_MAIYAMOK); + fIgnoreSet.add(0x2060); // WJ + fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ -+ fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status); -+ fPuncSet.applyPattern(UNICODE_STRING_SIMPLE("[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status); ++ fBaseSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status); ++ fPuncSet.applyPattern(UnicodeString(u"[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status); // Compact for caching. fMarkSet.compact(); @@ -282,8 +282,8 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp UTRACE_EXIT_STATUS(status); } -@@ -869,175 +1021,204 @@ - UVector32 &foundBreaks, +@@ -876,175 +1028,205 @@ + UBool /* isPhraseBreaking */, UErrorCode& status ) const { if (U_FAILURE(status)) return 0; - if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { @@ -304,7 +304,7 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp + --scanStart; + startZwsp = scanBeforeStart(text, scanStart, breakStart); } -- + - uint32_t wordsFound = 0; - int32_t cpWordLength = 0; - int32_t cuWordLength = 0; @@ -633,9 +633,9 @@ diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp #if !UCONFIG_NO_NORMALIZATION diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h ---- icu.org/source/common/dictbe.h 2021-10-28 18:04:57.000000000 +0200 -+++ icu/source/common/dictbe.h 2021-11-15 20:41:53.052317579 +0100 -@@ -34,7 +34,8 @@ +--- icu.org/source/common/dictbe.h 2022-04-08 00:41:55.000000000 +0200 ++++ icu/source/common/dictbe.h 2022-05-16 13:49:33.820459894 +0200 +@@ -35,7 +35,8 @@ * threads without synchronization.</p> */ class DictionaryBreakEngine : public LanguageBreakEngine { @@ -645,7 +645,7 @@ diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h /** * The set of characters handled by this engine * @internal -@@ -42,14 +43,84 @@ +@@ -43,14 +44,84 @@ UnicodeSet fSet; @@ -731,10 +731,10 @@ diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h * <p>Virtual destructor.</p> */ virtual ~DictionaryBreakEngine(); -@@ -303,10 +374,12 @@ +@@ -305,10 +376,12 @@ + * @internal */ - UnicodeSet fKhmerWordSet; - UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; - UnicodeSet fMarkSet; @@ -748,8 +748,8 @@ diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h public: diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionarydata.cpp ---- icu.org/source/common/dictionarydata.cpp 2021-10-28 18:04:57.000000000 +0200 -+++ icu/source/common/dictionarydata.cpp 2021-11-15 19:25:00.583694898 +0100 +--- icu.org/source/common/dictionarydata.cpp 2023-06-14 06:23:55.000000000 +0900 ++++ icu/source/common/dictionarydata.cpp 2023-06-26 02:18:05.709454400 +0900 @@ -44,7 +44,7 @@ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, @@ -771,7 +771,7 @@ diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionaryda + continue; + } if (wordCount < limit) { - if (values != NULL) { + if (values != nullptr) { values[wordCount] = uct.getValue(); @@ -112,7 +118,7 @@ @@ -794,11 +794,12 @@ diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionaryda + continue; + } if (wordCount < limit) { - if (values != NULL) { + if (values != nullptr) { values[wordCount] = bt.getValue(); + diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h ---- icu.org/source/common/dictionarydata.h 2021-10-28 18:04:57.000000000 +0200 -+++ icu/source/common/dictionarydata.h 2021-11-15 20:44:34.484790590 +0100 +--- icu.org/source/common/dictionarydata.h 2023-06-14 06:23:55.000000000 +0900 ++++ icu/source/common/dictionarydata.h 2023-06-26 17:43:53.097724900 +0900 @@ -21,6 +21,7 @@ #include "unicode/utext.h" #include "unicode/udata.h" @@ -824,7 +825,7 @@ diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata + int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override; virtual int32_t getType() const override; private: - const UChar *characters; + const char16_t *characters; @@ -125,7 +126,7 @@ virtual ~BytesDictionaryMatcher(); virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, |