summaryrefslogtreecommitdiffstats
path: root/i18nutil/source/utility/unicode.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'i18nutil/source/utility/unicode.cxx')
-rw-r--r--i18nutil/source/utility/unicode.cxx772
1 files changed, 500 insertions, 272 deletions
diff --git a/i18nutil/source/utility/unicode.cxx b/i18nutil/source/utility/unicode.cxx
index be34ea58f44a..e98afeeff3b8 100644
--- a/i18nutil/source/utility/unicode.cxx
+++ b/i18nutil/source/utility/unicode.cxx
@@ -24,8 +24,10 @@
#include <i18nutil/unicode.hxx>
#include <sal/log.hxx>
#include <unicode/numfmt.h>
+#include <unicode/uchar.h>
#include "unicode_data.h"
#include <rtl/character.hxx>
+#include <o3tl/string_view.hxx>
#include <memory>
// Workaround for glibc braindamage:
@@ -65,18 +67,108 @@ unicode::getUnicodeScriptEnd( UnicodeScript type) {
}
sal_Int16
-unicode::getUnicodeType( const sal_Unicode ch ) {
- static sal_Unicode c = 0x00;
- static sal_Int16 r = 0x00;
+unicode::getUnicodeType(const sal_uInt32 ch)
+{
+ static sal_uInt32 c = 0x00;
+ static sal_uInt32 r = 0x00;
if (ch == c) return r;
else c = ch;
- sal_Int16 address = UnicodeTypeIndex[ch >> 8];
- r = static_cast<sal_Int16>(
- (address < UnicodeTypeNumberBlock)
- ? UnicodeTypeBlockValue[address]
- : UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
+ switch (u_charType(ch))
+ {
+ case U_UNASSIGNED:
+ r = css::i18n::UnicodeType::UNASSIGNED;
+ break;
+ case U_UPPERCASE_LETTER:
+ r = css::i18n::UnicodeType::UPPERCASE_LETTER;
+ break;
+ case U_LOWERCASE_LETTER:
+ r = css::i18n::UnicodeType::LOWERCASE_LETTER;
+ break;
+ case U_TITLECASE_LETTER:
+ r = css::i18n::UnicodeType::TITLECASE_LETTER;
+ break;
+ case U_MODIFIER_LETTER:
+ r = css::i18n::UnicodeType::MODIFIER_LETTER;
+ break;
+ case U_OTHER_LETTER:
+ r = css::i18n::UnicodeType::OTHER_LETTER;
+ break;
+ case U_NON_SPACING_MARK:
+ r = css::i18n::UnicodeType::NON_SPACING_MARK;
+ break;
+ case U_ENCLOSING_MARK:
+ r = css::i18n::UnicodeType::ENCLOSING_MARK;
+ break;
+ case U_COMBINING_SPACING_MARK:
+ r = css::i18n::UnicodeType::COMBINING_SPACING_MARK;
+ break;
+ case U_DECIMAL_DIGIT_NUMBER:
+ r = css::i18n::UnicodeType::DECIMAL_DIGIT_NUMBER;
+ break;
+ case U_LETTER_NUMBER:
+ r = css::i18n::UnicodeType::LETTER_NUMBER;
+ break;
+ case U_OTHER_NUMBER:
+ r = css::i18n::UnicodeType::OTHER_NUMBER;
+ break;
+ case U_SPACE_SEPARATOR:
+ r = css::i18n::UnicodeType::SPACE_SEPARATOR;
+ break;
+ case U_LINE_SEPARATOR:
+ r = css::i18n::UnicodeType::LINE_SEPARATOR;
+ break;
+ case U_PARAGRAPH_SEPARATOR:
+ r = css::i18n::UnicodeType::PARAGRAPH_SEPARATOR;
+ break;
+ case U_CONTROL_CHAR:
+ r = css::i18n::UnicodeType::CONTROL;
+ break;
+ case U_FORMAT_CHAR:
+ r = css::i18n::UnicodeType::FORMAT;
+ break;
+ case U_PRIVATE_USE_CHAR:
+ r = css::i18n::UnicodeType::PRIVATE_USE;
+ break;
+ case U_SURROGATE:
+ r = css::i18n::UnicodeType::SURROGATE;
+ break;
+ case U_DASH_PUNCTUATION:
+ r = css::i18n::UnicodeType::DASH_PUNCTUATION;
+ break;
+ case U_INITIAL_PUNCTUATION:
+ r = css::i18n::UnicodeType::INITIAL_PUNCTUATION;
+ break;
+ case U_FINAL_PUNCTUATION:
+ r = css::i18n::UnicodeType::FINAL_PUNCTUATION;
+ break;
+ case U_CONNECTOR_PUNCTUATION:
+ r = css::i18n::UnicodeType::CONNECTOR_PUNCTUATION;
+ break;
+ case U_OTHER_PUNCTUATION:
+ r = css::i18n::UnicodeType::OTHER_PUNCTUATION;
+ break;
+ case U_MATH_SYMBOL:
+ r = css::i18n::UnicodeType::MATH_SYMBOL;
+ break;
+ case U_CURRENCY_SYMBOL:
+ r = css::i18n::UnicodeType::CURRENCY_SYMBOL;
+ break;
+ case U_MODIFIER_SYMBOL:
+ r = css::i18n::UnicodeType::MODIFIER_SYMBOL;
+ break;
+ case U_OTHER_SYMBOL:
+ r = css::i18n::UnicodeType::OTHER_SYMBOL;
+ break;
+ case U_START_PUNCTUATION:
+ r = css::i18n::UnicodeType::START_PUNCTUATION;
+ break;
+ case U_END_PUNCTUATION:
+ r = css::i18n::UnicodeType::END_PUNCTUATION;
+ break;
+ }
+
return r;
}
@@ -95,6 +187,11 @@ unicode::getUnicodeDirection( const sal_Unicode ch ) {
return r;
}
+sal_uInt32 unicode::GetMirroredChar(sal_uInt32 nChar) {
+ nChar = u_charMirror(nChar);
+ return nChar;
+}
+
#define bit(name) (1U << name)
#define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
@@ -117,7 +214,7 @@ unicode::getUnicodeDirection( const sal_Unicode ch ) {
bit(UnicodeType::PARAGRAPH_SEPARATOR)
#define IsType(func, mask) \
-bool func( const sal_Unicode ch) {\
+bool func( const sal_uInt32 ch) {\
return (bit(getUnicodeType(ch)) & (mask)) != 0;\
}
@@ -128,65 +225,207 @@ IsType(unicode::isSpace, SPACEMASK)
#define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
-bool unicode::isWhiteSpace( const sal_Unicode ch) {
+bool unicode::isWhiteSpace(const sal_uInt32 ch)
+{
return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
}
sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
{
//See unicode/uscript.h
- static const sal_Int16 scriptTypes[] =
+ sal_Int16 nRet;
+ switch (eScript)
{
- ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
- ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
- // 15
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
- ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
- // 30
- ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- // 45
- ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
- ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- // 60
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
- // 75
- ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- // 90
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
- // 105
- ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
- // 120
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- // 135
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
- ScriptType::COMPLEX,
- ScriptType::WEAK
- };
+ case USCRIPT_INVALID_CODE:
+ case USCRIPT_COMMON:
+ case USCRIPT_INHERITED:
+ case USCRIPT_UNWRITTEN_LANGUAGES:
+ case USCRIPT_UNKNOWN:
+ case USCRIPT_MATHEMATICAL_NOTATION:
+ case USCRIPT_SYMBOLS:
+ case USCRIPT_CODE_LIMIT:
+ nRet = ScriptType::WEAK;
+ break;
+ case USCRIPT_ARMENIAN:
+ case USCRIPT_CHEROKEE:
+ case USCRIPT_COPTIC:
+ case USCRIPT_CYRILLIC:
+ case USCRIPT_GEORGIAN:
+ case USCRIPT_GOTHIC:
+ case USCRIPT_GREEK:
+ case USCRIPT_LATIN:
+ case USCRIPT_OGHAM:
+ case USCRIPT_OLD_ITALIC:
+ case USCRIPT_RUNIC:
+ case USCRIPT_CANADIAN_ABORIGINAL:
+ case USCRIPT_BRAILLE:
+ case USCRIPT_CYPRIOT:
+ case USCRIPT_OSMANYA:
+ case USCRIPT_SHAVIAN:
+ case USCRIPT_KATAKANA_OR_HIRAGANA:
+ case USCRIPT_GLAGOLITIC:
+ case USCRIPT_CIRTH:
+ case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
+ case USCRIPT_OLD_HUNGARIAN:
+ case USCRIPT_LATIN_FRAKTUR:
+ case USCRIPT_LATIN_GAELIC:
+ nRet = ScriptType::LATIN;
+ break;
+ case USCRIPT_BOPOMOFO:
+ case USCRIPT_HAN:
+ case USCRIPT_HANGUL:
+ case USCRIPT_HIRAGANA:
+ case USCRIPT_KATAKANA:
+ case USCRIPT_YI:
+ case USCRIPT_SIMPLIFIED_HAN:
+ case USCRIPT_TRADITIONAL_HAN:
+ case USCRIPT_JAPANESE:
+ case USCRIPT_KOREAN:
+ case USCRIPT_TANGUT:
+ case USCRIPT_KHITAN_SMALL_SCRIPT:
+ nRet = ScriptType::ASIAN;
+ break;
+ case USCRIPT_ARABIC:
+ case USCRIPT_BENGALI:
+ case USCRIPT_DESERET:
+ case USCRIPT_DEVANAGARI:
+ case USCRIPT_ETHIOPIC:
+ case USCRIPT_GUJARATI:
+ case USCRIPT_GURMUKHI:
+ case USCRIPT_HEBREW:
+ case USCRIPT_KANNADA:
+ case USCRIPT_KHMER:
+ case USCRIPT_LAO:
+ case USCRIPT_MALAYALAM:
+ case USCRIPT_MONGOLIAN:
+ case USCRIPT_MYANMAR:
+ case USCRIPT_ORIYA:
+ case USCRIPT_SINHALA:
+ case USCRIPT_SYRIAC:
+ case USCRIPT_TAMIL:
+ case USCRIPT_TELUGU:
+ case USCRIPT_THAANA:
+ case USCRIPT_THAI:
+ case USCRIPT_TIBETAN:
+ case USCRIPT_TAGALOG:
+ case USCRIPT_HANUNOO:
+ case USCRIPT_BUHID:
+ case USCRIPT_TAGBANWA:
+ case USCRIPT_LIMBU:
+ case USCRIPT_LINEAR_B:
+ case USCRIPT_TAI_LE:
+ case USCRIPT_UGARITIC:
+ case USCRIPT_BUGINESE:
+ case USCRIPT_KHAROSHTHI:
+ case USCRIPT_SYLOTI_NAGRI:
+ case USCRIPT_NEW_TAI_LUE:
+ case USCRIPT_TIFINAGH:
+ case USCRIPT_OLD_PERSIAN:
+ case USCRIPT_BALINESE:
+ case USCRIPT_BATAK:
+ case USCRIPT_BLISSYMBOLS:
+ case USCRIPT_BRAHMI:
+ case USCRIPT_CHAM:
+ case USCRIPT_DEMOTIC_EGYPTIAN:
+ case USCRIPT_HIERATIC_EGYPTIAN:
+ case USCRIPT_EGYPTIAN_HIEROGLYPHS:
+ case USCRIPT_KHUTSURI:
+ case USCRIPT_PAHAWH_HMONG:
+ case USCRIPT_HARAPPAN_INDUS:
+ case USCRIPT_JAVANESE:
+ case USCRIPT_KAYAH_LI:
+ case USCRIPT_LEPCHA:
+ case USCRIPT_LINEAR_A:
+ case USCRIPT_MANDAEAN:
+ case USCRIPT_MAYAN_HIEROGLYPHS:
+ case USCRIPT_MEROITIC:
+ case USCRIPT_NKO:
+ case USCRIPT_ORKHON:
+ case USCRIPT_OLD_PERMIC:
+ case USCRIPT_PHAGS_PA:
+ case USCRIPT_PHOENICIAN:
+ case USCRIPT_PHONETIC_POLLARD:
+ case USCRIPT_RONGORONGO:
+ case USCRIPT_SARATI:
+ case USCRIPT_ESTRANGELO_SYRIAC:
+ case USCRIPT_WESTERN_SYRIAC:
+ case USCRIPT_EASTERN_SYRIAC:
+ case USCRIPT_TENGWAR:
+ case USCRIPT_VAI:
+ case USCRIPT_VISIBLE_SPEECH:
+ case USCRIPT_CUNEIFORM:
+ case USCRIPT_CARIAN:
+ case USCRIPT_LANNA:
+ case USCRIPT_LYCIAN:
+ case USCRIPT_LYDIAN:
+ case USCRIPT_OL_CHIKI:
+ case USCRIPT_REJANG:
+ case USCRIPT_SAURASHTRA:
+ case USCRIPT_SIGN_WRITING:
+ case USCRIPT_SUNDANESE:
+ case USCRIPT_MOON:
+ case USCRIPT_MEITEI_MAYEK:
+ case USCRIPT_IMPERIAL_ARAMAIC:
+ case USCRIPT_AVESTAN:
+ case USCRIPT_CHAKMA:
+ case USCRIPT_KAITHI:
+ case USCRIPT_MANICHAEAN:
+ case USCRIPT_INSCRIPTIONAL_PAHLAVI:
+ case USCRIPT_PSALTER_PAHLAVI:
+ case USCRIPT_BOOK_PAHLAVI:
+ case USCRIPT_INSCRIPTIONAL_PARTHIAN:
+ case USCRIPT_SAMARITAN:
+ case USCRIPT_TAI_VIET:
+ case USCRIPT_BAMUM:
+ case USCRIPT_LISU:
+ case USCRIPT_NAKHI_GEBA:
+ case USCRIPT_OLD_SOUTH_ARABIAN:
+ case USCRIPT_BASSA_VAH:
+ case USCRIPT_DUPLOYAN_SHORTAND:
+ case USCRIPT_ELBASAN:
+ case USCRIPT_GRANTHA:
+ case USCRIPT_KPELLE:
+ case USCRIPT_LOMA:
+ case USCRIPT_MENDE:
+ case USCRIPT_MEROITIC_CURSIVE:
+ case USCRIPT_OLD_NORTH_ARABIAN:
+ case USCRIPT_NABATAEAN:
+ case USCRIPT_PALMYRENE:
+ case USCRIPT_SINDHI:
+ case USCRIPT_WARANG_CITI:
+ default: // anything new is going to be pretty wild
+ nRet = ScriptType::COMPLEX;
+ break;
+ }
+ return nRet;
+}
- sal_Int16 nRet;
- if (eScript < USCRIPT_COMMON)
- nRet = ScriptType::WEAK;
- else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
- nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild
+sal_Int16 unicode::getScriptClassFromLanguageTag( const LanguageTag& rLanguageTag )
+{
+ constexpr int32_t nBuf = 42;
+ UScriptCode aBuf[nBuf];
+ if (rLanguageTag.hasScript())
+ {
+ aBuf[0] = static_cast<UScriptCode>(u_getPropertyValueEnum( UCHAR_SCRIPT,
+ OUStringToOString( rLanguageTag.getScript(), RTL_TEXTENCODING_ASCII_US).getStr()));
+ }
else
- nRet = scriptTypes[eScript];
- return nRet;
+ {
+ OUString aName;
+ if (rLanguageTag.getCountry().isEmpty())
+ aName = rLanguageTag.getLanguage();
+ else
+ aName = rLanguageTag.getLanguage() + "-" + rLanguageTag.getCountry();
+ UErrorCode status = U_ZERO_ERROR;
+ const int32_t nScripts = uscript_getCode(
+ OUStringToOString( aName, RTL_TEXTENCODING_ASCII_US).getStr(),
+ aBuf, nBuf, &status);
+ // U_BUFFER_OVERFLOW_ERROR would be set with too many scripts for buffer
+ // and required capacity returned, but really..
+ if (nScripts == 0 || !U_SUCCESS(status))
+ return css::i18n::ScriptType::LATIN;
+ }
+ return getScriptClassFromUScriptCode( aBuf[0]);
}
OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
@@ -196,604 +435,594 @@ OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
{
case USCRIPT_CODE_LIMIT:
case USCRIPT_INVALID_CODE:
- sRet = "zxx";
+ sRet = "zxx"_ostr;
break;
case USCRIPT_COMMON:
case USCRIPT_INHERITED:
- sRet = "und";
+ sRet = "und"_ostr;
break;
case USCRIPT_MATHEMATICAL_NOTATION:
case USCRIPT_SYMBOLS:
- sRet = "zxx";
+ sRet = "zxx"_ostr;
break;
case USCRIPT_UNWRITTEN_LANGUAGES:
case USCRIPT_UNKNOWN:
- sRet = "und";
+ sRet = "und"_ostr;
break;
case USCRIPT_ARABIC:
- sRet = "ar";
+ sRet = "ar"_ostr;
break;
case USCRIPT_ARMENIAN:
- sRet = "hy";
+ sRet = "hy"_ostr;
break;
case USCRIPT_BENGALI:
- sRet = "bn";
+ sRet = "bn"_ostr;
break;
case USCRIPT_BOPOMOFO:
- sRet = "zh";
+ sRet = "zh"_ostr;
break;
case USCRIPT_CHEROKEE:
- sRet = "chr";
+ sRet = "chr"_ostr;
break;
case USCRIPT_COPTIC:
- sRet = "cop";
+ sRet = "cop"_ostr;
break;
case USCRIPT_CYRILLIC:
- sRet = "ru";
+ sRet = "ru"_ostr;
break;
case USCRIPT_DESERET:
- sRet = "en";
+ sRet = "en"_ostr;
break;
case USCRIPT_DEVANAGARI:
- sRet = "hi";
+ sRet = "hi"_ostr;
break;
case USCRIPT_ETHIOPIC:
- sRet = "am";
+ sRet = "am"_ostr;
break;
case USCRIPT_GEORGIAN:
- sRet = "ka";
+ sRet = "ka"_ostr;
break;
case USCRIPT_GOTHIC:
- sRet = "got";
+ sRet = "got"_ostr;
break;
case USCRIPT_GREEK:
- sRet = "el";
+ sRet = "el"_ostr;
break;
case USCRIPT_GUJARATI:
- sRet = "gu";
+ sRet = "gu"_ostr;
break;
case USCRIPT_GURMUKHI:
- sRet = "pa";
+ sRet = "pa"_ostr;
break;
case USCRIPT_HAN:
- sRet = "zh";
+ sRet = "zh"_ostr;
break;
case USCRIPT_HANGUL:
- sRet = "ko";
+ sRet = "ko"_ostr;
break;
case USCRIPT_HEBREW:
- sRet = "hr";
+ sRet = "hr"_ostr;
break;
case USCRIPT_HIRAGANA:
- sRet = "ja";
+ sRet = "ja"_ostr;
break;
case USCRIPT_KANNADA:
- sRet = "kn";
+ sRet = "kn"_ostr;
break;
case USCRIPT_KATAKANA:
- sRet = "ja";
+ sRet = "ja"_ostr;
break;
case USCRIPT_KHMER:
- sRet = "km";
+ sRet = "km"_ostr;
break;
case USCRIPT_LAO:
- sRet = "lo";
+ sRet = "lo"_ostr;
break;
case USCRIPT_LATIN:
- sRet = "en";
+ sRet = "en"_ostr;
break;
case USCRIPT_MALAYALAM:
- sRet = "ml";
+ sRet = "ml"_ostr;
break;
case USCRIPT_MONGOLIAN:
- sRet = "mn";
+ sRet = "mn"_ostr;
break;
case USCRIPT_MYANMAR:
- sRet = "my";
+ sRet = "my"_ostr;
break;
case USCRIPT_OGHAM:
- sRet = "pgl";
+ sRet = "pgl"_ostr;
break;
case USCRIPT_OLD_ITALIC:
- sRet = "osc";
+ sRet = "osc"_ostr;
break;
case USCRIPT_ORIYA:
- sRet = "or";
+ sRet = "or"_ostr;
break;
case USCRIPT_RUNIC:
- sRet = "ang";
+ sRet = "ang"_ostr;
break;
case USCRIPT_SINHALA:
- sRet = "si";
+ sRet = "si"_ostr;
break;
case USCRIPT_SYRIAC:
- sRet = "syr";
+ sRet = "syr"_ostr;
break;
case USCRIPT_TAMIL:
- sRet = "ta";
+ sRet = "ta"_ostr;
break;
case USCRIPT_TELUGU:
- sRet = "te";
+ sRet = "te"_ostr;
break;
case USCRIPT_THAANA:
- sRet = "dv";
+ sRet = "dv"_ostr;
break;
case USCRIPT_THAI:
- sRet = "th";
+ sRet = "th"_ostr;
break;
case USCRIPT_TIBETAN:
- sRet = "bo";
+ sRet = "bo"_ostr;
break;
case USCRIPT_CANADIAN_ABORIGINAL:
- sRet = "iu";
+ sRet = "iu"_ostr;
break;
case USCRIPT_YI:
- sRet = "ii";
+ sRet = "ii"_ostr;
break;
case USCRIPT_TAGALOG:
- sRet = "tl";
+ sRet = "tl"_ostr;
break;
case USCRIPT_HANUNOO:
- sRet = "hnn";
+ sRet = "hnn"_ostr;
break;
case USCRIPT_BUHID:
- sRet = "bku";
+ sRet = "bku"_ostr;
break;
case USCRIPT_TAGBANWA:
- sRet = "tbw";
+ sRet = "tbw"_ostr;
break;
case USCRIPT_BRAILLE:
- sRet = "en";
+ sRet = "en"_ostr;
break;
case USCRIPT_CYPRIOT:
- sRet = "ecy";
+ sRet = "ecy"_ostr;
break;
case USCRIPT_LIMBU:
- sRet = "lif";
+ sRet = "lif"_ostr;
break;
case USCRIPT_LINEAR_B:
- sRet = "gmy";
+ sRet = "gmy"_ostr;
break;
case USCRIPT_OSMANYA:
- sRet = "so";
+ sRet = "so"_ostr;
break;
case USCRIPT_SHAVIAN:
- sRet = "en";
+ sRet = "en"_ostr;
break;
case USCRIPT_TAI_LE:
- sRet = "tdd";
+ sRet = "tdd"_ostr;
break;
case USCRIPT_UGARITIC:
- sRet = "uga";
+ sRet = "uga"_ostr;
break;
case USCRIPT_KATAKANA_OR_HIRAGANA:
- sRet = "ja";
+ sRet = "ja"_ostr;
break;
case USCRIPT_BUGINESE:
- sRet = "bug";
+ sRet = "bug"_ostr;
break;
case USCRIPT_GLAGOLITIC:
- sRet = "ch";
+ sRet = "ch"_ostr;
break;
case USCRIPT_KHAROSHTHI:
- sRet = "pra";
+ sRet = "pra"_ostr;
break;
case USCRIPT_SYLOTI_NAGRI:
- sRet = "syl";
+ sRet = "syl"_ostr;
break;
case USCRIPT_NEW_TAI_LUE:
- sRet = "khb";
+ sRet = "khb"_ostr;
break;
case USCRIPT_TIFINAGH:
- sRet = "tmh";
+ sRet = "tmh"_ostr;
break;
case USCRIPT_OLD_PERSIAN:
- sRet = "peo";
+ sRet = "peo"_ostr;
break;
case USCRIPT_BALINESE:
- sRet = "ban";
+ sRet = "ban"_ostr;
break;
case USCRIPT_BATAK:
- sRet = "btk";
+ sRet = "btk"_ostr;
break;
case USCRIPT_BLISSYMBOLS:
- sRet = "en";
+ sRet = "en"_ostr;
break;
case USCRIPT_BRAHMI:
- sRet = "pra";
+ sRet = "pra"_ostr;
break;
case USCRIPT_CHAM:
- sRet = "cja";
+ sRet = "cja"_ostr;
break;
case USCRIPT_CIRTH:
- sRet = "sjn";
+ sRet = "sjn"_ostr;
break;
case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
- sRet = "cu";
+ sRet = "cu"_ostr;
break;
case USCRIPT_DEMOTIC_EGYPTIAN:
case USCRIPT_HIERATIC_EGYPTIAN:
case USCRIPT_EGYPTIAN_HIEROGLYPHS:
- sRet = "egy";
+ sRet = "egy"_ostr;
break;
case USCRIPT_KHUTSURI:
- sRet = "ka";
+ sRet = "ka"_ostr;
break;
case USCRIPT_SIMPLIFIED_HAN:
- sRet = "zh";
+ sRet = "zh"_ostr;
break;
case USCRIPT_TRADITIONAL_HAN:
- sRet = "zh";
+ sRet = "zh"_ostr;
break;
case USCRIPT_PAHAWH_HMONG:
- sRet = "blu";
+ sRet = "blu"_ostr;
break;
case USCRIPT_OLD_HUNGARIAN:
- sRet = "ohu";
+ sRet = "ohu"_ostr;
break;
case USCRIPT_HARAPPAN_INDUS:
- sRet = "xiv";
+ sRet = "xiv"_ostr;
break;
case USCRIPT_JAVANESE:
- sRet = "kaw";
+ sRet = "kaw"_ostr;
break;
case USCRIPT_KAYAH_LI:
- sRet = "eky";
+ sRet = "eky"_ostr;
break;
case USCRIPT_LATIN_FRAKTUR:
- sRet = "de";
+ sRet = "de"_ostr;
break;
case USCRIPT_LATIN_GAELIC:
- sRet = "ga";
+ sRet = "ga"_ostr;
break;
case USCRIPT_LEPCHA:
- sRet = "lep";
+ sRet = "lep"_ostr;
break;
case USCRIPT_LINEAR_A:
- sRet = "ecr";
+ sRet = "ecr"_ostr;
break;
case USCRIPT_MAYAN_HIEROGLYPHS:
- sRet = "myn";
+ sRet = "myn"_ostr;
break;
case USCRIPT_MEROITIC:
- sRet = "xmr";
+ sRet = "xmr"_ostr;
break;
case USCRIPT_NKO:
- sRet = "nqo";
+ sRet = "nqo"_ostr;
break;
case USCRIPT_ORKHON:
- sRet = "otk";
+ sRet = "otk"_ostr;
break;
case USCRIPT_OLD_PERMIC:
- sRet = "kv";
+ sRet = "kv"_ostr;
break;
case USCRIPT_PHAGS_PA:
- sRet = "xng";
+ sRet = "xng"_ostr;
break;
case USCRIPT_PHOENICIAN:
- sRet = "phn";
+ sRet = "phn"_ostr;
break;
case USCRIPT_PHONETIC_POLLARD:
- sRet = "hmd";
+ sRet = "hmd"_ostr;
break;
case USCRIPT_RONGORONGO:
- sRet = "rap";
+ sRet = "rap"_ostr;
break;
case USCRIPT_SARATI:
- sRet = "qya";
+ sRet = "qya"_ostr;
break;
case USCRIPT_ESTRANGELO_SYRIAC:
- sRet = "syr";
+ sRet = "syr"_ostr;
break;
case USCRIPT_WESTERN_SYRIAC:
- sRet = "tru";
+ sRet = "tru"_ostr;
break;
case USCRIPT_EASTERN_SYRIAC:
- sRet = "aii";
+ sRet = "aii"_ostr;
break;
case USCRIPT_TENGWAR:
- sRet = "sjn";
+ sRet = "sjn"_ostr;
break;
case USCRIPT_VAI:
- sRet = "vai";
+ sRet = "vai"_ostr;
break;
case USCRIPT_VISIBLE_SPEECH:
- sRet = "en";
+ sRet = "en"_ostr;
break;
case USCRIPT_CUNEIFORM:
- sRet = "akk";
+ sRet = "akk"_ostr;
break;
case USCRIPT_CARIAN:
- sRet = "xcr";
+ sRet = "xcr"_ostr;
break;
case USCRIPT_JAPANESE:
- sRet = "ja";
+ sRet = "ja"_ostr;
break;
case USCRIPT_LANNA:
- sRet = "nod";
+ sRet = "nod"_ostr;
break;
case USCRIPT_LYCIAN:
- sRet = "xlc";
+ sRet = "xlc"_ostr;
break;
case USCRIPT_LYDIAN:
- sRet = "xld";
+ sRet = "xld"_ostr;
break;
case USCRIPT_OL_CHIKI:
- sRet = "sat";
+ sRet = "sat"_ostr;
break;
case USCRIPT_REJANG:
- sRet = "rej";
+ sRet = "rej"_ostr;
break;
case USCRIPT_SAURASHTRA:
- sRet = "saz";
+ sRet = "saz"_ostr;
break;
case USCRIPT_SIGN_WRITING:
- sRet = "en";
+ sRet = "en"_ostr;
break;
case USCRIPT_SUNDANESE:
- sRet = "su";
+ sRet = "su"_ostr;
break;
case USCRIPT_MOON:
- sRet = "en";
+ sRet = "en"_ostr;
break;
case USCRIPT_MEITEI_MAYEK:
- sRet = "mni";
+ sRet = "mni"_ostr;
break;
case USCRIPT_IMPERIAL_ARAMAIC:
- sRet = "arc";
+ sRet = "arc"_ostr;
break;
case USCRIPT_AVESTAN:
- sRet = "ae";
+ sRet = "ae"_ostr;
break;
case USCRIPT_CHAKMA:
- sRet = "ccp";
+ sRet = "ccp"_ostr;
break;
case USCRIPT_KOREAN:
- sRet = "ko";
+ sRet = "ko"_ostr;
break;
case USCRIPT_KAITHI:
- sRet = "awa";
+ sRet = "awa"_ostr;
break;
case USCRIPT_MANICHAEAN:
- sRet = "xmn";
+ sRet = "xmn"_ostr;
break;
case USCRIPT_INSCRIPTIONAL_PAHLAVI:
case USCRIPT_PSALTER_PAHLAVI:
case USCRIPT_BOOK_PAHLAVI:
case USCRIPT_INSCRIPTIONAL_PARTHIAN:
- sRet = "xpr";
+ sRet = "xpr"_ostr;
break;
case USCRIPT_SAMARITAN:
- sRet = "heb";
+ sRet = "heb"_ostr;
break;
case USCRIPT_TAI_VIET:
- sRet = "blt";
+ sRet = "blt"_ostr;
break;
case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
- sRet = "mic";
+ sRet = "mic"_ostr;
break;
case USCRIPT_NABATAEAN:
- sRet = "mis-Nbat"; // Uncoded with script
+ sRet = "mis-Nbat"_ostr; // Uncoded with script
break;
case USCRIPT_PALMYRENE:
- sRet = "mis-Palm"; // Uncoded with script
+ sRet = "mis-Palm"_ostr; // Uncoded with script
break;
case USCRIPT_BAMUM:
- sRet = "bax";
+ sRet = "bax"_ostr;
break;
case USCRIPT_LISU:
- sRet = "lis";
+ sRet = "lis"_ostr;
break;
case USCRIPT_NAKHI_GEBA:
- sRet = "nxq";
+ sRet = "nxq"_ostr;
break;
case USCRIPT_OLD_SOUTH_ARABIAN:
- sRet = "xsa";
+ sRet = "xsa"_ostr;
break;
case USCRIPT_BASSA_VAH:
- sRet = "bsq";
+ sRet = "bsq"_ostr;
break;
case USCRIPT_DUPLOYAN_SHORTAND:
- sRet = "fr";
+ sRet = "fr"_ostr;
break;
case USCRIPT_ELBASAN:
- sRet = "sq";
+ sRet = "sq"_ostr;
break;
case USCRIPT_GRANTHA:
- sRet = "ta";
+ sRet = "ta"_ostr;
break;
case USCRIPT_KPELLE:
- sRet = "kpe";
+ sRet = "kpe"_ostr;
break;
case USCRIPT_LOMA:
- sRet = "lom";
+ sRet = "lom"_ostr;
break;
case USCRIPT_MENDE:
- sRet = "men";
+ sRet = "men"_ostr;
break;
case USCRIPT_MEROITIC_CURSIVE:
- sRet = "xmr";
+ sRet = "xmr"_ostr;
break;
case USCRIPT_OLD_NORTH_ARABIAN:
- sRet = "xna";
+ sRet = "xna"_ostr;
break;
case USCRIPT_SINDHI:
- sRet = "sd";
+ sRet = "sd"_ostr;
break;
case USCRIPT_WARANG_CITI:
- sRet = "hoc";
+ sRet = "hoc"_ostr;
break;
-#if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
case USCRIPT_AFAKA:
- sRet = "djk";
+ sRet = "djk"_ostr;
break;
case USCRIPT_JURCHEN:
- sRet = "juc";
+ sRet = "juc"_ostr;
break;
case USCRIPT_MRO:
- sRet = "cmr";
+ sRet = "cmr"_ostr;
break;
case USCRIPT_NUSHU:
- sRet = "mis-Nshu"; // Uncoded with script
+ sRet = "mis-Nshu"_ostr; // Uncoded with script
break;
case USCRIPT_SHARADA:
- sRet = "sa";
+ sRet = "sa"_ostr;
break;
case USCRIPT_SORA_SOMPENG:
- sRet = "srb";
+ sRet = "srb"_ostr;
break;
case USCRIPT_TAKRI:
- sRet = "doi";
+ sRet = "doi"_ostr;
break;
case USCRIPT_TANGUT:
- sRet = "txg";
+ sRet = "txg"_ostr;
break;
case USCRIPT_WOLEAI:
- sRet = "woe";
+ sRet = "woe"_ostr;
break;
-#endif
-#if (U_ICU_VERSION_MAJOR_NUM >= 49)
case USCRIPT_ANATOLIAN_HIEROGLYPHS:
- sRet = "hlu";
+ sRet = "hlu"_ostr;
break;
case USCRIPT_KHOJKI:
- sRet = "gu";
+ sRet = "gu"_ostr;
break;
case USCRIPT_TIRHUTA:
- sRet = "mai";
+ sRet = "mai"_ostr;
break;
-#endif
-#if (U_ICU_VERSION_MAJOR_NUM >= 52)
case USCRIPT_CAUCASIAN_ALBANIAN:
- sRet = "xag";
+ sRet = "xag"_ostr;
break;
case USCRIPT_MAHAJANI:
- sRet = "mwr";
+ sRet = "mwr"_ostr;
break;
-#endif
-#if (U_ICU_VERSION_MAJOR_NUM >= 54)
case USCRIPT_AHOM:
- sRet = "aho";
+ sRet = "aho"_ostr;
break;
case USCRIPT_HATRAN:
- sRet = "qly-Hatr";
+ sRet = "qly-Hatr"_ostr;
break;
case USCRIPT_MODI:
- sRet = "mr-Modi";
+ sRet = "mr-Modi"_ostr;
break;
case USCRIPT_MULTANI:
- sRet = "skr-Mutl";
+ sRet = "skr-Mutl"_ostr;
break;
case USCRIPT_PAU_CIN_HAU:
- sRet = "ctd-Pauc";
+ sRet = "ctd-Pauc"_ostr;
break;
case USCRIPT_SIDDHAM:
- sRet = "sa-Sidd";
+ sRet = "sa-Sidd"_ostr;
break;
-#endif
-#if (U_ICU_VERSION_MAJOR_NUM >= 58)
case USCRIPT_ADLAM:
- sRet = "mis-Adlm"; // Adlam for Fulani, no language code
+ sRet = "mis-Adlm"_ostr; // Adlam for Fulani, no language code
break;
case USCRIPT_BHAIKSUKI:
- sRet = "mis-Bhks"; // Bhaiksuki for some Buddhist texts, no language code
+ sRet = "mis-Bhks"_ostr; // Bhaiksuki for some Buddhist texts, no language code
break;
case USCRIPT_MARCHEN:
- sRet = "bo-Marc";
+ sRet = "bo-Marc"_ostr;
break;
case USCRIPT_NEWA:
- sRet = "new-Newa";
+ sRet = "new-Newa"_ostr;
break;
case USCRIPT_OSAGE:
- sRet = "osa-Osge";
+ sRet = "osa-Osge"_ostr;
break;
case USCRIPT_HAN_WITH_BOPOMOFO:
- sRet = "mis-Hanb"; // Han with Bopomofo, zh-Hanb ?
+ sRet = "mis-Hanb"_ostr; // Han with Bopomofo, zh-Hanb ?
break;
case USCRIPT_JAMO:
- sRet = "ko"; // Jamo - elements of Hangul Syllables
+ sRet = "ko"_ostr; // Jamo - elements of Hangul Syllables
break;
case USCRIPT_SYMBOLS_EMOJI:
- sRet = "mis-Zsye"; // Emoji variant
+ sRet = "mis-Zsye"_ostr; // Emoji variant
break;
-#endif
-#if (U_ICU_VERSION_MAJOR_NUM >= 60)
case USCRIPT_MASARAM_GONDI:
- sRet = "gon-Gonm"; // macro language code, could be wsg,esg,gno
+ sRet = "gon-Gonm"_ostr; // macro language code, could be wsg,esg,gno
break;
case USCRIPT_SOYOMBO:
- sRet = "mn-Soyo"; // abugida to write Mongolian, also Tibetan and Sanskrit
+ sRet = "mn-Soyo"_ostr; // abugida to write Mongolian, also Tibetan and Sanskrit
break;
case USCRIPT_ZANABAZAR_SQUARE:
- sRet = "mn-Zanb"; // abugida to write Mongolian
+ sRet = "mn-Zanb"_ostr; // abugida to write Mongolian
break;
-#endif
-#if (U_ICU_VERSION_MAJOR_NUM >= 62)
case USCRIPT_DOGRA:
- sRet = "dgo"; // Dogri proper
+ sRet = "dgo"_ostr; // Dogri proper
break;
case USCRIPT_GUNJALA_GONDI:
- sRet = "wsg"; // Adilabad Gondi
+ sRet = "wsg"_ostr; // Adilabad Gondi
break;
case USCRIPT_MAKASAR:
- sRet = "mak";
+ sRet = "mak"_ostr;
break;
case USCRIPT_MEDEFAIDRIN:
- sRet = "dmf-Medf";
+ sRet = "dmf-Medf"_ostr;
break;
case USCRIPT_HANIFI_ROHINGYA:
- sRet = "rhg";
+ sRet = "rhg"_ostr;
break;
case USCRIPT_SOGDIAN:
- sRet = "sog";
+ sRet = "sog"_ostr;
break;
case USCRIPT_OLD_SOGDIAN:
- sRet = "sog";
+ sRet = "sog"_ostr;
break;
-#endif
-#if (U_ICU_VERSION_MAJOR_NUM >= 64)
case USCRIPT_ELYMAIC:
- sRet = "arc-Elym";
+ sRet = "arc-Elym"_ostr;
break;
case USCRIPT_NYIAKENG_PUACHUE_HMONG:
- sRet = "hmn-Hmnp"; // macrolanguage code
+ sRet = "hmn-Hmnp"_ostr; // macrolanguage code
break;
case USCRIPT_NANDINAGARI:
- sRet = "sa-Nand";
+ sRet = "sa-Nand"_ostr;
break;
case USCRIPT_WANCHO:
- sRet = "nnp-Wcho";
+ sRet = "nnp-Wcho"_ostr;
break;
-#endif
-#if (U_ICU_VERSION_MAJOR_NUM >= 66)
case USCRIPT_CHORASMIAN:
- sRet = "xco-Chrs";
+ sRet = "xco-Chrs"_ostr;
break;
case USCRIPT_DIVES_AKURU:
- sRet = "dv-Diak";
+ sRet = "dv-Diak"_ostr;
break;
case USCRIPT_KHITAN_SMALL_SCRIPT:
- sRet = "zkt-Kits";
+ sRet = "zkt-Kits"_ostr;
break;
case USCRIPT_YEZIDI:
- sRet = "kmr-Yezi";
+ sRet = "kmr-Yezi"_ostr;
break;
-#endif
#if (U_ICU_VERSION_MAJOR_NUM >= 70)
case USCRIPT_CYPRO_MINOAN:
- sRet = "mis-Cpmn"; // Uncoded with script
+ sRet = "mis-Cpmn"_ostr; // Uncoded with script
break;
case USCRIPT_OLD_UYGHUR:
- sRet = "oui-Ougr";
+ sRet = "oui-Ougr"_ostr;
break;
case USCRIPT_TANGSA:
- sRet = "nst-Tnsa";
+ sRet = "nst-Tnsa"_ostr;
break;
case USCRIPT_TOTO:
- sRet = "txo-Toto";
+ sRet = "txo-Toto"_ostr;
break;
case USCRIPT_VITHKUQI:
- sRet = "sq-Vith"; // macrolanguage code
+ sRet = "sq-Vith"_ostr; // macrolanguage code
+ break;
+#endif
+#if (U_ICU_VERSION_MAJOR_NUM >= 72)
+ case USCRIPT_KAWI:
+ sRet = "mis-Kawi"_ostr; // Uncoded with script
+ break;
+ case USCRIPT_NAG_MUNDARI:
+ sRet = "unr-Nagm"_ostr;
break;
#endif
}
@@ -1019,7 +1248,7 @@ OUString ToggleUnicodeCodepoint::StringToReplace()
sIn = maInput.toString();
while( nUPlus != -1 )
{
- nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
+ nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
//prevent creating control characters or invalid Unicode values
if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
maInput = sIn.subView(nUPlus);
@@ -1062,7 +1291,7 @@ OUString ToggleUnicodeCodepoint::ReplacementString()
}
while( nUPlus > 0 )
{
- nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
+ nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16);
output.appendUtf32( nUnicode );
sIn = sIn.copy(nUPlus+2);
@@ -1081,8 +1310,7 @@ OUString ToggleUnicodeCodepoint::ReplacementString()
//pad with zeros - minimum length of 4.
for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
aTmp.insert( 0,"0" );
- output.append( "U+" );
- output.append( aTmp );
+ output.append( "U+" + aTmp );
}
}
return output.makeStringAndClear();