diff options
Diffstat (limited to 'i18nlangtag/source/languagetag/languagetag.cxx')
-rw-r--r-- | i18nlangtag/source/languagetag/languagetag.cxx | 173 |
1 files changed, 135 insertions, 38 deletions
diff --git a/i18nlangtag/source/languagetag/languagetag.cxx b/i18nlangtag/source/languagetag/languagetag.cxx index 29c85cd710fc..26a3d5db2c79 100644 --- a/i18nlangtag/source/languagetag/languagetag.cxx +++ b/i18nlangtag/source/languagetag/languagetag.cxx @@ -18,8 +18,9 @@ #include <sal/log.hxx> #include <osl/file.hxx> #include <rtl/locale.h> -#include <tools/long.hxx> +#include <o3tl/string_view.hxx> #include <algorithm> +#include <atomic> #include <map> #include <mutex> #include <string_view> @@ -36,6 +37,10 @@ #include <osl/detail/android-bootstrap.h> #endif +#ifdef EMSCRIPTEN +#include <osl/detail/emscripten-bootstrap.h> +#endif + using namespace com::sun::star; namespace { @@ -85,9 +90,9 @@ static const KnownTagSet & getKnowns() namespace { struct compareIgnoreAsciiCaseLess { - bool operator()( const OUString& r1, std::u16string_view r2 ) const + bool operator()( std::u16string_view r1, std::u16string_view r2 ) const { - return r1.compareToIgnoreAsciiCase( r2) < 0; + return o3tl::compareToIgnoreAsciiCase(r1, r2) < 0; } }; typedef ::std::map< OUString, LanguageTag::ImplPtr, compareIgnoreAsciiCaseLess > MapBcp47; @@ -215,7 +220,7 @@ void LiblangtagDataRef::teardown() void LiblangtagDataRef::setupDataPath() { -#if defined(ANDROID) +#if defined(ANDROID) || defined(EMSCRIPTEN) maDataPath = OString(lo_get_app_data_dir()) + "/share/liblangtag"; #else // maDataPath is assumed to be empty here. @@ -234,7 +239,7 @@ void LiblangtagDataRef::setupDataPath() } #endif if (maDataPath.isEmpty()) - maDataPath = "|"; // assume system + maDataPath = "|"_ostr; // assume system else lt_db_set_datadir( maDataPath.getStr()); } @@ -356,6 +361,7 @@ private: EXTRACTED_NONE, EXTRACTED_LSC, EXTRACTED_LV, + EXTRACTED_LR, EXTRACTED_C_LOCALE, EXTRACTED_X, EXTRACTED_X_JOKER, @@ -370,6 +376,7 @@ private: @return EXTRACTED_LSC if simple tag was detected (i.e. one that would fulfill the isIsoODF() condition), EXTRACTED_LV if a tag with variant was detected, + EXTRACTED_LR if a tag with 3-digit UN M.49 region code was detected EXTRACTED_C_LOCALE if a 'C' locale was detected, EXTRACTED_X if x-... privateuse tag was detected, EXTRACTED_X_JOKER if "*" joker was detected, @@ -380,6 +387,7 @@ private: OUString& rLanguage, OUString& rScript, OUString& rCountry, + OUString& rRegion, OUString& rVariants ); /** Convert Locale to BCP 47 string without resolving system and creating @@ -752,7 +760,7 @@ LanguageTag::ImplPtr LanguageTag::registerImpl() const ImplPtr pImpl; #if OSL_DEBUG_LEVEL > 0 - static size_t nCalls = 0; + static std::atomic_int nCalls = 0; ++nCalls; SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: " << nCalls << " calls"); #endif @@ -852,7 +860,7 @@ LanguageTag::ImplPtr LanguageTag::registerImpl() const std::unique_lock aGuard( theMutex()); #if OSL_DEBUG_LEVEL > 0 - static tools::Long nRunning = 0; + static long nRunning = 0; // Entering twice here is ok, which is needed for fallback init in // getKnowns() in canonicalize() via pImpl->convertBcp47ToLocale() below, // everything else is suspicious. @@ -961,7 +969,7 @@ LanguageTag::ImplPtr LanguageTag::registerImpl() const // May have involved canonicalize(), so compare with // pImpl->maBcp47 instead of maBcp47! aBcp47 = LanguageTagImpl::convertToBcp47( - MsLangId::Conversion::convertLanguageToLocale( pImpl->mnLangID )); + MsLangId::Conversion::convertLanguageToLocale( pImpl->mnLangID, true)); bInsert = (aBcp47 == pImpl->maBcp47); } } @@ -1116,20 +1124,22 @@ bool LanguageTagImpl::canonicalize() // and want to determine if parsing it would be possible // without using liblangtag just to see if it is a simple known // locale or could fall back to one. - OUString aLanguage, aScript, aCountry, aVariants; - Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants); + OUString aLanguage, aScript, aCountry, aRegion, aVariants; + Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aRegion, aVariants); if (eExt != EXTRACTED_NONE) { - if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV) + if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV || eExt == EXTRACTED_LR) { // Rebuild bcp47 with proper casing of tags. OUStringBuffer aBuf( aLanguage.getLength() + 1 + aScript.getLength() + - 1 + aCountry.getLength() + 1 + aVariants.getLength()); + 1 + aCountry.getLength() + 1 + aRegion.getLength() + 1 + aVariants.getLength()); aBuf.append( aLanguage); if (!aScript.isEmpty()) aBuf.append("-" + aScript); if (!aCountry.isEmpty()) aBuf.append("-" + aCountry); + if (!aRegion.isEmpty()) + aBuf.append("-" + aRegion); if (!aVariants.isEmpty()) aBuf.append("-" + aVariants); OUString aStr( aBuf.makeStringAndClear()); @@ -1352,7 +1362,7 @@ void LanguageTagImpl::convertLocaleToBcp47() // locale via LanguageTag::convertToBcp47(LanguageType) and // LanguageTag::convertToLocale(LanguageType) would instantiate another // LanguageTag. - maLocale = MsLangId::Conversion::convertLanguageToLocale( LANGUAGE_SYSTEM ); + maLocale = MsLangId::Conversion::convertLanguageToLocale( LANGUAGE_SYSTEM, false); } if (maLocale.Language.isEmpty()) { @@ -1496,7 +1506,7 @@ void LanguageTagImpl::convertLangToLocale() mbInitializedLangID = true; } // Resolve system here! The original is remembered as mbSystemLocale. - maLocale = MsLangId::Conversion::convertLanguageToLocale( mnLangID ); + maLocale = MsLangId::Conversion::convertLanguageToLocale( mnLangID, false); mbInitializedLocale = true; } @@ -1532,7 +1542,7 @@ void LanguageTag::convertFromRtlLocale() if (maLocale.Variant.isEmpty()) return; - OString aStr = OUStringToOString(maLocale.Language, RTL_TEXTENCODING_UTF8) + "_" + OUStringToOString(OUStringConcatenation(maLocale.Country + maLocale.Variant), + OString aStr = OUStringToOString(maLocale.Language, RTL_TEXTENCODING_UTF8) + "_" + OUStringToOString(Concat2View(maLocale.Country + maLocale.Variant), RTL_TEXTENCODING_UTF8); /* FIXME: let liblangtag parse this entirely with * lt_tag_convert_from_locale() but that needs a patch to pass the @@ -1572,7 +1582,7 @@ const OUString & LanguageTagImpl::getBcp47() const const OUString & LanguageTag::getBcp47( bool bResolveSystem ) const { - static const OUString theEmptyBcp47 = u""; + static constexpr OUString theEmptyBcp47 = u""_ustr; if (!bResolveSystem && mbSystemLocale) return theEmptyBcp47; @@ -2032,9 +2042,9 @@ void LanguageTag::setScriptType(LanguageTag::ScriptType st) bool LanguageTagImpl::cacheSimpleLSCV() { - OUString aLanguage, aScript, aCountry, aVariants; - Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants); - bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV); + OUString aLanguage, aScript, aCountry, aRegion, aVariants; + Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aRegion, aVariants); + bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV || eExt == EXTRACTED_LR); if (bRet) { maCachedLanguage = aLanguage; @@ -2177,8 +2187,10 @@ LanguageTag & LanguageTag::makeFallback() aVec.emplace_back(aLanguage + "-" + aCountry); if (aLanguage == "zh") { - // For zh-HK or zh-MO also list zh-TW, for all other zh-XX also - // list zh-CN. + // For zh-HK or zh-MO also list zh-TW to get zh-Hant, for all + // other zh-XX also list zh-CN to get zh-Hans; both of which we + // use the legacy forms instead of the more correct script + // tags that unfortunately most pieces don't understand. if (aCountry == "HK" || aCountry == "MO") aVec.emplace_back(aLanguage + "-TW"); else if (aCountry != "CN") @@ -2376,7 +2388,7 @@ LanguageTag & LanguageTag::makeFallback() } // Original language-only. - if (aLanguage != maBcp47) + if (!aLanguage.isEmpty() && aLanguage != maBcp47) aVec.push_back( aLanguage); return aVec; @@ -2434,7 +2446,7 @@ bool LanguageTag::operator<( const LanguageTag & rLanguageTag ) const // static LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp47, - OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& rVariants ) + OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& rRegion, OUString& rVariants ) { Extraction eRet = EXTRACTED_NONE; const sal_Int32 nLen = rBcp47.getLength(); @@ -2458,6 +2470,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = "C"; rScript.clear(); rCountry.clear(); + rRegion.clear(); rVariants.clear(); } else if (nLen == 2 || nLen == 3) // ll or lll @@ -2467,6 +2480,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.toAsciiLowerCase(); rScript.clear(); rCountry.clear(); + rRegion.clear(); rVariants.clear(); eRet = EXTRACTED_LSC; } @@ -2478,11 +2492,25 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp { rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase(); + rRegion.clear(); rScript.clear(); rVariants.clear(); eRet = EXTRACTED_LSC; } } + else if ( (nHyph1 == 2 && nLen == 6) // ll-rrr + || (nHyph1 == 3 && nLen == 7)) // lll-rrr + { + if (nHyph2 < 0) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rCountry.clear(); + rRegion = rBcp47.copy( nHyph1 + 1, 3); + rScript.clear(); + rVariants.clear(); + eRet = EXTRACTED_LR; + } + } else if ( (nHyph1 == 2 && nLen == 7) // ll-Ssss or ll-vvvv || (nHyph1 == 3 && nLen == 8)) // lll-Ssss or lll-vvvv { @@ -2495,6 +2523,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript.clear(); rCountry.clear(); + rRegion.clear(); rVariants = rBcp47.copy( nHyph1 + 1); eRet = EXTRACTED_LV; } @@ -2504,6 +2533,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); rCountry.clear(); + rRegion.clear(); rVariants.clear(); eRet = EXTRACTED_LSC; } @@ -2517,10 +2547,24 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase(); + rRegion.clear(); rVariants.clear(); eRet = EXTRACTED_LSC; } } + else if ( (nHyph1 == 2 && nHyph2 == 7 && nLen == 11) // ll-Ssss-rrr + || (nHyph1 == 3 && nHyph2 == 8 && nLen == 12)) // lll-Ssss-rrr + { + if (nHyph3 < 0) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); + rCountry.clear(); + rRegion = rBcp47.copy( nHyph2 + 1, 3); + rVariants.clear(); + eRet = EXTRACTED_LR; + } + } else if ( (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 10 && nLen >= 15) // ll-Ssss-CC-vvvv[vvvv][-...] || (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 11 && nLen >= 16)) // lll-Ssss-CC-vvvv[vvvv][-...] { @@ -2531,10 +2575,26 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase(); + rRegion.clear(); rVariants = rBcp47.copy( nHyph3 + 1); eRet = EXTRACTED_LV; } } + else if ( (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 11 && nLen >= 16) // ll-Ssss-rrr-vvvv[vvvv][-...] + || (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 12 && nLen >= 17)) // lll-Ssss-rrr-vvvv[vvvv][-...] + { + if (nHyph4 < 0) + nHyph4 = rBcp47.getLength(); + if (nHyph4 - nHyph3 > 4 && nHyph4 - nHyph3 <= 9) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase(); + rCountry.clear(); + rRegion = rBcp47.copy( nHyph2 + 1, 3); + rVariants = rBcp47.copy( nHyph3 + 1); + eRet = EXTRACTED_LR; + } + } else if ( (nHyph1 == 2 && nHyph2 == 5 && nHyph3 == 7) // ll-CC-u-... || (nHyph1 == 3 && nHyph2 == 6 && nHyph3 == 8)) // lll-CC-u-... { @@ -2550,6 +2610,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = "es"; rScript.clear(); rCountry = "ES"; + rRegion.clear(); rVariants = "u-co-trad"; // not strictly a variant, but used to reconstruct the tag. eRet = EXTRACTED_LV; } @@ -2565,10 +2626,26 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript.clear(); rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase(); + rRegion.clear(); rVariants = rBcp47.copy( nHyph2 + 1); eRet = EXTRACTED_LV; } } + else if ( (nHyph1 == 2 && nHyph2 == 6 && nLen >= 11) // ll-rrr-vvvv[vvvv][-...] + || (nHyph1 == 3 && nHyph2 == 7 && nLen >= 12)) // lll-rrr-vvvv[vvvv][-...] + { + if (nHyph3 < 0) + nHyph3 = rBcp47.getLength(); + if (nHyph3 - nHyph2 > 4 && nHyph3 - nHyph2 <= 9) + { + rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); + rScript.clear(); + rCountry.clear(); + rRegion = rBcp47.copy( nHyph1 + 1, 3); + rVariants = rBcp47.copy( nHyph2 + 1); + eRet = EXTRACTED_LR; + } + } else if ( (nHyph1 == 2 && nLen >= 8) // ll-vvvvv[vvv][-...] || (nHyph1 == 3 && nLen >= 9)) // lll-vvvvv[vvv][-...] { @@ -2579,6 +2656,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase(); rScript.clear(); rCountry.clear(); + rRegion.clear(); rVariants = rBcp47.copy( nHyph1 + 1); eRet = EXTRACTED_LV; } @@ -2592,6 +2670,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = "en"; rScript.clear(); rCountry = "GB"; + rRegion.clear(); rVariants = "oed"; eRet = EXTRACTED_LV; } @@ -2602,6 +2681,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage = "es"; rScript.clear(); rCountry = "ES"; + rRegion.clear(); rVariants = "tradnl"; // this is nonsense, but... ignored. eRet = EXTRACTED_KNOWN_BAD; } @@ -2613,8 +2693,19 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp rLanguage.clear(); rScript.clear(); rCountry.clear(); + rRegion.clear(); rVariants.clear(); } + else + { + assert(rLanguage.getLength() == 2 || rLanguage.getLength() == 3 + || eRet == EXTRACTED_X_JOKER || eRet == EXTRACTED_X || eRet == EXTRACTED_C_LOCALE); + assert(rScript.isEmpty() || rScript.getLength() == 4); + assert(rCountry.isEmpty() || rRegion.isEmpty()); // [2ALPHA / 3DIGIT] + assert(rCountry.isEmpty() || rCountry.getLength() == 2); + assert(rRegion.isEmpty() || rRegion.getLength() == 3); + assert(rVariants.isEmpty() || rVariants.getLength() >= 4 || rVariants == "oed"); + } return eRet; } @@ -2683,10 +2774,8 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp ::std::vector< ::std::vector< OUString > > aListFallbacks( rList.size()); size_t i = 0; for (auto const& elem : rList) - { - ::std::vector< OUString > aTmp( LanguageTag(elem).getFallbackStrings( true)); - aListFallbacks[i++] = aTmp; - } + aListFallbacks[i++] = LanguageTag(elem).getFallbackStrings(true); + for (auto const& rfb : aFallbacks) { size_t nPosFb = 0; @@ -2828,9 +2917,9 @@ css::lang::Locale LanguageTag::convertToLocaleWithFallback( const OUString& rBcp // static -LanguageType LanguageTag::convertToLanguageTypeWithFallback( const css::lang::Locale& rLocale, bool bResolveSystem ) +LanguageType LanguageTag::convertToLanguageTypeWithFallback( const css::lang::Locale& rLocale ) { - if (rLocale.Language.isEmpty() && !bResolveSystem) + if (rLocale.Language.isEmpty()) return LANGUAGE_SYSTEM; return LanguageTag( rLocale).makeFallback().getLanguageType(); @@ -2838,7 +2927,8 @@ LanguageType LanguageTag::convertToLanguageTypeWithFallback( const css::lang::Lo // static -bool LanguageTag::isValidBcp47( const OUString& rString, OUString* o_pCanonicalized, bool bDisallowPrivate ) +bool LanguageTag::isValidBcp47( const OUString& rString, OUString* o_pCanonicalized, + LanguageTag::PrivateUse ePrivateUse ) { bool bValid = false; @@ -2865,30 +2955,37 @@ bool LanguageTag::isValidBcp47( const OUString& rString, OUString* o_pCanonicali if (pTag) { bValid = true; - if (bDisallowPrivate) + if (ePrivateUse != PrivateUse::ALLOW) { - const lt_string_t* pPrivate = lt_tag_get_privateuse( aVar.mpLangtag); - if (pPrivate && lt_string_length( pPrivate) > 0) - bValid = false; - else + do { + const char* pLang = nullptr; const lt_lang_t* pLangT = lt_tag_get_language( aVar.mpLangtag); if (pLangT) { - const char* pLang = lt_lang_get_tag( pLangT); + pLang = lt_lang_get_tag( pLangT); if (pLang && strcmp( pLang, I18NLANGTAG_QLT_ASCII) == 0) { - // Disallow 'qlt' privateuse code to prevent + // Disallow 'qlt' localuse code to prevent // confusion with our internal usage. bValid = false; + break; } } + if (ePrivateUse == PrivateUse::ALLOW_ART_X && pLang && strcmp( pLang, "art") == 0) + { + // Allow anything 'art' which includes 'art-x-...' and 'art-Latn-x-...'. + break; + } + const lt_string_t* pPrivate = lt_tag_get_privateuse( aVar.mpLangtag); + if (pPrivate && lt_string_length( pPrivate) > 0) + bValid = false; } + while (false); } if (o_pCanonicalized) *o_pCanonicalized = OUString::createFromAscii( pTag); free( pTag); - return bValid; } } else |