summaryrefslogtreecommitdiffstats
path: root/i18nlangtag/source/languagetag/languagetag.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'i18nlangtag/source/languagetag/languagetag.cxx')
-rw-r--r--i18nlangtag/source/languagetag/languagetag.cxx173
1 files changed, 135 insertions, 38 deletions
diff --git a/i18nlangtag/source/languagetag/languagetag.cxx b/i18nlangtag/source/languagetag/languagetag.cxx
index 29c85cd710fc..26a3d5db2c79 100644
--- a/i18nlangtag/source/languagetag/languagetag.cxx
+++ b/i18nlangtag/source/languagetag/languagetag.cxx
@@ -18,8 +18,9 @@
#include <sal/log.hxx>
#include <osl/file.hxx>
#include <rtl/locale.h>
-#include <tools/long.hxx>
+#include <o3tl/string_view.hxx>
#include <algorithm>
+#include <atomic>
#include <map>
#include <mutex>
#include <string_view>
@@ -36,6 +37,10 @@
#include <osl/detail/android-bootstrap.h>
#endif
+#ifdef EMSCRIPTEN
+#include <osl/detail/emscripten-bootstrap.h>
+#endif
+
using namespace com::sun::star;
namespace {
@@ -85,9 +90,9 @@ static const KnownTagSet & getKnowns()
namespace {
struct compareIgnoreAsciiCaseLess
{
- bool operator()( const OUString& r1, std::u16string_view r2 ) const
+ bool operator()( std::u16string_view r1, std::u16string_view r2 ) const
{
- return r1.compareToIgnoreAsciiCase( r2) < 0;
+ return o3tl::compareToIgnoreAsciiCase(r1, r2) < 0;
}
};
typedef ::std::map< OUString, LanguageTag::ImplPtr, compareIgnoreAsciiCaseLess > MapBcp47;
@@ -215,7 +220,7 @@ void LiblangtagDataRef::teardown()
void LiblangtagDataRef::setupDataPath()
{
-#if defined(ANDROID)
+#if defined(ANDROID) || defined(EMSCRIPTEN)
maDataPath = OString(lo_get_app_data_dir()) + "/share/liblangtag";
#else
// maDataPath is assumed to be empty here.
@@ -234,7 +239,7 @@ void LiblangtagDataRef::setupDataPath()
}
#endif
if (maDataPath.isEmpty())
- maDataPath = "|"; // assume system
+ maDataPath = "|"_ostr; // assume system
else
lt_db_set_datadir( maDataPath.getStr());
}
@@ -356,6 +361,7 @@ private:
EXTRACTED_NONE,
EXTRACTED_LSC,
EXTRACTED_LV,
+ EXTRACTED_LR,
EXTRACTED_C_LOCALE,
EXTRACTED_X,
EXTRACTED_X_JOKER,
@@ -370,6 +376,7 @@ private:
@return EXTRACTED_LSC if simple tag was detected (i.e. one that
would fulfill the isIsoODF() condition),
EXTRACTED_LV if a tag with variant was detected,
+ EXTRACTED_LR if a tag with 3-digit UN M.49 region code was detected
EXTRACTED_C_LOCALE if a 'C' locale was detected,
EXTRACTED_X if x-... privateuse tag was detected,
EXTRACTED_X_JOKER if "*" joker was detected,
@@ -380,6 +387,7 @@ private:
OUString& rLanguage,
OUString& rScript,
OUString& rCountry,
+ OUString& rRegion,
OUString& rVariants );
/** Convert Locale to BCP 47 string without resolving system and creating
@@ -752,7 +760,7 @@ LanguageTag::ImplPtr LanguageTag::registerImpl() const
ImplPtr pImpl;
#if OSL_DEBUG_LEVEL > 0
- static size_t nCalls = 0;
+ static std::atomic_int nCalls = 0;
++nCalls;
SAL_INFO( "i18nlangtag", "LanguageTag::registerImpl: " << nCalls << " calls");
#endif
@@ -852,7 +860,7 @@ LanguageTag::ImplPtr LanguageTag::registerImpl() const
std::unique_lock aGuard( theMutex());
#if OSL_DEBUG_LEVEL > 0
- static tools::Long nRunning = 0;
+ static long nRunning = 0;
// Entering twice here is ok, which is needed for fallback init in
// getKnowns() in canonicalize() via pImpl->convertBcp47ToLocale() below,
// everything else is suspicious.
@@ -961,7 +969,7 @@ LanguageTag::ImplPtr LanguageTag::registerImpl() const
// May have involved canonicalize(), so compare with
// pImpl->maBcp47 instead of maBcp47!
aBcp47 = LanguageTagImpl::convertToBcp47(
- MsLangId::Conversion::convertLanguageToLocale( pImpl->mnLangID ));
+ MsLangId::Conversion::convertLanguageToLocale( pImpl->mnLangID, true));
bInsert = (aBcp47 == pImpl->maBcp47);
}
}
@@ -1116,20 +1124,22 @@ bool LanguageTagImpl::canonicalize()
// and want to determine if parsing it would be possible
// without using liblangtag just to see if it is a simple known
// locale or could fall back to one.
- OUString aLanguage, aScript, aCountry, aVariants;
- Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants);
+ OUString aLanguage, aScript, aCountry, aRegion, aVariants;
+ Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aRegion, aVariants);
if (eExt != EXTRACTED_NONE)
{
- if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV)
+ if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV || eExt == EXTRACTED_LR)
{
// Rebuild bcp47 with proper casing of tags.
OUStringBuffer aBuf( aLanguage.getLength() + 1 + aScript.getLength() +
- 1 + aCountry.getLength() + 1 + aVariants.getLength());
+ 1 + aCountry.getLength() + 1 + aRegion.getLength() + 1 + aVariants.getLength());
aBuf.append( aLanguage);
if (!aScript.isEmpty())
aBuf.append("-" + aScript);
if (!aCountry.isEmpty())
aBuf.append("-" + aCountry);
+ if (!aRegion.isEmpty())
+ aBuf.append("-" + aRegion);
if (!aVariants.isEmpty())
aBuf.append("-" + aVariants);
OUString aStr( aBuf.makeStringAndClear());
@@ -1352,7 +1362,7 @@ void LanguageTagImpl::convertLocaleToBcp47()
// locale via LanguageTag::convertToBcp47(LanguageType) and
// LanguageTag::convertToLocale(LanguageType) would instantiate another
// LanguageTag.
- maLocale = MsLangId::Conversion::convertLanguageToLocale( LANGUAGE_SYSTEM );
+ maLocale = MsLangId::Conversion::convertLanguageToLocale( LANGUAGE_SYSTEM, false);
}
if (maLocale.Language.isEmpty())
{
@@ -1496,7 +1506,7 @@ void LanguageTagImpl::convertLangToLocale()
mbInitializedLangID = true;
}
// Resolve system here! The original is remembered as mbSystemLocale.
- maLocale = MsLangId::Conversion::convertLanguageToLocale( mnLangID );
+ maLocale = MsLangId::Conversion::convertLanguageToLocale( mnLangID, false);
mbInitializedLocale = true;
}
@@ -1532,7 +1542,7 @@ void LanguageTag::convertFromRtlLocale()
if (maLocale.Variant.isEmpty())
return;
- OString aStr = OUStringToOString(maLocale.Language, RTL_TEXTENCODING_UTF8) + "_" + OUStringToOString(OUStringConcatenation(maLocale.Country + maLocale.Variant),
+ OString aStr = OUStringToOString(maLocale.Language, RTL_TEXTENCODING_UTF8) + "_" + OUStringToOString(Concat2View(maLocale.Country + maLocale.Variant),
RTL_TEXTENCODING_UTF8);
/* FIXME: let liblangtag parse this entirely with
* lt_tag_convert_from_locale() but that needs a patch to pass the
@@ -1572,7 +1582,7 @@ const OUString & LanguageTagImpl::getBcp47() const
const OUString & LanguageTag::getBcp47( bool bResolveSystem ) const
{
- static const OUString theEmptyBcp47 = u"";
+ static constexpr OUString theEmptyBcp47 = u""_ustr;
if (!bResolveSystem && mbSystemLocale)
return theEmptyBcp47;
@@ -2032,9 +2042,9 @@ void LanguageTag::setScriptType(LanguageTag::ScriptType st)
bool LanguageTagImpl::cacheSimpleLSCV()
{
- OUString aLanguage, aScript, aCountry, aVariants;
- Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants);
- bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV);
+ OUString aLanguage, aScript, aCountry, aRegion, aVariants;
+ Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aRegion, aVariants);
+ bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV || eExt == EXTRACTED_LR);
if (bRet)
{
maCachedLanguage = aLanguage;
@@ -2177,8 +2187,10 @@ LanguageTag & LanguageTag::makeFallback()
aVec.emplace_back(aLanguage + "-" + aCountry);
if (aLanguage == "zh")
{
- // For zh-HK or zh-MO also list zh-TW, for all other zh-XX also
- // list zh-CN.
+ // For zh-HK or zh-MO also list zh-TW to get zh-Hant, for all
+ // other zh-XX also list zh-CN to get zh-Hans; both of which we
+ // use the legacy forms instead of the more correct script
+ // tags that unfortunately most pieces don't understand.
if (aCountry == "HK" || aCountry == "MO")
aVec.emplace_back(aLanguage + "-TW");
else if (aCountry != "CN")
@@ -2376,7 +2388,7 @@ LanguageTag & LanguageTag::makeFallback()
}
// Original language-only.
- if (aLanguage != maBcp47)
+ if (!aLanguage.isEmpty() && aLanguage != maBcp47)
aVec.push_back( aLanguage);
return aVec;
@@ -2434,7 +2446,7 @@ bool LanguageTag::operator<( const LanguageTag & rLanguageTag ) const
// static
LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp47,
- OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& rVariants )
+ OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& rRegion, OUString& rVariants )
{
Extraction eRet = EXTRACTED_NONE;
const sal_Int32 nLen = rBcp47.getLength();
@@ -2458,6 +2470,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage = "C";
rScript.clear();
rCountry.clear();
+ rRegion.clear();
rVariants.clear();
}
else if (nLen == 2 || nLen == 3) // ll or lll
@@ -2467,6 +2480,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage = rBcp47.toAsciiLowerCase();
rScript.clear();
rCountry.clear();
+ rRegion.clear();
rVariants.clear();
eRet = EXTRACTED_LSC;
}
@@ -2478,11 +2492,25 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
{
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase();
+ rRegion.clear();
rScript.clear();
rVariants.clear();
eRet = EXTRACTED_LSC;
}
}
+ else if ( (nHyph1 == 2 && nLen == 6) // ll-rrr
+ || (nHyph1 == 3 && nLen == 7)) // lll-rrr
+ {
+ if (nHyph2 < 0)
+ {
+ rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+ rCountry.clear();
+ rRegion = rBcp47.copy( nHyph1 + 1, 3);
+ rScript.clear();
+ rVariants.clear();
+ eRet = EXTRACTED_LR;
+ }
+ }
else if ( (nHyph1 == 2 && nLen == 7) // ll-Ssss or ll-vvvv
|| (nHyph1 == 3 && nLen == 8)) // lll-Ssss or lll-vvvv
{
@@ -2495,6 +2523,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rScript.clear();
rCountry.clear();
+ rRegion.clear();
rVariants = rBcp47.copy( nHyph1 + 1);
eRet = EXTRACTED_LV;
}
@@ -2504,6 +2533,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() +
rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
rCountry.clear();
+ rRegion.clear();
rVariants.clear();
eRet = EXTRACTED_LSC;
}
@@ -2517,10 +2547,24 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase();
+ rRegion.clear();
rVariants.clear();
eRet = EXTRACTED_LSC;
}
}
+ else if ( (nHyph1 == 2 && nHyph2 == 7 && nLen == 11) // ll-Ssss-rrr
+ || (nHyph1 == 3 && nHyph2 == 8 && nLen == 12)) // lll-Ssss-rrr
+ {
+ if (nHyph3 < 0)
+ {
+ rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+ rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
+ rCountry.clear();
+ rRegion = rBcp47.copy( nHyph2 + 1, 3);
+ rVariants.clear();
+ eRet = EXTRACTED_LR;
+ }
+ }
else if ( (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 10 && nLen >= 15) // ll-Ssss-CC-vvvv[vvvv][-...]
|| (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 11 && nLen >= 16)) // lll-Ssss-CC-vvvv[vvvv][-...]
{
@@ -2531,10 +2575,26 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase();
+ rRegion.clear();
rVariants = rBcp47.copy( nHyph3 + 1);
eRet = EXTRACTED_LV;
}
}
+ else if ( (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 11 && nLen >= 16) // ll-Ssss-rrr-vvvv[vvvv][-...]
+ || (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 12 && nLen >= 17)) // lll-Ssss-rrr-vvvv[vvvv][-...]
+ {
+ if (nHyph4 < 0)
+ nHyph4 = rBcp47.getLength();
+ if (nHyph4 - nHyph3 > 4 && nHyph4 - nHyph3 <= 9)
+ {
+ rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+ rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
+ rCountry.clear();
+ rRegion = rBcp47.copy( nHyph2 + 1, 3);
+ rVariants = rBcp47.copy( nHyph3 + 1);
+ eRet = EXTRACTED_LR;
+ }
+ }
else if ( (nHyph1 == 2 && nHyph2 == 5 && nHyph3 == 7) // ll-CC-u-...
|| (nHyph1 == 3 && nHyph2 == 6 && nHyph3 == 8)) // lll-CC-u-...
{
@@ -2550,6 +2610,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage = "es";
rScript.clear();
rCountry = "ES";
+ rRegion.clear();
rVariants = "u-co-trad"; // not strictly a variant, but used to reconstruct the tag.
eRet = EXTRACTED_LV;
}
@@ -2565,10 +2626,26 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rScript.clear();
rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase();
+ rRegion.clear();
rVariants = rBcp47.copy( nHyph2 + 1);
eRet = EXTRACTED_LV;
}
}
+ else if ( (nHyph1 == 2 && nHyph2 == 6 && nLen >= 11) // ll-rrr-vvvv[vvvv][-...]
+ || (nHyph1 == 3 && nHyph2 == 7 && nLen >= 12)) // lll-rrr-vvvv[vvvv][-...]
+ {
+ if (nHyph3 < 0)
+ nHyph3 = rBcp47.getLength();
+ if (nHyph3 - nHyph2 > 4 && nHyph3 - nHyph2 <= 9)
+ {
+ rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+ rScript.clear();
+ rCountry.clear();
+ rRegion = rBcp47.copy( nHyph1 + 1, 3);
+ rVariants = rBcp47.copy( nHyph2 + 1);
+ eRet = EXTRACTED_LR;
+ }
+ }
else if ( (nHyph1 == 2 && nLen >= 8) // ll-vvvvv[vvv][-...]
|| (nHyph1 == 3 && nLen >= 9)) // lll-vvvvv[vvv][-...]
{
@@ -2579,6 +2656,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rScript.clear();
rCountry.clear();
+ rRegion.clear();
rVariants = rBcp47.copy( nHyph1 + 1);
eRet = EXTRACTED_LV;
}
@@ -2592,6 +2670,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage = "en";
rScript.clear();
rCountry = "GB";
+ rRegion.clear();
rVariants = "oed";
eRet = EXTRACTED_LV;
}
@@ -2602,6 +2681,7 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage = "es";
rScript.clear();
rCountry = "ES";
+ rRegion.clear();
rVariants = "tradnl"; // this is nonsense, but... ignored.
eRet = EXTRACTED_KNOWN_BAD;
}
@@ -2613,8 +2693,19 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
rLanguage.clear();
rScript.clear();
rCountry.clear();
+ rRegion.clear();
rVariants.clear();
}
+ else
+ {
+ assert(rLanguage.getLength() == 2 || rLanguage.getLength() == 3
+ || eRet == EXTRACTED_X_JOKER || eRet == EXTRACTED_X || eRet == EXTRACTED_C_LOCALE);
+ assert(rScript.isEmpty() || rScript.getLength() == 4);
+ assert(rCountry.isEmpty() || rRegion.isEmpty()); // [2ALPHA / 3DIGIT]
+ assert(rCountry.isEmpty() || rCountry.getLength() == 2);
+ assert(rRegion.isEmpty() || rRegion.getLength() == 3);
+ assert(rVariants.isEmpty() || rVariants.getLength() >= 4 || rVariants == "oed");
+ }
return eRet;
}
@@ -2683,10 +2774,8 @@ LanguageTagImpl::Extraction LanguageTagImpl::simpleExtract( const OUString& rBcp
::std::vector< ::std::vector< OUString > > aListFallbacks( rList.size());
size_t i = 0;
for (auto const& elem : rList)
- {
- ::std::vector< OUString > aTmp( LanguageTag(elem).getFallbackStrings( true));
- aListFallbacks[i++] = aTmp;
- }
+ aListFallbacks[i++] = LanguageTag(elem).getFallbackStrings(true);
+
for (auto const& rfb : aFallbacks)
{
size_t nPosFb = 0;
@@ -2828,9 +2917,9 @@ css::lang::Locale LanguageTag::convertToLocaleWithFallback( const OUString& rBcp
// static
-LanguageType LanguageTag::convertToLanguageTypeWithFallback( const css::lang::Locale& rLocale, bool bResolveSystem )
+LanguageType LanguageTag::convertToLanguageTypeWithFallback( const css::lang::Locale& rLocale )
{
- if (rLocale.Language.isEmpty() && !bResolveSystem)
+ if (rLocale.Language.isEmpty())
return LANGUAGE_SYSTEM;
return LanguageTag( rLocale).makeFallback().getLanguageType();
@@ -2838,7 +2927,8 @@ LanguageType LanguageTag::convertToLanguageTypeWithFallback( const css::lang::Lo
// static
-bool LanguageTag::isValidBcp47( const OUString& rString, OUString* o_pCanonicalized, bool bDisallowPrivate )
+bool LanguageTag::isValidBcp47( const OUString& rString, OUString* o_pCanonicalized,
+ LanguageTag::PrivateUse ePrivateUse )
{
bool bValid = false;
@@ -2865,30 +2955,37 @@ bool LanguageTag::isValidBcp47( const OUString& rString, OUString* o_pCanonicali
if (pTag)
{
bValid = true;
- if (bDisallowPrivate)
+ if (ePrivateUse != PrivateUse::ALLOW)
{
- const lt_string_t* pPrivate = lt_tag_get_privateuse( aVar.mpLangtag);
- if (pPrivate && lt_string_length( pPrivate) > 0)
- bValid = false;
- else
+ do
{
+ const char* pLang = nullptr;
const lt_lang_t* pLangT = lt_tag_get_language( aVar.mpLangtag);
if (pLangT)
{
- const char* pLang = lt_lang_get_tag( pLangT);
+ pLang = lt_lang_get_tag( pLangT);
if (pLang && strcmp( pLang, I18NLANGTAG_QLT_ASCII) == 0)
{
- // Disallow 'qlt' privateuse code to prevent
+ // Disallow 'qlt' localuse code to prevent
// confusion with our internal usage.
bValid = false;
+ break;
}
}
+ if (ePrivateUse == PrivateUse::ALLOW_ART_X && pLang && strcmp( pLang, "art") == 0)
+ {
+ // Allow anything 'art' which includes 'art-x-...' and 'art-Latn-x-...'.
+ break;
+ }
+ const lt_string_t* pPrivate = lt_tag_get_privateuse( aVar.mpLangtag);
+ if (pPrivate && lt_string_length( pPrivate) > 0)
+ bValid = false;
}
+ while (false);
}
if (o_pCanonicalized)
*o_pCanonicalized = OUString::createFromAscii( pTag);
free( pTag);
- return bValid;
}
}
else