summaryrefslogtreecommitdiffstats
path: root/sal/qa
diff options
context:
space:
mode:
authorStephan Bergmann <sbergman@redhat.com>2017-09-13 08:28:32 +0200
committerStephan Bergmann <sbergman@redhat.com>2017-09-13 08:28:32 +0200
commit08e78607ec6bc820c52ab3df1a5d3738e049b90d (patch)
tree64a43efbf2e834663a4a29919f8724da67a1bb34 /sal/qa
parentChange assertion failure to SAL_WARN (diff)
downloadcore-08e78607ec6bc820c52ab3df1a5d3738e049b90d.tar.gz
core-08e78607ec6bc820c52ab3df1a5d3738e049b90d.zip
Make reading UTF-8 strict
Consider non-shortest forms, surrogates, and representations of values larger than 0x10FFFF (which can even cover five or six bytes, for historical reasons) as "invalid" (they used to be considered as "undefined" instead). This is in response to fc670f637d4271246691904fd649358ce2e7be59 "svtools: HTML import: don't put lone surrogates in OUString" (which can now be reverted again in a follow-up commit). My fear would have been that some places in the code rely on the original, relaxed handling, but at least 'make check' still succeeded for me. Change-Id: I017e6c04ed3c577c3694b417167f853987a1d1ce
Diffstat (limited to 'sal/qa')
-rw-r--r--sal/qa/rtl/textenc/rtl_textcvt.cxx332
1 files changed, 303 insertions, 29 deletions
diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx b/sal/qa/rtl/textenc/rtl_textcvt.cxx
index d698bc22cd74..3c36852bebfc 100644
--- a/sal/qa/rtl/textenc/rtl_textcvt.cxx
+++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx
@@ -453,6 +453,8 @@ public:
void testComplexCut();
+ void testInvalidUtf8();
+
void testSRCBUFFERTOSMALL();
void testMime();
@@ -465,6 +467,7 @@ public:
CPPUNIT_TEST(testSingleByte);
CPPUNIT_TEST(testComplex);
CPPUNIT_TEST(testComplexCut);
+ CPPUNIT_TEST(testInvalidUtf8);
CPPUNIT_TEST(testSRCBUFFERTOSMALL);
CPPUNIT_TEST(testMime);
CPPUNIT_TEST(testWindows);
@@ -2330,35 +2333,6 @@ void Test::testComplex() {
true,
false,
RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
- { RTL_TEXTENCODING_UTF8,
- RTL_CONSTASCII_STRINGPARAM(
- "\xC0\x80\xE0\x80\x81\xF0\x80\x80\x82\xF8\x80\x80\x80\x83"
- "\xFC\x80\x80\x80\x80\x84"),
- { 0x0000,0x0001,0x0002,0x0003,0x0004 },
- 5,
- false,
- true,
- false,
- false,
- RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
- { RTL_TEXTENCODING_UTF8,
- RTL_CONSTASCII_STRINGPARAM("\xED\xA1\x89\xED\xB4\x93"),
- { 0xD849,0xDD13 },
- 2,
- false,
- true,
- false,
- false,
- RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
- { RTL_TEXTENCODING_UTF8,
- RTL_CONSTASCII_STRINGPARAM("\xED\xA1\x89\x41"),
- { 0xD849,0x0041 },
- 2,
- false,
- true,
- false,
- false,
- RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
// Test Java UTF-8:
@@ -2664,6 +2638,306 @@ void Test::testComplexCut() {
#endif
}
+void Test::testInvalidUtf8() {
+ // UTF-8, invalid bytes:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\x80\xBF\xFE\xFF"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(4), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD\uFFFD\uFFFD\uFFFD"),
+ OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, non-shortest two-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xC0\x80"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(2), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, cut two-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xC0"), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(0), size);
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL, info);
+ CPPUNIT_ASSERT(converted <= 1);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, non-shortest three-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x9F\xBF"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(3), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, cut three-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x80"), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(0), size);
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL, info);
+ CPPUNIT_ASSERT(converted <= 2);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, cut three-byte sequence followed by more:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xE0\x80."), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(2), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD."), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(3), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, surrogates:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr,
+ RTL_CONSTASCII_STRINGPARAM("\xED\xA0\x80\xED\xB0\x80"), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(2), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(6), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, non-shortest four-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xF0\x8F\xBF\xBF"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, too-large four-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xF4\x90\x80\x80"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, five-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr,
+ RTL_CONSTASCII_STRINGPARAM("\xFB\xBF\xBF\xBF\xBF"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(5), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // UTF-8, six-byte sequence:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr,
+ RTL_CONSTASCII_STRINGPARAM("\xFD\xBF\xBF\xBF\xBF\xBF"),
+ buf, TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(6), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // Java UTF-8, U+0000:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_JAVA_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\0"), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+ // Java UTF-8, U+10000:
+ {
+ auto const converter = rtl_createTextToUnicodeConverter(
+ RTL_TEXTENCODING_JAVA_UTF8);
+ CPPUNIT_ASSERT(converter != nullptr);
+ sal_Unicode buf[TEST_STRING_SIZE];
+ sal_uInt32 info;
+ sal_Size converted;
+ auto const size = rtl_convertTextToUnicode(
+ converter, nullptr, RTL_CONSTASCII_STRINGPARAM(u8"\U00010000"), buf,
+ TEST_STRING_SIZE,
+ (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+ &info, &converted);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+ CPPUNIT_ASSERT_EQUAL(
+ OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+ CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+ CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+ rtl_destroyTextToUnicodeConverter(converter);
+ }
+}
+
void Test::testSRCBUFFERTOSMALL() {
rtl_TextToUnicodeConverter cv = rtl_createTextToUnicodeConverter(
RTL_TEXTENCODING_EUC_JP);