summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMiklos Vajna <vmiklos@collabora.com>2022-10-25 15:55:34 +0200
committerXisco Fauli <xiscofauli@libreoffice.org>2022-10-26 08:43:52 +0200
commitf3e629ff15dcf2710901dbb942cee9b3c4e38af3 (patch)
tree2b9993a9988029fcefeb55934b156e97ed1e33f9
parenttdf#151060 sw PDF export: don't paint off-page part of drawing object (diff)
downloadcore-f3e629ff15dcf2710901dbb942cee9b3c4e38af3.tar.gz
core-f3e629ff15dcf2710901dbb942cee9b3c4e38af3.zip
sw html import: fix handling of CDATA
In case the HTML contained markup like <![CDATA[...]]>, we simply ignored it during import, even if e.g. the ODT import handles that correctly. The reason for this is that the svtools/ HTMLParser had code to parse <!-- ... ---> style comments, but not for CDATA. Fix the problem by introducing a new HtmlTokenId::CDATA, producing a matching token content in HTMLParser::GetNextToken_(), and finally map it to normal text on the Writer side. Note that HtmlTokenId doesn't allow non-on-off tokens past ONOFF_START, neither allows inserting a single token before ONOFF_START (it breaks getOnToken()), so for now just add a second, dummy token to avoid breakage. Change-Id: I605c3c21dc11986fda5d93d36148788a638e97b4 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141813 Reviewed-by: Miklos Vajna <vmiklos@collabora.com> Tested-by: Jenkins (cherry picked from commit b38730ae0ae92ca49b84a45853c2ed098ee9064f) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141838 Reviewed-by: Xisco Fauli <xiscofauli@libreoffice.org>
-rw-r--r--include/svtools/htmlkywd.hxx1
-rw-r--r--include/svtools/htmltokn.h2
-rw-r--r--svtools/qa/unit/testHtmlReader.cxx23
-rw-r--r--svtools/source/svhtml/htmlkywd.cxx4
-rw-r--r--svtools/source/svhtml/parhtml.cxx39
-rw-r--r--sw/source/filter/html/swhtml.cxx1
6 files changed, 70 insertions, 0 deletions
diff --git a/include/svtools/htmlkywd.hxx b/include/svtools/htmlkywd.hxx
index 5d6b7e629fe7..9a84cddd37bf 100644
--- a/include/svtools/htmlkywd.hxx
+++ b/include/svtools/htmlkywd.hxx
@@ -32,6 +32,7 @@
#define OOO_STRING_SVTOOLS_HTML_base "base"
#define OOO_STRING_SVTOOLS_HTML_comment "!--"
#define OOO_STRING_SVTOOLS_HTML_doctype "!DOCTYPE"
+#define OOO_STRING_SVTOOLS_HTML_cdata "![cdata["
#define OOO_STRING_SVTOOLS_HTML_embed "embed"
#define OOO_STRING_SVTOOLS_HTML_horzrule "hr"
#define OOO_STRING_SVTOOLS_HTML_image "img"
diff --git a/include/svtools/htmltokn.h b/include/svtools/htmltokn.h
index bfa1f14d6812..9dca8a8f3ea7 100644
--- a/include/svtools/htmltokn.h
+++ b/include/svtools/htmltokn.h
@@ -58,6 +58,8 @@ enum class HtmlTokenId : sal_Int16
AREA, // Netscape 2.0
BASE, // HTML 3.0
COMMENT,
+ CDATA,
+ DUMMY, // so ONOFF_START is even
DOCTYPE,
EMBED, // Netscape 2.0 ignore </EMBED>
HORZRULE, // ignore </HR>
diff --git a/svtools/qa/unit/testHtmlReader.cxx b/svtools/qa/unit/testHtmlReader.cxx
index 146458a200eb..37f74e903bcc 100644
--- a/svtools/qa/unit/testHtmlReader.cxx
+++ b/svtools/qa/unit/testHtmlReader.cxx
@@ -27,6 +27,7 @@ public:
OUString m_aDocument;
int m_nLineBreakCount = 0;
+ OUString m_aCdata;
};
TestHTMLParser::TestHTMLParser(SvStream& rStream)
@@ -40,6 +41,8 @@ void TestHTMLParser::NextToken(HtmlTokenId nToken)
m_aDocument += aToken;
else if (nToken == HtmlTokenId::LINEBREAK)
++m_nLineBreakCount;
+ else if (nToken == HtmlTokenId::CDATA)
+ m_aCdata = aToken;
}
/// Tests HTMLParser.
@@ -76,6 +79,26 @@ CPPUNIT_TEST_FIXTURE(Test, testLineBreak)
// This was 2, <br></br> was interpreted as 2 line breaks in XHTML mode.
CPPUNIT_ASSERT_EQUAL(1, xParser->m_nLineBreakCount);
}
+
+CPPUNIT_TEST_FIXTURE(Test, testCdata)
+{
+ // Given a document with CDATA:
+ SvMemoryStream aStream;
+ OString aDocument("A<![CDATA[B &uuml; &lt;]]>C");
+ aStream.WriteBytes(aDocument.getStr(), aDocument.getLength());
+ aStream.Seek(0);
+
+ // When parsing that HTML:
+ tools::SvRef<TestHTMLParser> xParser = new TestHTMLParser(aStream);
+ xParser->CallParser();
+
+ // Then make sure that we get a cdata token with the correct content:
+ // Without the accompanying fix in place, this test would have failed with:
+ // - Expected: B &uuml; &lt;
+ // - Actual :
+ // i.e. the content inside CDATA was lost.
+ CPPUNIT_ASSERT_EQUAL(OUString("B &uuml; &lt;"), xParser->m_aCdata);
+}
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/svtools/source/svhtml/htmlkywd.cxx b/svtools/source/svhtml/htmlkywd.cxx
index 2d51910d85e9..584322fac8bc 100644
--- a/svtools/source/svhtml/htmlkywd.cxx
+++ b/svtools/source/svhtml/htmlkywd.cxx
@@ -27,6 +27,9 @@
#include <svtools/htmltokn.h>
#include <svtools/htmlkywd.hxx>
+// If this is odd, then getOnToken() breaks.
+static_assert(static_cast<sal_Int16>(HtmlTokenId::ABBREVIATION_ON) % 2 == 0);
+
namespace {
template<typename T>
@@ -64,6 +67,7 @@ using HTML_TokenEntry = TokenEntry<HtmlTokenId>;
HTML_TokenEntry const aHTMLTokenTab[] = {
{std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_comment), HtmlTokenId::COMMENT},
{std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_doctype), HtmlTokenId::DOCTYPE},
+ {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_cdata), HtmlTokenId::CDATA},
{std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_anchor), HtmlTokenId::ANCHOR_ON},
{std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_abbreviation), HtmlTokenId::ABBREVIATION_ON}, // HTML 3.0
{std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_acronym), HtmlTokenId::ACRONYM_ON}, // HTML 3.0
diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx
index c6962824b6b6..70d1da950172 100644
--- a/svtools/source/svhtml/parhtml.cxx
+++ b/svtools/source/svhtml/parhtml.cxx
@@ -1053,6 +1053,10 @@ HtmlTokenId HTMLParser::GetNextToken_()
do {
sTmpBuffer.appendUtf32( nNextCh );
nNextCh = GetNextChar();
+ if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
+ {
+ break;
+ }
} while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) &&
!linguistic::IsControlChar(nNextCh) &&
IsParserWorking() && !rInput.eof() );
@@ -1151,6 +1155,41 @@ HtmlTokenId HTMLParser::GetNextToken_()
nNextCh = '>';
}
}
+ else if (nRet == HtmlTokenId::CDATA)
+ {
+ // Read until the closing ]]>.
+ bool bDone = false;
+ while (!bDone && !rInput.eof() && IsParserWorking())
+ {
+ if (nNextCh == '>')
+ {
+ if (sTmpBuffer.getLength() >= 2)
+ {
+ bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']'
+ && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
+ if (bDone)
+ {
+ // Ignore ]] at the end.
+ sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
+ }
+ }
+ if (!bDone)
+ {
+ sTmpBuffer.appendUtf32(nNextCh);
+ }
+ }
+ else if (!linguistic::IsControlChar(nNextCh))
+ {
+ sTmpBuffer.appendUtf32(nNextCh);
+ }
+ if (!bDone)
+ {
+ nNextCh = GetNextChar();
+ }
+ }
+ aToken = sTmpBuffer;
+ sTmpBuffer.setLength(0);
+ }
else
{
// TokenString not needed anymore
diff --git a/sw/source/filter/html/swhtml.cxx b/sw/source/filter/html/swhtml.cxx
index e76421579e9f..c5b33a847560 100644
--- a/sw/source/filter/html/swhtml.cxx
+++ b/sw/source/filter/html/swhtml.cxx
@@ -1519,6 +1519,7 @@ void SwHTMLParser::NextToken( HtmlTokenId nToken )
break;
case HtmlTokenId::TEXTTOKEN:
+ case HtmlTokenId::CDATA:
// insert string without spanning attributes at the end.
if( !aToken.isEmpty() && ' '==aToken[0] && !IsReadPRE() )
{