summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKevin Suo <suokunlong@126.com>2022-10-11 10:04:16 +0800
committerXisco Fauli <xiscofauli@libreoffice.org>2022-11-01 11:17:40 +0100
commitbcb4fef4e321c6c0f1f9edd05b2f18ddb34792ed (patch)
treec5ac7a9026a93eba4916dae9e33fcb4329dfd970
parentbuild appstore packages with symbols/add targets to merge into universal build (diff)
downloadcore-bcb4fef4e321c6c0f1f9edd05b2f18ddb34792ed.tar.gz
core-bcb4fef4e321c6c0f1f9edd05b2f18ddb34792ed.zip
sdext.pdfimport: resolves tdf#104597: RTL script text runs are reversed
For the simple Arabic string: ٱلسَّلَامُ عَلَيْك, the xpdfimport binary generates the follwing (drawchar) sequences: كَ يْ لَ عَ مُ ا لَ سَّ ل ٱ (i.e., in reversed order, one character by one character). Before this patch, after pdfimport the text shows up as لَسَّلٱ كَيْلَعَ مُا, which is reversed. It was surposed to combine these characters into text frames in DrawXmlOptimizer::optimizeTextElements(Element& rParent) (sdext/source/pdfimport/\ tree/drawtreevisiting.cxx:677), but actually it was not combined successfully there. The single characters were then passed to sdext/source/pdfimport/tree/drawtreevisiting\ .cxx:105, one by one, in the hope that the strings could be mirrored. The mirroring failed, because one single character, even after mirroring, always equals itself. The DrawXmlOptimizer::optimizeTextElements failed to combine the characters into one text frame, because the condition: (rCurGC.Transformation == rNextGC.Transformation || notTransformed(rNextGC)) would never be true, as at least its horizontal position is different. A further analysis indicates that we do not need to check the transformation here at all, as this is an optimizer for a TextElement and in case a character is transformed then it would already be in a different draw element (thus will never be combined with this TextElement). After the fix of DrawXmlOptimizer::optimizeTextElements which now successfully combines the characters, there is another issue in the old PDFIProcessor::mirrorString function. It seems to mirror the characters within a word, but if a string contains two words, then the two words are not successfully switched (e.g. for string "abc def" it produces "cba fed" rather than "fed cba"), which is not suitable for the case of RTL which requires all the characters been reversed. Fix this by using comphelper::string::reverseString. Change-Id: Ifa210836f1d6666dd56205ff0d243adfb4114794 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141231 Tested-by: Jenkins Reviewed-by: Thorsten Behrens <thorsten.behrens@allotropia.de> (cherry picked from commit 69e9925ded584113e52f84ef0ed7c224079fa061) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141319 Reviewed-by: Xisco Fauli <xiscofauli@libreoffice.org>
-rw-r--r--sdext/source/pdfimport/test/testdocs/tdf104597_textrun.pdfbin0 -> 21042 bytes
-rw-r--r--sdext/source/pdfimport/test/tests.cxx44
-rw-r--r--sdext/source/pdfimport/tree/drawtreevisiting.cxx17
-rw-r--r--sdext/source/pdfimport/tree/pdfiprocessor.cxx18
4 files changed, 48 insertions, 31 deletions
diff --git a/sdext/source/pdfimport/test/testdocs/tdf104597_textrun.pdf b/sdext/source/pdfimport/test/testdocs/tdf104597_textrun.pdf
new file mode 100644
index 000000000000..dcee96aa3169
--- /dev/null
+++ b/sdext/source/pdfimport/test/testdocs/tdf104597_textrun.pdf
Binary files differ
diff --git a/sdext/source/pdfimport/test/tests.cxx b/sdext/source/pdfimport/test/tests.cxx
index 77d98999d0d3..a520569c4164 100644
--- a/sdext/source/pdfimport/test/tests.cxx
+++ b/sdext/source/pdfimport/test/tests.cxx
@@ -29,6 +29,7 @@
#include <rtl/math.hxx>
#include <osl/file.hxx>
#include <comphelper/sequence.hxx>
+#include <comphelper/string.hxx>
#include <cppunit/TestAssert.h>
#include <cppunit/extensions/HelperMacros.h>
@@ -780,6 +781,48 @@ namespace
getXPath(pXmlDoc, xpath, "font-weight"));
}
+ void testTdf104597_textrun()
+ {
+#if HAVE_FEATURE_POPPLER
+ rtl::Reference<pdfi::PDFIRawAdaptor> xAdaptor(new pdfi::PDFIRawAdaptor(OUString(), getComponentContext()));
+ xAdaptor->setTreeVisitorFactory(createDrawTreeVisitorFactory());
+
+ OString aOutput;
+ CPPUNIT_ASSERT_MESSAGE("Converting PDF to ODF XML",
+ xAdaptor->odfConvert(m_directories.getURLFromSrc(u"/sdext/source/pdfimport/test/testdocs/tdf104597_textrun.pdf"),
+ new OutputWrapString(aOutput),
+ nullptr));
+
+ // std::cout << aOutput << std::endl;
+ xmlDocUniquePtr pXmlDoc(xmlParseDoc(reinterpret_cast<xmlChar const *>(aOutput.getStr())));
+
+ // Test for امُ عَلَيْكَ
+ // TODO: How to get the "عَلَيْكَ" in xpath, as shown after the <text:s> tag?
+ OString xpath = "//draw:frame[@draw:transform='matrix(917.222222222222 0 0 917.222222222222 14821.9583333333 2159.23861112778)']/draw:text-box/text:p/text:span";
+ OUString sContent = getXPathContent(pXmlDoc, xpath); // u"\nا\nُ\nم\n"
+ CPPUNIT_ASSERT_EQUAL(OUString(u"اُم"), sContent.replaceAll("\n", ""));
+
+ // Test for ٱلَّسَل‬ . It appears in the 3rd frame, i.e. after the امُ عَلَيْكَ which is in the 2nd frame (from left to right)
+ // thus these two frames together appear as ٱلَّسَل امُ عَلَيْكَ in Draw‬.
+ xpath = "//draw:frame[@draw:transform='matrix(917.222222222222 0 0 917.222222222222 17420.1666666667 2159.23861112778)']/draw:text-box/text:p/text:span";
+ sContent = getXPathContent(pXmlDoc, xpath);
+ CPPUNIT_ASSERT_EQUAL(OUString(u"ٱلَّسَل"), sContent.replaceAll("\n", ""));
+
+ // Test for "LibreOffice LTR"
+ // TODO: How to get the "LTR" as shown after the <text:s> tag?
+ xpath = "//draw:frame[@draw:transform='matrix(917.222222222222 0 0 917.222222222222 12779.375 5121.79583335)']/draw:text-box/text:p/text:span";
+ sContent = getXPathContent(pXmlDoc, xpath);
+ CPPUNIT_ASSERT_EQUAL(OUString(u"LibreOffice"), sContent.replaceAll("\n", ""));
+
+ /* Test for Chinese characters */
+ // Use last() instead of matrix below, because the matrix may be different on different OS due to fallback of Chinese fonts.
+ xpath = "//draw:frame[last()]/draw:text-box/text:p/text:span";
+ sContent = getXPathContent(pXmlDoc, xpath);
+ CPPUNIT_ASSERT_EQUAL(OUString(u"中文测试,中文"), sContent.replaceAll("\n", ""));
+#endif
+ }
+
+
CPPUNIT_TEST_SUITE(PDFITest);
CPPUNIT_TEST(testXPDFParser);
CPPUNIT_TEST(testOdfWriterExport);
@@ -791,6 +834,7 @@ namespace
CPPUNIT_TEST(testTdf78427_FontFeatures);
CPPUNIT_TEST(testTdf78427_FontWeight_MyraidProSemibold);
CPPUNIT_TEST(testTdf143959_nameFromFontFile);
+ CPPUNIT_TEST(testTdf104597_textrun);
CPPUNIT_TEST_SUITE_END();
};
diff --git a/sdext/source/pdfimport/tree/drawtreevisiting.cxx b/sdext/source/pdfimport/tree/drawtreevisiting.cxx
index ffc27c65f56c..5a811f20eede 100644
--- a/sdext/source/pdfimport/tree/drawtreevisiting.cxx
+++ b/sdext/source/pdfimport/tree/drawtreevisiting.cxx
@@ -32,6 +32,7 @@
#include <com/sun/star/i18n/CharacterClassification.hpp>
#include <com/sun/star/i18n/ScriptType.hpp>
#include <com/sun/star/i18n/DirectionProperty.hpp>
+#include <comphelper/string.hxx>
#include <string.h>
#include <string_view>
@@ -122,7 +123,7 @@ void DrawXmlEmitter::visit( TextElement& elem, const std::list< std::unique_ptr<
}
if (isRTL) // If so, reverse string
- str = PDFIProcessor::mirrorString( str );
+ str = ::comphelper::string::reverseString(str);
m_rEmitContext.rEmitter.beginTag( "text:span", aProps );
@@ -647,15 +648,6 @@ static bool isSpaces(TextElement* pTextElem)
return true;
}
-static bool notTransformed(const GraphicsContext& GC)
-{
- return
- rtl::math::approxEqual(GC.Transformation.get(0,0), 100.00) &&
- GC.Transformation.get(1,0) == 0.00 &&
- GC.Transformation.get(0,1) == 0.00 &&
- rtl::math::approxEqual(GC.Transformation.get(1,1), -100.00);
-}
-
void DrawXmlOptimizer::optimizeTextElements(Element& rParent)
{
if( rParent.Children.empty() ) // this should not happen
@@ -696,13 +688,12 @@ void DrawXmlOptimizer::optimizeTextElements(Element& rParent)
// line and space optimization; works only in strictly horizontal mode
// concatenate consecutive text elements unless there is a
- // font or text color or matrix change, leave a new span in that case
+ // font or text color change, leave a new span in that case
if( (pCur->FontId == pNext->FontId || isSpaces(pNext)) &&
rCurGC.FillColor.Red == rNextGC.FillColor.Red &&
rCurGC.FillColor.Green == rNextGC.FillColor.Green &&
rCurGC.FillColor.Blue == rNextGC.FillColor.Blue &&
- rCurGC.FillColor.Alpha == rNextGC.FillColor.Alpha &&
- (rCurGC.Transformation == rNextGC.Transformation || notTransformed(rNextGC))
+ rCurGC.FillColor.Alpha == rNextGC.FillColor.Alpha
)
{
pCur->updateGeometryWith( pNext );
diff --git a/sdext/source/pdfimport/tree/pdfiprocessor.cxx b/sdext/source/pdfimport/tree/pdfiprocessor.cxx
index ff105426ad51..701813942ba2 100644
--- a/sdext/source/pdfimport/tree/pdfiprocessor.cxx
+++ b/sdext/source/pdfimport/tree/pdfiprocessor.cxx
@@ -695,24 +695,6 @@ void PDFIProcessor::sortElements(Element* pEle)
pEle->Children.sort(lr_tb_sort);
}
-// helper method: get a mirrored string
-OUString PDFIProcessor::mirrorString( const OUString& i_rString )
-{
- const sal_Int32 nLen = i_rString.getLength();
- OUStringBuffer aMirror( nLen );
-
- sal_Int32 i = 0;
- while(i < nLen)
- {
- // read one code point
- const sal_uInt32 nCodePoint = i_rString.iterateCodePoints( &i );
-
- // and append it mirrored
- aMirror.appendUtf32( GetMirroredChar(nCodePoint) );
- }
- return aMirror.makeStringAndClear();
-}
-
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */