From c830ca306d55888c3f4222b7247bc0f9e6947bba Mon Sep 17 00:00:00 2001 From: Dennis Francis Date: Tue, 18 Oct 2022 16:14:37 +0530 Subject: vcl: re-exporting broken pdfs -> empty pages Certain pdf documents when loaded in LO_IMPORT_USE_PDFIUM=1 mode even if pdf-version < v1.6 sometimes has missing objects referred by other objects for determing its stream length for instance. As a result parsing fails and results in a pdf with empty pages. A round trip through pdfium and exporting to v1.6 seems to cure the issue. Possibly it does some repairing work to determine the length of the stream in a independent pass through the file. Conflicts: vcl/source/filter/ipdf/pdfread.cxx Change-Id: Id09f67eddab4163ed12a3a3f3a73baf92e2912aa Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141856 Tested-by: Jenkins Reviewed-by: Dennis Francis (cherry picked from commit 3f9e8ac6172f5b1dfd2869ee1c6aea4f24d3f480) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/142137 Reviewed-by: Xisco Fauli --- include/vcl/filter/pdfdocument.hxx | 2 + vcl/Library_vcl.mk | 1 + vcl/inc/pdf/ExternalPDFStreams.hxx | 2 +- vcl/inc/pdf/pdfcompat.hxx | 42 ++++++++++++ vcl/source/filter/ipdf/pdfcompat.cxx | 114 +++++++++++++++++++++++++++++++++ vcl/source/filter/ipdf/pdfdocument.cxx | 14 ++++ vcl/source/filter/ipdf/pdfread.cxx | 110 +++---------------------------- 7 files changed, 183 insertions(+), 102 deletions(-) create mode 100644 vcl/inc/pdf/pdfcompat.hxx create mode 100644 vcl/source/filter/ipdf/pdfcompat.cxx diff --git a/include/vcl/filter/pdfdocument.hxx b/include/vcl/filter/pdfdocument.hxx index dd03029227d2..fbe0be89cdc6 100644 --- a/include/vcl/filter/pdfdocument.hxx +++ b/include/vcl/filter/pdfdocument.hxx @@ -576,6 +576,8 @@ public: //@{ /// Read elements from the start of the stream till its end. bool Read(SvStream& rStream); + /// Calls Read() first and if it fails it tries to fixup and then retry. + bool ReadWithPossibleFixup(SvStream& rStream); void SetSignatureLine(std::vector&& rSignatureLine); void SetSignaturePage(size_t nPage); /// Sign the read document with xCertificate in the edit buffer. diff --git a/vcl/Library_vcl.mk b/vcl/Library_vcl.mk index 6910927bfde0..1d18d2325ad8 100644 --- a/vcl/Library_vcl.mk +++ b/vcl/Library_vcl.mk @@ -451,6 +451,7 @@ $(eval $(call gb_Library_add_exception_objects,vcl,\ vcl/source/filter/ipict/ipict \ vcl/source/filter/ipsd/ipsd \ vcl/source/filter/ipict/shape \ + vcl/source/filter/ipdf/pdfcompat \ vcl/source/filter/ipdf/pdfread \ vcl/source/filter/ipdf/pdfdocument \ vcl/source/filter/iras/iras \ diff --git a/vcl/inc/pdf/ExternalPDFStreams.hxx b/vcl/inc/pdf/ExternalPDFStreams.hxx index 7840217630c8..45b15f7a74bc 100644 --- a/vcl/inc/pdf/ExternalPDFStreams.hxx +++ b/vcl/inc/pdf/ExternalPDFStreams.hxx @@ -42,7 +42,7 @@ struct VCL_DLLPUBLIC ExternalPDFStream aPDFStream.WriteBytes(maDataContainer.getData(), maDataContainer.getSize()); aPDFStream.Seek(0); auto pPDFDocument = std::make_shared(); - if (!pPDFDocument->Read(aPDFStream)) + if (!pPDFDocument->ReadWithPossibleFixup(aPDFStream)) { SAL_WARN("vcl.pdfwriter", "PDFWriterImpl::writeReferenceXObject: reading the PDF document failed"); diff --git a/vcl/inc/pdf/pdfcompat.hxx b/vcl/inc/pdf/pdfcompat.hxx new file mode 100644 index 000000000000..8f629b3bc8ee --- /dev/null +++ b/vcl/inc/pdf/pdfcompat.hxx @@ -0,0 +1,42 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace vcl::pdf +{ +/// Convert to inch, then assume 96 DPI. +inline double pointToPixel(const double fPoint, const double fResolutionDPI) +{ + return o3tl::convert(fPoint, o3tl::Length::pt, o3tl::Length::in) * fResolutionDPI; +} + +/// Decide if PDF data is old enough to be compatible. +bool isCompatible(SvStream& rInStream, sal_uInt64 nPos, sal_uInt64 nSize); + +/// Converts to highest supported format version (currently 1.6). +/// Usually used to deal with missing referenced objects in the +/// source pdf stream. +bool convertToHighestSupported(SvStream& rInStream, SvStream& rOutStream); + +/// Takes care of transparently downgrading the version of the PDF stream in +/// case it's too new for our PDF export. +bool getCompatibleStream(SvStream& rInStream, SvStream& rOutStream); + +BinaryDataContainer createBinaryDataContainer(SvStream& rStream); + +} // end of vcl::filter::ipdf namespace + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/vcl/source/filter/ipdf/pdfcompat.cxx b/vcl/source/filter/ipdf/pdfcompat.cxx new file mode 100644 index 000000000000..62413e585be9 --- /dev/null +++ b/vcl/source/filter/ipdf/pdfcompat.cxx @@ -0,0 +1,114 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include +#include +#include + +namespace vcl::pdf +{ +/// Decide if PDF data is old enough to be compatible. +bool isCompatible(SvStream& rInStream, sal_uInt64 nPos, sal_uInt64 nSize) +{ + if (nSize < 8) + return false; + + // %PDF-x.y + sal_uInt8 aFirstBytes[8]; + rInStream.Seek(nPos); + sal_uLong nRead = rInStream.ReadBytes(aFirstBytes, 8); + if (nRead < 8) + return false; + + if (aFirstBytes[0] != '%' || aFirstBytes[1] != 'P' || aFirstBytes[2] != 'D' + || aFirstBytes[3] != 'F' || aFirstBytes[4] != '-') + return false; + + sal_Int32 nMajor = o3tl::toInt32(std::string_view(reinterpret_cast(&aFirstBytes[5]), 1)); + sal_Int32 nMinor = o3tl::toInt32(std::string_view(reinterpret_cast(&aFirstBytes[7]), 1)); + return !(nMajor > 1 || (nMajor == 1 && nMinor > 6)); +} + +/// Converts to highest supported format version (1.6). +/// Usually used to deal with missing referenced objects in source +/// pdf stream. +bool convertToHighestSupported(SvStream& rInStream, SvStream& rOutStream) +{ + sal_uInt64 nPos = STREAM_SEEK_TO_BEGIN; + sal_uInt64 nSize = STREAM_SEEK_TO_END; + rInStream.Seek(nPos); + // Convert to PDF-1.6. + auto pPdfium = vcl::pdf::PDFiumLibrary::get(); + if (!pPdfium) + return false; + + // Read input into a buffer. + SvMemoryStream aInBuffer; + aInBuffer.WriteStream(rInStream, nSize); + + SvMemoryStream aSaved; + { + // Load the buffer using pdfium. + std::unique_ptr pPdfDocument + = pPdfium->openDocument(aInBuffer.GetData(), aInBuffer.GetSize(), OString()); + if (!pPdfDocument) + return false; + + // 16 means PDF-1.6. + if (!pPdfDocument->saveWithVersion(aSaved, 16)) + return false; + } + + aSaved.Seek(STREAM_SEEK_TO_BEGIN); + rOutStream.WriteStream(aSaved); + + return rOutStream.good(); +} + +/// Takes care of transparently downgrading the version of the PDF stream in +/// case it's too new for our PDF export. +bool getCompatibleStream(SvStream& rInStream, SvStream& rOutStream) +{ + sal_uInt64 nPos = STREAM_SEEK_TO_BEGIN; + sal_uInt64 nSize = STREAM_SEEK_TO_END; + bool bCompatible = isCompatible(rInStream, nPos, nSize); + rInStream.Seek(nPos); + if (bCompatible) + // Not converting. + rOutStream.WriteStream(rInStream, nSize); + else + convertToHighestSupported(rInStream, rOutStream); + + return rOutStream.good(); +} + +BinaryDataContainer createBinaryDataContainer(SvStream& rStream) +{ + // Save the original PDF stream for later use. + SvMemoryStream aMemoryStream; + if (!getCompatibleStream(rStream, aMemoryStream)) + return {}; + + const sal_uInt32 nStreamLength = aMemoryStream.TellEnd(); + + auto aPdfData = std::make_unique>(nStreamLength); + + aMemoryStream.Seek(STREAM_SEEK_TO_BEGIN); + aMemoryStream.ReadBytes(aPdfData->data(), aPdfData->size()); + if (aMemoryStream.GetError()) + return {}; + + return { std::move(aPdfData) }; +} + +} // end vcl::filter::ipdf namespace + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/vcl/source/filter/ipdf/pdfdocument.cxx b/vcl/source/filter/ipdf/pdfdocument.cxx index 493826e38f8f..a93083ce85a8 100644 --- a/vcl/source/filter/ipdf/pdfdocument.cxx +++ b/vcl/source/filter/ipdf/pdfdocument.cxx @@ -8,6 +8,8 @@ */ #include +#include +#include #include #include @@ -1349,6 +1351,18 @@ void PDFDocument::SetIDObject(size_t nID, PDFObjectElement* pObject) m_aIDObjects[nID] = pObject; } +bool PDFDocument::ReadWithPossibleFixup(SvStream& rStream) +{ + if (Read(rStream)) + return true; + + // Read failed, try a roundtrip through pdfium and then retry. + rStream.Seek(0); + SvMemoryStream aStandardizedStream; + vcl::pdf::convertToHighestSupported(rStream, aStandardizedStream); + return Read(aStandardizedStream); +} + bool PDFDocument::Read(SvStream& rStream) { // Check file magic. diff --git a/vcl/source/filter/ipdf/pdfread.cxx b/vcl/source/filter/ipdf/pdfread.cxx index e99682e0c0e4..c6bc4fd5b282 100644 --- a/vcl/source/filter/ipdf/pdfread.cxx +++ b/vcl/source/filter/ipdf/pdfread.cxx @@ -8,8 +8,7 @@ */ #include - -#include +#include #include #include @@ -23,99 +22,6 @@ using namespace com::sun::star; -namespace -{ -/// Convert to inch, then assume 96 DPI. -inline double pointToPixel(const double fPoint, const double fResolutionDPI) -{ - return o3tl::convert(fPoint, o3tl::Length::pt, o3tl::Length::in) * fResolutionDPI; -} - -/// Decide if PDF data is old enough to be compatible. -bool isCompatible(SvStream& rInStream, sal_uInt64 nPos, sal_uInt64 nSize) -{ - if (nSize < 8) - return false; - - // %PDF-x.y - sal_uInt8 aFirstBytes[8]; - rInStream.Seek(nPos); - sal_uLong nRead = rInStream.ReadBytes(aFirstBytes, 8); - if (nRead < 8) - return false; - - if (aFirstBytes[0] != '%' || aFirstBytes[1] != 'P' || aFirstBytes[2] != 'D' - || aFirstBytes[3] != 'F' || aFirstBytes[4] != '-') - return false; - - sal_Int32 nMajor = o3tl::toInt32(std::string_view(reinterpret_cast(&aFirstBytes[5]), 1)); - sal_Int32 nMinor = o3tl::toInt32(std::string_view(reinterpret_cast(&aFirstBytes[7]), 1)); - return !(nMajor > 1 || (nMajor == 1 && nMinor > 6)); -} - -/// Takes care of transparently downgrading the version of the PDF stream in -/// case it's too new for our PDF export. -bool getCompatibleStream(SvStream& rInStream, SvStream& rOutStream) -{ - sal_uInt64 nPos = STREAM_SEEK_TO_BEGIN; - sal_uInt64 nSize = STREAM_SEEK_TO_END; - bool bCompatible = isCompatible(rInStream, nPos, nSize); - rInStream.Seek(nPos); - if (bCompatible) - // Not converting. - rOutStream.WriteStream(rInStream, nSize); - else - { - // Downconvert to PDF-1.6. - auto pPdfium = vcl::pdf::PDFiumLibrary::get(); - if (!pPdfium) - return false; - - // Read input into a buffer. - SvMemoryStream aInBuffer; - aInBuffer.WriteStream(rInStream, nSize); - - SvMemoryStream aSaved; - { - // Load the buffer using pdfium. - std::unique_ptr pPdfDocument - = pPdfium->openDocument(aInBuffer.GetData(), aInBuffer.GetSize(), OString()); - if (!pPdfDocument) - return false; - - // 16 means PDF-1.6. - if (!pPdfDocument->saveWithVersion(aSaved, 16)) - return false; - } - - aSaved.Seek(STREAM_SEEK_TO_BEGIN); - rOutStream.WriteStream(aSaved); - } - - return rOutStream.good(); -} - -BinaryDataContainer createBinaryDataContainer(SvStream& rStream) -{ - // Save the original PDF stream for later use. - SvMemoryStream aMemoryStream; - if (!getCompatibleStream(rStream, aMemoryStream)) - return {}; - - const sal_uInt32 nStreamLength = aMemoryStream.TellEnd(); - - auto aPdfData = std::make_unique>(nStreamLength); - - aMemoryStream.Seek(STREAM_SEEK_TO_BEGIN); - aMemoryStream.ReadBytes(aPdfData->data(), aPdfData->size()); - if (aMemoryStream.GetError()) - return {}; - - return { std::move(aPdfData) }; -} - -} // end anonymous namespace - namespace vcl { size_t RenderPDFBitmaps(const void* pBuffer, int nSize, std::vector& rBitmaps, @@ -160,10 +66,12 @@ size_t RenderPDFBitmaps(const void* pBuffer, int nSize, std::vector& r // Returned unit is points, convert that to pixel. - const size_t nPageWidth = std::round(pointToPixel(nPageWidthPoints, fResolutionDPI) - * PDF_INSERT_MAGIC_SCALE_FACTOR); - const size_t nPageHeight = std::round(pointToPixel(nPageHeightPoints, fResolutionDPI) - * PDF_INSERT_MAGIC_SCALE_FACTOR); + const size_t nPageWidth + = std::round(vcl::pdf::pointToPixel(nPageWidthPoints, fResolutionDPI) + * PDF_INSERT_MAGIC_SCALE_FACTOR); + const size_t nPageHeight + = std::round(vcl::pdf::pointToPixel(nPageHeightPoints, fResolutionDPI) + * PDF_INSERT_MAGIC_SCALE_FACTOR); std::unique_ptr pPdfBitmap = pPdfium->createBitmap(nPageWidth, nPageHeight, /*nAlpha=*/1); if (!pPdfBitmap) @@ -222,7 +130,7 @@ size_t RenderPDFBitmaps(const void* pBuffer, int nSize, std::vector& r bool importPdfVectorGraphicData(SvStream& rStream, std::shared_ptr& rVectorGraphicData) { - BinaryDataContainer aDataContainer = createBinaryDataContainer(rStream); + BinaryDataContainer aDataContainer = vcl::pdf::createBinaryDataContainer(rStream); if (aDataContainer.isEmpty()) { SAL_WARN("vcl.filter", "ImportPDF: empty PDF data array"); @@ -433,7 +341,7 @@ size_t ImportPDFUnloaded(const OUString& rURL, std::vector& rG ::utl::UcbStreamHelper::CreateStream(rURL, StreamMode::READ | StreamMode::SHARE_DENYNONE)); // Save the original PDF stream for later use. - BinaryDataContainer aDataContainer = createBinaryDataContainer(*xStream); + BinaryDataContainer aDataContainer = vcl::pdf::createBinaryDataContainer(*xStream); if (aDataContainer.isEmpty()) return 0; -- cgit