diff options
-rw-r--r-- | external/pdfium/0012-svx-import-processed-PDF-text.patch.2 | 148 | ||||
-rw-r--r-- | external/pdfium/UnpackedTarball_pdfium.mk | 1 | ||||
-rw-r--r-- | svx/source/svdraw/svdpdf.cxx | 30 | ||||
-rw-r--r-- | svx/source/svdraw/svdpdf.hxx | 11 |
4 files changed, 173 insertions, 17 deletions
diff --git a/external/pdfium/0012-svx-import-processed-PDF-text.patch.2 b/external/pdfium/0012-svx-import-processed-PDF-text.patch.2 new file mode 100644 index 000000000000..cae9ec808aba --- /dev/null +++ b/external/pdfium/0012-svx-import-processed-PDF-text.patch.2 @@ -0,0 +1,148 @@ +From 7e8ecec81f102993e3fe73256415dcf049c09e29 Mon Sep 17 00:00:00 2001 +From: Ashod Nakashian <ashod.nakashian@collabora.co.uk> +Date: Tue, 5 Jun 2018 11:35:39 +0200 +Subject: [PATCH 12/14] svx: import processed PDF text + +--- + pdfium/core/fpdftext/cpdf_textpage.cpp | 29 ++++++++++++++++++++++++ + pdfium/core/fpdftext/cpdf_textpage.h | 2 ++ + pdfium/fpdfsdk/fpdf_editpage.cpp | 41 ++++++++++++++++++++++++++++++++++ + pdfium/public/fpdf_edit.h | 13 +++++++++++ + 4 files changed, 85 insertions(+) + +diff --git a/pdfium/core/fpdftext/cpdf_textpage.cpp b/pdfium/core/fpdftext/cpdf_textpage.cpp +index 5690698..4d7c48a 100644 +--- a/pdfium/core/fpdftext/cpdf_textpage.cpp ++++ b/pdfium/core/fpdftext/cpdf_textpage.cpp +@@ -1464,3 +1464,32 @@ Optional<PAGECHAR_INFO> CPDF_TextPage::GenerateCharInfo(wchar_t unicode) { + info.m_Origin.x, info.m_Origin.y); + return info; + } ++ ++WideString CPDF_TextPage::GetTextObjectText(CPDF_TextObject* pTextObj) ++{ ++ if (!m_bIsParsed) ++ return WideString(); ++ ++ float posy = 0; ++ bool IsContainPreChar = false; ++ bool IsAddLineFeed = false; ++ WideString strText; ++ for (const auto& charinfo : m_CharList) { ++ if (charinfo.m_pTextObj == pTextObj) { ++ IsContainPreChar = true; ++ IsAddLineFeed = false; ++ if (charinfo.m_Unicode) ++ strText += charinfo.m_Unicode; ++ } else if (charinfo.m_Unicode == 32) { ++ if (IsContainPreChar && charinfo.m_Unicode) { ++ strText += charinfo.m_Unicode; ++ IsContainPreChar = false; ++ IsAddLineFeed = false; ++ } ++ } else { ++ IsContainPreChar = false; ++ IsAddLineFeed = true; ++ } ++ } ++ return strText; ++} +diff --git a/pdfium/core/fpdftext/cpdf_textpage.h b/pdfium/core/fpdftext/cpdf_textpage.h +index 43a0312..7d5d5ec 100644 +--- a/pdfium/core/fpdftext/cpdf_textpage.h ++++ b/pdfium/core/fpdftext/cpdf_textpage.h +@@ -105,6 +105,8 @@ class CPDF_TextPage { + WideString GetPageText(int start, int count) const; + WideString GetAllPageText() const { return GetPageText(0, CountChars()); } + ++ WideString GetTextObjectText(CPDF_TextObject* pTextObj); ++ + int CountRects(int start, int nCount); + bool GetRect(int rectIndex, CFX_FloatRect* pRect) const; + +diff --git a/pdfium/fpdfsdk/fpdf_editpage.cpp b/pdfium/fpdfsdk/fpdf_editpage.cpp +index f4a1688..f34d3b5 100644 +--- a/pdfium/fpdfsdk/fpdf_editpage.cpp ++++ b/pdfium/fpdfsdk/fpdf_editpage.cpp +@@ -27,6 +27,7 @@ + #include "core/fpdfapi/parser/cpdf_string.h" + #include "core/fpdfdoc/cpdf_annot.h" + #include "core/fpdfdoc/cpdf_annotlist.h" ++#include "core/fpdftext/cpdf_textpage.h" + #include "fpdfsdk/cpdfsdk_helpers.h" + #include "public/fpdf_formfill.h" + #include "third_party/base/logging.h" +@@ -732,6 +733,46 @@ FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object, + return ret_count; + } + ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object, ++ FPDF_TEXTPAGE page, ++ int char_start, ++ int char_count, ++ unsigned short* result) ++{ ++ if (!page || !text_object || char_start < 0 || char_count < 0 || !result) ++ return 0; ++ ++ CPDF_TextObject* pTxtObj = CPDFTextObjectFromFPDFPageObject(text_object); ++ CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page); ++ int char_available = textpage->CountChars() - char_start; ++ if (char_available <= 0) ++ return 0; ++ ++ char_count = std::min(char_count, char_available); ++ if (char_count == 0) { ++ // Writing out "", which has a character count of 1 due to the NUL. ++ *result = '\0'; ++ return 1; ++ } ++ ++ WideString str = textpage->GetTextObjectText(pTxtObj); ++ ++ if (str.GetLength() > static_cast<size_t>(char_count)) ++ str = str.Left(static_cast<size_t>(char_count)); ++ ++ // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected ++ // the number of items to stay the same. ++ ByteString byte_str = str.UTF16LE_Encode(); ++ size_t byte_str_len = byte_str.GetLength(); ++ constexpr size_t kBytesPerCharacter = sizeof(unsigned short); ++ int ret_count = byte_str_len / kBytesPerCharacter; ++ ++ ASSERT(ret_count <= char_count + 1); // +1 to account for the NUL terminator. ++ memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len); ++ return ret_count; ++} ++ + FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV + FPDFTextObj_GetColor(FPDF_PAGEOBJECT text_object, + unsigned int* R, +diff --git a/pdfium/public/fpdf_edit.h b/pdfium/public/fpdf_edit.h +index f249e64..e14b2a5 100644 +--- a/pdfium/public/fpdf_edit.h ++++ b/pdfium/public/fpdf_edit.h +@@ -1065,6 +1065,19 @@ FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object, + int char_count, + unsigned short* result); + ++// Get the processed text of a text object. ++// ++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj ++// or FPDFPageObj_NewTextObjEx. ++// Return Value: ++// The number of characters (not bytes) written in result. ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object, ++ FPDF_TEXTPAGE page, ++ int char_start, ++ int char_count, ++ unsigned short* result); ++ + // Get the stroke RGBA of a text. Range of values: 0 - 255. + // + // path - the handle to the path object. +-- +2.16.3 + diff --git a/external/pdfium/UnpackedTarball_pdfium.mk b/external/pdfium/UnpackedTarball_pdfium.mk index 6880ac0b670b..5525e9ddf65d 100644 --- a/external/pdfium/UnpackedTarball_pdfium.mk +++ b/external/pdfium/UnpackedTarball_pdfium.mk @@ -25,6 +25,7 @@ pdfium_patches += 0008-svx-correct-the-positioning-of-PDF-Paths-and-the-str.patc pdfium_patches += 0009-svx-support-color-text-for-imported-PDFs.patch.2 pdfium_patches += 0010-svx-support-importing-forms-from-PDFs.patch.2 pdfium_patches += 0011-svx-correctly-possition-form-objects-from-PDF.patch.2 +pdfium_patches += 0012-svx-import-processed-PDF-text.patch.2 $(eval $(call gb_UnpackedTarball_UnpackedTarball,pdfium)) diff --git a/svx/source/svdraw/svdpdf.cxx b/svx/source/svdraw/svdpdf.cxx index f2fbd7a835ed..1946b61c29bb 100644 --- a/svx/source/svdraw/svdpdf.cxx +++ b/svx/source/svdraw/svdpdf.cxx @@ -225,13 +225,18 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc << ", height: " << dPageHeight); SetupPageScale(dPageWidth, dPageHeight); + // Load the page text to extract it when we get text elements. + FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage); + const int nPageObjectCount = FPDFPage_CountObject(pPdfPage); for (int nPageObjectIndex = 0; nPageObjectIndex < nPageObjectCount; ++nPageObjectIndex) { FPDF_PAGEOBJECT pPageObject = FPDFPage_GetObject(pPdfPage, nPageObjectIndex); - ImportPdfObject(pPageObject, nPageObjectIndex); + ImportPdfObject(pPageObject, pTextPage, nPageObjectIndex); } + FPDFText_ClosePage(pTextPage); + #if 0 // Now do the text. FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage); @@ -990,8 +995,8 @@ void ImpSdrPdfImport::checkClip() } bool ImpSdrPdfImport::isClip() const { return !maClip.getB2DRange().isEmpty(); } - -void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex) +void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, + int nPageObjectIndex) { if (pPageObject == nullptr) return; @@ -1000,7 +1005,7 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje switch (nPageObjectType) { case FPDF_PAGEOBJ_TEXT: - ImportText(pPageObject, nPageObjectIndex); + ImportText(pPageObject, pTextPage, nPageObjectIndex); break; case FPDF_PAGEOBJ_PATH: ImportPath(pPageObject, nPageObjectIndex); @@ -1012,7 +1017,7 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje SAL_WARN("sd.filter", "Got page object SHADING: " << nPageObjectIndex); break; case FPDF_PAGEOBJ_FORM: - ImportForm(pPageObject, nPageObjectIndex); + ImportForm(pPageObject, pTextPage, nPageObjectIndex); break; default: SAL_WARN("sd.filter", "Unknown PDF page object #" << nPageObjectIndex @@ -1021,7 +1026,8 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje } } -void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex) +void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, + int nPageObjectIndex) { SAL_WARN("sd.filter", "Got page object FORM: " << nPageObjectIndex); @@ -1036,14 +1042,15 @@ void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectInd for (int nIndex = 0; nIndex < nCount; ++nIndex) { FPDF_PAGEOBJECT pFormObject = FPDFFormObj_GetSubObject(pPageObject, nIndex); - ImportPdfObject(pFormObject, -1); + ImportPdfObject(pFormObject, pTextPage, -1); } // Restore the old one. mCurMatrix = aOldMatrix; } -void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex) +void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, + int nPageObjectIndex) { SAL_WARN("sd.filter", "Got page object TEXT: " << nPageObjectIndex); float left; @@ -1075,14 +1082,15 @@ void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectInd SAL_WARN("sd.filter", "Got TEXT origin: " << aPos); SAL_WARN("sd.filter", "Got TEXT Bounds: " << aRect); - const int nChars = FPDFTextObj_CountChars(pPageObject); + const int nChars = FPDFTextObj_CountChars(pPageObject) * 2; std::unique_ptr<sal_Unicode[]> pText(new sal_Unicode[nChars + 1]); // + terminating null unsigned short* pShortText = reinterpret_cast<unsigned short*>(pText.get()); - const int nActualChars = FPDFTextObj_GetText(pPageObject, 0, nChars, pShortText); + const int nActualChars + = FPDFTextObj_GetTextProcessed(pPageObject, pTextPage, 0, nChars, pShortText); if (nActualChars <= 0) { - SAL_WARN("sd.filter", "Got not TEXT"); + SAL_WARN("sd.filter", "Got no TEXT"); return; } diff --git a/svx/source/svdraw/svdpdf.hxx b/svx/source/svdraw/svdpdf.hxx index 460b508e83a8..da54b9a40fa8 100644 --- a/svx/source/svdraw/svdpdf.hxx +++ b/svx/source/svdraw/svdpdf.hxx @@ -42,6 +42,7 @@ class SdrObject; class SvdProgressInfo; typedef void* FPDF_DOCUMENT; typedef void* FPDF_PAGEOBJECT; +typedef void* FPDF_TEXTPAGE; // Helper Class to import PDF class ImpSdrPdfImport final @@ -86,7 +87,6 @@ class ImpSdrPdfImport final double d() const { return md; } double e() const { return me; } double f() const { return mf; } - /// Mutliply this * other. void Concatinate(const Matrix& other) { @@ -156,7 +156,6 @@ class ImpSdrPdfImport final /// Correct the vertical coordinate to start at the top. /// PDF coordinate system has orign at the bottom right. double correctVertOrigin(double offsetPts) const { return mdPageHeightPts - offsetPts; } - /// Convert PDF points to logic (twips). tools::Rectangle PointsToLogic(double left, double right, double top, double bottom) const; Point PointsToLogic(double x, double y) const; @@ -165,11 +164,12 @@ class ImpSdrPdfImport final void checkClip(); bool isClip() const; - void ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex); - void ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex); + void ImportPdfObject(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, + int nPageObjectIndex); + void ImportForm(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, int nPageObjectIndex); void ImportImage(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex); void ImportPath(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex); - void ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex); + void ImportText(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, int nPageObjectIndex); void ImportText(const Point& rPos, const Size& rSize, const OUString& rStr); void SetupPageScale(const double dPageWidth, const double dPageHeight); @@ -193,7 +193,6 @@ public: ~ImpSdrPdfImport(); int GetPageCount() const { return mnPageCount; } - size_t DoImport(SdrObjList& rDestList, size_t nInsPos, int nPageNumber, SvdProgressInfo* pProgrInfo = nullptr); }; |