summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--external/pdfium/0012-svx-import-processed-PDF-text.patch.2148
-rw-r--r--external/pdfium/UnpackedTarball_pdfium.mk1
-rw-r--r--svx/source/svdraw/svdpdf.cxx30
-rw-r--r--svx/source/svdraw/svdpdf.hxx11
4 files changed, 173 insertions, 17 deletions
diff --git a/external/pdfium/0012-svx-import-processed-PDF-text.patch.2 b/external/pdfium/0012-svx-import-processed-PDF-text.patch.2
new file mode 100644
index 000000000000..cae9ec808aba
--- /dev/null
+++ b/external/pdfium/0012-svx-import-processed-PDF-text.patch.2
@@ -0,0 +1,148 @@
+From 7e8ecec81f102993e3fe73256415dcf049c09e29 Mon Sep 17 00:00:00 2001
+From: Ashod Nakashian <ashod.nakashian@collabora.co.uk>
+Date: Tue, 5 Jun 2018 11:35:39 +0200
+Subject: [PATCH 12/14] svx: import processed PDF text
+
+---
+ pdfium/core/fpdftext/cpdf_textpage.cpp | 29 ++++++++++++++++++++++++
+ pdfium/core/fpdftext/cpdf_textpage.h | 2 ++
+ pdfium/fpdfsdk/fpdf_editpage.cpp | 41 ++++++++++++++++++++++++++++++++++
+ pdfium/public/fpdf_edit.h | 13 +++++++++++
+ 4 files changed, 85 insertions(+)
+
+diff --git a/pdfium/core/fpdftext/cpdf_textpage.cpp b/pdfium/core/fpdftext/cpdf_textpage.cpp
+index 5690698..4d7c48a 100644
+--- a/pdfium/core/fpdftext/cpdf_textpage.cpp
++++ b/pdfium/core/fpdftext/cpdf_textpage.cpp
+@@ -1464,3 +1464,32 @@ Optional<PAGECHAR_INFO> CPDF_TextPage::GenerateCharInfo(wchar_t unicode) {
+ info.m_Origin.x, info.m_Origin.y);
+ return info;
+ }
++
++WideString CPDF_TextPage::GetTextObjectText(CPDF_TextObject* pTextObj)
++{
++ if (!m_bIsParsed)
++ return WideString();
++
++ float posy = 0;
++ bool IsContainPreChar = false;
++ bool IsAddLineFeed = false;
++ WideString strText;
++ for (const auto& charinfo : m_CharList) {
++ if (charinfo.m_pTextObj == pTextObj) {
++ IsContainPreChar = true;
++ IsAddLineFeed = false;
++ if (charinfo.m_Unicode)
++ strText += charinfo.m_Unicode;
++ } else if (charinfo.m_Unicode == 32) {
++ if (IsContainPreChar && charinfo.m_Unicode) {
++ strText += charinfo.m_Unicode;
++ IsContainPreChar = false;
++ IsAddLineFeed = false;
++ }
++ } else {
++ IsContainPreChar = false;
++ IsAddLineFeed = true;
++ }
++ }
++ return strText;
++}
+diff --git a/pdfium/core/fpdftext/cpdf_textpage.h b/pdfium/core/fpdftext/cpdf_textpage.h
+index 43a0312..7d5d5ec 100644
+--- a/pdfium/core/fpdftext/cpdf_textpage.h
++++ b/pdfium/core/fpdftext/cpdf_textpage.h
+@@ -105,6 +105,8 @@ class CPDF_TextPage {
+ WideString GetPageText(int start, int count) const;
+ WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
+
++ WideString GetTextObjectText(CPDF_TextObject* pTextObj);
++
+ int CountRects(int start, int nCount);
+ bool GetRect(int rectIndex, CFX_FloatRect* pRect) const;
+
+diff --git a/pdfium/fpdfsdk/fpdf_editpage.cpp b/pdfium/fpdfsdk/fpdf_editpage.cpp
+index f4a1688..f34d3b5 100644
+--- a/pdfium/fpdfsdk/fpdf_editpage.cpp
++++ b/pdfium/fpdfsdk/fpdf_editpage.cpp
+@@ -27,6 +27,7 @@
+ #include "core/fpdfapi/parser/cpdf_string.h"
+ #include "core/fpdfdoc/cpdf_annot.h"
+ #include "core/fpdfdoc/cpdf_annotlist.h"
++#include "core/fpdftext/cpdf_textpage.h"
+ #include "fpdfsdk/cpdfsdk_helpers.h"
+ #include "public/fpdf_formfill.h"
+ #include "third_party/base/logging.h"
+@@ -732,6 +733,46 @@ FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+ return ret_count;
+ }
+
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object,
++ FPDF_TEXTPAGE page,
++ int char_start,
++ int char_count,
++ unsigned short* result)
++{
++ if (!page || !text_object || char_start < 0 || char_count < 0 || !result)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = CPDFTextObjectFromFPDFPageObject(text_object);
++ CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page);
++ int char_available = textpage->CountChars() - char_start;
++ if (char_available <= 0)
++ return 0;
++
++ char_count = std::min(char_count, char_available);
++ if (char_count == 0) {
++ // Writing out "", which has a character count of 1 due to the NUL.
++ *result = '\0';
++ return 1;
++ }
++
++ WideString str = textpage->GetTextObjectText(pTxtObj);
++
++ if (str.GetLength() > static_cast<size_t>(char_count))
++ str = str.Left(static_cast<size_t>(char_count));
++
++ // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
++ // the number of items to stay the same.
++ ByteString byte_str = str.UTF16LE_Encode();
++ size_t byte_str_len = byte_str.GetLength();
++ constexpr size_t kBytesPerCharacter = sizeof(unsigned short);
++ int ret_count = byte_str_len / kBytesPerCharacter;
++
++ ASSERT(ret_count <= char_count + 1); // +1 to account for the NUL terminator.
++ memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len);
++ return ret_count;
++}
++
+ FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
+ FPDFTextObj_GetColor(FPDF_PAGEOBJECT text_object,
+ unsigned int* R,
+diff --git a/pdfium/public/fpdf_edit.h b/pdfium/public/fpdf_edit.h
+index f249e64..e14b2a5 100644
+--- a/pdfium/public/fpdf_edit.h
++++ b/pdfium/public/fpdf_edit.h
+@@ -1065,6 +1065,19 @@ FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+ int char_count,
+ unsigned short* result);
+
++// Get the processed text of a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++// Return Value:
++// The number of characters (not bytes) written in result.
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object,
++ FPDF_TEXTPAGE page,
++ int char_start,
++ int char_count,
++ unsigned short* result);
++
+ // Get the stroke RGBA of a text. Range of values: 0 - 255.
+ //
+ // path - the handle to the path object.
+--
+2.16.3
+
diff --git a/external/pdfium/UnpackedTarball_pdfium.mk b/external/pdfium/UnpackedTarball_pdfium.mk
index 6880ac0b670b..5525e9ddf65d 100644
--- a/external/pdfium/UnpackedTarball_pdfium.mk
+++ b/external/pdfium/UnpackedTarball_pdfium.mk
@@ -25,6 +25,7 @@ pdfium_patches += 0008-svx-correct-the-positioning-of-PDF-Paths-and-the-str.patc
pdfium_patches += 0009-svx-support-color-text-for-imported-PDFs.patch.2
pdfium_patches += 0010-svx-support-importing-forms-from-PDFs.patch.2
pdfium_patches += 0011-svx-correctly-possition-form-objects-from-PDF.patch.2
+pdfium_patches += 0012-svx-import-processed-PDF-text.patch.2
$(eval $(call gb_UnpackedTarball_UnpackedTarball,pdfium))
diff --git a/svx/source/svdraw/svdpdf.cxx b/svx/source/svdraw/svdpdf.cxx
index f2fbd7a835ed..1946b61c29bb 100644
--- a/svx/source/svdraw/svdpdf.cxx
+++ b/svx/source/svdraw/svdpdf.cxx
@@ -225,13 +225,18 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
<< ", height: " << dPageHeight);
SetupPageScale(dPageWidth, dPageHeight);
+ // Load the page text to extract it when we get text elements.
+ FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage);
+
const int nPageObjectCount = FPDFPage_CountObject(pPdfPage);
for (int nPageObjectIndex = 0; nPageObjectIndex < nPageObjectCount; ++nPageObjectIndex)
{
FPDF_PAGEOBJECT pPageObject = FPDFPage_GetObject(pPdfPage, nPageObjectIndex);
- ImportPdfObject(pPageObject, nPageObjectIndex);
+ ImportPdfObject(pPageObject, pTextPage, nPageObjectIndex);
}
+ FPDFText_ClosePage(pTextPage);
+
#if 0
// Now do the text.
FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage);
@@ -990,8 +995,8 @@ void ImpSdrPdfImport::checkClip()
}
bool ImpSdrPdfImport::isClip() const { return !maClip.getB2DRange().isEmpty(); }
-
-void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex)
+void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex)
{
if (pPageObject == nullptr)
return;
@@ -1000,7 +1005,7 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje
switch (nPageObjectType)
{
case FPDF_PAGEOBJ_TEXT:
- ImportText(pPageObject, nPageObjectIndex);
+ ImportText(pPageObject, pTextPage, nPageObjectIndex);
break;
case FPDF_PAGEOBJ_PATH:
ImportPath(pPageObject, nPageObjectIndex);
@@ -1012,7 +1017,7 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje
SAL_WARN("sd.filter", "Got page object SHADING: " << nPageObjectIndex);
break;
case FPDF_PAGEOBJ_FORM:
- ImportForm(pPageObject, nPageObjectIndex);
+ ImportForm(pPageObject, pTextPage, nPageObjectIndex);
break;
default:
SAL_WARN("sd.filter", "Unknown PDF page object #" << nPageObjectIndex
@@ -1021,7 +1026,8 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje
}
}
-void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex)
+void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex)
{
SAL_WARN("sd.filter", "Got page object FORM: " << nPageObjectIndex);
@@ -1036,14 +1042,15 @@ void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectInd
for (int nIndex = 0; nIndex < nCount; ++nIndex)
{
FPDF_PAGEOBJECT pFormObject = FPDFFormObj_GetSubObject(pPageObject, nIndex);
- ImportPdfObject(pFormObject, -1);
+ ImportPdfObject(pFormObject, pTextPage, -1);
}
// Restore the old one.
mCurMatrix = aOldMatrix;
}
-void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex)
+void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex)
{
SAL_WARN("sd.filter", "Got page object TEXT: " << nPageObjectIndex);
float left;
@@ -1075,14 +1082,15 @@ void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectInd
SAL_WARN("sd.filter", "Got TEXT origin: " << aPos);
SAL_WARN("sd.filter", "Got TEXT Bounds: " << aRect);
- const int nChars = FPDFTextObj_CountChars(pPageObject);
+ const int nChars = FPDFTextObj_CountChars(pPageObject) * 2;
std::unique_ptr<sal_Unicode[]> pText(new sal_Unicode[nChars + 1]); // + terminating null
unsigned short* pShortText = reinterpret_cast<unsigned short*>(pText.get());
- const int nActualChars = FPDFTextObj_GetText(pPageObject, 0, nChars, pShortText);
+ const int nActualChars
+ = FPDFTextObj_GetTextProcessed(pPageObject, pTextPage, 0, nChars, pShortText);
if (nActualChars <= 0)
{
- SAL_WARN("sd.filter", "Got not TEXT");
+ SAL_WARN("sd.filter", "Got no TEXT");
return;
}
diff --git a/svx/source/svdraw/svdpdf.hxx b/svx/source/svdraw/svdpdf.hxx
index 460b508e83a8..da54b9a40fa8 100644
--- a/svx/source/svdraw/svdpdf.hxx
+++ b/svx/source/svdraw/svdpdf.hxx
@@ -42,6 +42,7 @@ class SdrObject;
class SvdProgressInfo;
typedef void* FPDF_DOCUMENT;
typedef void* FPDF_PAGEOBJECT;
+typedef void* FPDF_TEXTPAGE;
// Helper Class to import PDF
class ImpSdrPdfImport final
@@ -86,7 +87,6 @@ class ImpSdrPdfImport final
double d() const { return md; }
double e() const { return me; }
double f() const { return mf; }
-
/// Mutliply this * other.
void Concatinate(const Matrix& other)
{
@@ -156,7 +156,6 @@ class ImpSdrPdfImport final
/// Correct the vertical coordinate to start at the top.
/// PDF coordinate system has orign at the bottom right.
double correctVertOrigin(double offsetPts) const { return mdPageHeightPts - offsetPts; }
-
/// Convert PDF points to logic (twips).
tools::Rectangle PointsToLogic(double left, double right, double top, double bottom) const;
Point PointsToLogic(double x, double y) const;
@@ -165,11 +164,12 @@ class ImpSdrPdfImport final
void checkClip();
bool isClip() const;
- void ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
- void ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
+ void ImportPdfObject(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex);
+ void ImportForm(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, int nPageObjectIndex);
void ImportImage(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
void ImportPath(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
- void ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
+ void ImportText(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, int nPageObjectIndex);
void ImportText(const Point& rPos, const Size& rSize, const OUString& rStr);
void SetupPageScale(const double dPageWidth, const double dPageHeight);
@@ -193,7 +193,6 @@ public:
~ImpSdrPdfImport();
int GetPageCount() const { return mnPageCount; }
-
size_t DoImport(SdrObjList& rDestList, size_t nInsPos, int nPageNumber,
SvdProgressInfo* pProgrInfo = nullptr);
};