summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAshod Nakashian <ashod.nakashian@collabora.co.uk>2018-04-22 10:48:51 -0400
committerJan Holesovsky <kendy@collabora.com>2018-06-07 10:45:29 +0200
commit67bdfba807b257ce977d61239568877f2bff1429 (patch)
tree25a149f3ced1278396efa8811113ee52810e9ce2
parentsvx: correctly possition form objects from PDF (diff)
downloadcore-67bdfba807b257ce977d61239568877f2bff1429.tar.gz
core-67bdfba807b257ce977d61239568877f2bff1429.zip
svx: import processed PDF text
Some PDFs don't include spaces in the text. Instead, they rely on the explicit positioning of each character to render visually separated words. Latex seems to be prone to this approach, though not exclusively. Luckily, PDFium does process text and inserts "generated" spaces where necessary, which is what we retrieve and use as the text string while importing. Change-Id: Ic21fe6c8416ecaba66f06b6260f1d6b040ff12af
-rw-r--r--external/pdfium/0012-svx-import-processed-PDF-text.patch.2148
-rw-r--r--external/pdfium/UnpackedTarball_pdfium.mk1
-rw-r--r--svx/source/svdraw/svdpdf.cxx30
-rw-r--r--svx/source/svdraw/svdpdf.hxx11
4 files changed, 173 insertions, 17 deletions
diff --git a/external/pdfium/0012-svx-import-processed-PDF-text.patch.2 b/external/pdfium/0012-svx-import-processed-PDF-text.patch.2
new file mode 100644
index 000000000000..cae9ec808aba
--- /dev/null
+++ b/external/pdfium/0012-svx-import-processed-PDF-text.patch.2
@@ -0,0 +1,148 @@
+From 7e8ecec81f102993e3fe73256415dcf049c09e29 Mon Sep 17 00:00:00 2001
+From: Ashod Nakashian <ashod.nakashian@collabora.co.uk>
+Date: Tue, 5 Jun 2018 11:35:39 +0200
+Subject: [PATCH 12/14] svx: import processed PDF text
+
+---
+ pdfium/core/fpdftext/cpdf_textpage.cpp | 29 ++++++++++++++++++++++++
+ pdfium/core/fpdftext/cpdf_textpage.h | 2 ++
+ pdfium/fpdfsdk/fpdf_editpage.cpp | 41 ++++++++++++++++++++++++++++++++++
+ pdfium/public/fpdf_edit.h | 13 +++++++++++
+ 4 files changed, 85 insertions(+)
+
+diff --git a/pdfium/core/fpdftext/cpdf_textpage.cpp b/pdfium/core/fpdftext/cpdf_textpage.cpp
+index 5690698..4d7c48a 100644
+--- a/pdfium/core/fpdftext/cpdf_textpage.cpp
++++ b/pdfium/core/fpdftext/cpdf_textpage.cpp
+@@ -1464,3 +1464,32 @@ Optional<PAGECHAR_INFO> CPDF_TextPage::GenerateCharInfo(wchar_t unicode) {
+ info.m_Origin.x, info.m_Origin.y);
+ return info;
+ }
++
++WideString CPDF_TextPage::GetTextObjectText(CPDF_TextObject* pTextObj)
++{
++ if (!m_bIsParsed)
++ return WideString();
++
++ float posy = 0;
++ bool IsContainPreChar = false;
++ bool IsAddLineFeed = false;
++ WideString strText;
++ for (const auto& charinfo : m_CharList) {
++ if (charinfo.m_pTextObj == pTextObj) {
++ IsContainPreChar = true;
++ IsAddLineFeed = false;
++ if (charinfo.m_Unicode)
++ strText += charinfo.m_Unicode;
++ } else if (charinfo.m_Unicode == 32) {
++ if (IsContainPreChar && charinfo.m_Unicode) {
++ strText += charinfo.m_Unicode;
++ IsContainPreChar = false;
++ IsAddLineFeed = false;
++ }
++ } else {
++ IsContainPreChar = false;
++ IsAddLineFeed = true;
++ }
++ }
++ return strText;
++}
+diff --git a/pdfium/core/fpdftext/cpdf_textpage.h b/pdfium/core/fpdftext/cpdf_textpage.h
+index 43a0312..7d5d5ec 100644
+--- a/pdfium/core/fpdftext/cpdf_textpage.h
++++ b/pdfium/core/fpdftext/cpdf_textpage.h
+@@ -105,6 +105,8 @@ class CPDF_TextPage {
+ WideString GetPageText(int start, int count) const;
+ WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
+
++ WideString GetTextObjectText(CPDF_TextObject* pTextObj);
++
+ int CountRects(int start, int nCount);
+ bool GetRect(int rectIndex, CFX_FloatRect* pRect) const;
+
+diff --git a/pdfium/fpdfsdk/fpdf_editpage.cpp b/pdfium/fpdfsdk/fpdf_editpage.cpp
+index f4a1688..f34d3b5 100644
+--- a/pdfium/fpdfsdk/fpdf_editpage.cpp
++++ b/pdfium/fpdfsdk/fpdf_editpage.cpp
+@@ -27,6 +27,7 @@
+ #include "core/fpdfapi/parser/cpdf_string.h"
+ #include "core/fpdfdoc/cpdf_annot.h"
+ #include "core/fpdfdoc/cpdf_annotlist.h"
++#include "core/fpdftext/cpdf_textpage.h"
+ #include "fpdfsdk/cpdfsdk_helpers.h"
+ #include "public/fpdf_formfill.h"
+ #include "third_party/base/logging.h"
+@@ -732,6 +733,46 @@ FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+ return ret_count;
+ }
+
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object,
++ FPDF_TEXTPAGE page,
++ int char_start,
++ int char_count,
++ unsigned short* result)
++{
++ if (!page || !text_object || char_start < 0 || char_count < 0 || !result)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = CPDFTextObjectFromFPDFPageObject(text_object);
++ CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page);
++ int char_available = textpage->CountChars() - char_start;
++ if (char_available <= 0)
++ return 0;
++
++ char_count = std::min(char_count, char_available);
++ if (char_count == 0) {
++ // Writing out "", which has a character count of 1 due to the NUL.
++ *result = '\0';
++ return 1;
++ }
++
++ WideString str = textpage->GetTextObjectText(pTxtObj);
++
++ if (str.GetLength() > static_cast<size_t>(char_count))
++ str = str.Left(static_cast<size_t>(char_count));
++
++ // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
++ // the number of items to stay the same.
++ ByteString byte_str = str.UTF16LE_Encode();
++ size_t byte_str_len = byte_str.GetLength();
++ constexpr size_t kBytesPerCharacter = sizeof(unsigned short);
++ int ret_count = byte_str_len / kBytesPerCharacter;
++
++ ASSERT(ret_count <= char_count + 1); // +1 to account for the NUL terminator.
++ memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len);
++ return ret_count;
++}
++
+ FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
+ FPDFTextObj_GetColor(FPDF_PAGEOBJECT text_object,
+ unsigned int* R,
+diff --git a/pdfium/public/fpdf_edit.h b/pdfium/public/fpdf_edit.h
+index f249e64..e14b2a5 100644
+--- a/pdfium/public/fpdf_edit.h
++++ b/pdfium/public/fpdf_edit.h
+@@ -1065,6 +1065,19 @@ FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+ int char_count,
+ unsigned short* result);
+
++// Get the processed text of a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++// Return Value:
++// The number of characters (not bytes) written in result.
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object,
++ FPDF_TEXTPAGE page,
++ int char_start,
++ int char_count,
++ unsigned short* result);
++
+ // Get the stroke RGBA of a text. Range of values: 0 - 255.
+ //
+ // path - the handle to the path object.
+--
+2.16.3
+
diff --git a/external/pdfium/UnpackedTarball_pdfium.mk b/external/pdfium/UnpackedTarball_pdfium.mk
index 6880ac0b670b..5525e9ddf65d 100644
--- a/external/pdfium/UnpackedTarball_pdfium.mk
+++ b/external/pdfium/UnpackedTarball_pdfium.mk
@@ -25,6 +25,7 @@ pdfium_patches += 0008-svx-correct-the-positioning-of-PDF-Paths-and-the-str.patc
pdfium_patches += 0009-svx-support-color-text-for-imported-PDFs.patch.2
pdfium_patches += 0010-svx-support-importing-forms-from-PDFs.patch.2
pdfium_patches += 0011-svx-correctly-possition-form-objects-from-PDF.patch.2
+pdfium_patches += 0012-svx-import-processed-PDF-text.patch.2
$(eval $(call gb_UnpackedTarball_UnpackedTarball,pdfium))
diff --git a/svx/source/svdraw/svdpdf.cxx b/svx/source/svdraw/svdpdf.cxx
index f2fbd7a835ed..1946b61c29bb 100644
--- a/svx/source/svdraw/svdpdf.cxx
+++ b/svx/source/svdraw/svdpdf.cxx
@@ -225,13 +225,18 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
<< ", height: " << dPageHeight);
SetupPageScale(dPageWidth, dPageHeight);
+ // Load the page text to extract it when we get text elements.
+ FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage);
+
const int nPageObjectCount = FPDFPage_CountObject(pPdfPage);
for (int nPageObjectIndex = 0; nPageObjectIndex < nPageObjectCount; ++nPageObjectIndex)
{
FPDF_PAGEOBJECT pPageObject = FPDFPage_GetObject(pPdfPage, nPageObjectIndex);
- ImportPdfObject(pPageObject, nPageObjectIndex);
+ ImportPdfObject(pPageObject, pTextPage, nPageObjectIndex);
}
+ FPDFText_ClosePage(pTextPage);
+
#if 0
// Now do the text.
FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage);
@@ -990,8 +995,8 @@ void ImpSdrPdfImport::checkClip()
}
bool ImpSdrPdfImport::isClip() const { return !maClip.getB2DRange().isEmpty(); }
-
-void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex)
+void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex)
{
if (pPageObject == nullptr)
return;
@@ -1000,7 +1005,7 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje
switch (nPageObjectType)
{
case FPDF_PAGEOBJ_TEXT:
- ImportText(pPageObject, nPageObjectIndex);
+ ImportText(pPageObject, pTextPage, nPageObjectIndex);
break;
case FPDF_PAGEOBJ_PATH:
ImportPath(pPageObject, nPageObjectIndex);
@@ -1012,7 +1017,7 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje
SAL_WARN("sd.filter", "Got page object SHADING: " << nPageObjectIndex);
break;
case FPDF_PAGEOBJ_FORM:
- ImportForm(pPageObject, nPageObjectIndex);
+ ImportForm(pPageObject, pTextPage, nPageObjectIndex);
break;
default:
SAL_WARN("sd.filter", "Unknown PDF page object #" << nPageObjectIndex
@@ -1021,7 +1026,8 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje
}
}
-void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex)
+void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex)
{
SAL_WARN("sd.filter", "Got page object FORM: " << nPageObjectIndex);
@@ -1036,14 +1042,15 @@ void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectInd
for (int nIndex = 0; nIndex < nCount; ++nIndex)
{
FPDF_PAGEOBJECT pFormObject = FPDFFormObj_GetSubObject(pPageObject, nIndex);
- ImportPdfObject(pFormObject, -1);
+ ImportPdfObject(pFormObject, pTextPage, -1);
}
// Restore the old one.
mCurMatrix = aOldMatrix;
}
-void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex)
+void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex)
{
SAL_WARN("sd.filter", "Got page object TEXT: " << nPageObjectIndex);
float left;
@@ -1075,14 +1082,15 @@ void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectInd
SAL_WARN("sd.filter", "Got TEXT origin: " << aPos);
SAL_WARN("sd.filter", "Got TEXT Bounds: " << aRect);
- const int nChars = FPDFTextObj_CountChars(pPageObject);
+ const int nChars = FPDFTextObj_CountChars(pPageObject) * 2;
std::unique_ptr<sal_Unicode[]> pText(new sal_Unicode[nChars + 1]); // + terminating null
unsigned short* pShortText = reinterpret_cast<unsigned short*>(pText.get());
- const int nActualChars = FPDFTextObj_GetText(pPageObject, 0, nChars, pShortText);
+ const int nActualChars
+ = FPDFTextObj_GetTextProcessed(pPageObject, pTextPage, 0, nChars, pShortText);
if (nActualChars <= 0)
{
- SAL_WARN("sd.filter", "Got not TEXT");
+ SAL_WARN("sd.filter", "Got no TEXT");
return;
}
diff --git a/svx/source/svdraw/svdpdf.hxx b/svx/source/svdraw/svdpdf.hxx
index 460b508e83a8..da54b9a40fa8 100644
--- a/svx/source/svdraw/svdpdf.hxx
+++ b/svx/source/svdraw/svdpdf.hxx
@@ -42,6 +42,7 @@ class SdrObject;
class SvdProgressInfo;
typedef void* FPDF_DOCUMENT;
typedef void* FPDF_PAGEOBJECT;
+typedef void* FPDF_TEXTPAGE;
// Helper Class to import PDF
class ImpSdrPdfImport final
@@ -86,7 +87,6 @@ class ImpSdrPdfImport final
double d() const { return md; }
double e() const { return me; }
double f() const { return mf; }
-
/// Mutliply this * other.
void Concatinate(const Matrix& other)
{
@@ -156,7 +156,6 @@ class ImpSdrPdfImport final
/// Correct the vertical coordinate to start at the top.
/// PDF coordinate system has orign at the bottom right.
double correctVertOrigin(double offsetPts) const { return mdPageHeightPts - offsetPts; }
-
/// Convert PDF points to logic (twips).
tools::Rectangle PointsToLogic(double left, double right, double top, double bottom) const;
Point PointsToLogic(double x, double y) const;
@@ -165,11 +164,12 @@ class ImpSdrPdfImport final
void checkClip();
bool isClip() const;
- void ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
- void ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
+ void ImportPdfObject(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage,
+ int nPageObjectIndex);
+ void ImportForm(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, int nPageObjectIndex);
void ImportImage(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
void ImportPath(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
- void ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex);
+ void ImportText(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, int nPageObjectIndex);
void ImportText(const Point& rPos, const Size& rSize, const OUString& rStr);
void SetupPageScale(const double dPageWidth, const double dPageHeight);
@@ -193,7 +193,6 @@ public:
~ImpSdrPdfImport();
int GetPageCount() const { return mnPageCount; }
-
size_t DoImport(SdrObjList& rDestList, size_t nInsPos, int nPageNumber,
SvdProgressInfo* pProgrInfo = nullptr);
};